mirror of
https://github.com/moodle/moodle.git
synced 2025-01-18 05:58:34 +01:00
MDL-68690 Search: Allow Solr to add documents in batches
Adding documents in batches instead of one at a time can make indexing using Solr significantly faster. This adds new API functions for search engines, including add_document_batch() to add a batch of documents, supports_add_document_batch(), get_batch_max_documents() and get_batch_max_content().
This commit is contained in:
parent
49a9e8b07d
commit
0deb19468d
@ -218,8 +218,8 @@ abstract class engine {
|
||||
* and and have the search engine back end add them
|
||||
* to the index.
|
||||
*
|
||||
* @param iterator $iterator the iterator of documents to index
|
||||
* @param searcharea $searcharea the area for the documents to index
|
||||
* @param \iterator $iterator the iterator of documents to index
|
||||
* @param base $searcharea the area for the documents to index
|
||||
* @param array $options document indexing options
|
||||
* @return array Processed document counts
|
||||
*/
|
||||
@ -227,11 +227,15 @@ abstract class engine {
|
||||
$numrecords = 0;
|
||||
$numdocs = 0;
|
||||
$numdocsignored = 0;
|
||||
$numbatches = 0;
|
||||
$lastindexeddoc = 0;
|
||||
$firstindexeddoc = 0;
|
||||
$partial = false;
|
||||
$lastprogress = manager::get_current_time();
|
||||
|
||||
$batchmode = $this->supports_add_document_batch();
|
||||
$currentbatch = [];
|
||||
|
||||
foreach ($iterator as $document) {
|
||||
// Stop if we have exceeded the time limit (and there are still more items). Always
|
||||
// do at least one second's worth of documents otherwise it will never make progress.
|
||||
@ -255,10 +259,22 @@ abstract class engine {
|
||||
$searcharea->attach_files($document);
|
||||
}
|
||||
|
||||
if ($this->add_document($document, $options['indexfiles'])) {
|
||||
$numdocs++;
|
||||
if ($batchmode && strlen($document->get('content')) <= $this->get_batch_max_content()) {
|
||||
$currentbatch[] = $document;
|
||||
if (count($currentbatch) >= $this->get_batch_max_documents()) {
|
||||
[$processed, $failed, $batches] = $this->add_document_batch($currentbatch, $options['indexfiles']);
|
||||
$numdocs += $processed;
|
||||
$numdocsignored += $failed;
|
||||
$numbatches += $batches;
|
||||
$currentbatch = [];
|
||||
}
|
||||
} else {
|
||||
$numdocsignored++;
|
||||
if ($this->add_document($document, $options['indexfiles'])) {
|
||||
$numdocs++;
|
||||
} else {
|
||||
$numdocsignored++;
|
||||
}
|
||||
$numbatches++;
|
||||
}
|
||||
|
||||
$lastindexeddoc = $document->get('modified');
|
||||
@ -279,7 +295,15 @@ abstract class engine {
|
||||
}
|
||||
}
|
||||
|
||||
return array($numrecords, $numdocs, $numdocsignored, $lastindexeddoc, $partial);
|
||||
// Add remaining documents from batch.
|
||||
if ($batchmode && $currentbatch) {
|
||||
[$processed, $failed, $batches] = $this->add_document_batch($currentbatch, $options['indexfiles']);
|
||||
$numdocs += $processed;
|
||||
$numdocsignored += $failed;
|
||||
$numbatches += $batches;
|
||||
}
|
||||
|
||||
return [$numrecords, $numdocs, $numdocsignored, $lastindexeddoc, $partial, $numbatches];
|
||||
}
|
||||
|
||||
/**
|
||||
@ -473,6 +497,27 @@ abstract class engine {
|
||||
*/
|
||||
abstract function add_document($document, $fileindexing = false);
|
||||
|
||||
/**
|
||||
* Adds multiple documents to the search engine.
|
||||
*
|
||||
* It should return the number successfully processed, and the number of batches they were
|
||||
* processed in (for example if you add 100 documents and there is an error processing one of
|
||||
* those documents, and it took 4 batches, it would return [99, 1, 4]).
|
||||
*
|
||||
* If the engine implements this, it should return true to {@see supports_add_document_batch}.
|
||||
*
|
||||
* The system will only call this function with up to {@see get_batch_max_documents} documents,
|
||||
* and each document in the batch will have content no larger than specified by
|
||||
* {@see get_batch_max_content}.
|
||||
*
|
||||
* @param document[] $documents Documents to add
|
||||
* @param bool $fileindexing True if file indexing is to be used
|
||||
* @return int[] Array of three elements, successfully processed, failed processed, batch count
|
||||
*/
|
||||
public function add_document_batch(array $documents, bool $fileindexing = false): array {
|
||||
throw new \coding_exception('add_document_batch not supported by this engine');
|
||||
}
|
||||
|
||||
/**
|
||||
* Executes the query on the engine.
|
||||
*
|
||||
@ -653,4 +698,44 @@ abstract class engine {
|
||||
public function supports_users() {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if the search engine supports adding documents in a batch.
|
||||
*
|
||||
* If it returns true to this function, the search engine must implement the add_document_batch
|
||||
* function.
|
||||
*
|
||||
* @return bool True if the search engine supports adding documents in a batch
|
||||
*/
|
||||
public function supports_add_document_batch(): bool {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the maximum number of documents to send together in batch mode.
|
||||
*
|
||||
* Only relevant if the engine returns true to {@see supports_add_document_batch}.
|
||||
*
|
||||
* Can be overridden by search engine if required.
|
||||
*
|
||||
* @var int Number of documents to send together in batch mode, default 100.
|
||||
*/
|
||||
public function get_batch_max_documents(): int {
|
||||
return 100;
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the maximum size of document content to be included in a shared batch (if the
|
||||
* document is bigger then it will be sent on its own; batching does not provide a performance
|
||||
* improvement for big documents anyway).
|
||||
*
|
||||
* Only relevant if the engine returns true to {@see supports_add_document_batch}.
|
||||
*
|
||||
* Can be overridden by search engine if required.
|
||||
*
|
||||
* @return int Max size in bytes, default 1MB
|
||||
*/
|
||||
public function get_batch_max_content(): int {
|
||||
return 1024 * 1024;
|
||||
}
|
||||
}
|
||||
|
@ -1152,8 +1152,20 @@ class manager {
|
||||
$recordset, array($searcharea, 'get_document'), $options));
|
||||
$result = $this->engine->add_documents($iterator, $searcharea, $options);
|
||||
$recordset->close();
|
||||
if (count($result) === 5) {
|
||||
list($numrecords, $numdocs, $numdocsignored, $lastindexeddoc, $partial) = $result;
|
||||
$batchinfo = '';
|
||||
if (count($result) === 6) {
|
||||
[$numrecords, $numdocs, $numdocsignored, $lastindexeddoc, $partial, $batches] = $result;
|
||||
// Only show the batch count if we actually batched any requests.
|
||||
if ($batches !== $numdocs + $numdocsignored) {
|
||||
$batchinfo = ' (' . $batches . ' batch' . ($batches === 1 ? '' : 'es') . ')';
|
||||
}
|
||||
} else if (count($result) === 5) {
|
||||
// Backward compatibility for engines that don't return a batch count.
|
||||
[$numrecords, $numdocs, $numdocsignored, $lastindexeddoc, $partial] = $result;
|
||||
// Deprecated since Moodle 4.0 MDL-68690.
|
||||
// TODO: MDL-68776 This will be deleted in Moodle 4.4.
|
||||
debugging('engine::add_documents() should return $batches (5-value return is deprecated)',
|
||||
DEBUG_DEVELOPER);
|
||||
} else {
|
||||
throw new coding_exception('engine::add_documents() should return $partial (4-value return is deprecated)');
|
||||
}
|
||||
@ -1168,7 +1180,7 @@ class manager {
|
||||
}
|
||||
|
||||
$progress->output('Processed ' . $numrecords . ' records containing ' . $numdocs .
|
||||
' documents, in ' . $elapsed . ' seconds' . $partialtext . '.', 1);
|
||||
' documents' . $batchinfo . ', in ' . $elapsed . ' seconds' . $partialtext . '.', 1);
|
||||
} else {
|
||||
$progress->output('No new documents to index.', 1);
|
||||
}
|
||||
@ -1305,8 +1317,20 @@ class manager {
|
||||
|
||||
// Use this iterator to add documents.
|
||||
$result = $this->engine->add_documents($iterator, $searcharea, $options);
|
||||
if (count($result) === 5) {
|
||||
list($numrecords, $numdocs, $numdocsignored, $lastindexeddoc, $partial) = $result;
|
||||
$batchinfo = '';
|
||||
if (count($result) === 6) {
|
||||
[$numrecords, $numdocs, $numdocsignored, $lastindexeddoc, $partial, $batches] = $result;
|
||||
// Only show the batch count if we actually batched any requests.
|
||||
if ($batches !== $numdocs + $numdocsignored) {
|
||||
$batchinfo = ' (' . $batches . ' batch' . ($batches === 1 ? '' : 'es') . ')';
|
||||
}
|
||||
} else if (count($result) === 5) {
|
||||
// Backward compatibility for engines that don't return a batch count.
|
||||
[$numrecords, $numdocs, $numdocsignored, $lastindexeddoc, $partial] = $result;
|
||||
// Deprecated since Moodle 4.0 MDL-68690.
|
||||
// TODO: MDL-68776 This will be deleted in Moodle 4.4 (as should the below bit).
|
||||
debugging('engine::add_documents() should return $batches (5-value return is deprecated)',
|
||||
DEBUG_DEVELOPER);
|
||||
} else {
|
||||
// Backward compatibility for engines that don't support partial adding.
|
||||
list($numrecords, $numdocs, $numdocsignored, $lastindexeddoc) = $result;
|
||||
@ -1318,7 +1342,7 @@ class manager {
|
||||
if ($numdocs > 0) {
|
||||
$elapsed = round((self::get_current_time() - $elapsed), 3);
|
||||
$progress->output('Processed ' . $numrecords . ' records containing ' . $numdocs .
|
||||
' documents, in ' . $elapsed . ' seconds' .
|
||||
' documents' . $batchinfo . ', in ' . $elapsed . ' seconds' .
|
||||
($partial ? ' (not complete)' : '') . '.', 1);
|
||||
} else {
|
||||
$progress->output('No documents to index.', 1);
|
||||
|
@ -753,6 +753,32 @@ class engine extends \core_search\engine {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a batch of documents to the engine at once.
|
||||
*
|
||||
* @param \core_search\document[] $documents Documents to add
|
||||
* @param bool $fileindexing If true, indexes files (these are done one at a time)
|
||||
* @return int[] Array of three elements: successfully processed, failed processed, batch count
|
||||
*/
|
||||
public function add_document_batch(array $documents, bool $fileindexing = false): array {
|
||||
$docdatabatch = [];
|
||||
foreach ($documents as $document) {
|
||||
$docdatabatch[] = $document->export_for_engine();
|
||||
}
|
||||
|
||||
$resultcounts = $this->add_solr_documents($docdatabatch);
|
||||
|
||||
// Files are processed one document at a time (if there are files it's slow anyway).
|
||||
if ($fileindexing) {
|
||||
foreach ($documents as $document) {
|
||||
// This will take care of updating all attached files in the index.
|
||||
$this->process_document_files($document);
|
||||
}
|
||||
}
|
||||
|
||||
return $resultcounts;
|
||||
}
|
||||
|
||||
/**
|
||||
* Replaces underlines at edges of words in the content with spaces.
|
||||
*
|
||||
@ -771,12 +797,12 @@ class engine extends \core_search\engine {
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a text document to the search engine.
|
||||
* Creates a Solr document object.
|
||||
*
|
||||
* @param array $doc
|
||||
* @return bool
|
||||
* @param array $doc Array of document fields
|
||||
* @return \SolrInputDocument Created document
|
||||
*/
|
||||
protected function add_solr_document($doc) {
|
||||
protected function create_solr_document(array $doc): \SolrInputDocument {
|
||||
$solrdoc = new \SolrInputDocument();
|
||||
|
||||
// Replace underlines in the content with spaces. The reason for this is that for italic
|
||||
@ -786,10 +812,23 @@ class engine extends \core_search\engine {
|
||||
$doc['content'] = self::replace_underlines($doc['content']);
|
||||
}
|
||||
|
||||
// Set all the fields.
|
||||
foreach ($doc as $field => $value) {
|
||||
$solrdoc->addField($field, $value);
|
||||
}
|
||||
|
||||
return $solrdoc;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a text document to the search engine.
|
||||
*
|
||||
* @param array $doc
|
||||
* @return bool
|
||||
*/
|
||||
protected function add_solr_document($doc) {
|
||||
$solrdoc = $this->create_solr_document($doc);
|
||||
|
||||
try {
|
||||
$result = $this->get_search_client()->addDocument($solrdoc, true, static::AUTOCOMMIT_WITHIN);
|
||||
return true;
|
||||
@ -804,6 +843,50 @@ class engine extends \core_search\engine {
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds multiple text documents to the search engine.
|
||||
*
|
||||
* @param array $docs Array of documents (each an array of fields) to add
|
||||
* @return int[] Array of success, failure, batch count
|
||||
* @throws \core_search\engine_exception
|
||||
*/
|
||||
protected function add_solr_documents(array $docs): array {
|
||||
$solrdocs = [];
|
||||
foreach ($docs as $doc) {
|
||||
$solrdocs[] = $this->create_solr_document($doc);
|
||||
}
|
||||
|
||||
try {
|
||||
// Add documents in a batch and report that they all succeeded.
|
||||
$this->get_search_client()->addDocuments($solrdocs, true, static::AUTOCOMMIT_WITHIN);
|
||||
return [count($solrdocs), 0, 1];
|
||||
} catch (\SolrClientException $e) {
|
||||
// If there is an exception, fall through...
|
||||
$donothing = true;
|
||||
} catch (\SolrServerException $e) {
|
||||
// If there is an exception, fall through...
|
||||
$donothing = true;
|
||||
}
|
||||
|
||||
// When there is an error, we fall back to adding them individually so that we can report
|
||||
// which document(s) failed. Since it overwrites, adding the successful ones multiple
|
||||
// times won't hurt.
|
||||
$success = 0;
|
||||
$failure = 0;
|
||||
$batches = 0;
|
||||
foreach ($docs as $doc) {
|
||||
$result = $this->add_solr_document($doc);
|
||||
$batches++;
|
||||
if ($result) {
|
||||
$success++;
|
||||
} else {
|
||||
$failure++;
|
||||
}
|
||||
}
|
||||
|
||||
return [$success, $failure, $batches];
|
||||
}
|
||||
|
||||
/**
|
||||
* Index files attached to the docuemnt, ensuring the index matches the current document files.
|
||||
*
|
||||
@ -1446,6 +1529,15 @@ class engine extends \core_search\engine {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Solr supports adding documents in a batch.
|
||||
*
|
||||
* @return bool True
|
||||
*/
|
||||
public function supports_add_document_batch(): bool {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Solr supports deleting the index for a context.
|
||||
*
|
||||
|
@ -1297,6 +1297,148 @@ class search_solr_engine_testcase extends advanced_testcase {
|
||||
$this->assert_raw_solr_query_result('content:xyzzy', []);
|
||||
}
|
||||
|
||||
/**
|
||||
* Specific test of the add_document_batch function (also used in many other tests).
|
||||
*/
|
||||
public function test_add_document_batch() {
|
||||
// Get a default document.
|
||||
$area = new core_mocksearch\search\mock_search_area();
|
||||
$record = $this->generator->create_record();
|
||||
$doc = $area->get_document($record);
|
||||
$originalid = $doc->get('id');
|
||||
|
||||
// Now create 5 similar documents.
|
||||
$docs = [];
|
||||
for ($i = 1; $i <= 5; $i++) {
|
||||
$doc = $area->get_document($record);
|
||||
$doc->set('id', $originalid . '-' . $i);
|
||||
$doc->set('title', 'Batch ' . $i);
|
||||
$docs[$i] = $doc;
|
||||
}
|
||||
|
||||
// Document 3 has a file attached.
|
||||
$fs = get_file_storage();
|
||||
$filerecord = new \stdClass();
|
||||
$filerecord->content = 'Some FileContents';
|
||||
$file = $this->generator->create_file($filerecord);
|
||||
$docs[3]->add_stored_file($file);
|
||||
|
||||
// Add all these documents to the search engine.
|
||||
$this->assertEquals([5, 0, 1], $this->engine->add_document_batch($docs, true));
|
||||
$this->engine->area_index_complete($area->get_area_id());
|
||||
|
||||
// Check all documents were indexed.
|
||||
$querydata = new stdClass();
|
||||
$querydata->q = 'Batch';
|
||||
$results = $this->search->search($querydata);
|
||||
$this->assertCount(5, $results);
|
||||
|
||||
// Check it also finds based on the file.
|
||||
$querydata->q = 'FileContents';
|
||||
$results = $this->search->search($querydata);
|
||||
$this->assertCount(1, $results);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests the batching logic, specifically the limit to 100 documents per
|
||||
* batch, and not batching very large documents.
|
||||
*/
|
||||
public function test_batching() {
|
||||
$area = new core_mocksearch\search\mock_search_area();
|
||||
$record = $this->generator->create_record();
|
||||
$doc = $area->get_document($record);
|
||||
$originalid = $doc->get('id');
|
||||
|
||||
// Up to 100 documents in 1 batch.
|
||||
$docs = [];
|
||||
for ($i = 1; $i <= 100; $i++) {
|
||||
$doc = $area->get_document($record);
|
||||
$doc->set('id', $originalid . '-' . $i);
|
||||
$docs[$i] = $doc;
|
||||
}
|
||||
[, , , , , $batches] = $this->engine->add_documents(
|
||||
new ArrayIterator($docs), $area, ['indexfiles' => true]);
|
||||
$this->assertEquals(1, $batches);
|
||||
|
||||
// More than 100 needs 2 batches.
|
||||
$docs = [];
|
||||
for ($i = 1; $i <= 101; $i++) {
|
||||
$doc = $area->get_document($record);
|
||||
$doc->set('id', $originalid . '-' . $i);
|
||||
$docs[$i] = $doc;
|
||||
}
|
||||
[, , , , , $batches] = $this->engine->add_documents(
|
||||
new ArrayIterator($docs), $area, ['indexfiles' => true]);
|
||||
$this->assertEquals(2, $batches);
|
||||
|
||||
// Small number but with some large documents that aren't batched.
|
||||
$docs = [];
|
||||
for ($i = 1; $i <= 10; $i++) {
|
||||
$doc = $area->get_document($record);
|
||||
$doc->set('id', $originalid . '-' . $i);
|
||||
$docs[$i] = $doc;
|
||||
}
|
||||
// This one is just small enough to fit.
|
||||
$docs[3]->set('content', str_pad('xyzzy ', 1024 * 1024, 'x'));
|
||||
// These two don't fit.
|
||||
$docs[5]->set('content', str_pad('xyzzy ', 1024 * 1024 + 1, 'x'));
|
||||
$docs[6]->set('content', str_pad('xyzzy ', 1024 * 1024 + 1, 'x'));
|
||||
[, , , , , $batches] = $this->engine->add_documents(
|
||||
new ArrayIterator($docs), $area, ['indexfiles' => true]);
|
||||
$this->assertEquals(3, $batches);
|
||||
|
||||
// Check that all 3 of the large documents (added as batch or not) show up in results.
|
||||
$this->engine->area_index_complete($area->get_area_id());
|
||||
$querydata = new stdClass();
|
||||
$querydata->q = 'xyzzy';
|
||||
$results = $this->search->search($querydata);
|
||||
$this->assertCount(3, $results);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests with large documents. The point of this test is that we stop batching
|
||||
* documents if they are bigger than 1MB, and the maximum batch count is 100,
|
||||
* so the maximum size batch will be about 100 1MB documents.
|
||||
*/
|
||||
public function test_add_document_batch_large() {
|
||||
// This test is a bit slow and not that important to run every time...
|
||||
if (!PHPUNIT_LONGTEST) {
|
||||
$this->markTestSkipped('PHPUNIT_LONGTEST is not defined');
|
||||
}
|
||||
|
||||
// Get a default document.
|
||||
$area = new core_mocksearch\search\mock_search_area();
|
||||
$record = $this->generator->create_record();
|
||||
$doc = $area->get_document($record);
|
||||
$originalid = $doc->get('id');
|
||||
|
||||
// Now create 100 large documents.
|
||||
$size = 1024 * 1024;
|
||||
$docs = [];
|
||||
for ($i = 1; $i <= 100; $i++) {
|
||||
$doc = $area->get_document($record);
|
||||
$doc->set('id', $originalid . '-' . $i);
|
||||
$doc->set('title', 'Batch ' . $i);
|
||||
$doc->set('content', str_pad('', $size, 'Long text ' . $i . '. ', STR_PAD_RIGHT) . ' xyzzy');
|
||||
$docs[$i] = $doc;
|
||||
}
|
||||
|
||||
// Add all these documents to the search engine.
|
||||
$this->engine->add_document_batch($docs, true);
|
||||
$this->engine->area_index_complete($area->get_area_id());
|
||||
|
||||
// Check all documents were indexed, searching for text at end.
|
||||
$querydata = new stdClass();
|
||||
$querydata->q = 'xyzzy';
|
||||
$results = $this->search->search($querydata);
|
||||
$this->assertCount(100, $results);
|
||||
|
||||
// Search for specific text that's only in one.
|
||||
$querydata->q = '42';
|
||||
$results = $this->search->search($querydata);
|
||||
$this->assertCount(1, $results);
|
||||
}
|
||||
|
||||
/**
|
||||
* Carries out a raw Solr query using the Solr basic query syntax.
|
||||
*
|
||||
|
@ -1,6 +1,14 @@
|
||||
This files describes API changes in /search/*,
|
||||
information provided here is intended especially for developers.
|
||||
|
||||
=== 4.0 ===
|
||||
|
||||
* Search indexing now supports sending multiple documents to the server in a batch. This is implemented
|
||||
for the Solr search engine, where it significantly increases performance. For this to work, engines
|
||||
should implement add_document_batch() function and return true to supports_add_document_batch().
|
||||
There is also an additional parameter returned from add_documents() with the number of batches
|
||||
sent, which is used for the log display. Existing engines should continue to work unmodified.
|
||||
|
||||
=== 3.8 ===
|
||||
|
||||
* Search indexing supports time limits to make the scheduled task run more neatly since 3.4. In order for
|
||||
|
Loading…
x
Reference in New Issue
Block a user