MDL-68690 Search: Allow Solr to add documents in batches

Adding documents in batches instead of one at a time can make
indexing using Solr significantly faster.

This adds new API functions for search engines, including
add_document_batch() to add a batch of documents,
supports_add_document_batch(), get_batch_max_documents() and
get_batch_max_content().
This commit is contained in:
sam marshall 2020-05-13 12:11:21 +01:00
parent 49a9e8b07d
commit 0deb19468d
5 changed files with 367 additions and 16 deletions

View File

@ -218,8 +218,8 @@ abstract class engine {
* and and have the search engine back end add them
* to the index.
*
* @param iterator $iterator the iterator of documents to index
* @param searcharea $searcharea the area for the documents to index
* @param \iterator $iterator the iterator of documents to index
* @param base $searcharea the area for the documents to index
* @param array $options document indexing options
* @return array Processed document counts
*/
@ -227,11 +227,15 @@ abstract class engine {
$numrecords = 0;
$numdocs = 0;
$numdocsignored = 0;
$numbatches = 0;
$lastindexeddoc = 0;
$firstindexeddoc = 0;
$partial = false;
$lastprogress = manager::get_current_time();
$batchmode = $this->supports_add_document_batch();
$currentbatch = [];
foreach ($iterator as $document) {
// Stop if we have exceeded the time limit (and there are still more items). Always
// do at least one second's worth of documents otherwise it will never make progress.
@ -255,10 +259,22 @@ abstract class engine {
$searcharea->attach_files($document);
}
if ($this->add_document($document, $options['indexfiles'])) {
$numdocs++;
if ($batchmode && strlen($document->get('content')) <= $this->get_batch_max_content()) {
$currentbatch[] = $document;
if (count($currentbatch) >= $this->get_batch_max_documents()) {
[$processed, $failed, $batches] = $this->add_document_batch($currentbatch, $options['indexfiles']);
$numdocs += $processed;
$numdocsignored += $failed;
$numbatches += $batches;
$currentbatch = [];
}
} else {
$numdocsignored++;
if ($this->add_document($document, $options['indexfiles'])) {
$numdocs++;
} else {
$numdocsignored++;
}
$numbatches++;
}
$lastindexeddoc = $document->get('modified');
@ -279,7 +295,15 @@ abstract class engine {
}
}
return array($numrecords, $numdocs, $numdocsignored, $lastindexeddoc, $partial);
// Add remaining documents from batch.
if ($batchmode && $currentbatch) {
[$processed, $failed, $batches] = $this->add_document_batch($currentbatch, $options['indexfiles']);
$numdocs += $processed;
$numdocsignored += $failed;
$numbatches += $batches;
}
return [$numrecords, $numdocs, $numdocsignored, $lastindexeddoc, $partial, $numbatches];
}
/**
@ -473,6 +497,27 @@ abstract class engine {
*/
abstract function add_document($document, $fileindexing = false);
/**
* Adds multiple documents to the search engine.
*
* It should return the number successfully processed, and the number of batches they were
* processed in (for example if you add 100 documents and there is an error processing one of
* those documents, and it took 4 batches, it would return [99, 1, 4]).
*
* If the engine implements this, it should return true to {@see supports_add_document_batch}.
*
* The system will only call this function with up to {@see get_batch_max_documents} documents,
* and each document in the batch will have content no larger than specified by
* {@see get_batch_max_content}.
*
* @param document[] $documents Documents to add
* @param bool $fileindexing True if file indexing is to be used
* @return int[] Array of three elements, successfully processed, failed processed, batch count
*/
public function add_document_batch(array $documents, bool $fileindexing = false): array {
throw new \coding_exception('add_document_batch not supported by this engine');
}
/**
* Executes the query on the engine.
*
@ -653,4 +698,44 @@ abstract class engine {
public function supports_users() {
return false;
}
/**
* Checks if the search engine supports adding documents in a batch.
*
* If it returns true to this function, the search engine must implement the add_document_batch
* function.
*
* @return bool True if the search engine supports adding documents in a batch
*/
public function supports_add_document_batch(): bool {
return false;
}
/**
* Gets the maximum number of documents to send together in batch mode.
*
* Only relevant if the engine returns true to {@see supports_add_document_batch}.
*
* Can be overridden by search engine if required.
*
* @var int Number of documents to send together in batch mode, default 100.
*/
public function get_batch_max_documents(): int {
return 100;
}
/**
* Gets the maximum size of document content to be included in a shared batch (if the
* document is bigger then it will be sent on its own; batching does not provide a performance
* improvement for big documents anyway).
*
* Only relevant if the engine returns true to {@see supports_add_document_batch}.
*
* Can be overridden by search engine if required.
*
* @return int Max size in bytes, default 1MB
*/
public function get_batch_max_content(): int {
return 1024 * 1024;
}
}

View File

@ -1152,8 +1152,20 @@ class manager {
$recordset, array($searcharea, 'get_document'), $options));
$result = $this->engine->add_documents($iterator, $searcharea, $options);
$recordset->close();
if (count($result) === 5) {
list($numrecords, $numdocs, $numdocsignored, $lastindexeddoc, $partial) = $result;
$batchinfo = '';
if (count($result) === 6) {
[$numrecords, $numdocs, $numdocsignored, $lastindexeddoc, $partial, $batches] = $result;
// Only show the batch count if we actually batched any requests.
if ($batches !== $numdocs + $numdocsignored) {
$batchinfo = ' (' . $batches . ' batch' . ($batches === 1 ? '' : 'es') . ')';
}
} else if (count($result) === 5) {
// Backward compatibility for engines that don't return a batch count.
[$numrecords, $numdocs, $numdocsignored, $lastindexeddoc, $partial] = $result;
// Deprecated since Moodle 4.0 MDL-68690.
// TODO: MDL-68776 This will be deleted in Moodle 4.4.
debugging('engine::add_documents() should return $batches (5-value return is deprecated)',
DEBUG_DEVELOPER);
} else {
throw new coding_exception('engine::add_documents() should return $partial (4-value return is deprecated)');
}
@ -1168,7 +1180,7 @@ class manager {
}
$progress->output('Processed ' . $numrecords . ' records containing ' . $numdocs .
' documents, in ' . $elapsed . ' seconds' . $partialtext . '.', 1);
' documents' . $batchinfo . ', in ' . $elapsed . ' seconds' . $partialtext . '.', 1);
} else {
$progress->output('No new documents to index.', 1);
}
@ -1305,8 +1317,20 @@ class manager {
// Use this iterator to add documents.
$result = $this->engine->add_documents($iterator, $searcharea, $options);
if (count($result) === 5) {
list($numrecords, $numdocs, $numdocsignored, $lastindexeddoc, $partial) = $result;
$batchinfo = '';
if (count($result) === 6) {
[$numrecords, $numdocs, $numdocsignored, $lastindexeddoc, $partial, $batches] = $result;
// Only show the batch count if we actually batched any requests.
if ($batches !== $numdocs + $numdocsignored) {
$batchinfo = ' (' . $batches . ' batch' . ($batches === 1 ? '' : 'es') . ')';
}
} else if (count($result) === 5) {
// Backward compatibility for engines that don't return a batch count.
[$numrecords, $numdocs, $numdocsignored, $lastindexeddoc, $partial] = $result;
// Deprecated since Moodle 4.0 MDL-68690.
// TODO: MDL-68776 This will be deleted in Moodle 4.4 (as should the below bit).
debugging('engine::add_documents() should return $batches (5-value return is deprecated)',
DEBUG_DEVELOPER);
} else {
// Backward compatibility for engines that don't support partial adding.
list($numrecords, $numdocs, $numdocsignored, $lastindexeddoc) = $result;
@ -1318,7 +1342,7 @@ class manager {
if ($numdocs > 0) {
$elapsed = round((self::get_current_time() - $elapsed), 3);
$progress->output('Processed ' . $numrecords . ' records containing ' . $numdocs .
' documents, in ' . $elapsed . ' seconds' .
' documents' . $batchinfo . ', in ' . $elapsed . ' seconds' .
($partial ? ' (not complete)' : '') . '.', 1);
} else {
$progress->output('No documents to index.', 1);

View File

@ -753,6 +753,32 @@ class engine extends \core_search\engine {
return true;
}
/**
* Adds a batch of documents to the engine at once.
*
* @param \core_search\document[] $documents Documents to add
* @param bool $fileindexing If true, indexes files (these are done one at a time)
* @return int[] Array of three elements: successfully processed, failed processed, batch count
*/
public function add_document_batch(array $documents, bool $fileindexing = false): array {
$docdatabatch = [];
foreach ($documents as $document) {
$docdatabatch[] = $document->export_for_engine();
}
$resultcounts = $this->add_solr_documents($docdatabatch);
// Files are processed one document at a time (if there are files it's slow anyway).
if ($fileindexing) {
foreach ($documents as $document) {
// This will take care of updating all attached files in the index.
$this->process_document_files($document);
}
}
return $resultcounts;
}
/**
* Replaces underlines at edges of words in the content with spaces.
*
@ -771,12 +797,12 @@ class engine extends \core_search\engine {
}
/**
* Adds a text document to the search engine.
* Creates a Solr document object.
*
* @param array $doc
* @return bool
* @param array $doc Array of document fields
* @return \SolrInputDocument Created document
*/
protected function add_solr_document($doc) {
protected function create_solr_document(array $doc): \SolrInputDocument {
$solrdoc = new \SolrInputDocument();
// Replace underlines in the content with spaces. The reason for this is that for italic
@ -786,10 +812,23 @@ class engine extends \core_search\engine {
$doc['content'] = self::replace_underlines($doc['content']);
}
// Set all the fields.
foreach ($doc as $field => $value) {
$solrdoc->addField($field, $value);
}
return $solrdoc;
}
/**
* Adds a text document to the search engine.
*
* @param array $doc
* @return bool
*/
protected function add_solr_document($doc) {
$solrdoc = $this->create_solr_document($doc);
try {
$result = $this->get_search_client()->addDocument($solrdoc, true, static::AUTOCOMMIT_WITHIN);
return true;
@ -804,6 +843,50 @@ class engine extends \core_search\engine {
return false;
}
/**
* Adds multiple text documents to the search engine.
*
* @param array $docs Array of documents (each an array of fields) to add
* @return int[] Array of success, failure, batch count
* @throws \core_search\engine_exception
*/
protected function add_solr_documents(array $docs): array {
$solrdocs = [];
foreach ($docs as $doc) {
$solrdocs[] = $this->create_solr_document($doc);
}
try {
// Add documents in a batch and report that they all succeeded.
$this->get_search_client()->addDocuments($solrdocs, true, static::AUTOCOMMIT_WITHIN);
return [count($solrdocs), 0, 1];
} catch (\SolrClientException $e) {
// If there is an exception, fall through...
$donothing = true;
} catch (\SolrServerException $e) {
// If there is an exception, fall through...
$donothing = true;
}
// When there is an error, we fall back to adding them individually so that we can report
// which document(s) failed. Since it overwrites, adding the successful ones multiple
// times won't hurt.
$success = 0;
$failure = 0;
$batches = 0;
foreach ($docs as $doc) {
$result = $this->add_solr_document($doc);
$batches++;
if ($result) {
$success++;
} else {
$failure++;
}
}
return [$success, $failure, $batches];
}
/**
* Index files attached to the docuemnt, ensuring the index matches the current document files.
*
@ -1446,6 +1529,15 @@ class engine extends \core_search\engine {
return true;
}
/**
* Solr supports adding documents in a batch.
*
* @return bool True
*/
public function supports_add_document_batch(): bool {
return true;
}
/**
* Solr supports deleting the index for a context.
*

View File

@ -1297,6 +1297,148 @@ class search_solr_engine_testcase extends advanced_testcase {
$this->assert_raw_solr_query_result('content:xyzzy', []);
}
/**
* Specific test of the add_document_batch function (also used in many other tests).
*/
public function test_add_document_batch() {
// Get a default document.
$area = new core_mocksearch\search\mock_search_area();
$record = $this->generator->create_record();
$doc = $area->get_document($record);
$originalid = $doc->get('id');
// Now create 5 similar documents.
$docs = [];
for ($i = 1; $i <= 5; $i++) {
$doc = $area->get_document($record);
$doc->set('id', $originalid . '-' . $i);
$doc->set('title', 'Batch ' . $i);
$docs[$i] = $doc;
}
// Document 3 has a file attached.
$fs = get_file_storage();
$filerecord = new \stdClass();
$filerecord->content = 'Some FileContents';
$file = $this->generator->create_file($filerecord);
$docs[3]->add_stored_file($file);
// Add all these documents to the search engine.
$this->assertEquals([5, 0, 1], $this->engine->add_document_batch($docs, true));
$this->engine->area_index_complete($area->get_area_id());
// Check all documents were indexed.
$querydata = new stdClass();
$querydata->q = 'Batch';
$results = $this->search->search($querydata);
$this->assertCount(5, $results);
// Check it also finds based on the file.
$querydata->q = 'FileContents';
$results = $this->search->search($querydata);
$this->assertCount(1, $results);
}
/**
* Tests the batching logic, specifically the limit to 100 documents per
* batch, and not batching very large documents.
*/
public function test_batching() {
$area = new core_mocksearch\search\mock_search_area();
$record = $this->generator->create_record();
$doc = $area->get_document($record);
$originalid = $doc->get('id');
// Up to 100 documents in 1 batch.
$docs = [];
for ($i = 1; $i <= 100; $i++) {
$doc = $area->get_document($record);
$doc->set('id', $originalid . '-' . $i);
$docs[$i] = $doc;
}
[, , , , , $batches] = $this->engine->add_documents(
new ArrayIterator($docs), $area, ['indexfiles' => true]);
$this->assertEquals(1, $batches);
// More than 100 needs 2 batches.
$docs = [];
for ($i = 1; $i <= 101; $i++) {
$doc = $area->get_document($record);
$doc->set('id', $originalid . '-' . $i);
$docs[$i] = $doc;
}
[, , , , , $batches] = $this->engine->add_documents(
new ArrayIterator($docs), $area, ['indexfiles' => true]);
$this->assertEquals(2, $batches);
// Small number but with some large documents that aren't batched.
$docs = [];
for ($i = 1; $i <= 10; $i++) {
$doc = $area->get_document($record);
$doc->set('id', $originalid . '-' . $i);
$docs[$i] = $doc;
}
// This one is just small enough to fit.
$docs[3]->set('content', str_pad('xyzzy ', 1024 * 1024, 'x'));
// These two don't fit.
$docs[5]->set('content', str_pad('xyzzy ', 1024 * 1024 + 1, 'x'));
$docs[6]->set('content', str_pad('xyzzy ', 1024 * 1024 + 1, 'x'));
[, , , , , $batches] = $this->engine->add_documents(
new ArrayIterator($docs), $area, ['indexfiles' => true]);
$this->assertEquals(3, $batches);
// Check that all 3 of the large documents (added as batch or not) show up in results.
$this->engine->area_index_complete($area->get_area_id());
$querydata = new stdClass();
$querydata->q = 'xyzzy';
$results = $this->search->search($querydata);
$this->assertCount(3, $results);
}
/**
* Tests with large documents. The point of this test is that we stop batching
* documents if they are bigger than 1MB, and the maximum batch count is 100,
* so the maximum size batch will be about 100 1MB documents.
*/
public function test_add_document_batch_large() {
// This test is a bit slow and not that important to run every time...
if (!PHPUNIT_LONGTEST) {
$this->markTestSkipped('PHPUNIT_LONGTEST is not defined');
}
// Get a default document.
$area = new core_mocksearch\search\mock_search_area();
$record = $this->generator->create_record();
$doc = $area->get_document($record);
$originalid = $doc->get('id');
// Now create 100 large documents.
$size = 1024 * 1024;
$docs = [];
for ($i = 1; $i <= 100; $i++) {
$doc = $area->get_document($record);
$doc->set('id', $originalid . '-' . $i);
$doc->set('title', 'Batch ' . $i);
$doc->set('content', str_pad('', $size, 'Long text ' . $i . '. ', STR_PAD_RIGHT) . ' xyzzy');
$docs[$i] = $doc;
}
// Add all these documents to the search engine.
$this->engine->add_document_batch($docs, true);
$this->engine->area_index_complete($area->get_area_id());
// Check all documents were indexed, searching for text at end.
$querydata = new stdClass();
$querydata->q = 'xyzzy';
$results = $this->search->search($querydata);
$this->assertCount(100, $results);
// Search for specific text that's only in one.
$querydata->q = '42';
$results = $this->search->search($querydata);
$this->assertCount(1, $results);
}
/**
* Carries out a raw Solr query using the Solr basic query syntax.
*

View File

@ -1,6 +1,14 @@
This files describes API changes in /search/*,
information provided here is intended especially for developers.
=== 4.0 ===
* Search indexing now supports sending multiple documents to the server in a batch. This is implemented
for the Solr search engine, where it significantly increases performance. For this to work, engines
should implement add_document_batch() function and return true to supports_add_document_batch().
There is also an additional parameter returned from add_documents() with the number of batches
sent, which is used for the log display. Existing engines should continue to work unmodified.
=== 3.8 ===
* Search indexing supports time limits to make the scheduled task run more neatly since 3.4. In order for