MDL-59039 Global search: Allow partial indexing (in scheduled task)

This commit is contained in:
sam marshall 2017-05-25 18:19:06 +01:00
parent 350700bf8b
commit 67d6479581
13 changed files with 302 additions and 36 deletions

View File

@ -125,6 +125,9 @@ foreach ($searchareas as $area) {
$areasconfig[$areaid]->docsprocessed . ' , ' .
$areasconfig[$areaid]->recordsprocessed . ' , ' .
$areasconfig[$areaid]->docsignored;
if ($areasconfig[$areaid]->partial) {
$laststatus .= ' ' . get_string('searchpartial', 'admin');
}
} else {
$laststatus = '';
}

View File

@ -557,6 +557,13 @@ if ($hassiteconfig) {
$temp->add(new admin_setting_heading('searchengineheading', new lang_string('searchengine', 'admin'), ''));
$temp->add(new admin_setting_configselect('searchengine',
new lang_string('selectsearchengine', 'admin'), '', 'solr', $engines));
$temp->add(new admin_setting_heading('searchindexingheading', new lang_string('searchoptions', 'admin'), ''));
$temp->add(new admin_setting_configcheckbox('searchindexwhendisabled',
new lang_string('searchindexwhendisabled', 'admin'), new lang_string('searchindexwhendisabled_desc', 'admin'),
0));
$temp->add(new admin_setting_configduration('searchindextime',
new lang_string('searchindextime', 'admin'), new lang_string('searchindextime_desc', 'admin'),
600));
$ADMIN->add('searchplugins', $temp);
$ADMIN->add('searchplugins', new admin_externalpage('searchareas', new lang_string('searchareas', 'admin'),

View File

@ -985,10 +985,16 @@ $string['searchdeleteindex'] = 'Delete all indexed contents';
$string['searchengine'] = 'Search engine';
$string['searchindexactions'] = 'Index actions';
$string['searchindexdeleted'] = 'Index deleted';
$string['searchindextime'] = 'Indexing time limit';
$string['searchindextime_desc'] = 'When indexing large amounts of new content, the scheduled task will stop after this time limit is reached. It will continue the next time the task runs.';
$string['searchindexupdated'] = 'Search engine contents have been updated';
$string['searchindexwhendisabled'] = 'Index when disabled';
$string['searchindexwhendisabled_desc'] = 'Allows the scheduled task to build the search index even when search is disabled. This is useful if you want to build the index before the search facility appears to students.';
$string['searchinsettings'] = 'Search in settings';
$string['searchlastrun'] = 'Last run (time, # docs, # records, # ignores)';
$string['searchnotavailable'] = 'Search is not available';
$string['searchpartial'] = '(not yet fully indexed)';
$string['searchoptions'] = 'Search options';
$string['searchreindexed'] = 'All site contents have been reindexed.';
$string['searchreindexindex'] = 'Reindex all site contents';
$string['searchresults'] = 'Search results';

View File

@ -46,12 +46,13 @@ class search_index_task extends scheduled_task {
* Throw exceptions on errors (the job will be retried).
*/
public function execute() {
if (!\core_search\manager::is_global_search_enabled()) {
if (!\core_search\manager::is_global_search_enabled() &&
!get_config('core', 'searchindexwhendisabled')) {
return;
}
$globalsearch = \core_search\manager::instance();
// Indexing database records for modules + rich documents of forum.
$globalsearch->index();
$globalsearch->index(false, get_config('core', 'searchindextime'), new \text_progress_trace());
}
}

View File

@ -49,7 +49,8 @@ class search_optimize_task extends scheduled_task {
* Throw exceptions on errors (the job will be retried).
*/
public function execute() {
if (!\core_search\manager::is_global_search_enabled()) {
if (!\core_search\manager::is_global_search_enabled() &&
!get_config('core', 'searchindexwhendisabled')) {
return;
}

View File

@ -175,7 +175,8 @@ abstract class base {
list($componentname, $varname) = $this->get_config_var_name();
$config = [];
$settingnames = array('_enabled', '_indexingstart', '_indexingend', '_lastindexrun', '_docsignored', '_docsprocessed', '_recordsprocessed');
$settingnames = array('_enabled', '_indexingstart', '_indexingend', '_lastindexrun',
'_docsignored', '_docsprocessed', '_recordsprocessed', '_partial');
foreach ($settingnames as $name) {
$config[$varname . $name] = get_config($componentname, $varname . $name);
}
@ -209,6 +210,22 @@ abstract class base {
return set_config($varname . '_enabled', $isenabled, $componentname);
}
/**
* Gets the length of time spent indexing this area (the last time it was indexed).
*
* @return int|bool Time in seconds spent indexing this area last time, false if never indexed
*/
public function get_last_indexing_duration() {
list($componentname, $varname) = $this->get_config_var_name();
$start = get_config($componentname, $varname . '_indexingstart');
$end = get_config($componentname, $varname . '_indexingend');
if ($start && $end) {
return $end - $start;
} else {
return false;
}
}
/**
* Returns true if this area uses file indexing.
*

View File

@ -213,8 +213,18 @@ abstract class engine {
$numdocs = 0;
$numdocsignored = 0;
$lastindexeddoc = 0;
$firstindexeddoc = 0;
$partial = false;
foreach ($iterator as $document) {
// Stop if we have exceeded the time limit (and there are still more items). Always
// do at least one second's worth of documents otherwise it will never make progress.
if ($lastindexeddoc !== $firstindexeddoc &&
!empty($options['stopat']) && microtime(true) >= $options['stopat']) {
$partial = true;
break;
}
if (!$document instanceof \core_search\document) {
continue;
}
@ -236,10 +246,13 @@ abstract class engine {
}
$lastindexeddoc = $document->get('modified');
if (!$firstindexeddoc) {
$firstindexeddoc = $lastindexeddoc;
}
$numrecords++;
}
return array($numrecords, $numdocs, $numdocsignored, $lastindexeddoc);
return array($numrecords, $numdocs, $numdocsignored, $lastindexeddoc, $partial);
}
/**

View File

@ -521,11 +521,19 @@ class manager {
* Index all documents.
*
* @param bool $fullindex Whether we should reindex everything or not.
* @param float $timelimit Time limit in seconds (0 = no time limit)
* @param \progress_trace $progress Optional class for tracking progress
* @throws \moodle_exception
* @return bool Whether there was any updated document or not.
*/
public function index($fullindex = false) {
global $CFG;
public function index($fullindex = false, $timelimit = 0, \progress_trace $progress = null) {
// Cannot combine time limit with reindex.
if ($timelimit && $fullindex) {
throw new \coding_exception('Cannot apply time limit when reindexing');
}
if (!$progress) {
$progress = new \null_progress_trace();
}
// Unlimited time.
\core_php_time_limit::raise();
@ -536,11 +544,25 @@ class manager {
$sumdocs = 0;
$searchareas = $this->get_search_areas_list(true);
if ($timelimit) {
// If time is limited (and therefore we're not just indexing everything anyway), select
// an order for search areas. The intention here is to avoid a situation where a new
// large search area is enabled, and this means all our other search areas go out of
// date while that one is being indexed. To do this, we order by the time we spent
// indexing them last time we ran, meaning anything that took a very long time will be
// done last.
uasort($searchareas, function(\core_search\base $area1, \core_search\base $area2) {
return (int)$area1->get_last_indexing_duration() - (int)$area2->get_last_indexing_duration();
});
// Decide time to stop.
$stopat = microtime(true) + $timelimit;
}
foreach ($searchareas as $areaid => $searcharea) {
if (CLI_SCRIPT && !PHPUNIT_TEST) {
mtrace('Processing ' . $searcharea->get_visible_name() . ' area');
}
$progress->output('Processing area: ' . $searcharea->get_visible_name());
// Notify the engine that an area is starting.
$this->engine->area_index_starting($searcharea, $fullindex);
@ -556,7 +578,16 @@ class manager {
if ($fullindex === true) {
$referencestarttime = 0;
} else {
$referencestarttime = $prevtimestart;
$partial = get_config($componentconfigname, $varname . '_partial');
if ($partial) {
// When the previous index did not complete all data, we start from the time of the
// last document that was successfully indexed. (Note this will result in
// re-indexing that one document, but we can't avoid that because there may be
// other documents in the same second.)
$referencestarttime = intval(get_config($componentconfigname, $varname . '_lastindexrun'));
} else {
$referencestarttime = $prevtimestart;
}
}
// Getting the recordset from the area.
@ -565,27 +596,35 @@ class manager {
// Pass get_document as callback.
$fileindexing = $this->engine->file_indexing_enabled() && $searcharea->uses_file_indexing();
$options = array('indexfiles' => $fileindexing, 'lastindexedtime' => $prevtimestart);
if ($timelimit) {
$options['stopat'] = $stopat;
}
$iterator = new \core\dml\recordset_walk($recordset, array($searcharea, 'get_document'), $options);
list($numrecords,
$numdocs,
$numdocsignored,
$lastindexeddoc) = $this->engine->add_documents($iterator, $searcharea, $options);
$result = $this->engine->add_documents($iterator, $searcharea, $options);
if (count($result) === 5) {
list($numrecords, $numdocs, $numdocsignored, $lastindexeddoc, $partial) = $result;
} else {
// Backward compatibility for engines that don't support partial adding.
list($numrecords, $numdocs, $numdocsignored, $lastindexeddoc) = $result;
debugging('engine::add_documents() should return $partial (4-value return is deprecated)',
DEBUG_DEVELOPER);
$partial = false;
}
if (CLI_SCRIPT && !PHPUNIT_TEST) {
if ($numdocs > 0) {
$elapsed = round((microtime(true) - $elapsed), 3);
mtrace('Processed ' . $numrecords . ' records containing ' . $numdocs . ' documents for ' .
$searcharea->get_visible_name() . ' area, in ' . $elapsed . ' seconds.');
} else {
mtrace('No new documents to index for ' . $searcharea->get_visible_name() . ' area.');
}
if ($numdocs > 0) {
$elapsed = round((microtime(true) - $elapsed), 3);
$progress->output('Processed ' . $numrecords . ' records containing ' . $numdocs .
' documents, in ' . $elapsed . ' seconds' .
($partial ? ' (not complete)' : '') . '.', 1);
} else {
$progress->output('No new documents to index.', 1);
}
// Notify the engine this area is complete, and only mark times if true.
if ($this->engine->area_index_complete($searcharea, $numdocs, $fullindex)) {
$sumdocs += $numdocs;
// Store last index run once documents have been commited to the search engine.
// Store last index run once documents have been committed to the search engine.
set_config($varname . '_indexingstart', $indexingstart, $componentconfigname);
set_config($varname . '_indexingend', time(), $componentconfigname);
set_config($varname . '_docsignored', $numdocsignored, $componentconfigname);
@ -594,6 +633,18 @@ class manager {
if ($lastindexeddoc > 0) {
set_config($varname . '_lastindexrun', $lastindexeddoc, $componentconfigname);
}
if ($partial) {
set_config($varname . '_partial', 1, $componentconfigname);
} else {
unset_config($varname . '_partial', $componentconfigname);
}
} else {
$progress->output('Engine reported error.');
}
if ($timelimit && (microtime(true) >= $stopat)) {
$progress->output('Stopping indexing due to time limit.');
break;
}
}
@ -673,7 +724,8 @@ class manager {
*/
public function get_areas_config($searchareas) {
$vars = array('indexingstart', 'indexingend', 'lastindexrun', 'docsignored', 'docsprocessed', 'recordsprocessed');
$vars = array('indexingstart', 'indexingend', 'lastindexrun', 'docsignored',
'docsprocessed', 'recordsprocessed', 'partial');
$configsettings = [];
foreach ($searchareas as $searcharea) {

View File

@ -27,8 +27,9 @@ define('CLI_SCRIPT', true);
require(__DIR__.'/../../config.php');
require_once($CFG->libdir.'/clilib.php'); // cli only functions
list($options, $unrecognized) = cli_get_params(array('help' => false, 'force' => false, 'reindex' => false),
array('h' => 'help', 'f' => 'force', 'r' => 'reindex'));
list($options, $unrecognized) = cli_get_params(array('help' => false, 'force' => false,
'reindex' => false, 'timelimit' => 0),
array('h' => 'help', 'f' => 'force', 'r' => 'reindex', 't' => 'timelimit'));
if ($unrecognized) {
$unrecognized = implode("\n ", $unrecognized);
@ -40,18 +41,24 @@ if ($options['help']) {
"Index search data
Options:
-h, --help Print out this help
-r, --reindex Reindex data
-f, --force Allow indexer to run, even if global search is disabled.
-h, --help Print out this help
-r, --reindex Reindex data
-f, --force Allow indexer to run, even if global search is disabled.
-t=<n>, --timelimit=<n> Stop after indexing for specified time (in seconds)
Example:
Examples:
\$ sudo -u www-data /usr/bin/php search/cli/indexer.php --reindex
\$ sudo -u www-data /usr/bin/php search/cli/indexer.php --timelimit=300
";
echo $help;
die;
}
if ($options['timelimit'] && $options['reindex']) {
cli_error('Cannot apply time limit when reindexing');
}
if (!\core_search\manager::is_global_search_enabled() && empty($options['force'])) {
cli_error('Global search is disabled. Use --force if you want to force an index while disabled');
}
@ -70,13 +77,20 @@ if ($serverstatus !== true) {
$globalsearch = \core_search\manager::instance();
if (empty($options['reindex'])) {
echo "Running full index of site\n";
echo "==========================\n";
$globalsearch->index();
if ($options['timelimit']) {
$limitinfo = ' (max ' . $options['timelimit'] . ' seconds)';
$limitunderline = preg_replace('~.~', '=', $limitinfo);
echo "Running index of site$limitinfo\n";
echo "=====================$limitunderline\n";
} else {
echo "Running full index of site\n";
echo "==========================\n";
}
$globalsearch->index(false, $options['timelimit'], new text_progress_trace());
} else {
echo "Running full reindex of site\n";
echo "============================\n";
$globalsearch->index(true);
$globalsearch->index(true, 0, new text_progress_trace());
}
// Optimize index at last.

View File

@ -122,4 +122,8 @@ class mock_search_area extends \core_search\base {
public function get_context_url(\core_search\document $doc) {
return new \moodle_url('/index.php');
}
public function get_visible_name($lazyload = false) {
return 'Mock search area';
}
}

View File

@ -29,6 +29,12 @@ defined('MOODLE_INTERNAL') || die;
class engine extends \core_search\engine {
/** @var int If set, waits when adding each document (microseconds) */
protected $adddelay = 0;
/** @var \core_search\document[] Documents added */
protected $added = [];
public function is_installed() {
return true;
}
@ -38,7 +44,11 @@ class engine extends \core_search\engine {
}
public function add_document($document, $fileindexing = false) {
// No need to implement.
if ($this->adddelay) {
usleep($this->adddelay);
}
$this->added[] = $document;
return true;
}
public function execute_query($data, $usercontexts, $limit = 0) {
@ -64,4 +74,25 @@ class engine extends \core_search\engine {
public function get_query_total_count() {
return 0;
}
/**
* Sets an add delay to simulate time taken indexing.
*
* @param float $seconds Delay in seconds for each document
*/
public function set_add_delay($seconds) {
$this->adddelay = (int)($seconds * 1000000);
}
/**
* Gets the list of indexed (added) documents since last time this function
* was called.
*
* @return \core_search\document[] List of documents, in order added.
*/
public function get_and_clear_added_documents() {
$added = $this->added;
$this->added = [];
return $added;
}
}

View File

@ -116,6 +116,7 @@ class search_manager_testcase extends advanced_testcase {
$configs = $search->get_areas_config(array($this->forumpostareaid => $searcharea));
$this->assertEquals($start, $configs[$this->forumpostareaid]->indexingstart);
$this->assertEquals($end, $configs[$this->forumpostareaid]->indexingend);
$this->assertEquals(false, $configs[$this->forumpostareaid]->partial);
try {
$fakeareaid = \core_search\manager::generate_areaid('mod_unexisting', 'chihuaquita');
@ -132,6 +133,7 @@ class search_manager_testcase extends advanced_testcase {
$this->assertEquals(0, $config[$varname . '_indexingstart']);
$this->assertEquals(0, $config[$varname . '_indexingend']);
$this->assertEquals(0, $config[$varname . '_lastindexrun']);
$this->assertEquals(0, $config[$varname . '_partial']);
// No caching.
$configs = $search->get_areas_config(array($this->forumpostareaid => $searcharea));
$this->assertEquals(0, $configs[$this->forumpostareaid]->indexingstart);
@ -151,6 +153,114 @@ class search_manager_testcase extends advanced_testcase {
$this->assertEquals(0, $configs[$this->forumpostareaid]->indexingend);
}
/**
* Tests the get_last_indexing_duration method in the base area class.
*/
public function test_get_last_indexing_duration() {
$this->resetAfterTest();
$search = testable_core_search::instance();
$searcharea = $search->get_search_area($this->forumpostareaid);
// When never indexed, the duration is false.
$this->assertSame(false, $searcharea->get_last_indexing_duration());
// Set the start/end times.
list($componentname, $varname) = $searcharea->get_config_var_name();
$start = time() - 100;
$end = time();
set_config($varname . '_indexingstart', $start, $componentname);
set_config($varname . '_indexingend', $end, $componentname);
// The duration should now be 100.
$this->assertSame(100, $searcharea->get_last_indexing_duration());
}
/**
* Tests that partial indexing works correctly.
*/
public function test_partial_indexing() {
global $USER;
$this->resetAfterTest();
$this->setAdminUser();
// Create a course and a forum.
$generator = $this->getDataGenerator();
$course = $generator->create_course();
$forum = $generator->create_module('forum', ['course' => $course->id]);
// Index everything up to current. Ensure the course is older than current second so it
// definitely doesn't get indexed again next time.
$this->waitForSecond();
$search = testable_core_search::instance();
$search->index(false, 0);
$searcharea = $search->get_search_area($this->forumpostareaid);
list($componentname, $varname) = $searcharea->get_config_var_name();
$this->assertFalse(get_config($componentname, $varname . '_partial'));
// Add 3 discussions to the forum.
$now = time();
$generator->get_plugin_generator('mod_forum')->create_discussion(['course' => $course->id,
'forum' => $forum->id, 'userid' => $USER->id, 'timemodified' => $now,
'name' => 'Frog']);
$generator->get_plugin_generator('mod_forum')->create_discussion(['course' => $course->id,
'forum' => $forum->id, 'userid' => $USER->id, 'timemodified' => $now + 1,
'name' => 'Toad']);
$generator->get_plugin_generator('mod_forum')->create_discussion(['course' => $course->id,
'forum' => $forum->id, 'userid' => $USER->id, 'timemodified' => $now + 2,
'name' => 'Zombie']);
time_sleep_until($now + 3);
// Clear the count of added documents.
$search->get_engine()->get_and_clear_added_documents();
// Make the search engine delay while indexing each document.
$search->get_engine()->set_add_delay(1.2);
// Index with a limit of 2 seconds - it should index 2 of the documents (after the second
// one, it will have taken 2.4 seconds so it will stop).
$search->index(false, 2);
$added = $search->get_engine()->get_and_clear_added_documents();
$this->assertCount(2, $added);
$this->assertEquals('Frog', $added[0]->get('title'));
$this->assertEquals('Toad', $added[1]->get('title'));
$this->assertEquals(1, get_config($componentname, $varname . '_partial'));
// Add a label.
$generator->create_module('label', ['course' => $course->id, 'intro' => 'Vampire']);
// Wait to next second (so as to not reindex the label more than once, as it will now
// be timed before the indexing run).
$this->waitForSecond();
// Next index with 1 second limit should do the label and not the forum - the logic is,
// if it spent ages indexing an area last time, do that one last on next run.
$search->index(false, 1);
$added = $search->get_engine()->get_and_clear_added_documents();
$this->assertCount(1, $added);
$this->assertEquals('Vampire', $added[0]->get('title'));
// Index again with a 2 second limit - it will redo last post for safety (because of other
// things possibly having the same time second), and then do the remaining one. (Note:
// because it always does more than one second worth of items, it would actually index 2
// posts even if the limit were less than 2.)
$search->index(false, 2);
$added = $search->get_engine()->get_and_clear_added_documents();
$this->assertCount(2, $added);
$this->assertEquals('Toad', $added[0]->get('title'));
$this->assertEquals('Zombie', $added[1]->get('title'));
$this->assertFalse(get_config($componentname, $varname . '_partial'));
// Index again - there should be nothing to index this time.
$search->index(false, 2);
$added = $search->get_engine()->get_and_clear_added_documents();
$this->assertCount(0, $added);
$this->assertFalse(get_config($componentname, $varname . '_partial'));
}
/**
* Adding this test here as get_areas_user_accesses process is the same, results just depend on the context level.
*

View File

@ -1,6 +1,13 @@
This files describes API changes in /search/*,
information provided here is intended especially for developers.
=== 3.4 ===
* Search indexing now supports time limits to make the scheduled task run more neatly. In order for
this to work, search engine plugins will need to implement the 'stopat' parameter if they
override the add_documents() function, and return an extra parameter from this function (see base
class in engine.php). Unmodified plugins will still work, but without supporting time limits.
=== 3.2 ===
* Base search area classes have been renamed, please update your search areas to use the classes below: