Comments, update logic improved, rest of module add/delete functions added.

This commit is contained in:
mchampan 2006-08-21 00:50:29 +00:00
parent 0513f3bfa5
commit 791a4cece1
15 changed files with 218 additions and 34 deletions

View File

@ -1,3 +1,25 @@
2006/08/21
----------
Fixed index document count, and created new config variable to store
the size. (Search now has 3 global vars in $CFG, date, size and complete,
see indexer.php for var names). Index size is cached to provide an always
current value for the index - this is to take into account the fact that
deleted documents are in fact not removed from the index, but instead just
marked as deleted and not returned in search results. The actual document
still features in the index, and skews sizes. When the index optimiser is
completed in ZFS, then these deleted documents will be pruned, thus
correctly modifying the index size.
Additional commenting added.
Query page logic very slightly modified to clean up GET string a bit (removed
'p' variable).
Add/delete functions added to other document types.
A few TODO fields added to source, indicating changes still to come (or at
least to be considered).
2006/08/16
----------
Add/delete/update cron functions finished - can be called seperately

View File

@ -22,11 +22,18 @@
$dbcontrol = new IndexDBControl();
$addition_count = 0;
mtrace('<pre>Starting index update (additions)...');
mtrace('Index size before: '.$index->count()."\n");
$indexdate = $CFG->search_indexer_run_date;
mtrace('<pre>Starting index update (additions)...');
mtrace('Index size before: '.$CFG->search_index_size."\n");
//get all modules
if ($mods = get_records_select('modules')) {
//append virtual modules onto array
$mods = array_merge($mods, search_get_additional_modules());
foreach ($mods as $mod) {
//build include file and function names
$class_file = $CFG->dirroot.'/search/documents/'.$mod->name.'_document.php';
$db_names_function = $mod->name.'_db_names';
$get_document_function = $mod->name.'_single_document';
@ -35,22 +42,29 @@
if (file_exists($class_file)) {
require_once($class_file);
//if both required functions exist
if (function_exists($db_names_function) and function_exists($get_document_function)) {
mtrace("Checking $mod->name module for additions.");
$values = $db_names_function();
$where = (isset($values[4])) ? $values[4] : '';
$sql = "select id, ".$values[0]." as docid from ".$values[1]."
where id not in
(select docid from ".SEARCH_DATABASE_TABLE." where doctype like '$mod->name')";
//select records in MODULE table, but not in SEARCH_DATABASE_TABLE
$sql = "select id, ".$values[0]." as docid from ".$values[1].
" where id not in".
" (select docid from ".SEARCH_DATABASE_TABLE." where doctype like '$mod->name')".
" and ".$values[2]." > $indexdate".
" $where";
$records = get_records_sql($sql);
//foreach record, build a module specific search document using the get_document function
if (is_array($records)) {
foreach($records as $record) {
$additions[] = $get_document_function($record->id);
} //foreach
} //if
//foreach document, add it to the index and database table
foreach ($additions as $add) {
++$addition_count;
@ -74,9 +88,11 @@
//commit changes
$index->commit();
//update index date
//update index date and size
set_config("search_indexer_run_date", time());
set_config("search_index_size", (int)$CFG->search_index_size + (int)$addition_count);
//print some additional info
mtrace("Added $addition_count documents.");
mtrace('Index size after: '.$index->count().'</pre>');

View File

@ -1,5 +1,17 @@
<?php
/* cron script to perform all the periodic search tasks
*
* delete.php
* updates the index by pruning deleted documents
*
* update.php
* updates document info in the index if the document has been modified since indexing
*
* add.php
* adds documents created since the last index run
*/
require_once('../config.php');
require_once("$CFG->dirroot/search/lib.php");

View File

@ -23,10 +23,13 @@
$deletion_count = 0;
mtrace('<pre>Starting clean-up of removed records...');
mtrace('Index size before: '.$index->count()."\n");
mtrace('Index size before: '.$CFG->search_index_size."\n");
if ($mods = get_records_select('modules')) {
$mods = array_merge($mods, search_get_additional_modules());
foreach ($mods as $mod) {
//build function names
$class_file = $CFG->dirroot.'/search/documents/'.$mod->name.'_document.php';
$delete_function = $mod->name.'_delete';
$db_names_function = $mod->name.'_db_names';
@ -39,13 +42,14 @@
mtrace("Checking $mod->name module for deletions.");
$values = $db_names_function();
$sql = "select id, docid from ".SEARCH_DATABASE_TABLE."
where doctype like '$mod->name'
and docid not in
(select ".$values[0]." from ".$values[1].")";
$sql = "select id, docid from ".SEARCH_DATABASE_TABLE.
" where doctype like '$mod->name'".
" and docid not in".
" (select ".$values[0]." from ".$values[1].")";
$records = get_records_sql($sql);
//build an array of all the deleted records
if (is_array($records)) {
foreach($records as $record) {
$deletions[] = $delete_function($record->docid);
@ -53,6 +57,7 @@
} //if
foreach ($deletions as $delete) {
//find the specific document in the index, using it's docid and doctype as keys
$doc = $index->find("+docid:$delete +doctype:$mod->name");
//get the record, should only be one
@ -60,6 +65,7 @@
++$deletion_count;
mtrace(" Delete: $thisdoc->title (database id = $thisdoc->dbid, index id = $thisdoc->id, moodle instance id = $thisdoc->docid)");
//remove it from index and database table
$dbcontrol->delDocument($thisdoc);
$index->delete($thisdoc->id);
} //foreach
@ -74,8 +80,9 @@
//commit changes
$index->commit();
//update index date
//update index date and index size
set_config("search_indexer_run_date", time());
set_config("search_index_size", (int)$CFG->search_index_size - (int)$deletion_count);
mtrace("Finished $deletion_count removals.");
mtrace('Index size after: '.$index->count().'</pre>');

View File

@ -12,6 +12,7 @@
$this->addField(Zend_Search_Lucene_Field::UnIndexed('url', $doc->url));
$this->addField(Zend_Search_Lucene_Field::UnIndexed('date', $doc->date));
//additional data added on a per-module basis
$this->addField(Zend_Search_Lucene_Field::Binary('data', serialize($data)));
$this->addField(Zend_Search_Lucene_Field::Keyword('doctype', $document_type));

View File

@ -68,6 +68,30 @@
return $documents;
} //forum_get_content_for_index
//returns a single forum search document based on a forum_entry id
function forum_single_document($id) {
$posts = get_recordset('forum_posts', 'id', $id);
$post = $posts->fields;
$discussions = get_recordset('forum_discussions', 'id', $post['discussion']);
$discussion = $discussions->fields;
$forums = get_recordset('forum', 'id', $discussion['forum']);
$forum = $forums->fields;
return new ForumSearchDocument($post, $forum['id'], $forum['course'], $post['groupid']);
} //forum_single_document
function forum_delete($info) {
return $info;
} //forum_delete
//returns the var names needed to build a sql query for addition/deletions
function forum_db_names() {
//[primary id], [table name], [time created field name], [time modified field name]
return array('id', 'forum_posts', 'created', 'modified');
} //forum_db_names
//reworked faster version from /mod/forum/lib.php
function forum_get_discussions_fast($forum) {
global $CFG, $USER;

View File

@ -7,7 +7,6 @@
* */
require_once("$CFG->dirroot/search/documents/document.php");
//require_once("$CFG->dirroot/mod/glossary/lib.php");
class GlossarySearchDocument extends SearchDocument {
public function __construct(&$entry, $glossary_id, $course_id, $group_id) {
@ -63,6 +62,7 @@
return $documents;
} //glossary_get_content_for_index
//returns a single glossary search document based on a glossary_entry id
function glossary_single_document($id) {
$entries = get_recordset('glossary_entries', 'id', $id);
$entry = $entries->fields;
@ -73,12 +73,16 @@
return new GlossarySearchDocument($entry, $entry['glossaryid'], $glossary['course'], -1);
} //glossary_single_document
//dummy delete function that converts docid from the search table to itself..
//this was here for a reason, but I can't remember it at the moment.
function glossary_delete($info) {
return $info;
} //glossary_delete
//returns the var names needed to build a sql query for addition/deletions
function glossary_db_names() {
return array('id', 'glossary_entries', 'timemodified');
//[primary id], [table name], [time created field name], [time modified field name]
return array('id', 'glossary_entries', 'timecreated', 'timemodified');
} //glossary_db_names
?>

View File

@ -58,4 +58,29 @@
return $documents;
} //resource_get_content_for_index
//returns a single resource search document based on a resource_entry id
function resource_single_document($id) {
$resources = get_recordset_sql('SELECT *
FROM `resource`
WHERE alltext NOT LIKE ""
AND alltext NOT LIKE " "
AND alltext NOT LIKE "&nbsp;"
AND TYPE != "file",
AND id = '.$id);
$resource = $resources->fields;
return new ResourceSearchDocument($resource);
} //resource_single_document
function resource_delete($info) {
return $info;
} //resource_delete
//returns the var names needed to build a sql query for addition/deletions
function resource_db_names() {
//[primary id], [table name], [time created field name], [time modified field name], [additional where conditions for sql]
return array('id', 'resource', 'timemodified', 'timemodified', "WHERE alltext NOT LIKE '' AND alltext NOT LIKE ' ' AND alltext NOT LIKE '&nbsp;' AND TYPE != 'file'");
} //resource_db_names
?>

View File

@ -134,4 +134,25 @@
return $documents;
} //wiki_get_content_for_index
//returns a single wiki search document based on a wiki_entry id
function wiki_single_document($id) {
$pages = get_recordset('wiki_pages', 'id', $id);
$page = $pages->fields;
$entries = get_recordset('wiki_entries', 'id', $page['wiki']);
$entry = $entries->fields;
return new WikiSearchDocument($page, $entry['wikiid'], $entry['course'], $entry['groupid']);
} //wiki_single_document
function wiki_delete($info) {
return $info;
} //wiki_delete
//returns the var names needed to build a sql query for addition/deletions
function wiki_db_names() {
//[primary id], [table name], [time created field name], [time modified field name]
return array('id', 'wiki_pages', 'created', 'lastmodified');
} //wiki_db_names
?>

View File

@ -92,15 +92,17 @@
// * mod_get_content_for_index
//are the sole basis for including a module in the index at the moment.
if ($mods = get_records_select('modules' /*'index this module?' where statement*/)) {
$mods = array_merge($mods, search_get_additional_modules());
if ($mods = get_records_select('modules' /*'index this module?' where statement*/)) {
//add virtual modules onto the back of the array
$mods = array_merge($mods, search_get_additional_modules());
foreach ($mods as $mod) {
foreach ($mods as $mod) {
$class_file = $CFG->dirroot.'/search/documents/'.$mod->name.'_document.php';
if (file_exists($class_file)) {
include_once($class_file);
//build function names
$iter_function = $mod->name.'_iterator';
$index_function = $mod->name.'_get_content_for_index';
@ -163,5 +165,8 @@
//mark the time we last updated
set_config("search_indexer_run_date", time());
//and the index size
set_config("search_index_size", (int)$index->count());
?>

View File

@ -24,13 +24,15 @@
$this->path = $path;
//test to see if there is a valid index on disk, at the specified path
try {
$test_index = new Zend_Search_Lucene($this->path, false);
$validindex = true;
} catch(Exception $e) {
$validindex = false;
} //catch
//retrieve file system info about the index if it is valid
if ($validindex) {
$this->size = display_size(get_directory_size($this->path));
$index_dir = get_directory_list($this->path, '', false, false);
@ -42,11 +44,16 @@
$this->indexcount = 0;
} //else
$db_exists = false;
$db_exists = false; //for now
//get all the current tables in moodle
$admin_tables = $db->MetaTables();
//TODO: use new IndexDBControl class for database checks?
//check if our search table exists
if (in_array($CFG->prefix.SEARCH_DATABASE_TABLE, $admin_tables)) {
//retrieve database information if it does
$db_exists = true;
//total documents
@ -65,12 +72,14 @@
$this->types = array();
} //else
//check if the busy flag is set
if ($CFG->search_indexer_busy == '1') {
$this->complete = false;
} else {
$this->complete = true;
} //if
//get the last run date for the indexer
if ($this->valid() && $CFG->search_indexer_run_date) {
$this->time = $CFG->search_indexer_run_date;
} else {
@ -78,6 +87,7 @@
} //else
} //__construct
//returns false on error, and the error message via referenced variable $err
public function valid(&$err=null) {
$err = array();
$ret = true;
@ -100,6 +110,7 @@
return $ret;
} //valid
//is the index dir valid
public function is_valid_dir() {
if ($this->filecount > 0) {
return true;
@ -108,6 +119,7 @@
} //else
} //is_valid_dir
//is the db table valid
public function is_valid_db() {
if ($this->dbcount > 0) {
return true;
@ -116,6 +128,7 @@
} //else
} //is_valid_db
//shorthand get method for the class variables
public function __get($var) {
if (in_array($var, array_keys(get_class_vars(get_class($this))))) {
return $this->$var;
@ -126,9 +139,11 @@
/* DB Index control class
*
* Used to control the search index database table
* */
class IndexDBControl {
//does the table exist?
public function checkTableExists() {
global $CFG, $db;
@ -142,6 +157,7 @@
} //else
} //checkTableExists
//is our database setup valid?
public function checkDB() {
global $CFG, $db;
@ -159,6 +175,7 @@
return $ret;
} //checkDB
//add a document record to the table
public function addDocument($document=null) {
global $db;
@ -182,6 +199,7 @@
return $id;
} //addDocument
//remove a document record from the index
public function delDocument($document) {
global $db;

View File

@ -27,18 +27,22 @@
//check for php5, but don't die yet (see line 52)
if ($check = search_check_php5()) {
require_once("$CFG->dirroot/search/querylib.php");
require_once("$CFG->dirroot/search/querylib.php");
$advanced = (optional_param('a', '0', PARAM_INT) == '1') ? true : false;
$pages = (optional_param('p', '0', PARAM_INT) == '1') ? true : false;
$page_number = optional_param('page', -1, PARAM_INT);
$pages = ($page_number == -1) ? false : true;
$advanced = (optional_param('a', '0', PARAM_INT) == '1') ? true : false;
$query_string = optional_param('query_string', '', PARAM_CLEAN);
if ($pages && isset($_SESSION['search_advanced_query'])) {
//if both are set, then we are busy browsing through the result pages of an advanced query
$adv = unserialize($_SESSION['search_advanced_query']);
} else if ($advanced) {
//otherwise we are dealing with a new advanced query
unset($_SESSION['search_advanced_query']);
session_unregister('search_advanced_query');
//retrieve advanced query variables
$adv->mustappear = trim(optional_param('mustappear', '', PARAM_CLEAN), $chars);
$adv->notappear = trim(optional_param('notappear', '', PARAM_CLEAN), $chars);
$adv->canappear = trim(optional_param('canappear', '', PARAM_CLEAN), $chars);
@ -48,47 +52,59 @@
} //else
if ($advanced) {
//parse the advanced variables into a query string
//TODO: move out to external query class (QueryParse?)
//chars to strip from strings (whitespace)
$chars = ' \t\n\r\0\x0B,;';
$query_string = '';
//get all available module types
$module_types = array_merge(array('All'), array_values(search_get_document_types()));
$adv->module = in_array($adv->module, $module_types) ? $adv->module : 'All';
//convert '1 2' into '+1 +2' for required words field
if (strlen(trim($adv->mustappear)) > 0) {
$query_string = ' +'.implode(' +', preg_split("/[\s,;]+/", $adv->mustappear));
} //if
//convert '1 2' into '-1 -2' for not wanted words field
if (strlen(trim($adv->notappear)) > 0) {
$query_string .= ' -'.implode(' -', preg_split("/[\s,;]+/", $adv->notappear));
} //if
//this field is left untouched, apart from whitespace being stripped
if (strlen(trim($adv->canappear)) > 0) {
$query_string .= ' '.implode(' ', preg_split("/[\s,;]+/", $adv->canappear));
} //if
//add module restriction
if ($adv->module != 'All') {
$query_string .= ' +doctype:'.$adv->module;
} //if
//create title search string
if (strlen(trim($adv->title)) > 0) {
$query_string .= ' +title:'.implode(' +title:', preg_split("/[\s,;]+/", $adv->title));
} //if
//create author search string
if (strlen(trim($adv->author)) > 0) {
$query_string .= ' +author:'.implode(' +author:', preg_split("/[\s,;]+/", $adv->author));
} //if
//save our options if the query is valid
if (!empty($query_string)) {
$_SESSION['search_advanced_query'] = serialize($adv);
} //if
} //if
$page_number = optional_param('page', 1, PARAM_INT);
//normalise page number
if ($page_number < 1) {
$page_number = 1;
} //if
} //if
//run the query against the index
$sq = new SearchQuery($query_string, $page_number, 10, true);
} //if
@ -204,7 +220,8 @@
print 'Searching: ';
if ($sq->is_valid_index()) {
print $sq->index_count();
//use cached variable to show up-to-date index size (takes deletions into account)
print $CFG->search_index_size;
} else {
print "0";
} //else
@ -235,7 +252,10 @@
$hits = $sq->results();
if ($advanced) {
$page_links = preg_replace("/query_string=[^&]+/", 'a=1&p=1', $page_links);
//if in advanced mode, search options are saved in the session, so
//we can remove the query string var from the page links, and replace
//it with a=1 (Advanced = on) instead
$page_links = preg_replace("/query_string=[^&]+/", 'a=1', $page_links);
} //if
print "<ol>";

View File

@ -274,9 +274,10 @@
return count($this->results);
} //count
public function index_count() {
return $this->index->count();
} //index_count
//this shouldn't be in this class
//public function index_count() {
// return $this->index->count();
//} //index_count
public function is_valid() {
return ($this->validquery and $this->validindex);

View File

@ -87,10 +87,14 @@
$table->data[] = array('<strong>Database</strong>', '<em><strong>search_documents<strong></em>');
//add an extra field if we're admin
//add extra fields if we're admin
if (isadmin()) {
//don't want to confuse users if the two totals don't match (hint: they should)
$table->data[] = array('Documents in index', $indexinfo->indexcount);
//*cough* they should match if deletions were actually removed from the index,
//as it turns out, they're only marked as deleted and not returned in search results
$table->data[] = array('Deletions in index', (int)$indexinfo->indexcount - (int)$indexinfo->dbcount);
} //if
$table->data[] = array('Documents in database', $indexinfo->dbcount);

View File

@ -27,6 +27,8 @@
mtrace("<pre>Starting index update (updates)...\n");
if ($mods = get_records_select('modules')) {
$mods = array_merge($mods, search_get_additional_modules());
foreach ($mods as $mod) {
$class_file = $CFG->dirroot.'/search/documents/'.$mod->name.'_document.php';
$get_document_function = $mod->name.'_single_document';
@ -41,8 +43,10 @@
mtrace("Checking $mod->name module for updates.");
$values = $db_names_function();
$sql = "select id, ".$values[0]." as docid from ".$values[1]."
where ".$values[2]." > $indexdate";
//TODO: check 'in' syntax with other RDBMS' (add and update.php as well)
$sql = "select id, ".$values[0]." as docid from ".$values[1].
" where ".$values[3]." > $indexdate".
" and id in (select docid from ".SEARCH_DATABASE_TABLE.")";
$records = get_records_sql($sql);