1
0
mirror of https://github.com/phpbb/phpbb.git synced 2025-07-30 21:40:43 +02:00

- overhauled search system

- updated structure for search backend plugins
  - better result caching using ACM
  - search results no longer session restricted => link to them by copying the URL :)
  - in-topic search
  - indexing posts now uses search backend plugins
  - develop/search_fill.php working again
  - fulltext_mysql not working yet
- tiny bugfixes to ACM


git-svn-id: file:///svn/phpbb/trunk@5441 89ea8834-ac86-4346-8a33-228a782c2dd0
This commit is contained in:
Nils Adermann
2006-01-11 18:56:07 +00:00
parent 9ea5fa1768
commit 0e0b1120fb
22 changed files with 1522 additions and 1036 deletions

View File

@@ -57,7 +57,7 @@ class acm
if (!$var_names)
{
$var_requested[] = $row['var_name'];
$this->var_requested[] = $row['var_name'];
}
}
}
@@ -137,7 +137,7 @@ class acm
if ($var_name{0} == '_')
{
if (!in_array($this->var_requested, $var_name))
if (!in_array($var_name, $this->var_requested))
{
$this->var_requested[] = $var_name;

View File

@@ -154,6 +154,11 @@ class acm
{
global $phpEx;
if (!$this->_exists($var_name))
{
return;
}
if ($var_name == 'sql' && !empty($table))
{
$regex = '(' . ((is_array($table)) ? implode('|', $table) : $table) . ')';

View File

@@ -564,7 +564,7 @@ class acp_board
$dp = opendir($phpbb_root_path . 'includes/search');
while ($file = readdir($dp))
{
if (preg_match('#\.' . $phpEx . '$#', $file))
if ((preg_match('#\.' . $phpEx . '$#', $file)) && ($file != "search.$phpEx"))
{
$search_plugins[] = preg_replace('#^(.*?)\.' . $phpEx . '$#', '\1', $file);
}

View File

@@ -2057,6 +2057,11 @@ function page_footer()
// Tidy some table rows every week
$cron_type = 'tidy_database';
}
else if (time() - $config['search_last_gc'] > $config['search_gc'])
{
// Tidy the cache
$cron_type = 'tidy_search';
}
/**
* @todo add session garbage collection

View File

@@ -509,7 +509,7 @@ function delete_topics($where_type, $where_ids, $auto_sync = true)
*/
function delete_posts($where_type, $where_ids, $auto_sync = true)
{
global $db;
global $db, $config, $phpbb_root_path, $phpEx;
if (is_array($where_ids))
{
@@ -542,7 +542,7 @@ function delete_posts($where_type, $where_ids, $auto_sync = true)
$db->sql_transaction('begin');
$table_ary = array(POSTS_TABLE, REPORTS_TABLE, SEARCH_MATCH_TABLE);
$table_ary = array(POSTS_TABLE, REPORTS_TABLE);
foreach ($table_ary as $table)
{
@@ -552,6 +552,26 @@ function delete_posts($where_type, $where_ids, $auto_sync = true)
}
unset($table_ary);
// Remove the message from the search index
$search_type = $config['search_type'];
if (!file_exists($phpbb_root_path . 'includes/search/' . $search_type . '.' . $phpEx))
{
trigger_error('NO_SUCH_SEARCH_MODULE');
}
require("{$phpbb_root_path}includes/search/$search_type.$phpEx");
$error = false;
$search = new $search_type($error);
if ($error)
{
trigger_error($error);
}
$search->index_remove($where_ids);
delete_attachments('post', $post_ids, false);
$db->sql_transaction('commit');

View File

@@ -1212,325 +1212,4 @@ class parse_message extends bbcode_firstpass
}
}
/**
* @package phpBB3
* Parses a given message and updates/maintains the fulltext tables
* @todo replace fulltext_search in message_parser with search modules
*/
class fulltext_search
{
function split_words($mode, $text)
{
global $user, $config;
static $drop_char_match, $drop_char_replace, $stopwords, $replace_synonym, $match_synonym;
// Is the fulltext indexer disabled? If yes then we need not
// carry on ... it's okay ... I know when I'm not wanted boo hoo
if (!$config['load_search_upd'])
{
return;
}
if (!is_array($drop_char_match))
{
$drop_char_match = array('-', '^', '$', ';', '#', '&', '(', ')', '<', '>', '`', '\'', '"', '|', ',', '@', '_', '?', '%', '~', '.', '[', ']', '{', '}', ':', '\\', '/', '=', '\'', '!', '*');
$drop_char_replace = array(' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '', '', ' ', ' ', ' ', ' ', '', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '' , ' ', ' ', ' ', ' ', ' ');
if ($fp = @fopen($user->lang_path . '/search_stopwords.txt', 'rb'))
{
$stopwords = explode("\n", str_replace("\r\n", "\n", fread($fp, filesize($user->lang_path . '/search_stopwords.txt'))));
}
fclose($fp);
if ($fp = @fopen($user->lang_path . '/search_synonyms.txt', 'rb'))
{
preg_match_all('#^(.*?) (.*?)$#ms', fread($fp, filesize($user->lang_path . '/search_synonyms.txt')), $match);
$replace_synonym = &$match[1];
$match_synonym = &$match[2];
}
fclose($fp);
}
$match = array();
// Comments for hardcoded bbcode elements (urls, smilies, html)
$match[] = '#<!\-\- .* \-\->(.*?)<!\-\- .* \-\->#is';
// New lines, carriage returns
$match[] = "#[\n\r]+#";
// NCRs like &nbsp; etc.
$match[] = '#(&amp;|&)[\#a-z0-9]+?;#i';
// Do not index code
$match[] = '#\[code=?.*?(\:?[0-9a-z]{5,})\].*?\[\/code(\:?[0-9a-z]{5,})\]#is';
// BBcode
$match[] = '#\[\/?[a-z\*\+\-]+=?.*?(\:?[0-9a-z]{5,})\]#';
// Sequences > min_search_chars & < max_search_chars
// $match[] = '#\s([\b]{1,' . $config['min_search_chars'] . '}|[\b]{' . $config['max_search_chars'] . ',})\s#is';
// $match[] = '#\s((&\#[0-9]+;){1,' . $config['min_search_chars'] . '}|(&\#[0-9]+;){' . $config['max_search_chars'] . ',})\s#is';
// Filter out ; and # but not &#[0-9]+;
// $match[] = '#(&\#[0-9]+;)|;|\#|&#';
$text = preg_replace($match, ' ', ' ' . strtolower(trim($text)) . ' ');
$text = str_replace(array(' + ', ' - '), array(' and ', ' not '), $text);
// Filter out non-alphabetical chars
$text = str_replace($drop_char_match, $drop_char_replace, $text);
// Split words
$text = explode(' ', preg_replace('#\s+#', ' ', trim($text)));
if (sizeof($stopwords))
{
$stopped_words = array_intersect($text, $stopwords);
$text = array_diff($text, $stopwords);
}
if (sizeof($replace_synonym))
{
$text = str_replace($replace_synonym, $match_synonym, $text);
}
foreach ($text as $index => $word)
{
if (strlen($word) < $config['min_search_chars'] || strlen($word) > $config['max_search_chars'])
{
unset($text[$index]);
}
}
return $text;
}
function add($mode, $post_id, &$message, &$subject)
{
global $config, $db;
// Is the fulltext indexer disabled? If yes then we need not
// carry on ... it's okay ... I know when I'm not wanted boo hoo
if (!$config['load_search_upd'])
{
return;
}
// Split old and new post/subject to obtain array of 'words'
$split_text = $this->split_words('post', $message);
$split_title = ($subject) ? $this->split_words('post', $subject) : array();
$words = array();
if ($mode == 'edit')
{
$words['add']['post'] = array();
$words['add']['title'] = array();
$words['del']['post'] = array();
$words['del']['title'] = array();
$sql = 'SELECT w.word_id, w.word_text, m.title_match
FROM ' . SEARCH_WORD_TABLE . ' w, ' . SEARCH_MATCH_TABLE . " m
WHERE m.post_id = $post_id
AND w.word_id = m.word_id";
$result = $db->sql_query($sql);
$cur_words = array();
while ($row = $db->sql_fetchrow($result))
{
$which = ($row['title_match']) ? 'title' : 'post';
$cur_words[$which][$row['word_text']] = $row['word_id'];
}
$db->sql_freeresult($result);
$words['add']['post'] = array_diff($split_text, array_keys($cur_words['post']));
$words['add']['title'] = array_diff($split_title, array_keys($cur_words['title']));
$words['del']['post'] = array_diff(array_keys($cur_words['post']), $split_text);
$words['del']['title'] = array_diff(array_keys($cur_words['title']), $split_title);
}
else
{
$words['add']['post'] = $split_text;
$words['add']['title'] = $split_title;
$words['del']['post'] = array();
$words['del']['title'] = array();
}
unset($split_text);
unset($split_title);
// Get unique words from the above arrays
$unique_add_words = array_unique(array_merge($words['add']['post'], $words['add']['title']));
// We now have unique arrays of all words to be added and removed and
// individual arrays of added and removed words for text and title. What
// we need to do now is add the new words (if they don't already exist)
// and then add (or remove) matches between the words and this post
if (sizeof($unique_add_words))
{
$sql = 'SELECT word_id, word_text
FROM ' . SEARCH_WORD_TABLE . '
WHERE word_text IN (' . implode(', ', preg_replace('#^(.*)$#', '\'$1\'', $unique_add_words)) . ")";
$result = $db->sql_query($sql);
$word_ids = array();
while ($row = $db->sql_fetchrow($result))
{
$word_ids[$row['word_text']] = $row['word_id'];
}
$db->sql_freeresult($result);
$new_words = array_diff($unique_add_words, array_keys($word_ids));
unset($unique_add_words);
if (sizeof($new_words))
{
switch (SQL_LAYER)
{
case 'mysql':
$sql = 'INSERT INTO ' . SEARCH_WORD_TABLE . ' (word_text)
VALUES ' . implode(', ', preg_replace('#^(.*)$#', '(\'$1\')', $new_words));
$db->sql_query($sql);
break;
case 'mysql4':
case 'mysqli':
case 'mssql':
case 'mssql_odbc':
case 'sqlite':
$sql = 'INSERT INTO ' . SEARCH_WORD_TABLE . ' (word_text) ' . implode(' UNION ALL ', preg_replace('#^(.*)$#', "SELECT '\$1'", $new_words));
$db->sql_query($sql);
break;
default:
foreach ($new_words as $word)
{
$sql = 'INSERT INTO ' . SEARCH_WORD_TABLE . " (word_text)
VALUES ('$word')";
$db->sql_query($sql);
}
break;
}
}
unset($new_words);
}
foreach ($words['del'] as $word_in => $word_ary)
{
$title_match = ($word_in == 'title') ? 1 : 0;
if (sizeof($word_ary))
{
$sql_in = array();
foreach ($word_ary as $word)
{
$sql_in[] = $cur_words[$word_in][$word];
}
$sql = 'DELETE FROM ' . SEARCH_MATCH_TABLE . '
WHERE word_id IN (' . implode(', ', $sql_in) . ')
AND post_id = ' . intval($post_id) . "
AND title_match = $title_match";
$db->sql_query($sql);
unset($sql_in);
}
}
foreach ($words['add'] as $word_in => $word_ary)
{
$title_match = ($word_in == 'title') ? 1 : 0;
if (sizeof($word_ary))
{
$sql = 'INSERT INTO ' . SEARCH_MATCH_TABLE . " (post_id, word_id, title_match)
SELECT $post_id, word_id, $title_match
FROM " . SEARCH_WORD_TABLE . '
WHERE word_text IN (' . implode(', ', preg_replace('#^(.*)$#', '\'$1\'', $word_ary)) . ')';
$db->sql_query($sql);
}
}
unset($words);
// Run the cleanup infrequently, once per session cleanup
if ($config['search_last_gc'] < time() - $config['search_gc'])
{
$this->search_tidy();
}
}
// Tidy up indexes, tag 'common words', remove
// words no longer referenced in the match table, etc.
function search_tidy()
{
global $db, $config;
// Is the fulltext indexer disabled? If yes then we need not
// carry on ... it's okay ... I know when I'm not wanted boo hoo
if (!$config['load_search_upd'])
{
return;
}
// Remove common (> 60% of posts ) words
$sql = 'SELECT SUM(forum_posts) AS total_posts
FROM ' . FORUMS_TABLE;
$result = $db->sql_query($sql);
$row = $db->sql_fetchrow($result);
$db->sql_freeresult($result);
if ($row['total_posts'] >= 100)
{
$sql = 'SELECT word_id
FROM ' . SEARCH_MATCH_TABLE . '
GROUP BY word_id
HAVING COUNT(word_id) > ' . floor($row['total_posts'] * 0.6);
$result = $db->sql_query($sql);
if ($row = $db->sql_fetchrow($result))
{
$sql_in = array();
do
{
$sql_in[] = $row['word_id'];
}
while ($row = $db->sql_fetchrow($result));
$sql_in = implode(', ', $sql_in);
$sql = 'UPDATE ' . SEARCH_WORD_TABLE . "
SET word_common = 1
WHERE word_id IN ($sql_in)";
$db->sql_query($sql);
$sql = 'DELETE FROM ' . SEARCH_MATCH_TABLE . "
WHERE word_id IN ($sql_in)";
$db->sql_query($sql);
unset($sql_in);
}
$db->sql_freeresult($result);
}
// Remove words with no matches ... this is a potentially nasty query
$sql = 'SELECT w.word_id
FROM ' . SEARCH_WORD_TABLE . ' w
LEFT JOIN ' . SEARCH_MATCH_TABLE . ' m ON (w.word_id = m.word_id)
WHERE w.word_common = 0 AND m.word_id IS NULL
GROUP BY m.word_id';
$result = $db->sql_query($sql);
if ($row = $db->sql_fetchrow($result))
{
$sql_in = array();
do
{
$sql_in[] = $row['word_id'];
}
while ($row = $db->sql_fetchrow($result));
$sql = 'DELETE FROM ' . SEARCH_WORD_TABLE . '
WHERE word_id IN (' . implode(', ', $sql_in) . ')';
$db->sql_query($sql);
unset($sql_in);
}
$db->sql_freeresult($result);
set_config('search_last_gc', time());
}
}
?>

File diff suppressed because it is too large Load Diff

264
phpBB/includes/search/search.php Executable file
View File

@@ -0,0 +1,264 @@
<?php
/**
*
* @package search
* @version $Id$
* @copyright (c) 2005 phpBB Group
* @license http://opensource.org/licenses/gpl-license.php GNU Public License
*
*/
/**
* @ignore
*/
define('SEARCH_RESULT_NOT_IN_CACHE', 2);
define('SEARCH_RESULT_IN_CACHE', 1);
define('SEARCH_RESULT_INCOMPLETE', 2);
/**
* @package search
* search_backend
* optional base class for search plugins providing simple caching based on ACM
* and functions to retrieve ignore_words and synonyms
*/
class search_backend
{
var $ignore_words = array();
var $match_synonym = array();
var $replace_synonym = array();
var $split_words = array();
var $common_words = array();
function search_backend(&$error)
{
// This class cannot be used as a search plugin
$error = true;
}
/**
* Stores a list of common words that should be ignored in $this->ignore_words and caches them
*/
function get_ignore_words()
{
if (!sizeof($this->ignore_words))
{
global $user, $cache;
$ignore_words = $cache->get('_ignore_words');
if (!$ignore_words)
{
$ignore_words = array();
}
if (!isset($ignore_words[$user->lang_name]))
{
$ignore_words[$user->lang_name] = explode("\n", str_replace("\n\n", "\n", str_replace("\r", "\n", file_get_contents($user->lang_path . '/search_ignore_words.txt'))));
$cache->put('_ignore_words', $ignore_words, 7200);
}
$this->ignore_words = $ignore_words[$user->lang_name];
unset($ignore_words);
}
}
/**
* Stores a list of synonyms that should be replaced in $this->match_synonym and $this->replace_synonym and caches them
*/
function get_synonyms()
{
if (!sizeof($this->match_synonym))
{
global $user, $cache;
$match_synonym = $cache->get('_match_synonym');
if (!$match_synonym)
{
$match_synonym = array();
}
if (!isset($match_synonym[$user->lang_name]))
{
preg_match_all('#^\s+(\S+)\s+(\S+)\s+$#m', file_get_contents($user->lang_path . '/search_synonyms.txt'), $match);
$match_synonym[$user->lang_name]['replace']= &$match[1];
$match_synonym[$user->lang_name]['match'] = &$match[2];
$cache->put('_match_synonym', $match_synonym, 7200);
}
$this->replace_synonym = $match_synonym[$user->lang_name]['replace'];
$this->match_synonym = $match_synonym[$user->lang_name]['match'];
unset($match_synonym);
}
}
/**
* Retrieves cached search results
*
* @param int result_count will contain the number of all results for the search (not only for the current page)
* @param array id_ary is filled with the ids belonging to the requested page that are stored in the cache
*
* @return SEARCH_RESULT_NOT_IN_CACHE or SEARCH_RESULT_IN_CACHE or SEARCH_RESULT_INCOMPLETE
*/
function obtain_ids($search_key, &$result_count, &$id_ary, $start, $per_page, $sort_dir)
{
global $cache;
if (!($stored_ids = $cache->get('_search_results_' . $search_key)))
{
// no search results cached for this search_key
return SEARCH_RESULT_NOT_IN_CACHE;
}
else
{
$result_count = $stored_ids[-1];
$reverse_ids = ($stored_ids[-2] != $sort_dir) ? true : false;
$complete = true;
// change the start to the actual end of the current request if the sort direction differs
// from the dirction in the cache and reverse the ids later
if ($reverse_ids)
{
$start = $result_count - $start - $per_page;
// the user requested a page past the last index
if ($start < 0)
{
return SEARCH_RESULT_NOT_IN_CACHE;
}
}
for ($i = $start, $n = $start + $per_page; ($i < $n) && ($i < $result_count); $i++)
{
if (!isset($stored_ids[$i]))
{
$complete = false;
}
else
{
$id_ary[] = $stored_ids[$i];
}
}
unset($stored_ids);
if ($reverse_ids)
{
$id_ary = array_reverse($id_ary);
}
if (!$complete)
{
return SEARCH_RESULT_INCOMPLETE;
}
return SEARCH_RESULT_IN_CACHE;
}
}
/**
* Caches post/topic ids
*
* @param array id_ary contains a list of post or topic ids that shall be cached, the first element
* must have the absolute index $start in the result set.
*/
function save_ids($search_key, $keywords, $author_ary, $result_count, &$id_ary, $start, $sort_dir)
{
global $cache, $config, $db;
$length = min(sizeof($id_ary), $config['search_block_size']);
$store_ids = array_slice($id_ary, 0, $length);
// create a new resultset if there is none for this search_key yet
// or add the ids to the existing resultset
if (!($store = $cache->get('_search_results_' . $search_key)))
{
// add the current keywords to the recent searches in the cache which are listed on the search page
if (!empty($keywords) || sizeof($author_ary))
{
$sql = 'SELECT search_time
FROM ' . SEARCH_TABLE . '
WHERE search_key = \'' . $db->sql_escape($search_key) . '\'';
$result = $db->sql_query($sql);
if (!$db->sql_fetchrow($result))
{
$sql_ary = array(
'search_key' => $search_key,
'search_time' => time(),
'search_keywords' => $keywords,
'search_authors' => implode(' ', $author_ary)
);
$sql = 'INSERT INTO ' . SEARCH_TABLE . ' ' . $db->sql_build_array('INSERT', $sql_ary);
$db->sql_query($sql);
}
$db->sql_freeresult($result);
}
set_config('last_search_time', time());
$store = array(-1 => $result_count, -2 => $sort_dir);
$id_range = range($start, $start + $length - 1);
}
else
{
// we use one set of resuts for both sort directions so we have to calculate the indizes
// for the reversed array and we also have to reverse the ids themselves
if ($store[-2] != $sort_dir)
{
$store_ids = array_reverse($store_ids);
$id_range = range($store[-1] - $start - $length, $store[-1] - $start - 1);
}
else
{
$id_range = range($start, $start + $length - 1);
}
}
// append the ids
$store += array_combine($id_range, $store_ids);
$cache->put('_search_results_' . $search_key, $store, $config['search_store_results']);
unset($store);
unset($store_ids);
unset($id_range);
}
/**
* Removes old entries from the search results table and removes searches with keywords that contain a word in $words.
*/
function destroy_cache($words)
{
global $db, $cache, $config;
if (sizeof($words))
{
$sql_where = '';
foreach ($words as $word)
{
$sql_where .= ' OR search_keywords LIKE \'%' . $db->sql_escape($word) . '%\'';
}
$sql = 'SELECT search_key
FROM ' . SEARCH_TABLE . "
WHERE search_keywords LIKE '%*%' $sql_where";
$result = $db->sql_query($sql);
while ($row = $db->sql_fetchrow($result))
{
$cache->destroy('_search_results_' . $row['search_key']);
}
$db->sql_freeresult();
}
$sql = 'DELETE
FROM ' . SEARCH_TABLE . '
WHERE search_time < ' . (time() - $config['search_store_results']);
$db->sql_query($sql);
}
}
?>