mirror of
https://github.com/phpbb/phpbb.git
synced 2025-08-07 01:06:48 +02:00
- overhauled search system
- updated structure for search backend plugins - better result caching using ACM - search results no longer session restricted => link to them by copying the URL :) - in-topic search - indexing posts now uses search backend plugins - develop/search_fill.php working again - fulltext_mysql not working yet - tiny bugfixes to ACM git-svn-id: file:///svn/phpbb/trunk@5441 89ea8834-ac86-4346-8a33-228a782c2dd0
This commit is contained in:
@@ -1212,325 +1212,4 @@ class parse_message extends bbcode_firstpass
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @package phpBB3
|
||||
* Parses a given message and updates/maintains the fulltext tables
|
||||
* @todo replace fulltext_search in message_parser with search modules
|
||||
*/
|
||||
class fulltext_search
|
||||
{
|
||||
function split_words($mode, $text)
|
||||
{
|
||||
global $user, $config;
|
||||
|
||||
static $drop_char_match, $drop_char_replace, $stopwords, $replace_synonym, $match_synonym;
|
||||
|
||||
// Is the fulltext indexer disabled? If yes then we need not
|
||||
// carry on ... it's okay ... I know when I'm not wanted boo hoo
|
||||
if (!$config['load_search_upd'])
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (!is_array($drop_char_match))
|
||||
{
|
||||
$drop_char_match = array('-', '^', '$', ';', '#', '&', '(', ')', '<', '>', '`', '\'', '"', '|', ',', '@', '_', '?', '%', '~', '.', '[', ']', '{', '}', ':', '\\', '/', '=', '\'', '!', '*');
|
||||
$drop_char_replace = array(' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '', '', ' ', ' ', ' ', ' ', '', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '' , ' ', ' ', ' ', ' ', ' ');
|
||||
|
||||
if ($fp = @fopen($user->lang_path . '/search_stopwords.txt', 'rb'))
|
||||
{
|
||||
$stopwords = explode("\n", str_replace("\r\n", "\n", fread($fp, filesize($user->lang_path . '/search_stopwords.txt'))));
|
||||
}
|
||||
fclose($fp);
|
||||
|
||||
if ($fp = @fopen($user->lang_path . '/search_synonyms.txt', 'rb'))
|
||||
{
|
||||
preg_match_all('#^(.*?) (.*?)$#ms', fread($fp, filesize($user->lang_path . '/search_synonyms.txt')), $match);
|
||||
$replace_synonym = &$match[1];
|
||||
$match_synonym = &$match[2];
|
||||
}
|
||||
fclose($fp);
|
||||
}
|
||||
|
||||
$match = array();
|
||||
// Comments for hardcoded bbcode elements (urls, smilies, html)
|
||||
$match[] = '#<!\-\- .* \-\->(.*?)<!\-\- .* \-\->#is';
|
||||
// New lines, carriage returns
|
||||
$match[] = "#[\n\r]+#";
|
||||
// NCRs like etc.
|
||||
$match[] = '#(&|&)[\#a-z0-9]+?;#i';
|
||||
// Do not index code
|
||||
$match[] = '#\[code=?.*?(\:?[0-9a-z]{5,})\].*?\[\/code(\:?[0-9a-z]{5,})\]#is';
|
||||
// BBcode
|
||||
$match[] = '#\[\/?[a-z\*\+\-]+=?.*?(\:?[0-9a-z]{5,})\]#';
|
||||
// Sequences > min_search_chars & < max_search_chars
|
||||
// $match[] = '#\s([\b]{1,' . $config['min_search_chars'] . '}|[\b]{' . $config['max_search_chars'] . ',})\s#is';
|
||||
// $match[] = '#\s((&\#[0-9]+;){1,' . $config['min_search_chars'] . '}|(&\#[0-9]+;){' . $config['max_search_chars'] . ',})\s#is';
|
||||
// Filter out ; and # but not &#[0-9]+;
|
||||
// $match[] = '#(&\#[0-9]+;)|;|\#|&#';
|
||||
|
||||
$text = preg_replace($match, ' ', ' ' . strtolower(trim($text)) . ' ');
|
||||
$text = str_replace(array(' + ', ' - '), array(' and ', ' not '), $text);
|
||||
|
||||
// Filter out non-alphabetical chars
|
||||
$text = str_replace($drop_char_match, $drop_char_replace, $text);
|
||||
|
||||
// Split words
|
||||
$text = explode(' ', preg_replace('#\s+#', ' ', trim($text)));
|
||||
|
||||
if (sizeof($stopwords))
|
||||
{
|
||||
$stopped_words = array_intersect($text, $stopwords);
|
||||
$text = array_diff($text, $stopwords);
|
||||
}
|
||||
|
||||
if (sizeof($replace_synonym))
|
||||
{
|
||||
$text = str_replace($replace_synonym, $match_synonym, $text);
|
||||
}
|
||||
|
||||
foreach ($text as $index => $word)
|
||||
{
|
||||
if (strlen($word) < $config['min_search_chars'] || strlen($word) > $config['max_search_chars'])
|
||||
{
|
||||
unset($text[$index]);
|
||||
}
|
||||
}
|
||||
|
||||
return $text;
|
||||
}
|
||||
|
||||
function add($mode, $post_id, &$message, &$subject)
|
||||
{
|
||||
global $config, $db;
|
||||
|
||||
// Is the fulltext indexer disabled? If yes then we need not
|
||||
// carry on ... it's okay ... I know when I'm not wanted boo hoo
|
||||
if (!$config['load_search_upd'])
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Split old and new post/subject to obtain array of 'words'
|
||||
$split_text = $this->split_words('post', $message);
|
||||
$split_title = ($subject) ? $this->split_words('post', $subject) : array();
|
||||
|
||||
$words = array();
|
||||
if ($mode == 'edit')
|
||||
{
|
||||
$words['add']['post'] = array();
|
||||
$words['add']['title'] = array();
|
||||
$words['del']['post'] = array();
|
||||
$words['del']['title'] = array();
|
||||
|
||||
$sql = 'SELECT w.word_id, w.word_text, m.title_match
|
||||
FROM ' . SEARCH_WORD_TABLE . ' w, ' . SEARCH_MATCH_TABLE . " m
|
||||
WHERE m.post_id = $post_id
|
||||
AND w.word_id = m.word_id";
|
||||
$result = $db->sql_query($sql);
|
||||
|
||||
$cur_words = array();
|
||||
while ($row = $db->sql_fetchrow($result))
|
||||
{
|
||||
$which = ($row['title_match']) ? 'title' : 'post';
|
||||
$cur_words[$which][$row['word_text']] = $row['word_id'];
|
||||
}
|
||||
$db->sql_freeresult($result);
|
||||
|
||||
$words['add']['post'] = array_diff($split_text, array_keys($cur_words['post']));
|
||||
$words['add']['title'] = array_diff($split_title, array_keys($cur_words['title']));
|
||||
$words['del']['post'] = array_diff(array_keys($cur_words['post']), $split_text);
|
||||
$words['del']['title'] = array_diff(array_keys($cur_words['title']), $split_title);
|
||||
}
|
||||
else
|
||||
{
|
||||
$words['add']['post'] = $split_text;
|
||||
$words['add']['title'] = $split_title;
|
||||
$words['del']['post'] = array();
|
||||
$words['del']['title'] = array();
|
||||
}
|
||||
unset($split_text);
|
||||
unset($split_title);
|
||||
|
||||
// Get unique words from the above arrays
|
||||
$unique_add_words = array_unique(array_merge($words['add']['post'], $words['add']['title']));
|
||||
|
||||
// We now have unique arrays of all words to be added and removed and
|
||||
// individual arrays of added and removed words for text and title. What
|
||||
// we need to do now is add the new words (if they don't already exist)
|
||||
// and then add (or remove) matches between the words and this post
|
||||
if (sizeof($unique_add_words))
|
||||
{
|
||||
$sql = 'SELECT word_id, word_text
|
||||
FROM ' . SEARCH_WORD_TABLE . '
|
||||
WHERE word_text IN (' . implode(', ', preg_replace('#^(.*)$#', '\'$1\'', $unique_add_words)) . ")";
|
||||
$result = $db->sql_query($sql);
|
||||
|
||||
$word_ids = array();
|
||||
while ($row = $db->sql_fetchrow($result))
|
||||
{
|
||||
$word_ids[$row['word_text']] = $row['word_id'];
|
||||
}
|
||||
$db->sql_freeresult($result);
|
||||
|
||||
$new_words = array_diff($unique_add_words, array_keys($word_ids));
|
||||
unset($unique_add_words);
|
||||
|
||||
if (sizeof($new_words))
|
||||
{
|
||||
switch (SQL_LAYER)
|
||||
{
|
||||
case 'mysql':
|
||||
$sql = 'INSERT INTO ' . SEARCH_WORD_TABLE . ' (word_text)
|
||||
VALUES ' . implode(', ', preg_replace('#^(.*)$#', '(\'$1\')', $new_words));
|
||||
$db->sql_query($sql);
|
||||
break;
|
||||
|
||||
case 'mysql4':
|
||||
case 'mysqli':
|
||||
case 'mssql':
|
||||
case 'mssql_odbc':
|
||||
case 'sqlite':
|
||||
$sql = 'INSERT INTO ' . SEARCH_WORD_TABLE . ' (word_text) ' . implode(' UNION ALL ', preg_replace('#^(.*)$#', "SELECT '\$1'", $new_words));
|
||||
$db->sql_query($sql);
|
||||
break;
|
||||
|
||||
default:
|
||||
foreach ($new_words as $word)
|
||||
{
|
||||
$sql = 'INSERT INTO ' . SEARCH_WORD_TABLE . " (word_text)
|
||||
VALUES ('$word')";
|
||||
$db->sql_query($sql);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
unset($new_words);
|
||||
}
|
||||
|
||||
foreach ($words['del'] as $word_in => $word_ary)
|
||||
{
|
||||
$title_match = ($word_in == 'title') ? 1 : 0;
|
||||
|
||||
if (sizeof($word_ary))
|
||||
{
|
||||
$sql_in = array();
|
||||
foreach ($word_ary as $word)
|
||||
{
|
||||
$sql_in[] = $cur_words[$word_in][$word];
|
||||
}
|
||||
|
||||
$sql = 'DELETE FROM ' . SEARCH_MATCH_TABLE . '
|
||||
WHERE word_id IN (' . implode(', ', $sql_in) . ')
|
||||
AND post_id = ' . intval($post_id) . "
|
||||
AND title_match = $title_match";
|
||||
$db->sql_query($sql);
|
||||
unset($sql_in);
|
||||
}
|
||||
}
|
||||
|
||||
foreach ($words['add'] as $word_in => $word_ary)
|
||||
{
|
||||
$title_match = ($word_in == 'title') ? 1 : 0;
|
||||
|
||||
if (sizeof($word_ary))
|
||||
{
|
||||
$sql = 'INSERT INTO ' . SEARCH_MATCH_TABLE . " (post_id, word_id, title_match)
|
||||
SELECT $post_id, word_id, $title_match
|
||||
FROM " . SEARCH_WORD_TABLE . '
|
||||
WHERE word_text IN (' . implode(', ', preg_replace('#^(.*)$#', '\'$1\'', $word_ary)) . ')';
|
||||
$db->sql_query($sql);
|
||||
}
|
||||
}
|
||||
|
||||
unset($words);
|
||||
|
||||
// Run the cleanup infrequently, once per session cleanup
|
||||
if ($config['search_last_gc'] < time() - $config['search_gc'])
|
||||
{
|
||||
$this->search_tidy();
|
||||
}
|
||||
}
|
||||
|
||||
// Tidy up indexes, tag 'common words', remove
|
||||
// words no longer referenced in the match table, etc.
|
||||
function search_tidy()
|
||||
{
|
||||
global $db, $config;
|
||||
|
||||
// Is the fulltext indexer disabled? If yes then we need not
|
||||
// carry on ... it's okay ... I know when I'm not wanted boo hoo
|
||||
if (!$config['load_search_upd'])
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
// Remove common (> 60% of posts ) words
|
||||
$sql = 'SELECT SUM(forum_posts) AS total_posts
|
||||
FROM ' . FORUMS_TABLE;
|
||||
$result = $db->sql_query($sql);
|
||||
|
||||
$row = $db->sql_fetchrow($result);
|
||||
$db->sql_freeresult($result);
|
||||
|
||||
if ($row['total_posts'] >= 100)
|
||||
{
|
||||
$sql = 'SELECT word_id
|
||||
FROM ' . SEARCH_MATCH_TABLE . '
|
||||
GROUP BY word_id
|
||||
HAVING COUNT(word_id) > ' . floor($row['total_posts'] * 0.6);
|
||||
$result = $db->sql_query($sql);
|
||||
|
||||
if ($row = $db->sql_fetchrow($result))
|
||||
{
|
||||
$sql_in = array();
|
||||
do
|
||||
{
|
||||
$sql_in[] = $row['word_id'];
|
||||
}
|
||||
while ($row = $db->sql_fetchrow($result));
|
||||
|
||||
$sql_in = implode(', ', $sql_in);
|
||||
|
||||
$sql = 'UPDATE ' . SEARCH_WORD_TABLE . "
|
||||
SET word_common = 1
|
||||
WHERE word_id IN ($sql_in)";
|
||||
$db->sql_query($sql);
|
||||
|
||||
$sql = 'DELETE FROM ' . SEARCH_MATCH_TABLE . "
|
||||
WHERE word_id IN ($sql_in)";
|
||||
$db->sql_query($sql);
|
||||
unset($sql_in);
|
||||
}
|
||||
$db->sql_freeresult($result);
|
||||
}
|
||||
|
||||
// Remove words with no matches ... this is a potentially nasty query
|
||||
$sql = 'SELECT w.word_id
|
||||
FROM ' . SEARCH_WORD_TABLE . ' w
|
||||
LEFT JOIN ' . SEARCH_MATCH_TABLE . ' m ON (w.word_id = m.word_id)
|
||||
WHERE w.word_common = 0 AND m.word_id IS NULL
|
||||
GROUP BY m.word_id';
|
||||
$result = $db->sql_query($sql);
|
||||
|
||||
if ($row = $db->sql_fetchrow($result))
|
||||
{
|
||||
$sql_in = array();
|
||||
do
|
||||
{
|
||||
$sql_in[] = $row['word_id'];
|
||||
}
|
||||
while ($row = $db->sql_fetchrow($result));
|
||||
|
||||
$sql = 'DELETE FROM ' . SEARCH_WORD_TABLE . '
|
||||
WHERE word_id IN (' . implode(', ', $sql_in) . ')';
|
||||
$db->sql_query($sql);
|
||||
unset($sql_in);
|
||||
}
|
||||
$db->sql_freeresult($result);
|
||||
|
||||
set_config('search_last_gc', time());
|
||||
}
|
||||
}
|
||||
|
||||
?>
|
Reference in New Issue
Block a user