1
0
mirror of https://github.com/phpbb/phpbb.git synced 2025-07-30 21:40:43 +02:00

- search updates

git-svn-id: file:///svn/phpbb/trunk@5003 89ea8834-ac86-4346-8a33-228a782c2dd0
This commit is contained in:
Meik Sievertsen
2004-10-19 19:26:58 +00:00
parent 028c05a9a6
commit c091164d7e
10 changed files with 168 additions and 355 deletions

View File

@@ -111,7 +111,7 @@ class bbcode_firstpass extends bbcode
'url' => array('bbcode_id' => 3, 'regexp' => array('#\[url=?(.*?)?\](.*?)\[/url\]#ise' => "\$this->validate_url('\$1', '\$2')")),
'img' => array('bbcode_id' => 4, 'regexp' => array('#\[img\](https?://)([a-z0-9\-\.,\?!%\*_:;~\\&$@/=\+]+)\[/img\]#i' => '[img:' . $this->bbcode_uid . ']$1$2[/img:' . $this->bbcode_uid . ']')),
'size' => array('bbcode_id' => 5, 'regexp' => array('#\[size=([\-\+]?[1-2]?[0-9])\](.*?)\[/size\]#is' => '[size=$1:' . $this->bbcode_uid . ']$2[/size:' . $this->bbcode_uid . ']')),
'color' => array('bbcode_id' => 6, 'regexp' => array('!\[color=(#[0-9A-F]{6}|[a-z\-]+)\](.*?)\[/color\]!is' => '[color=$1:' . $this->bbcode_uid . ']$2[/color:' . $this->bbcode_uid . ']')),
'color' => array('bbcode_id' => 6, 'regexp' => array('#\[color=(#[0-9A-F]{6}|[a-z\-]+)\](.*?)\[/color\]#is' => '[color=$1:' . $this->bbcode_uid . ']$2[/color:' . $this->bbcode_uid . ']')),
'u' => array('bbcode_id' => 7, 'regexp' => array('#\[u\](.*?)\[/u\]#is' => '[u:' . $this->bbcode_uid . ']$1[/u:' . $this->bbcode_uid . ']')),
'list' => array('bbcode_id' => 9, 'regexp' => array('#\[list(=[a-z|0-9|(?:disc|circle|square))]+)?\].*\[/list\]#ise' => "\$this->bbcode_list('\$0')")),
'email' => array('bbcode_id' => 10, 'regexp' => array('#\[email=?(.*?)?\](.*?)\[/email\]#ise' => "\$this->validate_email('\$1', '\$2')")),
@@ -1084,7 +1084,7 @@ class parse_message extends bbcode_firstpass
// Parses a given message and updates/maintains the fulltext tables
class fulltext_search
{
function split_words($mode, $text, &$stopped_words)
function split_words($mode, $text)
{
global $user, $config;
@@ -1099,8 +1099,8 @@ class fulltext_search
if (!$drop_char_match)
{
$drop_char_match = array('^', '$', '&', '(', ')', '<', '>', '`', '\'', '"', '|', ',', '@', '_', '?', '%', '~', '.', '[', ']', '{', '}', ':', '\\', '/', '=', '#', '\'', ';', '!', '*');
$drop_char_replace = array(' ', ' ', ' ', ' ', ' ', ' ', ' ', '', '', ' ', ' ', ' ', ' ', '', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '' , ' ', ' ', ' ', ' ', ' ', ' ', ' ');
$drop_char_match = array('-', '^', '$', ';', '#', '&', '(', ')', '<', '>', '`', '\'', '"', '|', ',', '@', '_', '?', '%', '~', '.', '[', ']', '{', '}', ':', '\\', '/', '=', '\'', '!', '*');
$drop_char_replace = array(' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '', '', ' ', ' ', ' ', ' ', '', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '' , ' ', ' ', ' ', ' ', ' ');
if ($fp = @fopen($user->lang_path . '/search_stopwords.txt', 'rb'))
{
@@ -1118,42 +1118,50 @@ class fulltext_search
}
$match = array();
// Comments for hardcoded bbcode elements (urls, smilies, html)
$match[] = '#<!\-\- .* \-\->(.*?)<!\-\- .* \-\->#is';
// New lines, carriage returns
$match[] = "#[\n\r]+#";
// NCRs like &nbsp; etc.
$match[] = '#&amp;[\#a-z0-9]+?;#i';
// URL's
$match[] = '#\b[\w]+:\/\/[a-z0-9\.\-]+(\/[a-z0-9\?\.%_\-\+=&\/]+)?#';
$match[] = '#(&amp;|&)[\#a-z0-9]+?;#i';
// Do not index code
$match[] = '#\[code(=.*)?(\:?[0-9a-z]{5,})\].*?\[\/code(\:?[0-9a-z]{5,})\]#is';
// BBcode
$match[] = '#\[img:[a-z0-9]{5,}\].*?\[\/img:[a-z0-9]{5,}\]#';
$match[] = '#\[\/?[a-z\*=\+\-]+(\:?[0-9a-z]+)?:[a-z0-9]{5,}(\:[a-z0-9]+)?=?.*?\]#';
$match[] = '#<!\-\-(.*?)\-\->#is';
// Sequences < min_search_chars & < max_search_chars
$match[] = '#\b([\S]{1,' . $config['min_search_chars'] . '}|[\S]{' . $config['max_search_chars'] . ',})\b#is';
$match[] = '#\[\/?[a-z\*\+\-]+(=.*)?(\:?[0-9a-z]{5,})\]#';
// Sequences > min_search_chars & < max_search_chars
// $match[] = '#\s([\b]{1,' . $config['min_search_chars'] . '}|[\b]{' . $config['max_search_chars'] . ',})\s#is';
// $match[] = '#\s((&\#[0-9]+;){1,' . $config['min_search_chars'] . '}|(&\#[0-9]+;){' . $config['max_search_chars'] . ',})\s#is';
// Filter out ; and # but not &#[0-9]+;
// $match[] = '#(&\#[0-9]+;)|;|\#|&#';
$replace = array('', '', '', '', '', '', '');
$text = preg_replace($match, $replace, ' ' . strtolower($text) . ' ');
$text = str_replace(' and ', ' + ', $text);
$text = str_replace(' not ', ' - ', $text);
$text = preg_replace($match, ' ', ' ' . strtolower(trim($text)) . ' ');
$text = str_replace(array(' + ', ' - '), array(' and ', ' not '), $text);
// Filter out non-alphabetical chars
$text = str_replace($drop_char_match, $drop_char_replace, $text);
// Split words
$text = explode(' ', preg_replace('#\s+#', ' ', $text));
$text = explode(' ', preg_replace('#\s+#', ' ', trim($text)));
if ($stopwords)
if (sizeof($stopwords))
{
$stopped_words = array_intersect($text, $stopwords);
$text = array_diff($text, $stopwords);
}
if ($replace_synonym)
if (sizeof($replace_synonym))
{
$text = str_replace($replace_synonym, $match_synonym, $text);
}
foreach ($text as $index => $word)
{
if (strlen($word) < $config['min_search_chars'] || strlen($word) > $config['max_search_chars'])
{
unset($text[$index]);
}
}
return $text;
}
@@ -1169,10 +1177,8 @@ class fulltext_search
}
// Split old and new post/subject to obtain array of 'words'
$stopped_words = array();
$split_text = $this->split_words('post', $message, $stopped_words);
$split_title = ($subject) ? $this->split_words('post', $subject, $stopped_words) : array();
unset($stopped_words);
$split_text = $this->split_words('post', $message);
$split_title = ($subject) ? $this->split_words('post', $subject) : array();
$words = array();
if ($mode == 'edit')
@@ -1230,7 +1236,6 @@ class fulltext_search
$new_words = array_diff($unique_add_words, array_keys($word_ids));
unset($unique_add_words);
$db->sql_return_on_error(true);
if (sizeof($new_words))
{
switch (SQL_LAYER)
@@ -1238,7 +1243,7 @@ class fulltext_search
case 'mysql':
case 'mysql4':
$sql = 'INSERT INTO ' . SEARCH_WORD_TABLE . ' (word_text)
VALUES ' . implode(', ', preg_replace('#^(.*)$#', '(\'$1\')', $new_words));
VALUES ' . implode(', ', preg_replace('#^(.*)$#', '(\'$1\')', $new_words));
$db->sql_query($sql);
break;
@@ -1258,7 +1263,6 @@ class fulltext_search
break;
}
}
$db->sql_return_on_error(false);
unset($new_words);
}
@@ -1343,7 +1347,7 @@ class fulltext_search
$sql_in[] = $row['word_id'];
}
while ($row = $db->sql_fetchrow($result));
$sql_in = implode(', ', $sql_in);
$sql = 'UPDATE ' . SEARCH_WORD_TABLE . "
@@ -1363,7 +1367,7 @@ class fulltext_search
$sql = 'SELECT w.word_id
FROM ' . SEARCH_WORD_TABLE . ' w
LEFT JOIN ' . SEARCH_MATCH_TABLE . ' m ON w.word_id = m.word_id
AND m.word_id IS NULL
WHERE w.word_common = 0 AND m.word_id IS NULL
GROUP BY m.word_id';
$result = $db->sql_query($sql);