* @license GNU General Public License, version 2 (GPL-2.0) * * For full copyright and license information, please see * the docs/CREDITS.txt file. * */ namespace phpbb\search\backend; use phpbb\auth\auth; use phpbb\config\config; use phpbb\db\driver\driver_interface; use phpbb\db\tools\tools_interface; use phpbb\event\dispatcher_interface; use phpbb\language\language; use phpbb\log\log; use phpbb\user; /** * Fulltext search based on the sphinx search daemon */ class fulltext_sphinx implements search_backend_interface { protected const SPHINX_MAX_MATCHES = 20000; protected const SPHINX_CONNECT_RETRIES = 3; protected const SPHINX_CONNECT_WAIT_TIME = 300; /** * Associative array holding index stats * @var array */ protected $stats = array(); /** * Holds the words entered by user, obtained by splitting the entered query on whitespace * @var array */ protected $split_words = array(); /** * Holds unique sphinx id * @var string */ protected $id; /** * Stores the names of both main and delta sphinx indexes * separated by a semicolon * @var string */ protected $indexes; /** * Sphinx search client object * @var \SphinxClient */ protected $sphinx; /** * Relative path to board root * @var string */ protected $phpbb_root_path; /** * PHP Extension * @var string */ protected $php_ext; /** * Auth object * @var auth */ protected $auth; /** * Config object * @var config */ protected $config; /** * Database connection * @var driver_interface */ protected $db; /** * Database Tools object * @var tools_interface */ protected $db_tools; /** * Stores the database type if supported by sphinx * @var string */ protected $dbtype; /** * phpBB event dispatcher object * @var dispatcher_interface */ protected $phpbb_dispatcher; /** * @var language */ protected $language; /** * @var log */ protected $log; /** * User object * @var user */ protected $user; /** * Stores the generated content of the sphinx config file * @var string */ protected $config_file_data = ''; /** * Contains tidied search query. * Operators are prefixed in search query and common words excluded * @var string */ protected $search_query = ''; /** * Constructor * Creates a new \phpbb\search\backend\fulltext_postgres, which is used as a search backend * * @param auth $auth Auth object * @param config $config Config object * @param driver_interface $db Database object * @param tools_interface $db_tools * @param dispatcher_interface $phpbb_dispatcher Event dispatcher object * @param language $language * @param log $log * @param user $user User object * @param string $phpbb_root_path Relative path to phpBB root * @param string $phpEx PHP file extension */ public function __construct(auth $auth, config $config, driver_interface $db, tools_interface $db_tools, dispatcher_interface $phpbb_dispatcher, language $language, log $log, user $user, string $phpbb_root_path, string $phpEx) { $this->auth = $auth; $this->config = $config; $this->db = $db; $this->phpbb_dispatcher = $phpbb_dispatcher; $this->language = $language; $this->log = $log; $this->user = $user; $this->phpbb_root_path = $phpbb_root_path; $this->php_ext = $phpEx; $this->db_tools = $db_tools; if (!$this->config['fulltext_sphinx_id']) { $this->config->set('fulltext_sphinx_id', unique_id()); } $this->id = $this->config['fulltext_sphinx_id']; $this->indexes = 'index_phpbb_' . $this->id . '_delta;index_phpbb_' . $this->id . '_main'; if (!class_exists('SphinxClient')) { require($this->phpbb_root_path . 'includes/sphinxapi.' . $this->php_ext); } // Initialize sphinx client $this->sphinx = new \SphinxClient(); $this->sphinx->SetServer(($this->config['fulltext_sphinx_host'] ? $this->config['fulltext_sphinx_host'] : 'localhost'), ($this->config['fulltext_sphinx_port'] ? (int) $this->config['fulltext_sphinx_port'] : 9312)); } /** * {@inheritdoc} */ public function get_name(): string { return 'Sphinx Fulltext'; } /** * {@inheritdoc} */ public function is_available(): bool { return ($this->db->get_sql_layer() == 'mysqli' || $this->db->get_sql_layer() == 'postgres') && class_exists('SphinxClient'); } /** * {@inheritdoc} */ public function init() { if (!$this->is_available()) { return $this->language->lang('FULLTEXT_SPHINX_WRONG_DATABASE'); } // Move delta to main index each hour $this->config->set('search_gc', 3600); return false; } /** * {@inheritdoc} */ public function get_search_query(): string { return $this->search_query; } /** * {@inheritdoc} */ public function get_common_words(): array { return array(); } /** * {@inheritdoc} */ public function get_word_length() { return false; } /** * {@inheritdoc} */ public function split_keywords(string &$keywords, string $terms): bool { // Keep quotes and new lines $keywords = str_replace(['"', "\n"], ['"', ' '], trim($keywords)); if ($terms == 'all') { // Replaces verbal operators OR and NOT with special characters | and -, unless appearing within quotation marks $match = ['#\sor\s(?=([^"]*"[^"]*")*[^"]*$)#i', '#\snot\s(?=([^"]*"[^"]*")*[^"]*$)#i']; $replace = [' | ', ' -']; $keywords = preg_replace($match, $replace, $keywords); $this->sphinx->SetMatchMode(SPH_MATCH_EXTENDED); } else { $match = ['\\', '(',')', '|', '!', '@', '~', '/', '^', '$', '=', '&', '<', '>']; $keywords = str_replace($match, ' ', $keywords); $this->sphinx->SetMatchMode(SPH_MATCH_ANY); } if (strlen($keywords) > 0) { $this->search_query = str_replace('"', '"', $keywords); return true; } return false; } /** * {@inheritdoc} */ public function keyword_search(string $type, string $fields, string $terms, array $sort_by_sql, string $sort_key, string $sort_dir, string $sort_days, array $ex_fid_ary, string $post_visibility, int $topic_id, array $author_ary, string $author_name, array &$id_ary, int &$start, int $per_page) { // No keywords? No posts. if (!strlen($this->search_query) && !count($author_ary)) { return false; } $id_ary = array(); // Sorting if ($type == 'topics') { switch ($sort_key) { case 'a': $this->sphinx->SetGroupBy('topic_id', SPH_GROUPBY_ATTR, 'poster_id ' . (($sort_dir == 'a') ? 'ASC' : 'DESC')); break; case 'f': $this->sphinx->SetGroupBy('topic_id', SPH_GROUPBY_ATTR, 'forum_id ' . (($sort_dir == 'a') ? 'ASC' : 'DESC')); break; case 'i': case 's': $this->sphinx->SetGroupBy('topic_id', SPH_GROUPBY_ATTR, 'post_subject ' . (($sort_dir == 'a') ? 'ASC' : 'DESC')); break; case 't': default: $this->sphinx->SetGroupBy('topic_id', SPH_GROUPBY_ATTR, 'topic_last_post_time ' . (($sort_dir == 'a') ? 'ASC' : 'DESC')); break; } } else { switch ($sort_key) { case 'a': $this->sphinx->SetSortMode(($sort_dir == 'a') ? SPH_SORT_ATTR_ASC : SPH_SORT_ATTR_DESC, 'poster_id'); break; case 'f': $this->sphinx->SetSortMode(($sort_dir == 'a') ? SPH_SORT_ATTR_ASC : SPH_SORT_ATTR_DESC, 'forum_id'); break; case 'i': case 's': $this->sphinx->SetSortMode(($sort_dir == 'a') ? SPH_SORT_ATTR_ASC : SPH_SORT_ATTR_DESC, 'post_subject'); break; case 't': default: $this->sphinx->SetSortMode(($sort_dir == 'a') ? SPH_SORT_ATTR_ASC : SPH_SORT_ATTR_DESC, 'post_time'); break; } } // Most narrow filters first if ($topic_id) { $this->sphinx->SetFilter('topic_id', array($topic_id)); } /** * Allow modifying the Sphinx search options * * @event core.search_sphinx_keywords_modify_options * @var string type Searching type ('posts', 'topics') * @var string fields Searching fields ('titleonly', 'msgonly', 'firstpost', 'all') * @var string terms Searching terms ('all', 'any') * @var int sort_days Time, in days, of the oldest possible post to list * @var string sort_key The sort type used from the possible sort types * @var int topic_id Limit the search to this topic_id only * @var array ex_fid_ary Which forums not to search on * @var string post_visibility Post visibility data * @var array author_ary Array of user_id containing the users to filter the results to * @var string author_name The username to search on * @var object sphinx The Sphinx searchd client object * @since 3.1.7-RC1 */ $sphinx = $this->sphinx; $vars = array( 'type', 'fields', 'terms', 'sort_days', 'sort_key', 'topic_id', 'ex_fid_ary', 'post_visibility', 'author_ary', 'author_name', 'sphinx', ); extract($this->phpbb_dispatcher->trigger_event('core.search_sphinx_keywords_modify_options', compact($vars))); $this->sphinx = $sphinx; unset($sphinx); $search_query_prefix = ''; switch ($fields) { case 'titleonly': // Only search the title if ($terms == 'all') { $search_query_prefix = '@title '; } // Weight for the title $this->sphinx->SetFieldWeights(array("title" => 5, "data" => 1)); // 1 is first_post, 0 is not first post $this->sphinx->SetFilter('topic_first_post', array(1)); break; case 'msgonly': // Only search the body if ($terms == 'all') { $search_query_prefix = '@data '; } // Weight for the body $this->sphinx->SetFieldWeights(array("title" => 1, "data" => 5)); break; case 'firstpost': // More relative weight for the title, also search the body $this->sphinx->SetFieldWeights(array("title" => 5, "data" => 1)); // 1 is first_post, 0 is not first post $this->sphinx->SetFilter('topic_first_post', array(1)); break; default: // More relative weight for the title, also search the body $this->sphinx->SetFieldWeights(array("title" => 5, "data" => 1)); break; } if (count($author_ary)) { $this->sphinx->SetFilter('poster_id', $author_ary); } // As this is not simply possible at the moment, we limit the result to approved posts. // This will make it impossible for moderators to search unapproved and softdeleted posts, // but at least it will also cause the same for normal users. $this->sphinx->SetFilter('post_visibility', array(ITEM_APPROVED)); if (count($ex_fid_ary)) { // All forums that a user is allowed to access $fid_ary = array_unique(array_intersect(array_keys($this->auth->acl_getf('f_read', true)), array_keys($this->auth->acl_getf('f_search', true)))); // All forums that the user wants to and can search in $search_forums = array_diff($fid_ary, $ex_fid_ary); if (count($search_forums)) { $this->sphinx->SetFilter('forum_id', $search_forums); } } $this->sphinx->SetFilter('deleted', array(0)); $this->sphinx->SetLimits((int) $start, (int) $per_page, max(self::SPHINX_MAX_MATCHES, (int) $start + $per_page)); $result = $this->sphinx->Query($search_query_prefix . $this->sphinx_clean_search_string(str_replace('"', '"', $this->search_query)), $this->indexes); // Could be connection to localhost:9312 failed (errno=111, // msg=Connection refused) during rotate, retry if so $retries = self::SPHINX_CONNECT_RETRIES; while (!$result && (strpos($this->sphinx->GetLastError(), "errno=111,") !== false) && $retries--) { usleep(self::SPHINX_CONNECT_WAIT_TIME); $result = $this->sphinx->Query($search_query_prefix . $this->sphinx_clean_search_string(str_replace('"', '"', $this->search_query)), $this->indexes); } if ($this->sphinx->GetLastError()) { $this->log->add('critical', $this->user->data['user_id'], $this->user->ip, 'LOG_SPHINX_ERROR', false, array($this->sphinx->GetLastError())); if ($this->auth->acl_get('a_')) { trigger_error($this->language->lang('SPHINX_SEARCH_FAILED', $this->sphinx->GetLastError())); } else { trigger_error($this->language->lang('SPHINX_SEARCH_FAILED_LOG')); } } $result_count = $result['total_found']; if ($result_count && $start >= $result_count) { $start = floor(($result_count - 1) / $per_page) * $per_page; $this->sphinx->SetLimits((int) $start, (int) $per_page, max(self::SPHINX_MAX_MATCHES, (int) $start + $per_page)); $result = $this->sphinx->Query($search_query_prefix . $this->sphinx_clean_search_string(str_replace('"', '"', $this->search_query)), $this->indexes); // Could be connection to localhost:9312 failed (errno=111, // msg=Connection refused) during rotate, retry if so $retries = self::SPHINX_CONNECT_RETRIES; while (!$result && (strpos($this->sphinx->GetLastError(), "errno=111,") !== false) && $retries--) { usleep(self::SPHINX_CONNECT_WAIT_TIME); $result = $this->sphinx->Query($search_query_prefix . $this->sphinx_clean_search_string(str_replace('"', '"', $this->search_query)), $this->indexes); } } $id_ary = array(); if (isset($result['matches'])) { if ($type == 'posts') { $id_ary = array_keys($result['matches']); } else { foreach ($result['matches'] as $key => $value) { $id_ary[] = $value['attrs']['topic_id']; } } } else { return false; } $id_ary = array_slice($id_ary, 0, (int) $per_page); return $result_count; } /** * {@inheritdoc} */ public function author_search(string $type, bool $firstpost_only, array $sort_by_sql, string $sort_key, string $sort_dir, string $sort_days, array $ex_fid_ary, string $post_visibility, int $topic_id, array $author_ary, string $author_name, array &$id_ary, int &$start, int $per_page) { $this->search_query = ''; $this->sphinx->SetMatchMode(SPH_MATCH_FULLSCAN); $fields = ($firstpost_only) ? 'firstpost' : 'all'; $terms = 'all'; return $this->keyword_search($type, $fields, $terms, $sort_by_sql, $sort_key, $sort_dir, $sort_days, $ex_fid_ary, $post_visibility, $topic_id, $author_ary, $author_name, $id_ary, $start, $per_page); } /** * {@inheritdoc} */ public function supports_phrase_search(): bool { return false; } /** * {@inheritdoc} */ public function index(string $mode, int $post_id, string &$message, string &$subject, int $poster_id, int $forum_id) { /** * Event to modify method arguments before the Sphinx search index is updated * * @event core.search_sphinx_index_before * @var string mode Contains the post mode: edit, post, reply, quote * @var int post_id The id of the post which is modified/created * @var string message New or updated post content * @var string subject New or updated post subject * @var int poster_id Post author's user id * @var int forum_id The id of the forum in which the post is located * @since 3.2.3-RC1 */ $vars = array( 'mode', 'post_id', 'message', 'subject', 'poster_id', 'forum_id', ); extract($this->phpbb_dispatcher->trigger_event('core.search_sphinx_index_before', compact($vars))); if ($mode == 'edit') { $this->sphinx->UpdateAttributes($this->indexes, array('forum_id', 'poster_id'), array((int) $post_id => array((int) $forum_id, (int) $poster_id))); } else if ($mode != 'post' && $post_id) { // Update topic_last_post_time for full topic $sql_array = array( 'SELECT' => 'p1.post_id', 'FROM' => array( POSTS_TABLE => 'p1', ), 'LEFT_JOIN' => array(array( 'FROM' => array( POSTS_TABLE => 'p2' ), 'ON' => 'p1.topic_id = p2.topic_id', )), 'WHERE' => 'p2.post_id = ' . ((int) $post_id), ); $sql = $this->db->sql_build_query('SELECT', $sql_array); $result = $this->db->sql_query($sql); $post_updates = array(); $post_time = time(); while ($row = $this->db->sql_fetchrow($result)) { $post_updates[(int) $row['post_id']] = array($post_time); } $this->db->sql_freeresult($result); if (count($post_updates)) { $this->sphinx->UpdateAttributes($this->indexes, array('topic_last_post_time'), $post_updates); } } } /** * {@inheritdoc} */ public function index_remove(array $post_ids, array $author_ids, array $forum_ids): void { $values = array(); foreach ($post_ids as $post_id) { $values[$post_id] = array(1); } $this->sphinx->UpdateAttributes($this->indexes, array('deleted'), $values); } /** * Nothing needs to be destroyed */ public function tidy(): void { $this->config->set('search_last_gc', time(), false); } /** * {@inheritdoc} */ public function create_index(int &$post_counter = 0): ?array { if (!$this->index_created()) { $table_data = array( 'COLUMNS' => array( 'counter_id' => array('UINT', 0), 'max_doc_id' => array('UINT', 0), ), 'PRIMARY_KEY' => 'counter_id', ); $this->db_tools->sql_create_table(SPHINX_TABLE, $table_data); $data = array( 'counter_id' => '1', 'max_doc_id' => '0', ); $sql = 'INSERT INTO ' . SPHINX_TABLE . ' ' . $this->db->sql_build_array('INSERT', $data); $this->db->sql_query($sql); } return null; } /** * {@inheritdoc} */ public function delete_index(int &$post_counter = null): ?array { if ($this->index_created()) { $this->db_tools->sql_table_drop(SPHINX_TABLE); } return null; } /** * {@inheritdoc} */ public function index_created($allow_new_files = true): bool { $created = false; if ($this->db_tools->sql_table_exists(SPHINX_TABLE)) { $created = true; } return $created; } /** * {@inheritdoc} */ public function index_stats() { if (empty($this->stats)) { $this->get_stats(); } return array( $this->language->lang('FULLTEXT_SPHINX_MAIN_POSTS') => ($this->index_created()) ? $this->stats['main_posts'] : 0, $this->language->lang('FULLTEXT_SPHINX_DELTA_POSTS') => ($this->index_created()) ? $this->stats['total_posts'] - $this->stats['main_posts'] : 0, $this->language->lang('FULLTEXT_MYSQL_TOTAL_POSTS') => ($this->index_created()) ? $this->stats['total_posts'] : 0, ); } /** * Computes the stats and store them in the $this->stats associative array */ protected function get_stats() { if ($this->index_created()) { $sql = 'SELECT COUNT(post_id) as total_posts FROM ' . POSTS_TABLE; $result = $this->db->sql_query($sql); $this->stats['total_posts'] = (int) $this->db->sql_fetchfield('total_posts'); $this->db->sql_freeresult($result); $sql = 'SELECT COUNT(p.post_id) as main_posts FROM ' . POSTS_TABLE . ' p, ' . SPHINX_TABLE . ' m WHERE p.post_id <= m.max_doc_id AND m.counter_id = 1'; $result = $this->db->sql_query($sql); $this->stats['main_posts'] = (int) $this->db->sql_fetchfield('main_posts'); $this->db->sql_freeresult($result); } } /** * Cleans search query passed into Sphinx search engine, as follows: * 1. Hyphenated words are replaced with keyword search for either the exact phrase with spaces * or as a single word without spaces eg search for "know-it-all" becomes ("know it all"|"knowitall*") * 2. Words with apostrophes are contracted eg "it's" becomes "its" * 3. <, >, " and & are decoded from HTML entities. * 4. Following special characters used as search operators in Sphinx are preserved when used with correct syntax: * (a) quorum matching: "the world is a wonderful place"/3 * Finds 3 of the words within the phrase. Number must be between 1 and 9. * (b) proximity search: "hello world"~10 * Finds hello and world within 10 words of each other. Number can be between 1 and 99. * (c) strict word order: aaa << bbb << ccc * Finds "aaa" only where it appears before "bbb" and only where "bbb" appears before "ccc". * (d) exact match operator: if lemmatizer or stemming enabled, * search will find exact match only and ignore other grammatical forms of the same word stem. * eg. raining =cats and =dogs * will not return "raining cat and dog" * eg. ="search this exact phrase" * will not return "searched this exact phrase", "searching these exact phrases". * 5. Special characters /, ~, << and = not complying with the correct syntax * and other reserved operators are escaped and searched literally. * Special characters not explicitly listed in charset_table or blend_chars in sphinx.conf * will not be indexed and keywords containing them will be ignored by Sphinx. * By default, only $, %, & and @ characters are indexed and searchable. * String transformation is in backend only and not visible to the end user * nor reflected in the results page URL or keyword highlighting. * * @param string $search_string * @return string */ protected function sphinx_clean_search_string($search_string) { $from = ['@', '^', '$', '!', '<', '>', '"', '&', '\'']; $to = ['\@', '\^', '\$', '\!', '<', '>', '"', '&', '']; $search_string = str_replace($from, $to, $search_string); $search_string = strrev($search_string); $search_string = preg_replace(['#\/(?!"[^"]+")#', '#~(?!"[^"]+")#'], ['/\\', '~\\'], $search_string); $search_string = strrev($search_string); $match = ['#(/|\\\\/)(?)#', '#(~|\\\\~)(?!\d{1,2}(\s|$))#', '#((?:\p{L}|\p{N})+)-((?:\p{L}|\p{N})+)(?:-((?:\p{L}|\p{N})+))?(?:-((?:\p{L}|\p{N})+))?#i', '#<<\s*$#', '#(\S\K=|=(?=\s)|=$)#']; $replace = ['\/', '\~', '("$1 $2 $3 $4"|$1$2$3$4*)', '\<\<', '\=']; $search_string = preg_replace($match, $replace, $search_string); $search_string = preg_replace('#\s+"\|#', '"|', $search_string); /** * OPTIONAL: Thousands separator stripped from numbers, eg search for '90,000' is queried as '90000'. * By default commas are stripped from search index so that '90,000' is indexed as '90000' */ // $search_string = preg_replace('#[0-9]{1,3}\K,(?=[0-9]{3})#', '', $search_string); return $search_string; } /** * {@inheritdoc} */ public function get_acp_options(): array { $config_vars = array( 'fulltext_sphinx_data_path' => 'string', 'fulltext_sphinx_host' => 'string', 'fulltext_sphinx_port' => 'string', 'fulltext_sphinx_indexer_mem_limit' => 'int', ); $tpl = ' ' . $this->language->lang('FULLTEXT_SPHINX_CONFIGURE'). '