MDL-65806 Search: Solr does not find words in italic

This commit is contained in:
sam marshall 2019-05-30 16:19:57 +01:00
parent f3507273e9
commit c207289127
2 changed files with 88 additions and 1 deletions

View File

@ -272,7 +272,7 @@ class engine extends \core_search\engine {
$query = new \SolrDisMaxQuery();
$this->set_query($query, $data->q);
$this->set_query($query, self::replace_underlines($data->q));
$this->add_fields($query);
// Search filters applied, we don't cache these filters as we don't want to pollute the cache with tmp filters
@ -750,6 +750,23 @@ class engine extends \core_search\engine {
return true;
}
/**
* Replaces underlines at edges of words in the content with spaces.
*
* For example '_frogs_' will become 'frogs', '_frogs and toads_' will become 'frogs and toads',
* and 'frogs_and_toads' will be left as 'frogs_and_toads'.
*
* The reason for this is that for italic content_to_text puts _italic_ underlines at the start
* and end of the italicised phrase (not between words). Solr treats underlines as part of the
* word, which means that if you search for a word in italic then you can't find it.
*
* @param string $str String to replace
* @return string Replaced string
*/
protected static function replace_underlines(string $str): string {
return preg_replace('~\b_|_\b~', '', $str);
}
/**
* Adds a text document to the search engine.
*
@ -758,6 +775,14 @@ class engine extends \core_search\engine {
*/
protected function add_solr_document($doc) {
$solrdoc = new \SolrInputDocument();
// Replace underlines in the content with spaces. The reason for this is that for italic
// text, content_to_text puts _italic_ underlines. Solr treats underlines as part of the
// word, which means that if you search for a word in italic then you can't find it.
if (array_key_exists('content', $doc)) {
$doc['content'] = self::replace_underlines($doc['content']);
}
foreach ($doc as $field => $value) {
$solrdoc->addField($field, $value);
}

View File

@ -1010,6 +1010,68 @@ class search_solr_engine_testcase extends advanced_testcase {
['Post1', 'Post2'], $results);
}
/**
* Tests searching for results containing words in italic text. (This used to fail.)
*/
public function test_italics() {
global $USER;
// Use real search areas.
$this->search->clear_static();
$this->search->add_core_search_areas();
// Create a course and a forum.
$generator = $this->getDataGenerator();
$course = $generator->create_course();
$forum = $generator->create_module('forum', ['course' => $course->id]);
// As admin user, create forum discussions with various words in italics or with underlines.
$this->setAdminUser();
$forumgen = $generator->get_plugin_generator('mod_forum');
$forumgen->create_discussion(['course' => $course->id, 'forum' => $forum->id,
'userid' => $USER->id, 'name' => 'Post1',
'message' => '<p>This is a post about <i>frogs</i>.</p>']);
$forumgen->create_discussion(['course' => $course->id, 'forum' => $forum->id,
'userid' => $USER->id, 'name' => 'Post2',
'message' => '<p>This is a post about <i>toads and zombies</i>.</p>']);
$forumgen->create_discussion(['course' => $course->id, 'forum' => $forum->id,
'userid' => $USER->id, 'name' => 'Post3',
'message' => '<p>This is a post about toads_and_zombies.</p>']);
$forumgen->create_discussion(['course' => $course->id, 'forum' => $forum->id,
'userid' => $USER->id, 'name' => 'Post4',
'message' => '<p>This is a post about _leading and trailing_ underlines.</p>']);
// Index the data.
$this->search->index();
// Search for 'frogs' should find the post.
$querydata = new stdClass();
$querydata->q = 'frogs';
$results = $this->search->search($querydata);
$this->assert_result_titles(['Post1'], $results);
// Search for 'toads' or 'zombies' should find post 2 (and not 3)...
$querydata->q = 'toads';
$results = $this->search->search($querydata);
$this->assert_result_titles(['Post2'], $results);
$querydata->q = 'zombies';
$results = $this->search->search($querydata);
$this->assert_result_titles(['Post2'], $results);
// Search for 'toads_and_zombies' should find post 3.
$querydata->q = 'toads_and_zombies';
$results = $this->search->search($querydata);
$this->assert_result_titles(['Post3'], $results);
// Search for '_leading' or 'trailing_' should find post 4.
$querydata->q = '_leading';
$results = $this->search->search($querydata);
$this->assert_result_titles(['Post4'], $results);
$querydata->q = 'trailing_';
$results = $this->search->search($querydata);
$this->assert_result_titles(['Post4'], $results);
}
/**
* Asserts that the returned documents have the expected titles (regardless of order).
*