mirror of
https://github.com/moodle/moodle.git
synced 2025-04-22 08:55:15 +02:00
Merge branch 'MDL-62042-master' of https://github.com/sammarshallou/moodle
This commit is contained in:
commit
590c774d37
@ -48,6 +48,11 @@ defined('MOODLE_INTERNAL') || die();
|
||||
*/
|
||||
class core_text {
|
||||
|
||||
/**
|
||||
* @var string[] Array of strings representing Unicode non-characters
|
||||
*/
|
||||
protected static $noncharacters;
|
||||
|
||||
/**
|
||||
* Return t3lib helper class, which is used for conversion between charsets
|
||||
*
|
||||
@ -628,6 +633,39 @@ class core_text {
|
||||
return $str;
|
||||
}
|
||||
|
||||
/**
|
||||
* There are a number of Unicode non-characters including the byte-order mark (which may appear
|
||||
* multiple times in a string) and also other ranges. These can cause problems for some
|
||||
* processing.
|
||||
*
|
||||
* This function removes the characters using string replace, so that the rest of the string
|
||||
* remains unchanged.
|
||||
*
|
||||
* @param string $value Input string
|
||||
* @return string Cleaned string value
|
||||
* @since Moodle 3.5
|
||||
*/
|
||||
public static function remove_unicode_non_characters($value) {
|
||||
// Set up list of all Unicode non-characters for fast replacing.
|
||||
if (!self::$noncharacters) {
|
||||
self::$noncharacters = [];
|
||||
// This list of characters is based on the Unicode standard. It includes the last two
|
||||
// characters of each code planes 0-16 inclusive...
|
||||
for ($plane = 0; $plane <= 16; $plane++) {
|
||||
$base = ($plane === 0 ? '' : dechex($plane));
|
||||
self::$noncharacters[] = html_entity_decode('&#x' . $base . 'fffe;');
|
||||
self::$noncharacters[] = html_entity_decode('&#x' . $base . 'ffff;');
|
||||
}
|
||||
// ...And the character range U+FDD0 to U+FDEF.
|
||||
for ($char = 0xfdd0; $char <= 0xfdef; $char++) {
|
||||
self::$noncharacters[] = html_entity_decode('&#x' . dechex($char) . ';');
|
||||
}
|
||||
}
|
||||
|
||||
// Do character replacement.
|
||||
return str_replace(self::$noncharacters, '', $value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns encoding options for select boxes, utf-8 and platform encoding first
|
||||
*
|
||||
|
@ -412,6 +412,27 @@ class core_text_testcase extends advanced_testcase {
|
||||
$this->assertSame($str.$bom, core_text::trim_utf8_bom($bom.$str.$bom));
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests the static remove_unicode_non_characters method.
|
||||
*/
|
||||
public function test_remove_unicode_non_characters() {
|
||||
// Confirm that texts which don't contain these characters are unchanged.
|
||||
$this->assertSame('Frogs!', core_text::remove_unicode_non_characters('Frogs!'));
|
||||
|
||||
// Even if they contain some very scary characters.
|
||||
$example = html_entity_decode('A�𝅘𝅥B');
|
||||
$this->assertSame($example, core_text::remove_unicode_non_characters($example));
|
||||
|
||||
// Non-characters are removed wherever they may be, with other characters left.
|
||||
$example = html_entity_decode('ABCD�E');
|
||||
$expected = html_entity_decode('ABCD�E');
|
||||
$this->assertSame($expected, core_text::remove_unicode_non_characters($example));
|
||||
|
||||
// If you only have a non-character, you get empty string.
|
||||
$example = html_entity_decode('');
|
||||
$this->assertSame('', core_text::remove_unicode_non_characters($example));
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests the static get_encodings method.
|
||||
*/
|
||||
|
@ -291,6 +291,9 @@ class document implements \renderable, \templatable {
|
||||
if ($fielddata['type'] === 'int' || $fielddata['type'] === 'tdate') {
|
||||
$this->data[$fieldname] = intval($value);
|
||||
} else {
|
||||
// Remove disallowed Unicode characters.
|
||||
$value = \core_text::remove_unicode_non_characters($value);
|
||||
|
||||
// Replace all groups of line breaks and spaces by single spaces.
|
||||
$this->data[$fieldname] = preg_replace("/\s+/u", " ", $value);
|
||||
if ($this->data[$fieldname] === null) {
|
||||
|
@ -1134,6 +1134,43 @@ class search_solr_engine_testcase extends advanced_testcase {
|
||||
$this->assertEquals('C1P', $results[0]->get('title'));
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests with bogus content (that can be entered into Moodle) to see if it crashes.
|
||||
*/
|
||||
public function test_bogus_content() {
|
||||
$generator = $this->getDataGenerator();
|
||||
$course1 = $generator->create_course(['fullname' => 'Course 1']);
|
||||
$course1context = \context_course::instance($course1->id);
|
||||
|
||||
// It is possible to enter into a Moodle database content containing these characters,
|
||||
// which are Unicode non-characters / byte order marks. If sent to Solr, these cause
|
||||
// failures.
|
||||
$boguscontent = html_entity_decode('') . 'frog';
|
||||
$this->create_search_record($course1->id, $course1context->id, 'C1', $boguscontent);
|
||||
$boguscontent = html_entity_decode('') . 'frog';
|
||||
$this->create_search_record($course1->id, $course1context->id, 'C1', $boguscontent);
|
||||
|
||||
// Unicode Standard Version 9.0 - Core Specification, section 23.7, lists 66 non-characters
|
||||
// in total. Here are some of them - these work OK for me but it may depend on platform.
|
||||
$boguscontent = html_entity_decode('') . 'frog';
|
||||
$this->create_search_record($course1->id, $course1context->id, 'C1', $boguscontent);
|
||||
$boguscontent = html_entity_decode('') . 'frog';
|
||||
$this->create_search_record($course1->id, $course1context->id, 'C1', $boguscontent);
|
||||
$boguscontent = html_entity_decode('') . 'frog';
|
||||
$this->create_search_record($course1->id, $course1context->id, 'C1', $boguscontent);
|
||||
$boguscontent = html_entity_decode('') . 'frog';
|
||||
$this->create_search_record($course1->id, $course1context->id, 'C1', $boguscontent);
|
||||
|
||||
// Do the indexing (this will check it doesn't throw warnings).
|
||||
$this->search->index();
|
||||
|
||||
// Confirm that all 6 documents are found in search.
|
||||
$querydata = new stdClass();
|
||||
$querydata->q = 'frog';
|
||||
$results = $this->search->search($querydata);
|
||||
$this->assertCount(6, $results);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a record to the mock search area, so that the search engine can find it later.
|
||||
*
|
||||
|
Loading…
x
Reference in New Issue
Block a user