Merge branch 'MDL-62042-master' of https://github.com/sammarshallou/moodle

2025-04-22 08:55:15 +02:00 · 2018-04-23 16:07:37 +02:00 · 2018-04-23 16:07:37 +02:00 · 590c774d37
commit 590c774d37
parent c5a8065258 ffa868a9e1
4 changed files with 99 additions and 0 deletions
--- a/lib/classes/text.php
+++ b/lib/classes/text.php
@ -48,6 +48,11 @@ defined('MOODLE_INTERNAL') || die();
 */
 class core_text {

+    /**
+     * @var string[] Array of strings representing Unicode non-characters
+     */
+    protected static $noncharacters;
+
    /**
     * Return t3lib helper class, which is used for conversion between charsets
     *
@ -628,6 +633,39 @@ class core_text {
        return $str;
    }

+    /**
+     * There are a number of Unicode non-characters including the byte-order mark (which may appear
+     * multiple times in a string) and also other ranges. These can cause problems for some
+     * processing.
+     *
+     * This function removes the characters using string replace, so that the rest of the string
+     * remains unchanged.
+     *
+     * @param string $value Input string
+     * @return string Cleaned string value
+     * @since Moodle 3.5
+     */
+    public static function remove_unicode_non_characters($value) {
+        // Set up list of all Unicode non-characters for fast replacing.
+        if (!self::$noncharacters) {
+            self::$noncharacters = [];
+            // This list of characters is based on the Unicode standard. It includes the last two
+            // characters of each code planes 0-16 inclusive...
+            for ($plane = 0; $plane <= 16; $plane++) {
+                $base = ($plane === 0 ? '' : dechex($plane));
+                self::$noncharacters[] = html_entity_decode('&#x' . $base . 'fffe;');
+                self::$noncharacters[] = html_entity_decode('&#x' . $base . 'ffff;');
+            }
+            // ...And the character range U+FDD0 to U+FDEF.
+            for ($char = 0xfdd0; $char <= 0xfdef; $char++) {
+                self::$noncharacters[] = html_entity_decode('&#x' . dechex($char) . ';');
+            }
+        }
+
+        // Do character replacement.
+        return str_replace(self::$noncharacters, '', $value);
+    }
+
    /**
     * Returns encoding options for select boxes, utf-8 and platform encoding first
     *
--- a/lib/tests/text_test.php
+++ b/lib/tests/text_test.php
@ -412,6 +412,27 @@ class core_text_testcase extends advanced_testcase {
        $this->assertSame($str.$bom, core_text::trim_utf8_bom($bom.$str.$bom));
    }

+    /**
+     * Tests the static remove_unicode_non_characters method.
+     */
+    public function test_remove_unicode_non_characters() {
+        // Confirm that texts which don't contain these characters are unchanged.
+        $this->assertSame('Frogs!', core_text::remove_unicode_non_characters('Frogs!'));
+
+        // Even if they contain some very scary characters.
+        $example = html_entity_decode('A&#xfffd;&#x1d15f;B');
+        $this->assertSame($example, core_text::remove_unicode_non_characters($example));
+
+        // Non-characters are removed wherever they may be, with other characters left.
+        $example = html_entity_decode('&#xfffe;A&#xffff;B&#x8fffe;C&#xfdd0;D&#xfffd;E&#xfdd5;');
+        $expected = html_entity_decode('ABCD&#xfffd;E');
+        $this->assertSame($expected, core_text::remove_unicode_non_characters($example));
+
+        // If you only have a non-character, you get empty string.
+        $example = html_entity_decode('&#xfffe;');
+        $this->assertSame('', core_text::remove_unicode_non_characters($example));
+    }
+
    /**
     * Tests the static get_encodings method.
     */
--- a/search/classes/document.php
+++ b/search/classes/document.php
@ -291,6 +291,9 @@ class document implements \renderable, \templatable {
        if ($fielddata['type'] === 'int' || $fielddata['type'] === 'tdate') {
            $this->data[$fieldname] = intval($value);
        } else {
+            // Remove disallowed Unicode characters.
+            $value = \core_text::remove_unicode_non_characters($value);
+
            // Replace all groups of line breaks and spaces by single spaces.
            $this->data[$fieldname] = preg_replace("/\s+/u", " ", $value);
            if ($this->data[$fieldname] === null) {
--- a/search/engine/solr/tests/engine_test.php
+++ b/search/engine/solr/tests/engine_test.php
@ -1134,6 +1134,43 @@ class search_solr_engine_testcase extends advanced_testcase {
        $this->assertEquals('C1P', $results[0]->get('title'));
    }

+    /**
+     * Tests with bogus content (that can be entered into Moodle) to see if it crashes.
+     */
+    public function test_bogus_content() {
+        $generator = $this->getDataGenerator();
+        $course1 = $generator->create_course(['fullname' => 'Course 1']);
+        $course1context = \context_course::instance($course1->id);
+
+        // It is possible to enter into a Moodle database content containing these characters,
+        // which are Unicode non-characters / byte order marks. If sent to Solr, these cause
+        // failures.
+        $boguscontent = html_entity_decode('&#xfffe;') . 'frog';
+        $this->create_search_record($course1->id, $course1context->id, 'C1', $boguscontent);
+        $boguscontent = html_entity_decode('&#xffff;') . 'frog';
+        $this->create_search_record($course1->id, $course1context->id, 'C1', $boguscontent);
+
+        // Unicode Standard Version 9.0 - Core Specification, section 23.7, lists 66 non-characters
+        // in total. Here are some of them - these work OK for me but it may depend on platform.
+        $boguscontent = html_entity_decode('&#xfdd0;') . 'frog';
+        $this->create_search_record($course1->id, $course1context->id, 'C1', $boguscontent);
+        $boguscontent = html_entity_decode('&#xfdef;') . 'frog';
+        $this->create_search_record($course1->id, $course1context->id, 'C1', $boguscontent);
+        $boguscontent = html_entity_decode('&#x1fffe;') . 'frog';
+        $this->create_search_record($course1->id, $course1context->id, 'C1', $boguscontent);
+        $boguscontent = html_entity_decode('&#x10ffff;') . 'frog';
+        $this->create_search_record($course1->id, $course1context->id, 'C1', $boguscontent);
+
+        // Do the indexing (this will check it doesn't throw warnings).
+        $this->search->index();
+
+        // Confirm that all 6 documents are found in search.
+        $querydata = new stdClass();
+        $querydata->q = 'frog';
+        $results = $this->search->search($querydata);
+        $this->assertCount(6, $results);
+    }
+
    /**
     * Adds a record to the mock search area, so that the search engine can find it later.
     *