diff --git a/lib/classes/text.php b/lib/classes/text.php index b99b5d6fc6c..7a44b1b807b 100644 --- a/lib/classes/text.php +++ b/lib/classes/text.php @@ -248,6 +248,30 @@ class core_text { return $result; } + /** + * Truncates a string to no more than a certain number of bytes in a multi-byte safe manner. + * UTF-8 only! + * + * Many of the other charsets we test for (like ISO-2022-JP and EUC-JP) are not supported + * by typo3, and will give invalid results, so we are supporting UTF-8 only. + * + * @param string $string String to truncate + * @param int $bytes Maximum length of bytes in the result + * @return string Portion of string specified by $bytes + * @since Moodle 3.1 + */ + public static function str_max_bytes($string, $bytes) { + if (function_exists('mb_strcut')) { + return mb_strcut($string, 0, $bytes, 'UTF-8'); + } + + $oldlevel = error_reporting(E_PARSE); + $result = self::typo3()->strtrunc('utf-8', $string, $bytes); + error_reporting($oldlevel); + + return $result; + } + /** * Finds the last occurrence of a character in a string within another. * UTF-8 ONLY safe mb_strrchr(). @@ -707,4 +731,4 @@ class core_text { } return implode(' ', $words); } -} \ No newline at end of file +} diff --git a/lib/tests/text_test.php b/lib/tests/text_test.php index 8aeac338431..53e9e57d09c 100644 --- a/lib/tests/text_test.php +++ b/lib/tests/text_test.php @@ -179,6 +179,56 @@ class core_text_testcase extends advanced_testcase { $this->assertSame(4, core_text::strlen($str, 'GB18030')); } + /** + * Test unicode safe string truncation. + */ + public function test_str_max_bytes() { + // These are all 3 byte characters, so this is a 12-byte string. + $str = '言語設定'; + + $this->assertEquals(12, strlen($str)); + + // Step back, shortening the string 1 byte at a time. Should remove in 1 char chunks. + $conv = core_text::str_max_bytes($str, 12); + $this->assertEquals(12, strlen($conv)); + $this->assertSame('言語設定', $conv); + $conv = core_text::str_max_bytes($str, 11); + $this->assertEquals(9, strlen($conv)); + $this->assertSame('言語設', $conv); + $conv = core_text::str_max_bytes($str, 10); + $this->assertEquals(9, strlen($conv)); + $this->assertSame('言語設', $conv); + $conv = core_text::str_max_bytes($str, 9); + $this->assertEquals(9, strlen($conv)); + $this->assertSame('言語設', $conv); + $conv = core_text::str_max_bytes($str, 8); + $this->assertEquals(6, strlen($conv)); + $this->assertSame('言語', $conv); + + // Now try a mixed byte string. + $str = '言語設a定'; + + $this->assertEquals(13, strlen($str)); + + $conv = core_text::str_max_bytes($str, 11); + $this->assertEquals(10, strlen($conv)); + $this->assertSame('言語設a', $conv); + $conv = core_text::str_max_bytes($str, 10); + $this->assertEquals(10, strlen($conv)); + $this->assertSame('言語設a', $conv); + $conv = core_text::str_max_bytes($str, 9); + $this->assertEquals(9, strlen($conv)); + $this->assertSame('言語設', $conv); + $conv = core_text::str_max_bytes($str, 8); + $this->assertEquals(6, strlen($conv)); + $this->assertSame('言語', $conv); + + // Test 0 byte case. + $conv = core_text::str_max_bytes($str, 0); + $this->assertEquals(0, strlen($conv)); + $this->assertSame('', $conv); + } + /** * Tests the static strtolower method. */ diff --git a/lib/upgrade.txt b/lib/upgrade.txt index d23f70200b5..515ca2601c0 100644 --- a/lib/upgrade.txt +++ b/lib/upgrade.txt @@ -70,6 +70,8 @@ information provided here is intended especially for developers. is now a part of \antivirus_clamav\scanner class methods. * \repository::antivir_scan_file() has been deprecated, \core\antivirus\manager::scan_file() that applies antivirus plugins is replacing its functionality. +* Added core_text::str_max_bytes() which safely truncates multi-byte strings to a + maximum number of bytes. === 3.0 === diff --git a/search/engine/solr/classes/document.php b/search/engine/solr/classes/document.php index c4ef6a89eb3..20c1f3ecf9a 100644 --- a/search/engine/solr/classes/document.php +++ b/search/engine/solr/classes/document.php @@ -53,7 +53,7 @@ class document extends \core_search\document { public static function format_string_for_engine($string) { // 2^15 default. We could convert this to a setting as is possible to // change the max in solr. - return substr($string, 0, 32766); + return \core_text::str_max_bytes($string, 32766); } /**