diff --git a/phpBB/includes/search/fulltext_native_improved.php b/phpBB/includes/search/fulltext_native_improved.php index dac964a64a..a01ccd64ed 100644 --- a/phpBB/includes/search/fulltext_native_improved.php +++ b/phpBB/includes/search/fulltext_native_improved.php @@ -47,10 +47,18 @@ class fulltext_native_improved extends search_backend $this->word_length = array('min' => $config['fulltext_native_min_chars'], 'max' => $config['fulltext_native_max_chars']); + /** + * Load the UTF tools + */ if (!class_exists('utf_normalizer')) { include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx); } + if (!function_exists('utf8_strlen')) + { + include($phpbb_root_path . 'includes/utf/utf_tools.' . $phpEx); + } + $error = false; } @@ -864,14 +872,6 @@ class fulltext_native_improved extends search_backend $isset_min = $min - 1; - /** - * Load the UTF tools - */ - if (!function_exists('utf8_strlen')) - { - include($phpbb_root_path . 'includes/utf/utf_tools.' . $phpEx); - } - /** * Clean up the string, remove HTML tags, remove BBCodes */ @@ -1259,11 +1259,6 @@ class fulltext_native_improved extends search_backend $encoding = strtolower($encoding); if ($encoding != 'utf-8') { - if (!function_exists('utf8_recode')) - { - include($phpbb_root_path . 'includes/utf/utf_tools.' . $phpEx); - } - $text = utf8_recode($text, $encoding); } @@ -1277,7 +1272,7 @@ class fulltext_native_improved extends search_backend /** * Replace HTML entities and NCRs */ - $text = html_entity_decode($this->decode_ncr($text), ENT_QUOTES); + $text = html_entity_decode(utf8_decode_ncr($text), ENT_QUOTES); /** * Load the UTF-8 normalizer @@ -1481,60 +1476,6 @@ class fulltext_native_improved extends search_backend return $ret; } - /** - * Convert Numeric Character References to UTF-8 chars - * - * Notes: - * - we do not convert NCRs recursively, if you pass &#38; it will return & - * - we DO NOT check for the existence of the Unicode characters, therefore an entity - * may be converted to an inexistent codepoint - * - * @param string $text String to convert, encoded in UTF-8 (no normal form required) - * @return string UTF-8 string where NCRs have been replaced with the actual chars - */ - function decode_ncr($text) - { - /** - * @todo replace me with preg_replace_callback() or a loop - */ - return preg_replace( - '/&#([0-9]{1,6});/e', - "\$this->cp_to_utf(\$1)", - - preg_replace( - '/&#x([0-9A-F]{1,5});/ie', - "\$this->cp_to_utf(hexdec('\$1'))", - $text - ) - ); - } - - /** - * Convert a codepoint to a UTF-8 char - * - * @param integer $cp Unicode codepoint - * @return string UTF-8 string - */ - function cp_to_utf($cp) - { - if ($cp > 0xFFFF) - { - return chr(0xF0 | ($cp >> 18)) . chr(0x80 | (($cp >> 12) & 0x3F)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F)); - } - elseif ($cp > 0x7FF) - { - return chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F)); - } - elseif ($cp > 0x7F) - { - return chr(0xC0 | ($cp >> 6)) . chr(0x80 | ($cp & 0x3F)); - } - else - { - return chr($cp); - } - } - /** * Returns a list of options for the ACP to display */ diff --git a/phpBB/includes/utf/utf_tools.php b/phpBB/includes/utf/utf_tools.php index 4d8ba05340..b3c3c5ed5e 100644 --- a/phpBB/includes/utf/utf_tools.php +++ b/phpBB/includes/utf/utf_tools.php @@ -127,4 +127,91 @@ function utf8_recode($string, $encoding) die('Finish me!! '.basename(__FILE__).' at line '.__LINE__); } +/** +* Replace all UTF-8 chars that are not in ASCII with their NCR +* +* @param string $text UTF-8 string in NFC +* @return string ASCII string using NCRs for non-ASCII chars +*/ +function utf8_encode_ncr($text) +{ + return preg_replace_callback('#[\\xC2-\\xF4][\\x80-\\xBF]+#', 'utf8_encode_ncr_callback', $text); +} + +/** +* Callback used in encode_ncr() +* +* Takes a UTF-8 char and replaces it with its NCR. Attention, $m is an array +* +* @param array $m 0-based numerically indexed array passed by preg_replace_callback() +* @return string A HTML NCR if the character is valid, or the original string otherwise +*/ +function utf8_encode_ncr_callback($m) +{ + switch (strlen($m[0])) + { + case 1: + return '&#' . ord($m[0]) . ';'; + + case 2: + return '&#' . (((ord($m[0][0]) & 0x1F) << 6) | (ord($m[0][1]) & 0x3F)) . ';'; + + case 3: + return '&#' . (((ord($m[0][0]) & 0x0F) << 12) | ((ord($m[0][1]) & 0x3F) << 6) | (ord($m[0][2]) & 0x3F)) . ';'; + + case 4: + return '&#' . (((ord($m[0][0]) & 0x07) << 18) | ((ord($m[0][1]) & 0x3F) << 12) | ((ord($m[0][2]) & 0x3F) << 6) | (ord($m[0][3]) & 0x3F)) . ';'; + + default: + return $m[0]; + } +} + +/** +* Convert Numeric Character References to UTF-8 chars +* +* Notes: +* - we do not convert NCRs recursively, if you pass &#38; it will return & +* - we DO NOT check for the existence of the Unicode characters, therefore an entity +* may be converted to an inexistent codepoint +* +* @param string $text String to convert, encoded in UTF-8 (no normal form required) +* @return string UTF-8 string where NCRs have been replaced with the actual chars +*/ +function utf8_decode_ncr($text) +{ + return preg_replace_callback('/&#([0-9]{1,6}|x[0-9A-F]{1,5});/i', 'utf8_decode_ncr_callback', $text); +} + +/** +* Callback used in decode_ncr() +* +* Takes a NCR (in decimal or hexadecimal) and returns a UTF-8 char. Attention, $m is an array. +* It will ignore most of invalid NCRs, but not all! +* +* @param array $m 0-based numerically indexed array passed by preg_replace_callback() +* @return string UTF-8 char +*/ +function utf8_decode_ncr_callback($m) +{ + $cp = (strncasecmp($m[1], 'x', 1)) ? $m[1] : hexdec(substr($m[1], 1)); + + if ($cp > 0xFFFF) + { + return chr(0xF0 | ($cp >> 18)) . chr(0x80 | (($cp >> 12) & 0x3F)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F)); + } + elseif ($cp > 0x7FF) + { + return chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F)); + } + elseif ($cp > 0x7F) + { + return chr(0xC0 | ($cp >> 6)) . chr(0x80 | ($cp & 0x3F)); + } + else + { + return chr($cp); + } +} + ?> \ No newline at end of file