mirror of
https://github.com/phpbb/phpbb.git
synced 2025-08-19 23:11:45 +02:00
[feature/patchwork-utf8] Normalize with intl, use patchwork/utf8 as fallback
This commit is contained in:
@@ -18,6 +18,13 @@ namespace phpbb\search;
|
||||
*/
|
||||
class fulltext_native extends \phpbb\search\base
|
||||
{
|
||||
const UTF8_HANGUL_FIRST = "\xEA\xB0\x80";
|
||||
const UTF8_HANGUL_LAST = "\xED\x9E\xA3";
|
||||
const UTF8_CJK_FIRST = "\xE4\xB8\x80";
|
||||
const UTF8_CJK_LAST = "\xE9\xBE\xBB";
|
||||
const UTF8_CJK_B_FIRST = "\xF0\xA0\x80\x80";
|
||||
const UTF8_CJK_B_LAST = "\xF0\xAA\x9B\x96";
|
||||
|
||||
/**
|
||||
* Associative array holding index stats
|
||||
* @var array
|
||||
@@ -93,7 +100,7 @@ class fulltext_native extends \phpbb\search\base
|
||||
protected $user;
|
||||
|
||||
/**
|
||||
* Initialises the fulltext_native search backend with min/max word length and makes sure the UTF-8 normalizer is loaded
|
||||
* Initialises the fulltext_native search backend with min/max word length
|
||||
*
|
||||
* @param boolean|string &$error is passed by reference and should either be set to false on success or an error message on failure
|
||||
*/
|
||||
@@ -110,10 +117,6 @@ class fulltext_native extends \phpbb\search\base
|
||||
/**
|
||||
* Load the UTF tools
|
||||
*/
|
||||
if (!class_exists('utf_normalizer'))
|
||||
{
|
||||
include($this->phpbb_root_path . 'includes/utf/utf_normalizer.' . $this->php_ext);
|
||||
}
|
||||
if (!function_exists('utf8_decode_ncr'))
|
||||
{
|
||||
include($this->phpbb_root_path . 'includes/utf/utf_tools.' . $this->php_ext);
|
||||
@@ -1175,9 +1178,9 @@ class fulltext_native extends \phpbb\search\base
|
||||
* Note: this could be optimized. If the codepoint is lower than Hangul's range
|
||||
* we know that it will also be lower than CJK ranges
|
||||
*/
|
||||
if ((strncmp($word, UTF8_HANGUL_FIRST, 3) < 0 || strncmp($word, UTF8_HANGUL_LAST, 3) > 0)
|
||||
&& (strncmp($word, UTF8_CJK_FIRST, 3) < 0 || strncmp($word, UTF8_CJK_LAST, 3) > 0)
|
||||
&& (strncmp($word, UTF8_CJK_B_FIRST, 4) < 0 || strncmp($word, UTF8_CJK_B_LAST, 4) > 0))
|
||||
if ((strncmp($word, self::UTF8_HANGUL_FIRST, 3) < 0 || strncmp($word, self::UTF8_HANGUL_LAST, 3) > 0)
|
||||
&& (strncmp($word, self::UTF8_CJK_FIRST, 3) < 0 || strncmp($word, self::UTF8_CJK_LAST, 3) > 0)
|
||||
&& (strncmp($word, self::UTF8_CJK_B_FIRST, 4) < 0 || strncmp($word, self::UTF8_CJK_B_LAST, 4) > 0))
|
||||
{
|
||||
$word = strtok(' ');
|
||||
continue;
|
||||
@@ -1544,8 +1547,6 @@ class fulltext_native extends \phpbb\search\base
|
||||
* @param string $allowed_chars String of special chars to allow
|
||||
* @param string $encoding Text encoding
|
||||
* @return string Cleaned up text, only alphanumeric chars are left
|
||||
*
|
||||
* @todo \normalizer::cleanup being able to be used?
|
||||
*/
|
||||
protected function cleanup($text, $allowed_chars = null, $encoding = 'utf-8')
|
||||
{
|
||||
@@ -1572,12 +1573,9 @@ class fulltext_native extends \phpbb\search\base
|
||||
$text = htmlspecialchars_decode(utf8_decode_ncr($text), ENT_QUOTES);
|
||||
|
||||
/**
|
||||
* Load the UTF-8 normalizer
|
||||
*
|
||||
* If we use it more widely, an instance of that class should be held in a
|
||||
* a global variable instead
|
||||
* Normalize to NFC
|
||||
*/
|
||||
\utf_normalizer::nfc($text);
|
||||
$text = \Normalizer::normalize($text);
|
||||
|
||||
/**
|
||||
* The first thing we do is:
|
||||
@@ -1670,9 +1668,9 @@ class fulltext_native extends \phpbb\search\base
|
||||
$utf_char = substr($text, $pos, $utf_len);
|
||||
$pos += $utf_len;
|
||||
|
||||
if (($utf_char >= UTF8_HANGUL_FIRST && $utf_char <= UTF8_HANGUL_LAST)
|
||||
|| ($utf_char >= UTF8_CJK_FIRST && $utf_char <= UTF8_CJK_LAST)
|
||||
|| ($utf_char >= UTF8_CJK_B_FIRST && $utf_char <= UTF8_CJK_B_LAST))
|
||||
if (($utf_char >= self::UTF8_HANGUL_FIRST && $utf_char <= self::UTF8_HANGUL_LAST)
|
||||
|| ($utf_char >= self::UTF8_CJK_FIRST && $utf_char <= self::UTF8_CJK_LAST)
|
||||
|| ($utf_char >= self::UTF8_CJK_B_FIRST && $utf_char <= self::UTF8_CJK_B_LAST))
|
||||
{
|
||||
/**
|
||||
* All characters within these ranges are valid
|
||||
|
Reference in New Issue
Block a user