mirror of
https://github.com/phpbb/phpbb.git
synced 2025-04-13 12:22:03 +02:00
[feature/patchwork-utf8] Normalize with intl, use patchwork/utf8 as fallback
This commit is contained in:
parent
1601b61ef9
commit
5a7caf6508
@ -27,6 +27,7 @@
|
||||
"require": {
|
||||
"php": ">=5.3.3",
|
||||
"lusitanian/oauth": "0.2.*",
|
||||
"patchwork/utf8": "1.1.*",
|
||||
"symfony/config": "2.5.*",
|
||||
"symfony/console": "2.5.*",
|
||||
"symfony/dependency-injection": "2.5.*",
|
||||
|
56
phpBB/composer.lock
generated
56
phpBB/composer.lock
generated
@ -68,6 +68,62 @@
|
||||
],
|
||||
"time": "2013-08-29 21:40:04"
|
||||
},
|
||||
{
|
||||
"name": "patchwork/utf8",
|
||||
"version": "v1.1.26",
|
||||
"source": {
|
||||
"type": "git",
|
||||
"url": "https://github.com/tchwork/utf8.git",
|
||||
"reference": "6b8e46603b49ee87ad6bceb314da94cc04ffcdce"
|
||||
},
|
||||
"dist": {
|
||||
"type": "zip",
|
||||
"url": "https://api.github.com/repos/tchwork/utf8/zipball/6b8e46603b49ee87ad6bceb314da94cc04ffcdce",
|
||||
"reference": "6b8e46603b49ee87ad6bceb314da94cc04ffcdce",
|
||||
"shasum": ""
|
||||
},
|
||||
"require": {
|
||||
"lib-pcre": ">=7.3",
|
||||
"php": ">=5.3.0"
|
||||
},
|
||||
"suggest": {
|
||||
"ext-iconv": "Use iconv for best performance",
|
||||
"ext-intl": "Use Intl for best performance",
|
||||
"ext-mbstring": "Use Mbstring for best performance"
|
||||
},
|
||||
"type": "library",
|
||||
"extra": {
|
||||
"branch-alias": {
|
||||
"dev-master": "1.1-dev"
|
||||
}
|
||||
},
|
||||
"autoload": {
|
||||
"psr-0": {
|
||||
"Patchwork": "class/",
|
||||
"Normalizer": "class/"
|
||||
}
|
||||
},
|
||||
"notification-url": "https://packagist.org/downloads/",
|
||||
"license": [
|
||||
"(Apache-2.0 or GPL-2.0)"
|
||||
],
|
||||
"authors": [
|
||||
{
|
||||
"name": "Nicolas Grekas",
|
||||
"email": "p@tchwork.com"
|
||||
}
|
||||
],
|
||||
"description": "Portable and performant UTF-8, Unicode and Grapheme Clusters for PHP",
|
||||
"homepage": "https://github.com/tchwork/utf8",
|
||||
"keywords": [
|
||||
"grapheme",
|
||||
"i18n",
|
||||
"unicode",
|
||||
"utf-8",
|
||||
"utf8"
|
||||
],
|
||||
"time": "2014-11-08 10:13:25"
|
||||
},
|
||||
{
|
||||
"name": "psr/log",
|
||||
"version": "1.0.0",
|
||||
|
@ -32,262 +32,11 @@ $phpbb_root_path = '../';
|
||||
$phpEx = substr(strrchr(__FILE__, '.'), 1);
|
||||
|
||||
echo "Checking for required files\n";
|
||||
download('http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt');
|
||||
download('http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt');
|
||||
download('http://www.unicode.org/Public/UNIDATA/UnicodeData.txt');
|
||||
echo "\n";
|
||||
|
||||
require_once($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
|
||||
$file_contents = array();
|
||||
|
||||
/**
|
||||
* Generate some Hangul/Jamo stuff
|
||||
*/
|
||||
echo "\nGenerating Hangul and Jamo tables\n";
|
||||
for ($i = 0; $i < UNICODE_HANGUL_LCOUNT; ++$i)
|
||||
{
|
||||
$utf_char = cp_to_utf(UNICODE_HANGUL_LBASE + $i);
|
||||
$file_contents['utf_normalizer_common']['utf_jamo_index'][$utf_char] = $i * UNICODE_HANGUL_VCOUNT * UNICODE_HANGUL_TCOUNT + UNICODE_HANGUL_SBASE;
|
||||
$file_contents['utf_normalizer_common']['utf_jamo_type'][$utf_char] = UNICODE_JAMO_L;
|
||||
}
|
||||
|
||||
for ($i = 0; $i < UNICODE_HANGUL_VCOUNT; ++$i)
|
||||
{
|
||||
$utf_char = cp_to_utf(UNICODE_HANGUL_VBASE + $i);
|
||||
$file_contents['utf_normalizer_common']['utf_jamo_index'][$utf_char] = $i * UNICODE_HANGUL_TCOUNT;
|
||||
$file_contents['utf_normalizer_common']['utf_jamo_type'][$utf_char] = UNICODE_JAMO_V;
|
||||
}
|
||||
|
||||
for ($i = 0; $i < UNICODE_HANGUL_TCOUNT; ++$i)
|
||||
{
|
||||
$utf_char = cp_to_utf(UNICODE_HANGUL_TBASE + $i);
|
||||
$file_contents['utf_normalizer_common']['utf_jamo_index'][$utf_char] = $i;
|
||||
$file_contents['utf_normalizer_common']['utf_jamo_type'][$utf_char] = UNICODE_JAMO_T;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load the CompositionExclusions table
|
||||
*/
|
||||
echo "Loading CompositionExclusion\n";
|
||||
$fp = fopen('CompositionExclusions.txt', 'rt');
|
||||
|
||||
$exclude = array();
|
||||
while (!feof($fp))
|
||||
{
|
||||
$line = fgets($fp, 1024);
|
||||
|
||||
if (!strpos(' 0123456789ABCDEFabcdef', $line[0]))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
$cp = strtok($line, ' ');
|
||||
|
||||
if ($pos = strpos($cp, '..'))
|
||||
{
|
||||
$start = hexdec(substr($cp, 0, $pos));
|
||||
$end = hexdec(substr($cp, $pos + 2));
|
||||
|
||||
for ($i = $start; $i < $end; ++$i)
|
||||
{
|
||||
$exclude[$i] = 1;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
$exclude[hexdec($cp)] = 1;
|
||||
}
|
||||
}
|
||||
fclose($fp);
|
||||
|
||||
/**
|
||||
* Load QuickCheck tables
|
||||
*/
|
||||
echo "Generating QuickCheck tables\n";
|
||||
$fp = fopen('DerivedNormalizationProps.txt', 'rt');
|
||||
|
||||
while (!feof($fp))
|
||||
{
|
||||
$line = fgets($fp, 1024);
|
||||
|
||||
if (!strpos(' 0123456789ABCDEFabcdef', $line[0]))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
$p = array_map('trim', explode(';', strtok($line, '#')));
|
||||
|
||||
/**
|
||||
* Capture only NFC_QC, NFKC_QC
|
||||
*/
|
||||
if (!preg_match('#^NFK?C_QC$#', $p[1]))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($pos = strpos($p[0], '..'))
|
||||
{
|
||||
$start = hexdec(substr($p[0], 0, $pos));
|
||||
$end = hexdec(substr($p[0], $pos + 2));
|
||||
}
|
||||
else
|
||||
{
|
||||
$start = $end = hexdec($p[0]);
|
||||
}
|
||||
|
||||
if ($start >= UTF8_HANGUL_FIRST && $end <= UTF8_HANGUL_LAST)
|
||||
{
|
||||
/**
|
||||
* We do not store Hangul syllables in the array
|
||||
*/
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($p[2] == 'M')
|
||||
{
|
||||
$val = UNICODE_QC_MAYBE;
|
||||
}
|
||||
else
|
||||
{
|
||||
$val = UNICODE_QC_NO;
|
||||
}
|
||||
|
||||
if ($p[1] == 'NFKC_QC')
|
||||
{
|
||||
$file = 'utf_nfkc_qc';
|
||||
}
|
||||
else
|
||||
{
|
||||
$file = 'utf_nfc_qc';
|
||||
}
|
||||
|
||||
for ($i = $start; $i <= $end; ++$i)
|
||||
{
|
||||
/**
|
||||
* The vars have the same name as the file: $utf_nfc_qc is in utf_nfc_qc.php
|
||||
*/
|
||||
$file_contents[$file][$file][cp_to_utf($i)] = $val;
|
||||
}
|
||||
}
|
||||
fclose($fp);
|
||||
|
||||
/**
|
||||
* Do mappings
|
||||
*/
|
||||
echo "Loading Unicode decomposition mappings\n";
|
||||
$fp = fopen($phpbb_root_path . 'develop/UnicodeData.txt', 'rt');
|
||||
|
||||
$map = array();
|
||||
while (!feof($fp))
|
||||
{
|
||||
$p = explode(';', fgets($fp, 1024));
|
||||
$cp = hexdec($p[0]);
|
||||
|
||||
if (!empty($p[3]))
|
||||
{
|
||||
/**
|
||||
* Store combining class > 0
|
||||
*/
|
||||
$file_contents['utf_normalizer_common']['utf_combining_class'][cp_to_utf($cp)] = (int) $p[3];
|
||||
}
|
||||
|
||||
if (!isset($p[5]) || !preg_match_all('#[0-9A-F]+#', strip_tags($p[5]), $m))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (strpos($p[5], '>'))
|
||||
{
|
||||
$map['NFKD'][$cp] = implode(' ', array_map('hexdec', $m[0]));
|
||||
}
|
||||
else
|
||||
{
|
||||
$map['NFD'][$cp] = $map['NFKD'][$cp] = implode(' ', array_map('hexdec', $m[0]));
|
||||
}
|
||||
}
|
||||
fclose($fp);
|
||||
|
||||
/**
|
||||
* Build the canonical composition table
|
||||
*/
|
||||
echo "Generating the Canonical Composition table\n";
|
||||
foreach ($map['NFD'] as $cp => $decomp_seq)
|
||||
{
|
||||
if (!strpos($decomp_seq, ' ') || isset($exclude[$cp]))
|
||||
{
|
||||
/**
|
||||
* Singletons are excluded from canonical composition
|
||||
*/
|
||||
continue;
|
||||
}
|
||||
|
||||
$utf_seq = implode('', array_map('cp_to_utf', explode(' ', $decomp_seq)));
|
||||
|
||||
if (!isset($file_contents['utf_canonical_comp']['utf_canonical_comp'][$utf_seq]))
|
||||
{
|
||||
$file_contents['utf_canonical_comp']['utf_canonical_comp'][$utf_seq] = cp_to_utf($cp);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Decompose the NF[K]D mappings recursively and prepare the file contents
|
||||
*/
|
||||
echo "Generating the Canonical and Compatibility Decomposition tables\n\n";
|
||||
foreach ($map as $type => $decomp_map)
|
||||
{
|
||||
foreach ($decomp_map as $cp => $decomp_seq)
|
||||
{
|
||||
$decomp_map[$cp] = decompose($decomp_map, $decomp_seq);
|
||||
}
|
||||
unset($decomp_seq);
|
||||
|
||||
if ($type == 'NFKD')
|
||||
{
|
||||
$file = 'utf_compatibility_decomp';
|
||||
$var = 'utf_compatibility_decomp';
|
||||
}
|
||||
else
|
||||
{
|
||||
$file = 'utf_canonical_decomp';
|
||||
$var = 'utf_canonical_decomp';
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate the corresponding file
|
||||
*/
|
||||
foreach ($decomp_map as $cp => $decomp_seq)
|
||||
{
|
||||
$file_contents[$file][$var][cp_to_utf($cp)] = implode('', array_map('cp_to_utf', explode(' ', $decomp_seq)));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Generate and/or alter the files
|
||||
*/
|
||||
foreach ($file_contents as $file => $contents)
|
||||
{
|
||||
/**
|
||||
* Generate a new file
|
||||
*/
|
||||
echo "Writing to $file.$phpEx\n";
|
||||
|
||||
if (!$fp = fopen($phpbb_root_path . 'includes/utf/data/' . $file . '.' . $phpEx, 'wb'))
|
||||
{
|
||||
trigger_error('Cannot open ' . $file . ' for write');
|
||||
}
|
||||
|
||||
fwrite($fp, '<?php');
|
||||
foreach ($contents as $var => $val)
|
||||
{
|
||||
fwrite($fp, "\n\$GLOBALS[" . my_var_export($var) . ']=' . my_var_export($val) . ";");
|
||||
}
|
||||
fclose($fp);
|
||||
}
|
||||
|
||||
echo "\n*** UTF-8 normalization tables done\n\n";
|
||||
|
||||
/**
|
||||
* Now we'll generate the files needed by the search indexer
|
||||
* Generate the files needed by the search indexer
|
||||
*/
|
||||
echo "Generating search indexer tables\n";
|
||||
|
||||
@ -424,32 +173,6 @@ die("\nAll done!\n");
|
||||
// Internal functions //
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/**
|
||||
* Decompose a sequence recusively
|
||||
*
|
||||
* @param array $decomp_map Decomposition mapping, passed by reference
|
||||
* @param string $decomp_seq Decomposition sequence as decimal codepoints separated with a space
|
||||
* @return string Decomposition sequence, fully decomposed
|
||||
*/
|
||||
function decompose(&$decomp_map, $decomp_seq)
|
||||
{
|
||||
$ret = array();
|
||||
foreach (explode(' ', $decomp_seq) as $cp)
|
||||
{
|
||||
if (isset($decomp_map[$cp]))
|
||||
{
|
||||
$ret[] = decompose($decomp_map, $decomp_map[$cp]);
|
||||
}
|
||||
else
|
||||
{
|
||||
$ret[] = $cp;
|
||||
}
|
||||
}
|
||||
|
||||
return implode(' ', $ret);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Return a parsable string representation of a variable
|
||||
*
|
||||
@ -537,17 +260,6 @@ function hex_to_utf($hex)
|
||||
return cp_to_utf(hexdec($hex));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a UTF string formed from a sequence of codepoints in hexadecimal
|
||||
*
|
||||
* @param string $seq Sequence of codepoints, separated with a space
|
||||
* @return string UTF-8 string
|
||||
*/
|
||||
function hexseq_to_utf($seq)
|
||||
{
|
||||
return implode('', array_map('hex_to_utf', explode(' ', $seq)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a codepoint to a UTF-8 char
|
||||
*
|
||||
|
@ -81,38 +81,3 @@ function utf8_to_unicode_callback($m)
|
||||
{
|
||||
return '\u' . str_pad(base_convert(utf8_ord($m[0]), 10, 16), 4, '0', STR_PAD_LEFT) . '';
|
||||
}
|
||||
|
||||
/**
|
||||
* A wrapper function for the normalizer which takes care of including the class if required and modifies the passed strings
|
||||
* to be in NFKC
|
||||
*
|
||||
* @param mixed $strings a string or an array of strings to normalize
|
||||
* @return mixed the normalized content, preserving array keys if array given.
|
||||
*/
|
||||
function utf8_normalize_nfkc($strings)
|
||||
{
|
||||
if (empty($strings))
|
||||
{
|
||||
return $strings;
|
||||
}
|
||||
|
||||
if (!class_exists('utf_normalizer'))
|
||||
{
|
||||
global $phpbb_root_path, $phpEx;
|
||||
include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
|
||||
}
|
||||
|
||||
if (!is_array($strings))
|
||||
{
|
||||
utf_normalizer::nfkc($strings);
|
||||
}
|
||||
else if (is_array($strings))
|
||||
{
|
||||
foreach ($strings as $key => $string)
|
||||
{
|
||||
utf_normalizer::nfkc($strings[$key]);
|
||||
}
|
||||
}
|
||||
|
||||
return $strings;
|
||||
}
|
||||
|
@ -1,394 +0,0 @@
|
||||
<?php
|
||||
/**
|
||||
*
|
||||
* This file is part of the phpBB Forum Software package.
|
||||
*
|
||||
* @copyright (c) phpBB Limited <https://www.phpbb.com>
|
||||
* @license GNU General Public License, version 2 (GPL-2.0)
|
||||
*
|
||||
* For full copyright and license information, please see
|
||||
* the docs/CREDITS.txt file.
|
||||
*
|
||||
*/
|
||||
|
||||
if (php_sapi_name() != 'cli')
|
||||
{
|
||||
die("This program must be run from the command line.\n");
|
||||
}
|
||||
|
||||
//
|
||||
// Security message:
|
||||
//
|
||||
// This script is potentially dangerous.
|
||||
// Remove or comment the next line (die(".... ) to enable this script.
|
||||
// Do NOT FORGET to either remove this script or disable it after you have used it.
|
||||
//
|
||||
die("Please read the first lines of this script for instructions on how to enable it");
|
||||
|
||||
set_time_limit(0);
|
||||
error_reporting(E_ALL);
|
||||
|
||||
define('IN_PHPBB', true);
|
||||
$phpbb_root_path = '../';
|
||||
$phpEx = substr(strrchr(__FILE__, '.'), 1);
|
||||
|
||||
|
||||
/**
|
||||
* Let's download some files we need
|
||||
*/
|
||||
download('http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt');
|
||||
download('http://www.unicode.org/Public/UNIDATA/UnicodeData.txt');
|
||||
|
||||
/**
|
||||
* Those are the tests we run
|
||||
*/
|
||||
$test_suite = array(
|
||||
/**
|
||||
* NFC
|
||||
* c2 == NFC(c1) == NFC(c2) == NFC(c3)
|
||||
* c4 == NFC(c4) == NFC(c5)
|
||||
*/
|
||||
'NFC' => array(
|
||||
'c2' => array('c1', 'c2', 'c3'),
|
||||
'c4' => array('c4', 'c5')
|
||||
),
|
||||
|
||||
/**
|
||||
* NFD
|
||||
* c3 == NFD(c1) == NFD(c2) == NFD(c3)
|
||||
* c5 == NFD(c4) == NFD(c5)
|
||||
*/
|
||||
'NFD' => array(
|
||||
'c3' => array('c1', 'c2', 'c3'),
|
||||
'c5' => array('c4', 'c5')
|
||||
),
|
||||
|
||||
/**
|
||||
* NFKC
|
||||
* c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
|
||||
*/
|
||||
'NFKC' => array(
|
||||
'c4' => array('c1', 'c2', 'c3', 'c4', 'c5')
|
||||
),
|
||||
|
||||
/**
|
||||
* NFKD
|
||||
* c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
|
||||
*/
|
||||
'NFKD' => array(
|
||||
'c5' => array('c1', 'c2', 'c3', 'c4', 'c5')
|
||||
)
|
||||
);
|
||||
|
||||
require_once($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
|
||||
|
||||
$i = $n = 0;
|
||||
$failed = false;
|
||||
$tested_chars = array();
|
||||
|
||||
$fp = fopen($phpbb_root_path . 'develop/NormalizationTest.txt', 'rb');
|
||||
while (!feof($fp))
|
||||
{
|
||||
$line = fgets($fp);
|
||||
++$n;
|
||||
|
||||
if ($line[0] == '@')
|
||||
{
|
||||
if ($i)
|
||||
{
|
||||
echo "done\n";
|
||||
}
|
||||
|
||||
$i = 0;
|
||||
echo "\n", substr($line, 1), "\n\n";
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!strpos(' 0123456789ABCDEF', $line[0]))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (++$i % 100 == 0)
|
||||
{
|
||||
echo $i, ' ';
|
||||
}
|
||||
|
||||
list($c1, $c2, $c3, $c4, $c5) = explode(';', $line);
|
||||
|
||||
if (!strpos($c1, ' '))
|
||||
{
|
||||
/**
|
||||
* We are currently testing a single character, we add it to the list of
|
||||
* characters we have processed so that we can exclude it when testing
|
||||
* for invariants
|
||||
*/
|
||||
$tested_chars[$c1] = 1;
|
||||
}
|
||||
|
||||
foreach ($test_suite as $form => $serie)
|
||||
{
|
||||
foreach ($serie as $expected => $tests)
|
||||
{
|
||||
$hex_expected = ${$expected};
|
||||
$utf_expected = hexseq_to_utf($hex_expected);
|
||||
|
||||
foreach ($tests as $test)
|
||||
{
|
||||
$utf_result = $utf_expected;
|
||||
call_user_func(array('utf_normalizer', $form), $utf_result);
|
||||
|
||||
if (strcmp($utf_expected, $utf_result))
|
||||
{
|
||||
$failed = true;
|
||||
$hex_result = utf_to_hexseq($utf_result);
|
||||
|
||||
echo "\nFAILED $expected == $form($test) ($hex_expected != $hex_result)";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ($failed)
|
||||
{
|
||||
die("\n\nFailed at line $n\n");
|
||||
}
|
||||
}
|
||||
}
|
||||
fclose($fp);
|
||||
|
||||
/**
|
||||
* Test for invariants
|
||||
*/
|
||||
echo "\n\nTesting for invariants...\n\n";
|
||||
|
||||
$fp = fopen($phpbb_root_path . 'develop/UnicodeData.txt', 'rt');
|
||||
|
||||
$n = 0;
|
||||
while (!feof($fp))
|
||||
{
|
||||
if (++$n % 100 == 0)
|
||||
{
|
||||
echo $n, ' ';
|
||||
}
|
||||
|
||||
$line = fgets($fp, 1024);
|
||||
|
||||
if (!$pos = strpos($line, ';'))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
$hex_tested = $hex_expected = substr($line, 0, $pos);
|
||||
|
||||
if (isset($tested_chars[$hex_tested]))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
$utf_expected = hex_to_utf($hex_expected);
|
||||
|
||||
if ($utf_expected >= UTF8_SURROGATE_FIRST
|
||||
&& $utf_expected <= UTF8_SURROGATE_LAST)
|
||||
{
|
||||
/**
|
||||
* Surrogates are illegal on their own, we expect the normalizer
|
||||
* to return a replacement char
|
||||
*/
|
||||
$utf_expected = UTF8_REPLACEMENT;
|
||||
$hex_expected = utf_to_hexseq($utf_expected);
|
||||
}
|
||||
|
||||
foreach (array('nfc', 'nfkc', 'nfd', 'nfkd') as $form)
|
||||
{
|
||||
$utf_result = $utf_expected;
|
||||
utf_normalizer::$form($utf_result);
|
||||
$hex_result = utf_to_hexseq($utf_result);
|
||||
// echo "$form($utf_expected) == $utf_result\n";
|
||||
|
||||
if (strcmp($utf_expected, $utf_result))
|
||||
{
|
||||
$failed = 1;
|
||||
|
||||
echo "\nFAILED $hex_expected == $form($hex_tested) ($hex_expected != $hex_result)";
|
||||
}
|
||||
}
|
||||
|
||||
if ($failed)
|
||||
{
|
||||
die("\n\nFailed at line $n\n");
|
||||
}
|
||||
}
|
||||
fclose($fp);
|
||||
|
||||
die("\n\nALL TESTS PASSED SUCCESSFULLY\n");
|
||||
|
||||
/**
|
||||
* Download a file to the develop/ dir
|
||||
*
|
||||
* @param string $url URL of the file to download
|
||||
* @return null
|
||||
*/
|
||||
function download($url)
|
||||
{
|
||||
global $phpbb_root_path;
|
||||
|
||||
if (file_exists($phpbb_root_path . 'develop/' . basename($url)))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
echo 'Downloading from ', $url, ' ';
|
||||
|
||||
if (!$fpr = fopen($url, 'rb'))
|
||||
{
|
||||
die("Can't download from $url\nPlease download it yourself and put it in the develop/ dir, kthxbai");
|
||||
}
|
||||
|
||||
if (!$fpw = fopen($phpbb_root_path . 'develop/' . basename($url), 'wb'))
|
||||
{
|
||||
die("Can't open develop/" . basename($url) . " for output... please check your permissions or something");
|
||||
}
|
||||
|
||||
$i = 0;
|
||||
$chunk = 32768;
|
||||
$done = '';
|
||||
|
||||
while (!feof($fpr))
|
||||
{
|
||||
$i += fwrite($fpw, fread($fpr, $chunk));
|
||||
echo str_repeat("\x08", strlen($done));
|
||||
|
||||
$done = ($i >> 10) . ' KiB';
|
||||
echo $done;
|
||||
}
|
||||
fclose($fpr);
|
||||
fclose($fpw);
|
||||
|
||||
echo "\n";
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a UTF string to a sequence of codepoints in hexadecimal
|
||||
*
|
||||
* @param string $utf UTF string
|
||||
* @return integer Unicode codepoints in hex
|
||||
*/
|
||||
function utf_to_hexseq($str)
|
||||
{
|
||||
$pos = 0;
|
||||
$len = strlen($str);
|
||||
$ret = array();
|
||||
|
||||
while ($pos < $len)
|
||||
{
|
||||
$c = $str[$pos];
|
||||
switch ($c & "\xF0")
|
||||
{
|
||||
case "\xC0":
|
||||
case "\xD0":
|
||||
$utf_char = substr($str, $pos, 2);
|
||||
$pos += 2;
|
||||
break;
|
||||
|
||||
case "\xE0":
|
||||
$utf_char = substr($str, $pos, 3);
|
||||
$pos += 3;
|
||||
break;
|
||||
|
||||
case "\xF0":
|
||||
$utf_char = substr($str, $pos, 4);
|
||||
$pos += 4;
|
||||
break;
|
||||
|
||||
default:
|
||||
$utf_char = $c;
|
||||
++$pos;
|
||||
}
|
||||
|
||||
$hex = dechex(utf_to_cp($utf_char));
|
||||
|
||||
if (!isset($hex[3]))
|
||||
{
|
||||
$hex = substr('000' . $hex, -4);
|
||||
}
|
||||
|
||||
$ret[] = $hex;
|
||||
}
|
||||
|
||||
return strtr(implode(' ', $ret), 'abcdef', 'ABCDEF');
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a UTF-8 char to its codepoint
|
||||
*
|
||||
* @param string $utf_char UTF-8 char
|
||||
* @return integer Unicode codepoint
|
||||
*/
|
||||
function utf_to_cp($utf_char)
|
||||
{
|
||||
switch (strlen($utf_char))
|
||||
{
|
||||
case 1:
|
||||
return ord($utf_char);
|
||||
|
||||
case 2:
|
||||
return ((ord($utf_char[0]) & 0x1F) << 6) | (ord($utf_char[1]) & 0x3F);
|
||||
|
||||
case 3:
|
||||
return ((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F);
|
||||
|
||||
case 4:
|
||||
return ((ord($utf_char[0]) & 0x07) << 18) | ((ord($utf_char[1]) & 0x3F) << 12) | ((ord($utf_char[2]) & 0x3F) << 6) | (ord($utf_char[3]) & 0x3F);
|
||||
|
||||
default:
|
||||
die('UTF-8 chars can only be 1-4 bytes long');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a UTF string formed from a sequence of codepoints in hexadecimal
|
||||
*
|
||||
* @param string $seq Sequence of codepoints, separated with a space
|
||||
* @return string UTF-8 string
|
||||
*/
|
||||
function hexseq_to_utf($seq)
|
||||
{
|
||||
return implode('', array_map('hex_to_utf', explode(' ', $seq)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a codepoint in hexadecimal to a UTF-8 char
|
||||
*
|
||||
* @param string $hex Codepoint, in hexadecimal
|
||||
* @return string UTF-8 char
|
||||
*/
|
||||
function hex_to_utf($hex)
|
||||
{
|
||||
return cp_to_utf(hexdec($hex));
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a codepoint to a UTF-8 char
|
||||
*
|
||||
* @param integer $cp Unicode codepoint
|
||||
* @return string UTF-8 string
|
||||
*/
|
||||
function cp_to_utf($cp)
|
||||
{
|
||||
if ($cp > 0xFFFF)
|
||||
{
|
||||
return chr(0xF0 | ($cp >> 18)) . chr(0x80 | (($cp >> 12) & 0x3F)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
|
||||
}
|
||||
else if ($cp > 0x7FF)
|
||||
{
|
||||
return chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
|
||||
}
|
||||
else if ($cp > 0x7F)
|
||||
{
|
||||
return chr(0xC0 | ($cp >> 6)) . chr(0x80 | ($cp & 0x3F));
|
||||
}
|
||||
else
|
||||
{
|
||||
return chr($cp);
|
||||
}
|
||||
}
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -1,4 +0,0 @@
|
||||
<?php
|
||||
$GLOBALS['utf_jamo_index']=array('ᄀ'=>44032,'ᄁ'=>44620,'ᄂ'=>45208,'ᄃ'=>45796,'ᄄ'=>46384,'ᄅ'=>46972,'ᄆ'=>47560,'ᄇ'=>48148,'ᄈ'=>48736,'ᄉ'=>49324,'ᄊ'=>49912,'ᄋ'=>50500,'ᄌ'=>51088,'ᄍ'=>51676,'ᄎ'=>52264,'ᄏ'=>52852,'ᄐ'=>53440,'ᄑ'=>54028,'ᄒ'=>54616,'ᅡ'=>0,'ᅢ'=>28,'ᅣ'=>56,'ᅤ'=>84,'ᅥ'=>112,'ᅦ'=>140,'ᅧ'=>168,'ᅨ'=>196,'ᅩ'=>224,'ᅪ'=>252,'ᅫ'=>280,'ᅬ'=>308,'ᅭ'=>336,'ᅮ'=>364,'ᅯ'=>392,'ᅰ'=>420,'ᅱ'=>448,'ᅲ'=>476,'ᅳ'=>504,'ᅴ'=>532,'ᅵ'=>560,'ᆧ'=>0,'ᆨ'=>1,'ᆩ'=>2,'ᆪ'=>3,'ᆫ'=>4,'ᆬ'=>5,'ᆭ'=>6,'ᆮ'=>7,'ᆯ'=>8,'ᆰ'=>9,'ᆱ'=>10,'ᆲ'=>11,'ᆳ'=>12,'ᆴ'=>13,'ᆵ'=>14,'ᆶ'=>15,'ᆷ'=>16,'ᆸ'=>17,'ᆹ'=>18,'ᆺ'=>19,'ᆻ'=>20,'ᆼ'=>21,'ᆽ'=>22,'ᆾ'=>23,'ᆿ'=>24,'ᇀ'=>25,'ᇁ'=>26,'ᇂ'=>27);
|
||||
$GLOBALS['utf_jamo_type']=array('ᄀ'=>0,'ᄁ'=>0,'ᄂ'=>0,'ᄃ'=>0,'ᄄ'=>0,'ᄅ'=>0,'ᄆ'=>0,'ᄇ'=>0,'ᄈ'=>0,'ᄉ'=>0,'ᄊ'=>0,'ᄋ'=>0,'ᄌ'=>0,'ᄍ'=>0,'ᄎ'=>0,'ᄏ'=>0,'ᄐ'=>0,'ᄑ'=>0,'ᄒ'=>0,'ᅡ'=>1,'ᅢ'=>1,'ᅣ'=>1,'ᅤ'=>1,'ᅥ'=>1,'ᅦ'=>1,'ᅧ'=>1,'ᅨ'=>1,'ᅩ'=>1,'ᅪ'=>1,'ᅫ'=>1,'ᅬ'=>1,'ᅭ'=>1,'ᅮ'=>1,'ᅯ'=>1,'ᅰ'=>1,'ᅱ'=>1,'ᅲ'=>1,'ᅳ'=>1,'ᅴ'=>1,'ᅵ'=>1,'ᆧ'=>2,'ᆨ'=>2,'ᆩ'=>2,'ᆪ'=>2,'ᆫ'=>2,'ᆬ'=>2,'ᆭ'=>2,'ᆮ'=>2,'ᆯ'=>2,'ᆰ'=>2,'ᆱ'=>2,'ᆲ'=>2,'ᆳ'=>2,'ᆴ'=>2,'ᆵ'=>2,'ᆶ'=>2,'ᆷ'=>2,'ᆸ'=>2,'ᆹ'=>2,'ᆺ'=>2,'ᆻ'=>2,'ᆼ'=>2,'ᆽ'=>2,'ᆾ'=>2,'ᆿ'=>2,'ᇀ'=>2,'ᇁ'=>2,'ᇂ'=>2);
|
||||
$GLOBALS['utf_combining_class']=array('̀'=>230,'́'=>230,'̂'=>230,'̃'=>230,'̄'=>230,'̅'=>230,'̆'=>230,'̇'=>230,'̈'=>230,'̉'=>230,'̊'=>230,'̋'=>230,'̌'=>230,'̍'=>230,'̎'=>230,'̏'=>230,'̐'=>230,'̑'=>230,'̒'=>230,'̓'=>230,'̔'=>230,'̕'=>232,'̖'=>220,'̗'=>220,'̘'=>220,'̙'=>220,'̚'=>232,'̛'=>216,'̜'=>220,'̝'=>220,'̞'=>220,'̟'=>220,'̠'=>220,'̡'=>202,'̢'=>202,'̣'=>220,'̤'=>220,'̥'=>220,'̦'=>220,'̧'=>202,'̨'=>202,'̩'=>220,'̪'=>220,'̫'=>220,'̬'=>220,'̭'=>220,'̮'=>220,'̯'=>220,'̰'=>220,'̱'=>220,'̲'=>220,'̳'=>220,'̴'=>1,'̵'=>1,'̶'=>1,'̷'=>1,'̸'=>1,'̹'=>220,'̺'=>220,'̻'=>220,'̼'=>220,'̽'=>230,'̾'=>230,'̿'=>230,'̀'=>230,'́'=>230,'͂'=>230,'̓'=>230,'̈́'=>230,'ͅ'=>240,'͆'=>230,'͇'=>220,'͈'=>220,'͉'=>220,'͊'=>230,'͋'=>230,'͌'=>230,'͍'=>220,'͎'=>220,'͐'=>230,'͑'=>230,'͒'=>230,'͓'=>220,'͔'=>220,'͕'=>220,'͖'=>220,'͗'=>230,'͘'=>232,'͙'=>220,'͚'=>220,'͛'=>230,'͜'=>233,'͝'=>234,'͞'=>234,'͟'=>233,'͠'=>234,'͡'=>234,'͢'=>233,'ͣ'=>230,'ͤ'=>230,'ͥ'=>230,'ͦ'=>230,'ͧ'=>230,'ͨ'=>230,'ͩ'=>230,'ͪ'=>230,'ͫ'=>230,'ͬ'=>230,'ͭ'=>230,'ͮ'=>230,'ͯ'=>230,'҃'=>230,'҄'=>230,'҅'=>230,'҆'=>230,'֑'=>220,'֒'=>230,'֓'=>230,'֔'=>230,'֕'=>230,'֖'=>220,'֗'=>230,'֘'=>230,'֙'=>230,'֚'=>222,'֛'=>220,'֜'=>230,'֝'=>230,'֞'=>230,'֟'=>230,'֠'=>230,'֡'=>230,'֢'=>220,'֣'=>220,'֤'=>220,'֥'=>220,'֦'=>220,'֧'=>220,'֨'=>230,'֩'=>230,'֪'=>220,'֫'=>230,'֬'=>230,'֭'=>222,'֮'=>228,'֯'=>230,'ְ'=>10,'ֱ'=>11,'ֲ'=>12,'ֳ'=>13,'ִ'=>14,'ֵ'=>15,'ֶ'=>16,'ַ'=>17,'ָ'=>18,'ֹ'=>19,'ֺ'=>19,'ֻ'=>20,'ּ'=>21,'ֽ'=>22,'ֿ'=>23,'ׁ'=>24,'ׂ'=>25,'ׄ'=>230,'ׅ'=>220,'ׇ'=>18,'ؐ'=>230,'ؑ'=>230,'ؒ'=>230,'ؓ'=>230,'ؔ'=>230,'ؕ'=>230,'ً'=>27,'ٌ'=>28,'ٍ'=>29,'َ'=>30,'ُ'=>31,'ِ'=>32,'ّ'=>33,'ْ'=>34,'ٓ'=>230,'ٔ'=>230,'ٕ'=>220,'ٖ'=>220,'ٗ'=>230,'٘'=>230,'ٙ'=>230,'ٚ'=>230,'ٛ'=>230,'ٜ'=>220,'ٝ'=>230,'ٞ'=>230,'ٰ'=>35,'ۖ'=>230,'ۗ'=>230,'ۘ'=>230,'ۙ'=>230,'ۚ'=>230,'ۛ'=>230,'ۜ'=>230,'۟'=>230,'۠'=>230,'ۡ'=>230,'ۢ'=>230,'ۣ'=>220,'ۤ'=>230,'ۧ'=>230,'ۨ'=>230,'۪'=>220,'۫'=>230,'۬'=>230,'ۭ'=>220,'ܑ'=>36,'ܰ'=>230,'ܱ'=>220,'ܲ'=>230,'ܳ'=>230,'ܴ'=>220,'ܵ'=>230,'ܶ'=>230,'ܷ'=>220,'ܸ'=>220,'ܹ'=>220,'ܺ'=>230,'ܻ'=>220,'ܼ'=>220,'ܽ'=>230,'ܾ'=>220,'ܿ'=>230,'݀'=>230,'݁'=>230,'݂'=>220,'݃'=>230,'݄'=>220,'݅'=>230,'݆'=>220,'݇'=>230,'݈'=>220,'݉'=>230,'݊'=>230,'߫'=>230,'߬'=>230,'߭'=>230,'߮'=>230,'߯'=>230,'߰'=>230,'߱'=>230,'߲'=>220,'߳'=>230,'़'=>7,'्'=>9,'॑'=>230,'॒'=>220,'॓'=>230,'॔'=>230,'়'=>7,'্'=>9,'਼'=>7,'੍'=>9,'઼'=>7,'્'=>9,'଼'=>7,'୍'=>9,'்'=>9,'్'=>9,'ౕ'=>84,'ౖ'=>91,'಼'=>7,'್'=>9,'്'=>9,'්'=>9,'ุ'=>103,'ู'=>103,'ฺ'=>9,'่'=>107,'้'=>107,'๊'=>107,'๋'=>107,'ຸ'=>118,'ູ'=>118,'່'=>122,'້'=>122,'໊'=>122,'໋'=>122,'༘'=>220,'༙'=>220,'༵'=>220,'༷'=>220,'༹'=>216,'ཱ'=>129,'ི'=>130,'ུ'=>132,'ེ'=>130,'ཻ'=>130,'ོ'=>130,'ཽ'=>130,'ྀ'=>130,'ྂ'=>230,'ྃ'=>230,'྄'=>9,'྆'=>230,'྇'=>230,'࿆'=>220,'့'=>7,'္'=>9,'፟'=>230,'᜔'=>9,'᜴'=>9,'្'=>9,'៝'=>230,'ᢩ'=>228,'᤹'=>222,'᤺'=>230,'᤻'=>220,'ᨗ'=>230,'ᨘ'=>220,'᬴'=>7,'᭄'=>9,'᭫'=>230,'᭬'=>220,'᭭'=>230,'᭮'=>230,'᭯'=>230,'᭰'=>230,'᭱'=>230,'᭲'=>230,'᭳'=>230,'᷀'=>230,'᷁'=>230,'᷂'=>220,'᷃'=>230,'᷄'=>230,'᷅'=>230,'᷆'=>230,'᷇'=>230,'᷈'=>230,'᷉'=>230,'᷊'=>220,'᷾'=>230,'᷿'=>220,'⃐'=>230,'⃑'=>230,'⃒'=>1,'⃓'=>1,'⃔'=>230,'⃕'=>230,'⃖'=>230,'⃗'=>230,'⃘'=>1,'⃙'=>1,'⃚'=>1,'⃛'=>230,'⃜'=>230,'⃡'=>230,'⃥'=>1,'⃦'=>1,'⃧'=>230,'⃨'=>220,'⃩'=>230,'⃪'=>1,'⃫'=>1,'⃬'=>220,'⃭'=>220,'⃮'=>220,'⃯'=>220,'〪'=>218,'〫'=>228,'〬'=>232,'〭'=>222,'〮'=>224,'〯'=>224,'゙'=>8,'゚'=>8,'꠆'=>9,'ﬞ'=>26,'︠'=>230,'︡'=>230,'︢'=>230,'︣'=>230,'𐨍'=>220,'𐨏'=>230,'𐨸'=>230,'𐨹'=>1,'𐨺'=>220,'𐨿'=>9,'𝅥'=>216,'𝅦'=>216,'𝅧'=>1,'𝅨'=>1,'𝅩'=>1,'𝅭'=>226,'𝅮'=>216,'𝅯'=>216,'𝅰'=>216,'𝅱'=>216,'𝅲'=>216,'𝅻'=>220,'𝅼'=>220,'𝅽'=>220,'𝅾'=>220,'𝅿'=>220,'𝆀'=>220,'𝆁'=>220,'𝆂'=>220,'𝆅'=>230,'𝆆'=>230,'𝆇'=>230,'𝆈'=>230,'𝆉'=>230,'𝆊'=>220,'𝆋'=>220,'𝆪'=>230,'𝆫'=>230,'𝆬'=>230,'𝆭'=>230,'𝉂'=>230,'𝉃'=>230,'𝉄'=>230);
|
File diff suppressed because it is too large
Load Diff
@ -867,7 +867,6 @@ function utf8_recode($string, $encoding)
|
||||
|
||||
// Trigger an error?! Fow now just give bad data :-(
|
||||
trigger_error('Unknown encoding: ' . $encoding, E_USER_ERROR);
|
||||
//return $string; // use utf_normalizer::cleanup() ?
|
||||
}
|
||||
|
||||
/**
|
||||
@ -1611,14 +1610,8 @@ function utf8_case_fold_nfkc($text, $option = 'full')
|
||||
// do the case fold
|
||||
$text = utf8_case_fold($text, $option);
|
||||
|
||||
if (!class_exists('utf_normalizer'))
|
||||
{
|
||||
global $phpbb_root_path, $phpEx;
|
||||
include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
|
||||
}
|
||||
|
||||
// convert to NFKC
|
||||
utf_normalizer::nfkc($text);
|
||||
Normalizer::normalize($text, Normalizer::NFKC);
|
||||
|
||||
// FC_NFKC_Closure, http://www.unicode.org/Public/5.0.0/ucd/DerivedNormalizationProps.txt
|
||||
$text = strtr($text, $fc_nfkc_closure);
|
||||
@ -1714,106 +1707,56 @@ function utf8_case_fold_nfc($text, $option = 'full')
|
||||
return $text;
|
||||
}
|
||||
|
||||
if (extension_loaded('intl'))
|
||||
/**
|
||||
* wrapper around PHP's native normalizer from intl
|
||||
* previously a PECL extension, included in the core since PHP 5.3.0
|
||||
* http://php.net/manual/en/normalizer.normalize.php
|
||||
*
|
||||
* @param mixed $strings a string or an array of strings to normalize
|
||||
* @return mixed the normalized content, preserving array keys if array given.
|
||||
*/
|
||||
function utf8_normalize_nfc($strings)
|
||||
{
|
||||
/**
|
||||
* wrapper around PHP's native normalizer from intl
|
||||
* previously a PECL extension, included in the core since PHP 5.3.0
|
||||
* http://php.net/manual/en/normalizer.normalize.php
|
||||
*
|
||||
* @param mixed $strings a string or an array of strings to normalize
|
||||
* @return mixed the normalized content, preserving array keys if array given.
|
||||
*/
|
||||
function utf8_normalize_nfc($strings)
|
||||
if (empty($strings))
|
||||
{
|
||||
if (empty($strings))
|
||||
return $strings;
|
||||
}
|
||||
|
||||
if (!is_array($strings))
|
||||
{
|
||||
if (Normalizer::isNormalized($strings))
|
||||
{
|
||||
return $strings;
|
||||
}
|
||||
|
||||
if (!is_array($strings))
|
||||
return (string) Normalizer::normalize($strings);
|
||||
}
|
||||
else
|
||||
{
|
||||
foreach ($strings as $key => $string)
|
||||
{
|
||||
if (Normalizer::isNormalized($strings))
|
||||
if (is_array($string))
|
||||
{
|
||||
return $strings;
|
||||
}
|
||||
return (string) Normalizer::normalize($strings);
|
||||
}
|
||||
else
|
||||
{
|
||||
foreach ($strings as $key => $string)
|
||||
{
|
||||
if (is_array($string))
|
||||
foreach ($string as $_key => $_string)
|
||||
{
|
||||
foreach ($string as $_key => $_string)
|
||||
{
|
||||
if (Normalizer::isNormalized($strings[$key][$_key]))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
$strings[$key][$_key] = (string) Normalizer::normalize($strings[$key][$_key]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (Normalizer::isNormalized($strings[$key]))
|
||||
if (Normalizer::isNormalized($strings[$key][$_key]))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
$strings[$key] = (string) Normalizer::normalize($strings[$key]);
|
||||
$strings[$key][$_key] = (string) Normalizer::normalize($strings[$key][$_key]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $strings;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/**
|
||||
* A wrapper function for the normalizer which takes care of including the class if
|
||||
* required and modifies the passed strings to be in NFC (Normalization Form Composition).
|
||||
*
|
||||
* @param mixed $strings a string or an array of strings to normalize
|
||||
* @return mixed the normalized content, preserving array keys if array given.
|
||||
*/
|
||||
function utf8_normalize_nfc($strings)
|
||||
{
|
||||
if (empty($strings))
|
||||
{
|
||||
return $strings;
|
||||
}
|
||||
|
||||
if (!class_exists('utf_normalizer'))
|
||||
{
|
||||
global $phpbb_root_path, $phpEx;
|
||||
include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
|
||||
}
|
||||
|
||||
if (!is_array($strings))
|
||||
{
|
||||
utf_normalizer::nfc($strings);
|
||||
}
|
||||
else if (is_array($strings))
|
||||
{
|
||||
foreach ($strings as $key => $string)
|
||||
else
|
||||
{
|
||||
if (is_array($string))
|
||||
if (Normalizer::isNormalized($strings[$key]))
|
||||
{
|
||||
foreach ($string as $_key => $_string)
|
||||
{
|
||||
utf_normalizer::nfc($strings[$key][$_key]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
utf_normalizer::nfc($strings[$key]);
|
||||
continue;
|
||||
}
|
||||
$strings[$key] = (string) Normalizer::normalize($strings[$key]);
|
||||
}
|
||||
}
|
||||
|
||||
return $strings;
|
||||
}
|
||||
|
||||
return $strings;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -633,14 +633,8 @@ function utf8_new_case_fold_nfkc($text, $option = 'full')
|
||||
// do the case fold
|
||||
$text = utf8_new_case_fold($text, $option);
|
||||
|
||||
if (!class_exists('utf_normalizer'))
|
||||
{
|
||||
global $phpbb_root_path, $phpEx;
|
||||
include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
|
||||
}
|
||||
|
||||
// convert to NFKC
|
||||
utf_new_normalizer::nfkc($text);
|
||||
$text = Normalizer::normalize($text, Normalizer::NFKC);
|
||||
|
||||
// FC_NFKC_Closure, http://www.unicode.org/Public/5.0.0/ucd/DerivedNormalizationProps.txt
|
||||
$text = strtr($text, $fc_nfkc_closure);
|
||||
|
@ -1,197 +0,0 @@
|
||||
<?php
|
||||
/**
|
||||
*
|
||||
* This file is part of the phpBB Forum Software package.
|
||||
*
|
||||
* @copyright (c) phpBB Limited <https://www.phpbb.com>
|
||||
* @license GNU General Public License, version 2 (GPL-2.0)
|
||||
*
|
||||
* For full copyright and license information, please see
|
||||
* the docs/CREDITS.txt file.
|
||||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
* @ignore
|
||||
*/
|
||||
if (!defined('IN_PHPBB'))
|
||||
{
|
||||
exit;
|
||||
}
|
||||
|
||||
/**
|
||||
* A wrapper function for the normalizer which takes care of including the class if required and modifies the passed strings
|
||||
* to be in NFC (Normalization Form Composition).
|
||||
*
|
||||
* @param mixed $strings a string or an array of strings to normalize
|
||||
* @return mixed the normalized content, preserving array keys if array given.
|
||||
*/
|
||||
function utf8_new_normalize_nfc($strings)
|
||||
{
|
||||
if (empty($strings))
|
||||
{
|
||||
return $strings;
|
||||
}
|
||||
|
||||
if (!is_array($strings))
|
||||
{
|
||||
utf_new_normalizer::nfc($strings);
|
||||
}
|
||||
else if (is_array($strings))
|
||||
{
|
||||
foreach ($strings as $key => $string)
|
||||
{
|
||||
if (is_array($string))
|
||||
{
|
||||
foreach ($string as $_key => $_string)
|
||||
{
|
||||
utf_new_normalizer::nfc($strings[$key][$_key]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
utf_new_normalizer::nfc($strings[$key]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $strings;
|
||||
}
|
||||
|
||||
class utf_new_normalizer
|
||||
{
|
||||
/**
|
||||
* Validate, cleanup and normalize a string
|
||||
*
|
||||
* The ultimate convenience function! Clean up invalid UTF-8 sequences,
|
||||
* and convert to Normal Form C, canonical composition.
|
||||
*
|
||||
* @param string &$str The dirty string
|
||||
* @return string The same string, all shiny and cleaned-up
|
||||
*/
|
||||
function cleanup(&$str)
|
||||
{
|
||||
// The string below is the list of all autorized characters, sorted by frequency in latin text
|
||||
$pos = strspn($str, "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D");
|
||||
$len = strlen($str);
|
||||
|
||||
if ($pos == $len)
|
||||
{
|
||||
// ASCII strings with no special chars return immediately
|
||||
return;
|
||||
}
|
||||
|
||||
// Note: we do not check for $GLOBALS['utf_canonical_decomp']. It is assumed they are always loaded together
|
||||
if (!isset($GLOBALS['utf_nfc_qc']))
|
||||
{
|
||||
global $phpbb_root_path, $phpEx;
|
||||
include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
|
||||
}
|
||||
|
||||
if (!isset($GLOBALS['utf_canonical_decomp']))
|
||||
{
|
||||
global $phpbb_root_path, $phpEx;
|
||||
include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
|
||||
}
|
||||
|
||||
// Replace any byte in the range 0x00..0x1F, except for \r, \n and \t
|
||||
// We replace those characters with a 0xFF byte, which is illegal in UTF-8 and will in turn be replaced with a UTF replacement char
|
||||
$str = strtr(
|
||||
$str,
|
||||
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
|
||||
"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"
|
||||
);
|
||||
|
||||
$str = utf_new_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate and normalize a UTF string to NFC
|
||||
*
|
||||
* @param string &$str Unchecked UTF string
|
||||
* @return string The string, validated and in normal form
|
||||
*/
|
||||
function nfc(&$str)
|
||||
{
|
||||
$pos = strspn($str, UTF8_ASCII_RANGE);
|
||||
$len = strlen($str);
|
||||
|
||||
if ($pos == $len)
|
||||
{
|
||||
// ASCII strings return immediately
|
||||
return;
|
||||
}
|
||||
|
||||
if (!isset($GLOBALS['utf_nfc_qc']))
|
||||
{
|
||||
global $phpbb_root_path, $phpEx;
|
||||
include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
|
||||
}
|
||||
|
||||
if (!isset($GLOBALS['utf_canonical_decomp']))
|
||||
{
|
||||
global $phpbb_root_path, $phpEx;
|
||||
include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
|
||||
}
|
||||
|
||||
$str = utf_new_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate and normalize a UTF string to NFKC
|
||||
*
|
||||
* @param string &$str Unchecked UTF string
|
||||
* @return string The string, validated and in normal form
|
||||
*/
|
||||
function nfkc(&$str)
|
||||
{
|
||||
$pos = strspn($str, UTF8_ASCII_RANGE);
|
||||
$len = strlen($str);
|
||||
|
||||
if ($pos == $len)
|
||||
{
|
||||
// ASCII strings return immediately
|
||||
return;
|
||||
}
|
||||
|
||||
if (!isset($GLOBALS['utf_nfkc_qc']))
|
||||
{
|
||||
global $phpbb_root_path, $phpEx;
|
||||
include($phpbb_root_path . 'includes/utf/data/utf_nfkc_qc.' . $phpEx);
|
||||
}
|
||||
|
||||
if (!isset($GLOBALS['utf_compatibility_decomp']))
|
||||
{
|
||||
global $phpbb_root_path, $phpEx;
|
||||
include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);
|
||||
}
|
||||
|
||||
$str = utf_new_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
|
||||
}
|
||||
|
||||
/**
|
||||
* Recompose a UTF string
|
||||
*
|
||||
* @param string $str Unchecked UTF string
|
||||
* @param integer $pos Position of the first UTF char (in bytes)
|
||||
* @param integer $len Length of the string (in bytes)
|
||||
* @param array &$qc Quick-check array, passed by reference but never modified
|
||||
* @param array &$decomp_map Decomposition mapping, passed by reference but never modified
|
||||
* @return string The string, validated and recomposed
|
||||
*
|
||||
* @access private
|
||||
*/
|
||||
function recompose($str, $pos, $len, &$qc, &$decomp_map)
|
||||
{
|
||||
global $utf_canonical_comp;
|
||||
|
||||
// Load the canonical composition table
|
||||
if (!isset($utf_canonical_comp))
|
||||
{
|
||||
global $phpbb_root_path, $phpEx;
|
||||
include($phpbb_root_path . 'includes/utf/data/utf_canonical_comp.' . $phpEx);
|
||||
}
|
||||
|
||||
return utf_normalizer::recompose($str, $pos, $len, $qc, $decomp_map);
|
||||
}
|
||||
}
|
@ -74,7 +74,6 @@ require($phpbb_root_path . 'includes/functions.' . $phpEx);
|
||||
require($phpbb_root_path . 'includes/functions_content.' . $phpEx);
|
||||
|
||||
require($phpbb_root_path . 'includes/constants.' . $phpEx);
|
||||
include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
|
||||
require($phpbb_root_path . 'includes/utf/utf_tools.' . $phpEx);
|
||||
|
||||
// Set PHP error handler to ours
|
||||
|
@ -102,7 +102,6 @@ phpbb_require_updated('includes/functions.' . $phpEx);
|
||||
phpbb_require_updated('includes/functions_content.' . $phpEx, true);
|
||||
|
||||
phpbb_include_updated('includes/functions_admin.' . $phpEx);
|
||||
phpbb_include_updated('includes/utf/utf_normalizer.' . $phpEx);
|
||||
phpbb_include_updated('includes/utf/utf_tools.' . $phpEx);
|
||||
phpbb_require_updated('includes/functions_install.' . $phpEx);
|
||||
|
||||
|
@ -18,6 +18,13 @@ namespace phpbb\search;
|
||||
*/
|
||||
class fulltext_native extends \phpbb\search\base
|
||||
{
|
||||
const UTF8_HANGUL_FIRST = "\xEA\xB0\x80";
|
||||
const UTF8_HANGUL_LAST = "\xED\x9E\xA3";
|
||||
const UTF8_CJK_FIRST = "\xE4\xB8\x80";
|
||||
const UTF8_CJK_LAST = "\xE9\xBE\xBB";
|
||||
const UTF8_CJK_B_FIRST = "\xF0\xA0\x80\x80";
|
||||
const UTF8_CJK_B_LAST = "\xF0\xAA\x9B\x96";
|
||||
|
||||
/**
|
||||
* Associative array holding index stats
|
||||
* @var array
|
||||
@ -93,7 +100,7 @@ class fulltext_native extends \phpbb\search\base
|
||||
protected $user;
|
||||
|
||||
/**
|
||||
* Initialises the fulltext_native search backend with min/max word length and makes sure the UTF-8 normalizer is loaded
|
||||
* Initialises the fulltext_native search backend with min/max word length
|
||||
*
|
||||
* @param boolean|string &$error is passed by reference and should either be set to false on success or an error message on failure
|
||||
*/
|
||||
@ -110,10 +117,6 @@ class fulltext_native extends \phpbb\search\base
|
||||
/**
|
||||
* Load the UTF tools
|
||||
*/
|
||||
if (!class_exists('utf_normalizer'))
|
||||
{
|
||||
include($this->phpbb_root_path . 'includes/utf/utf_normalizer.' . $this->php_ext);
|
||||
}
|
||||
if (!function_exists('utf8_decode_ncr'))
|
||||
{
|
||||
include($this->phpbb_root_path . 'includes/utf/utf_tools.' . $this->php_ext);
|
||||
@ -1175,9 +1178,9 @@ class fulltext_native extends \phpbb\search\base
|
||||
* Note: this could be optimized. If the codepoint is lower than Hangul's range
|
||||
* we know that it will also be lower than CJK ranges
|
||||
*/
|
||||
if ((strncmp($word, UTF8_HANGUL_FIRST, 3) < 0 || strncmp($word, UTF8_HANGUL_LAST, 3) > 0)
|
||||
&& (strncmp($word, UTF8_CJK_FIRST, 3) < 0 || strncmp($word, UTF8_CJK_LAST, 3) > 0)
|
||||
&& (strncmp($word, UTF8_CJK_B_FIRST, 4) < 0 || strncmp($word, UTF8_CJK_B_LAST, 4) > 0))
|
||||
if ((strncmp($word, self::UTF8_HANGUL_FIRST, 3) < 0 || strncmp($word, self::UTF8_HANGUL_LAST, 3) > 0)
|
||||
&& (strncmp($word, self::UTF8_CJK_FIRST, 3) < 0 || strncmp($word, self::UTF8_CJK_LAST, 3) > 0)
|
||||
&& (strncmp($word, self::UTF8_CJK_B_FIRST, 4) < 0 || strncmp($word, self::UTF8_CJK_B_LAST, 4) > 0))
|
||||
{
|
||||
$word = strtok(' ');
|
||||
continue;
|
||||
@ -1544,8 +1547,6 @@ class fulltext_native extends \phpbb\search\base
|
||||
* @param string $allowed_chars String of special chars to allow
|
||||
* @param string $encoding Text encoding
|
||||
* @return string Cleaned up text, only alphanumeric chars are left
|
||||
*
|
||||
* @todo \normalizer::cleanup being able to be used?
|
||||
*/
|
||||
protected function cleanup($text, $allowed_chars = null, $encoding = 'utf-8')
|
||||
{
|
||||
@ -1572,12 +1573,9 @@ class fulltext_native extends \phpbb\search\base
|
||||
$text = htmlspecialchars_decode(utf8_decode_ncr($text), ENT_QUOTES);
|
||||
|
||||
/**
|
||||
* Load the UTF-8 normalizer
|
||||
*
|
||||
* If we use it more widely, an instance of that class should be held in a
|
||||
* a global variable instead
|
||||
* Normalize to NFC
|
||||
*/
|
||||
\utf_normalizer::nfc($text);
|
||||
$text = \Normalizer::normalize($text);
|
||||
|
||||
/**
|
||||
* The first thing we do is:
|
||||
@ -1670,9 +1668,9 @@ class fulltext_native extends \phpbb\search\base
|
||||
$utf_char = substr($text, $pos, $utf_len);
|
||||
$pos += $utf_len;
|
||||
|
||||
if (($utf_char >= UTF8_HANGUL_FIRST && $utf_char <= UTF8_HANGUL_LAST)
|
||||
|| ($utf_char >= UTF8_CJK_FIRST && $utf_char <= UTF8_CJK_LAST)
|
||||
|| ($utf_char >= UTF8_CJK_B_FIRST && $utf_char <= UTF8_CJK_B_LAST))
|
||||
if (($utf_char >= self::UTF8_HANGUL_FIRST && $utf_char <= self::UTF8_HANGUL_LAST)
|
||||
|| ($utf_char >= self::UTF8_CJK_FIRST && $utf_char <= self::UTF8_CJK_LAST)
|
||||
|| ($utf_char >= self::UTF8_CJK_B_FIRST && $utf_char <= self::UTF8_CJK_B_LAST))
|
||||
{
|
||||
/**
|
||||
* All characters within these ranges are valid
|
||||
|
@ -120,8 +120,9 @@ directory (above phpBB):
|
||||
Slow tests
|
||||
--------------
|
||||
|
||||
Certain tests, such as the UTF-8 normalizer or the DNS tests tend to be slow.
|
||||
Thus these tests are in the `slow` group, which is excluded by default. If you
|
||||
Certain tests, such as the DNS tests tend to be slow.
|
||||
Thus these tests are in the `slow` group, which is excluded by default. You can
|
||||
enable slow tests by copying the phpunit.xml.all file to phpunit.xml. If you
|
||||
only want the slow tests, run:
|
||||
|
||||
$ phpBB/vendor/bin/phpunit --group slow
|
||||
|
@ -1,327 +0,0 @@
|
||||
<?php
|
||||
/**
|
||||
*
|
||||
* This file is part of the phpBB Forum Software package.
|
||||
*
|
||||
* @copyright (c) phpBB Limited <https://www.phpbb.com>
|
||||
* @license GNU General Public License, version 2 (GPL-2.0)
|
||||
*
|
||||
* For full copyright and license information, please see
|
||||
* the docs/CREDITS.txt file.
|
||||
*
|
||||
*/
|
||||
|
||||
require_once dirname(__FILE__) . '/../../phpBB/includes/utf/utf_normalizer.php';
|
||||
|
||||
/**
|
||||
* @group slow
|
||||
*/
|
||||
class phpbb_utf_normalizer_test extends phpbb_test_case
|
||||
{
|
||||
static private $data_dir;
|
||||
|
||||
static public function setUpBeforeClass()
|
||||
{
|
||||
self::$data_dir = dirname(__file__) . '/../tmp';
|
||||
self::download('http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt', self::$data_dir);
|
||||
self::download('http://www.unicode.org/Public/UNIDATA/UnicodeData.txt', self::$data_dir);
|
||||
}
|
||||
|
||||
public function test_normalizer()
|
||||
{
|
||||
$test_suite = array(
|
||||
/**
|
||||
* NFC
|
||||
* c2 == NFC(c1) == NFC(c2) == NFC(c3)
|
||||
* c4 == NFC(c4) == NFC(c5)
|
||||
*/
|
||||
'NFC' => array(
|
||||
'c2' => array('c1', 'c2', 'c3'),
|
||||
'c4' => array('c4', 'c5')
|
||||
),
|
||||
|
||||
/**
|
||||
* NFD
|
||||
* c3 == NFD(c1) == NFD(c2) == NFD(c3)
|
||||
* c5 == NFD(c4) == NFD(c5)
|
||||
*/
|
||||
'NFD' => array(
|
||||
'c3' => array('c1', 'c2', 'c3'),
|
||||
'c5' => array('c4', 'c5')
|
||||
),
|
||||
|
||||
/**
|
||||
* NFKC
|
||||
* c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
|
||||
*/
|
||||
'NFKC' => array(
|
||||
'c4' => array('c1', 'c2', 'c3', 'c4', 'c5')
|
||||
),
|
||||
|
||||
/**
|
||||
* NFKD
|
||||
* c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
|
||||
*/
|
||||
'NFKD' => array(
|
||||
'c5' => array('c1', 'c2', 'c3', 'c4', 'c5')
|
||||
)
|
||||
);
|
||||
|
||||
$tested_chars = array();
|
||||
|
||||
$fp = fopen(self::$data_dir . '/NormalizationTest.txt', 'rb');
|
||||
while (!feof($fp))
|
||||
{
|
||||
$line = fgets($fp);
|
||||
|
||||
if ($line[0] == '@')
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!strpos(' 0123456789ABCDEF', $line[0]))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
list($c1, $c2, $c3, $c4, $c5) = explode(';', $line);
|
||||
|
||||
if (!strpos($c1, ' '))
|
||||
{
|
||||
/**
|
||||
* We are currently testing a single character, we add it to the list of
|
||||
* characters we have processed so that we can exclude it when testing
|
||||
* for invariants
|
||||
*/
|
||||
$tested_chars[$c1] = 1;
|
||||
}
|
||||
|
||||
foreach ($test_suite as $form => $serie)
|
||||
{
|
||||
foreach ($serie as $expected => $tests)
|
||||
{
|
||||
$hex_expected = ${$expected};
|
||||
$utf_expected = $this->hexseq_to_utf($hex_expected);
|
||||
|
||||
foreach ($tests as $test)
|
||||
{
|
||||
$utf_result = $utf_expected;
|
||||
call_user_func_array(array('utf_normalizer', $form), array(&$utf_result));
|
||||
|
||||
$hex_result = $this->utf_to_hexseq($utf_result);
|
||||
$this->assertEquals($utf_expected, $utf_result, "$expected == $form($test) ($hex_expected != $hex_result)");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
fclose($fp);
|
||||
|
||||
return $tested_chars;
|
||||
}
|
||||
|
||||
/**
|
||||
* @depends test_normalizer
|
||||
*/
|
||||
public function test_invariants(array $tested_chars)
|
||||
{
|
||||
$fp = fopen(self::$data_dir . '/UnicodeData.txt', 'rb');
|
||||
|
||||
while (!feof($fp))
|
||||
{
|
||||
$line = fgets($fp, 1024);
|
||||
|
||||
if (!$pos = strpos($line, ';'))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
$hex_tested = $hex_expected = substr($line, 0, $pos);
|
||||
|
||||
if (isset($tested_chars[$hex_tested]))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
$utf_expected = $this->hex_to_utf($hex_expected);
|
||||
|
||||
if ($utf_expected >= UTF8_SURROGATE_FIRST
|
||||
&& $utf_expected <= UTF8_SURROGATE_LAST)
|
||||
{
|
||||
/**
|
||||
* Surrogates are illegal on their own, we expect the normalizer
|
||||
* to return a replacement char
|
||||
*/
|
||||
$utf_expected = UTF8_REPLACEMENT;
|
||||
$hex_expected = $this->utf_to_hexseq($utf_expected);
|
||||
}
|
||||
|
||||
foreach (array('nfc', 'nfkc', 'nfd', 'nfkd') as $form)
|
||||
{
|
||||
$utf_result = $utf_expected;
|
||||
call_user_func_array(array('utf_normalizer', $form), array(&$utf_result));
|
||||
$hex_result = $this->utf_to_hexseq($utf_result);
|
||||
|
||||
$this->assertEquals($utf_expected, $utf_result, "$hex_expected == $form($hex_tested) ($hex_expected != $hex_result)");
|
||||
}
|
||||
}
|
||||
fclose($fp);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a UTF string to a sequence of codepoints in hexadecimal
|
||||
*
|
||||
* @param string $utf UTF string
|
||||
* @return integer Unicode codepoints in hex
|
||||
*/
|
||||
protected function utf_to_hexseq($str)
|
||||
{
|
||||
$pos = 0;
|
||||
$len = strlen($str);
|
||||
$ret = array();
|
||||
|
||||
while ($pos < $len)
|
||||
{
|
||||
$c = $str[$pos];
|
||||
switch ($c & "\xF0")
|
||||
{
|
||||
case "\xC0":
|
||||
case "\xD0":
|
||||
$utf_char = substr($str, $pos, 2);
|
||||
$pos += 2;
|
||||
break;
|
||||
|
||||
case "\xE0":
|
||||
$utf_char = substr($str, $pos, 3);
|
||||
$pos += 3;
|
||||
break;
|
||||
|
||||
case "\xF0":
|
||||
$utf_char = substr($str, $pos, 4);
|
||||
$pos += 4;
|
||||
break;
|
||||
|
||||
default:
|
||||
$utf_char = $c;
|
||||
++$pos;
|
||||
}
|
||||
|
||||
$hex = dechex($this->utf_to_cp($utf_char));
|
||||
|
||||
if (!isset($hex[3]))
|
||||
{
|
||||
$hex = substr('000' . $hex, -4);
|
||||
}
|
||||
|
||||
$ret[] = $hex;
|
||||
}
|
||||
|
||||
return strtr(implode(' ', $ret), 'abcdef', 'ABCDEF');
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a UTF-8 char to its codepoint
|
||||
*
|
||||
* @param string $utf_char UTF-8 char
|
||||
* @return integer Unicode codepoint
|
||||
*/
|
||||
protected function utf_to_cp($utf_char)
|
||||
{
|
||||
switch (strlen($utf_char))
|
||||
{
|
||||
case 1:
|
||||
return ord($utf_char);
|
||||
|
||||
case 2:
|
||||
return ((ord($utf_char[0]) & 0x1F) << 6) | (ord($utf_char[1]) & 0x3F);
|
||||
|
||||
case 3:
|
||||
return ((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F);
|
||||
|
||||
case 4:
|
||||
return ((ord($utf_char[0]) & 0x07) << 18) | ((ord($utf_char[1]) & 0x3F) << 12) | ((ord($utf_char[2]) & 0x3F) << 6) | (ord($utf_char[3]) & 0x3F);
|
||||
|
||||
default:
|
||||
throw new RuntimeException('UTF-8 chars can only be 1-4 bytes long');
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a UTF string formed from a sequence of codepoints in hexadecimal
|
||||
*
|
||||
* @param string $seq Sequence of codepoints, separated with a space
|
||||
* @return string UTF-8 string
|
||||
*/
|
||||
protected function hexseq_to_utf($seq)
|
||||
{
|
||||
return implode('', array_map(array($this, 'hex_to_utf'), explode(' ', $seq)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a codepoint in hexadecimal to a UTF-8 char
|
||||
*
|
||||
* @param string $hex Codepoint, in hexadecimal
|
||||
* @return string UTF-8 char
|
||||
*/
|
||||
protected function hex_to_utf($hex)
|
||||
{
|
||||
return $this->cp_to_utf(hexdec($hex));
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a codepoint to a UTF-8 char
|
||||
*
|
||||
* @param integer $cp Unicode codepoint
|
||||
* @return string UTF-8 string
|
||||
*/
|
||||
protected function cp_to_utf($cp)
|
||||
{
|
||||
if ($cp > 0xFFFF)
|
||||
{
|
||||
return chr(0xF0 | ($cp >> 18)) . chr(0x80 | (($cp >> 12) & 0x3F)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
|
||||
}
|
||||
else if ($cp > 0x7FF)
|
||||
{
|
||||
return chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
|
||||
}
|
||||
else if ($cp > 0x7F)
|
||||
{
|
||||
return chr(0xC0 | ($cp >> 6)) . chr(0x80 | ($cp & 0x3F));
|
||||
}
|
||||
else
|
||||
{
|
||||
return chr($cp);
|
||||
}
|
||||
}
|
||||
|
||||
// chunked download helper
|
||||
static protected function download($url, $to)
|
||||
{
|
||||
$target = $to . '/' . basename($url);
|
||||
|
||||
if (file_exists($target))
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
if (!$fpr = fopen($url, 'rb'))
|
||||
{
|
||||
echo "Failed to download $url\n";
|
||||
return;
|
||||
}
|
||||
|
||||
if (!$fpw = fopen($target, 'wb'))
|
||||
{
|
||||
echo "Failed to open $target for writing\n";
|
||||
return;
|
||||
}
|
||||
|
||||
$chunk = 32768;
|
||||
|
||||
while (!feof($fpr))
|
||||
{
|
||||
fwrite($fpw, fread($fpr, $chunk));
|
||||
}
|
||||
fclose($fpr);
|
||||
fclose($fpw);
|
||||
}
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user