[feature/patchwork-utf8] Normalize with intl, use patchwork/utf8 as fallback

2025-08-09 10:16:36 +02:00 · 2013-11-20 13:47:31 +01:00
parent 1601b61ef9
commit 5a7caf6508
20 changed files with 110 additions and 2883 deletions
--- a/phpBB/develop/generate_utf_tables.php
+++ b/phpBB/develop/generate_utf_tables.php
@@ -32,262 +32,11 @@ $phpbb_root_path = '../';
 $phpEx = substr(strrchr(__FILE__, '.'), 1);

 echo "Checking for required files\n";
-download('http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt');
-download('http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt');
 download('http://www.unicode.org/Public/UNIDATA/UnicodeData.txt');
 echo "\n";

-require_once($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
-$file_contents = array();
-
 /**
-* Generate some Hangul/Jamo stuff
-*/
-echo "\nGenerating Hangul and Jamo tables\n";
-for ($i = 0; $i < UNICODE_HANGUL_LCOUNT; ++$i)
-{
-	$utf_char = cp_to_utf(UNICODE_HANGUL_LBASE + $i);
-	$file_contents['utf_normalizer_common']['utf_jamo_index'][$utf_char] = $i * UNICODE_HANGUL_VCOUNT * UNICODE_HANGUL_TCOUNT + UNICODE_HANGUL_SBASE;
-	$file_contents['utf_normalizer_common']['utf_jamo_type'][$utf_char] = UNICODE_JAMO_L;
-}
-
-for ($i = 0; $i < UNICODE_HANGUL_VCOUNT; ++$i)
-{
-	$utf_char = cp_to_utf(UNICODE_HANGUL_VBASE + $i);
-	$file_contents['utf_normalizer_common']['utf_jamo_index'][$utf_char] = $i * UNICODE_HANGUL_TCOUNT;
-	$file_contents['utf_normalizer_common']['utf_jamo_type'][$utf_char] = UNICODE_JAMO_V;
-}
-
-for ($i = 0; $i < UNICODE_HANGUL_TCOUNT; ++$i)
-{
-	$utf_char = cp_to_utf(UNICODE_HANGUL_TBASE + $i);
-	$file_contents['utf_normalizer_common']['utf_jamo_index'][$utf_char] = $i;
-	$file_contents['utf_normalizer_common']['utf_jamo_type'][$utf_char] = UNICODE_JAMO_T;
-}
-
-/**
-* Load the CompositionExclusions table
-*/
-echo "Loading CompositionExclusion\n";
-$fp = fopen('CompositionExclusions.txt', 'rt');
-
-$exclude = array();
-while (!feof($fp))
-{
-	$line = fgets($fp, 1024);
-
-	if (!strpos(' 0123456789ABCDEFabcdef', $line[0]))
-	{
-		continue;
-	}
-
-	$cp = strtok($line, ' ');
-
-	if ($pos = strpos($cp, '..'))
-	{
-		$start = hexdec(substr($cp, 0, $pos));
-		$end = hexdec(substr($cp, $pos + 2));
-
-		for ($i = $start; $i < $end; ++$i)
-		{
-			$exclude[$i] = 1;
-		}
-	}
-	else
-	{
-		$exclude[hexdec($cp)] = 1;
-	}
-}
-fclose($fp);
-
-/**
-* Load QuickCheck tables
-*/
-echo "Generating QuickCheck tables\n";
-$fp = fopen('DerivedNormalizationProps.txt', 'rt');
-
-while (!feof($fp))
-{
-	$line = fgets($fp, 1024);
-
-	if (!strpos(' 0123456789ABCDEFabcdef', $line[0]))
-	{
-		continue;
-	}
-
-	$p = array_map('trim', explode(';', strtok($line, '#')));
-
-	/**
-	* Capture only NFC_QC, NFKC_QC
-	*/
-	if (!preg_match('#^NFK?C_QC$#', $p[1]))
-	{
-		continue;
-	}
-
-	if ($pos = strpos($p[0], '..'))
-	{
-		$start = hexdec(substr($p[0], 0, $pos));
-		$end = hexdec(substr($p[0], $pos + 2));
-	}
-	else
-	{
-		$start = $end = hexdec($p[0]);
-	}
-
-	if ($start >= UTF8_HANGUL_FIRST && $end <= UTF8_HANGUL_LAST)
-	{
-		/**
-		* We do not store Hangul syllables in the array
-		*/
-		continue;
-	}
-
-	if ($p[2] == 'M')
-	{
-		$val = UNICODE_QC_MAYBE;
-	}
-	else
-	{
-		$val = UNICODE_QC_NO;
-	}
-
-	if ($p[1] == 'NFKC_QC')
-	{
-		$file = 'utf_nfkc_qc';
-	}
-	else
-	{
-		$file = 'utf_nfc_qc';
-	}
-
-	for ($i = $start; $i <= $end; ++$i)
-	{
-		/**
-		* The vars have the same name as the file: $utf_nfc_qc is in utf_nfc_qc.php
-		*/
-		$file_contents[$file][$file][cp_to_utf($i)] = $val;
-	}
-}
-fclose($fp);
-
-/**
-* Do mappings
-*/
-echo "Loading Unicode decomposition mappings\n";
-$fp = fopen($phpbb_root_path . 'develop/UnicodeData.txt', 'rt');
-
-$map = array();
-while (!feof($fp))
-{
-	$p = explode(';', fgets($fp, 1024));
-	$cp = hexdec($p[0]);
-
-	if (!empty($p[3]))
-	{
-		/**
-		* Store combining class > 0
-		*/
-		$file_contents['utf_normalizer_common']['utf_combining_class'][cp_to_utf($cp)] = (int) $p[3];
-	}
-
-	if (!isset($p[5]) || !preg_match_all('#[0-9A-F]+#', strip_tags($p[5]), $m))
-	{
-		continue;
-	}
-
-	if (strpos($p[5], '>'))
-	{
-		$map['NFKD'][$cp] = implode(' ', array_map('hexdec', $m[0]));
-	}
-	else
-	{
-		$map['NFD'][$cp] = $map['NFKD'][$cp] = implode(' ', array_map('hexdec', $m[0]));
-	}
-}
-fclose($fp);
-
-/**
-* Build the canonical composition table
-*/
-echo "Generating the Canonical Composition table\n";
-foreach ($map['NFD'] as $cp => $decomp_seq)
-{
-	if (!strpos($decomp_seq, ' ') || isset($exclude[$cp]))
-	{
-		/**
-		* Singletons are excluded from canonical composition
-		*/
-		continue;
-	}
-
-	$utf_seq = implode('', array_map('cp_to_utf', explode(' ', $decomp_seq)));
-
-	if (!isset($file_contents['utf_canonical_comp']['utf_canonical_comp'][$utf_seq]))
-	{
-		$file_contents['utf_canonical_comp']['utf_canonical_comp'][$utf_seq] = cp_to_utf($cp);
-	}
-}
-
-/**
-* Decompose the NF[K]D mappings recursively and prepare the file contents
-*/
-echo "Generating the Canonical and Compatibility Decomposition tables\n\n";
-foreach ($map as $type => $decomp_map)
-{
-	foreach ($decomp_map as $cp => $decomp_seq)
-	{
-		$decomp_map[$cp] = decompose($decomp_map, $decomp_seq);
-	}
-	unset($decomp_seq);
-
-	if ($type == 'NFKD')
-	{
-		$file = 'utf_compatibility_decomp';
-		$var = 'utf_compatibility_decomp';
-	}
-	else
-	{
-		$file = 'utf_canonical_decomp';
-		$var = 'utf_canonical_decomp';
-	}
-
-	/**
-	* Generate the corresponding file
-	*/
-	foreach ($decomp_map as $cp => $decomp_seq)
-	{
-		$file_contents[$file][$var][cp_to_utf($cp)] = implode('', array_map('cp_to_utf', explode(' ', $decomp_seq)));
-	}
-}
-
-/**
-* Generate and/or alter the files
-*/
-foreach ($file_contents as $file => $contents)
-{
-	/**
-	* Generate a new file
-	*/
-	echo "Writing to $file.$phpEx\n";
-
-	if (!$fp = fopen($phpbb_root_path . 'includes/utf/data/' . $file . '.' . $phpEx, 'wb'))
-	{
-		trigger_error('Cannot open ' . $file . ' for write');
-	}
-
-	fwrite($fp, '<?php');
-	foreach ($contents as $var => $val)
-	{
-		fwrite($fp, "\n\$GLOBALS[" . my_var_export($var) . ']=' . my_var_export($val) . ";");
-	}
-	fclose($fp);
-}
-
-echo "\n*** UTF-8 normalization tables done\n\n";
-
-/**
-* Now we'll generate the files needed by the search indexer
+* Generate the files needed by the search indexer
 */
 echo "Generating search indexer tables\n";

@@ -424,32 +173,6 @@ die("\nAll done!\n");
 //                             Internal functions                             //
 ////////////////////////////////////////////////////////////////////////////////

-/**
-* Decompose a sequence recusively
-*
-* @param	array	$decomp_map	Decomposition mapping, passed by reference
-* @param	string	$decomp_seq	Decomposition sequence as decimal codepoints separated with a space
-* @return	string				Decomposition sequence, fully decomposed
-*/
-function decompose(&$decomp_map, $decomp_seq)
-{
-	$ret = array();
-	foreach (explode(' ', $decomp_seq) as $cp)
-	{
-		if (isset($decomp_map[$cp]))
-		{
-			$ret[] = decompose($decomp_map, $decomp_map[$cp]);
-		}
-		else
-		{
-			$ret[] = $cp;
-		}
-	}
-
-	return implode(' ', $ret);
-}
-
-
 /**
 * Return a parsable string representation of a variable
 *
@@ -537,17 +260,6 @@ function hex_to_utf($hex)
 	return cp_to_utf(hexdec($hex));
 }

-/**
-* Return a UTF string formed from a sequence of codepoints in hexadecimal
-*
-* @param	string	$seq		Sequence of codepoints, separated with a space
-* @return	string				UTF-8 string
-*/
-function hexseq_to_utf($seq)
-{
-	return implode('', array_map('hex_to_utf', explode(' ', $seq)));
-}
-
 /**
 * Convert a codepoint to a UTF-8 char
 *
--- a/phpBB/develop/unicode_testing.php
+++ b/phpBB/develop/unicode_testing.php
@@ -81,38 +81,3 @@ function utf8_to_unicode_callback($m)
 {
 	return '\u' . str_pad(base_convert(utf8_ord($m[0]), 10, 16), 4, '0', STR_PAD_LEFT) . '';
 }
-
-/**
-* A wrapper function for the normalizer which takes care of including the class if required and modifies the passed strings
-* to be in NFKC
-*
-* @param	mixed	$strings	a string or an array of strings to normalize
-* @return	mixed				the normalized content, preserving array keys if array given.
-*/
-function utf8_normalize_nfkc($strings)
-{
-	if (empty($strings))
-	{
-		return $strings;
-	}
-
-	if (!class_exists('utf_normalizer'))
-	{
-		global $phpbb_root_path, $phpEx;
-		include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
-	}
-
-	if (!is_array($strings))
-	{
-		utf_normalizer::nfkc($strings);
-	}
-	else if (is_array($strings))
-	{
-		foreach ($strings as $key => $string)
-		{
-			utf_normalizer::nfkc($strings[$key]);
-		}
-	}
-
-	return $strings;
-}
--- a/phpBB/develop/utf_normalizer_test.php
+++ b/phpBB/develop/utf_normalizer_test.php
@@ -1,394 +0,0 @@
-<?php
-/**
-*
-* This file is part of the phpBB Forum Software package.
-*
-* @copyright (c) phpBB Limited <https://www.phpbb.com>
-* @license GNU General Public License, version 2 (GPL-2.0)
-*
-* For full copyright and license information, please see
-* the docs/CREDITS.txt file.
-*
-*/
-
-if (php_sapi_name() != 'cli')
-{
-	die("This program must be run from the command line.\n");
-}
-
-//
-// Security message:
-//
-// This script is potentially dangerous.
-// Remove or comment the next line (die(".... ) to enable this script.
-// Do NOT FORGET to either remove this script or disable it after you have used it.
-//
-die("Please read the first lines of this script for instructions on how to enable it");
-
-set_time_limit(0);
-error_reporting(E_ALL);
-
-define('IN_PHPBB', true);
-$phpbb_root_path = '../';
-$phpEx = substr(strrchr(__FILE__, '.'), 1);
-
-
-/**
-* Let's download some files we need
-*/
-download('http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt');
-download('http://www.unicode.org/Public/UNIDATA/UnicodeData.txt');
-
-/**
-* Those are the tests we run
-*/
-$test_suite = array(
-	/**
-	* NFC
-	*   c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3)
-	*   c4 ==  NFC(c4) ==  NFC(c5)
-	*/
-	'NFC'	=>	array(
-		'c2'	=>	array('c1', 'c2', 'c3'),
-		'c4'	=>	array('c4', 'c5')
-	),
-
-	/**
-	* NFD
-	*   c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3)
-	*   c5 ==  NFD(c4) ==  NFD(c5)
-	*/
-	'NFD'	=>	array(
-		'c3'	=>	array('c1', 'c2', 'c3'),
-		'c5'	=>	array('c4', 'c5')
-	),
-
-	/**
-	* NFKC
-	*   c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
-	*/
-	'NFKC'	=>	array(
-		'c4'	=>	array('c1', 'c2', 'c3', 'c4', 'c5')
-	),
-
-	/**
-	* NFKD
-	*   c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
-	*/
-	'NFKD'	=>	array(
-		'c5'	=>	array('c1', 'c2', 'c3', 'c4', 'c5')
-	)
-);
-
-require_once($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
-
-$i = $n = 0;
-$failed = false;
-$tested_chars = array();
-
-$fp = fopen($phpbb_root_path . 'develop/NormalizationTest.txt', 'rb');
-while (!feof($fp))
-{
-	$line = fgets($fp);
-	++$n;
-
-	if ($line[0] == '@')
-	{
-		if ($i)
-		{
-			echo "done\n";
-		}
-
-		$i = 0;
-		echo "\n", substr($line, 1), "\n\n";
-		continue;
-	}
-
-	if (!strpos(' 0123456789ABCDEF', $line[0]))
-	{
-		continue;
-	}
-
-	if (++$i % 100 == 0)
-	{
-		echo $i, ' ';
-	}
-
-	list($c1, $c2, $c3, $c4, $c5) = explode(';', $line);
-
-	if (!strpos($c1, ' '))
-	{
-		/**
-		* We are currently testing a single character, we add it to the list of
-		* characters we have processed so that we can exclude it when testing
-		* for invariants
-		*/
-		$tested_chars[$c1] = 1;
-	}
-
-	foreach ($test_suite as $form => $serie)
-	{
-		foreach ($serie as $expected => $tests)
-		{
-			$hex_expected = ${$expected};
-			$utf_expected = hexseq_to_utf($hex_expected);
-
-			foreach ($tests as $test)
-			{
-				$utf_result = $utf_expected;
-				call_user_func(array('utf_normalizer', $form), $utf_result);
-
-				if (strcmp($utf_expected, $utf_result))
-				{
-					$failed = true;
-					$hex_result = utf_to_hexseq($utf_result);
-
-					echo "\nFAILED $expected == $form($test) ($hex_expected != $hex_result)";
-				}
-			}
-		}
-
-		if ($failed)
-		{
-			die("\n\nFailed at line $n\n");
-		}
-	}
-}
-fclose($fp);
-
-/**
-* Test for invariants
-*/
-echo "\n\nTesting for invariants...\n\n";
-
-$fp = fopen($phpbb_root_path . 'develop/UnicodeData.txt', 'rt');
-
-$n = 0;
-while (!feof($fp))
-{
-	if (++$n % 100 == 0)
-	{
-		echo $n, ' ';
-	}
-
-	$line = fgets($fp, 1024);
-
-	if (!$pos = strpos($line, ';'))
-	{
-		continue;
-	}
-
-	$hex_tested = $hex_expected = substr($line, 0, $pos);
-
-	if (isset($tested_chars[$hex_tested]))
-	{
-		continue;
-	}
-
-	$utf_expected = hex_to_utf($hex_expected);
-
-	if ($utf_expected >= UTF8_SURROGATE_FIRST
-	 && $utf_expected <= UTF8_SURROGATE_LAST)
-	{
-		/**
-		* Surrogates are illegal on their own, we expect the normalizer
-		* to return a replacement char
-		*/
-		$utf_expected = UTF8_REPLACEMENT;
-		$hex_expected = utf_to_hexseq($utf_expected);
-	}
-
-	foreach (array('nfc', 'nfkc', 'nfd', 'nfkd') as $form)
-	{
-		$utf_result = $utf_expected;
-		utf_normalizer::$form($utf_result);
-		$hex_result = utf_to_hexseq($utf_result);
-//		echo "$form($utf_expected) == $utf_result\n";
-
-		if (strcmp($utf_expected, $utf_result))
-		{
-			$failed = 1;
-
-			echo "\nFAILED $hex_expected == $form($hex_tested) ($hex_expected != $hex_result)";
-		}
-	}
-
-	if ($failed)
-	{
-		die("\n\nFailed at line $n\n");
-	}
-}
-fclose($fp);
-
-die("\n\nALL TESTS PASSED SUCCESSFULLY\n");
-
-/**
-* Download a file to the develop/ dir
-*
-* @param	string	$url		URL of the file to download
-* @return	null
-*/
-function download($url)
-{
-	global $phpbb_root_path;
-
-	if (file_exists($phpbb_root_path . 'develop/' . basename($url)))
-	{
-		return;
-	}
-
-	echo 'Downloading from ', $url, ' ';
-
-	if (!$fpr = fopen($url, 'rb'))
-	{
-		die("Can't download from $url\nPlease download it yourself and put it in the develop/ dir, kthxbai");
-	}
-
-	if (!$fpw = fopen($phpbb_root_path . 'develop/' . basename($url), 'wb'))
-	{
-		die("Can't open develop/" . basename($url) . " for output... please check your permissions or something");
-	}
-
-	$i = 0;
-	$chunk = 32768;
-	$done = '';
-
-	while (!feof($fpr))
-	{
-		$i += fwrite($fpw, fread($fpr, $chunk));
-		echo str_repeat("\x08", strlen($done));
-
-		$done = ($i >> 10) . ' KiB';
-		echo $done;
-	}
-	fclose($fpr);
-	fclose($fpw);
-
-	echo "\n";
-}
-
-/**
-* Convert a UTF string to a sequence of codepoints in hexadecimal
-*
-* @param	string	$utf	UTF string
-* @return	integer			Unicode codepoints in hex
-*/
-function utf_to_hexseq($str)
-{
-	$pos = 0;
-	$len = strlen($str);
-	$ret = array();
-
-	while ($pos < $len)
-	{
-		$c = $str[$pos];
-		switch ($c & "\xF0")
-		{
-			case "\xC0":
-			case "\xD0":
-				$utf_char = substr($str, $pos, 2);
-				$pos += 2;
-				break;
-
-			case "\xE0":
-				$utf_char = substr($str, $pos, 3);
-				$pos += 3;
-				break;
-
-			case "\xF0":
-				$utf_char = substr($str, $pos, 4);
-				$pos += 4;
-				break;
-
-			default:
-				$utf_char = $c;
-				++$pos;
-		}
-
-		$hex = dechex(utf_to_cp($utf_char));
-
-		if (!isset($hex[3]))
-		{
-			$hex = substr('000' . $hex, -4);
-		}
-
-		$ret[] = $hex;
-	}
-
-	return strtr(implode(' ', $ret), 'abcdef', 'ABCDEF');
-}
-
-/**
-* Convert a UTF-8 char to its codepoint
-*
-* @param	string	$utf_char	UTF-8 char
-* @return	integer				Unicode codepoint
-*/
-function utf_to_cp($utf_char)
-{
-	switch (strlen($utf_char))
-	{
-		case 1:
-			return ord($utf_char);
-
-		case 2:
-			return ((ord($utf_char[0]) & 0x1F) << 6) | (ord($utf_char[1]) & 0x3F);
-
-		case 3:
-			return ((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F);
-
-		case 4:
-			return ((ord($utf_char[0]) & 0x07) << 18) | ((ord($utf_char[1]) & 0x3F) << 12) | ((ord($utf_char[2]) & 0x3F) << 6) | (ord($utf_char[3]) & 0x3F);
-
-		default:
-			die('UTF-8 chars can only be 1-4 bytes long');
-	}
-}
-
-/**
-* Return a UTF string formed from a sequence of codepoints in hexadecimal
-*
-* @param	string	$seq		Sequence of codepoints, separated with a space
-* @return	string				UTF-8 string
-*/
-function hexseq_to_utf($seq)
-{
-	return implode('', array_map('hex_to_utf', explode(' ', $seq)));
-}
-
-/**
-* Convert a codepoint in hexadecimal to a UTF-8 char
-*
-* @param	string	$hex		Codepoint, in hexadecimal
-* @return	string				UTF-8 char
-*/
-function hex_to_utf($hex)
-{
-	return cp_to_utf(hexdec($hex));
-}
-
-/**
-* Convert a codepoint to a UTF-8 char
-*
-* @param	integer	$cp			Unicode codepoint
-* @return	string				UTF-8 string
-*/
-function cp_to_utf($cp)
-{
-	if ($cp > 0xFFFF)
-	{
-		return chr(0xF0 | ($cp >> 18)) . chr(0x80 | (($cp >> 12) & 0x3F)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
-	}
-	else if ($cp > 0x7FF)
-	{
-		return chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
-	}
-	else if ($cp > 0x7F)
-	{
-		return chr(0xC0 | ($cp >> 6)) . chr(0x80 | ($cp & 0x3F));
-	}
-	else
-	{
-		return chr($cp);
-	}
-}