[feature/patchwork-utf8] Normalize with intl, use patchwork/utf8 as fallback

2025-08-01 22:40:39 +02:00 · 2013-11-20 13:47:31 +01:00
parent 1601b61ef9
commit 5a7caf6508
20 changed files with 110 additions and 2883 deletions
--- a/phpBB/composer.json
+++ b/phpBB/composer.json
@@ -27,6 +27,7 @@
 	"require": {
 		"php": ">=5.3.3",
 		"lusitanian/oauth": "0.2.*",
+		"patchwork/utf8": "1.1.*",
 		"symfony/config": "2.5.*",
 		"symfony/console": "2.5.*",
 		"symfony/dependency-injection": "2.5.*",
--- a/phpBB/composer.lock
+++ b/phpBB/composer.lock
@@ -68,6 +68,62 @@
            ],
            "time": "2013-08-29 21:40:04"
        },
+        {
+            "name": "patchwork/utf8",
+            "version": "v1.1.26",
+            "source": {
+                "type": "git",
+                "url": "https://github.com/tchwork/utf8.git",
+                "reference": "6b8e46603b49ee87ad6bceb314da94cc04ffcdce"
+            },
+            "dist": {
+                "type": "zip",
+                "url": "https://api.github.com/repos/tchwork/utf8/zipball/6b8e46603b49ee87ad6bceb314da94cc04ffcdce",
+                "reference": "6b8e46603b49ee87ad6bceb314da94cc04ffcdce",
+                "shasum": ""
+            },
+            "require": {
+                "lib-pcre": ">=7.3",
+                "php": ">=5.3.0"
+            },
+            "suggest": {
+                "ext-iconv": "Use iconv for best performance",
+                "ext-intl": "Use Intl for best performance",
+                "ext-mbstring": "Use Mbstring for best performance"
+            },
+            "type": "library",
+            "extra": {
+                "branch-alias": {
+                    "dev-master": "1.1-dev"
+                }
+            },
+            "autoload": {
+                "psr-0": {
+                    "Patchwork": "class/",
+                    "Normalizer": "class/"
+                }
+            },
+            "notification-url": "https://packagist.org/downloads/",
+            "license": [
+                "(Apache-2.0 or GPL-2.0)"
+            ],
+            "authors": [
+                {
+                    "name": "Nicolas Grekas",
+                    "email": "p@tchwork.com"
+                }
+            ],
+            "description": "Portable and performant UTF-8, Unicode and Grapheme Clusters for PHP",
+            "homepage": "https://github.com/tchwork/utf8",
+            "keywords": [
+                "grapheme",
+                "i18n",
+                "unicode",
+                "utf-8",
+                "utf8"
+            ],
+            "time": "2014-11-08 10:13:25"
+        },
        {
            "name": "psr/log",
            "version": "1.0.0",
--- a/phpBB/develop/generate_utf_tables.php
+++ b/phpBB/develop/generate_utf_tables.php
@@ -32,262 +32,11 @@ $phpbb_root_path = '../';
 $phpEx = substr(strrchr(__FILE__, '.'), 1);

 echo "Checking for required files\n";
-download('http://www.unicode.org/Public/UNIDATA/CompositionExclusions.txt');
-download('http://www.unicode.org/Public/UNIDATA/DerivedNormalizationProps.txt');
 download('http://www.unicode.org/Public/UNIDATA/UnicodeData.txt');
 echo "\n";

-require_once($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
-$file_contents = array();
-
 /**
-* Generate some Hangul/Jamo stuff
-*/
-echo "\nGenerating Hangul and Jamo tables\n";
-for ($i = 0; $i < UNICODE_HANGUL_LCOUNT; ++$i)
-{
-	$utf_char = cp_to_utf(UNICODE_HANGUL_LBASE + $i);
-	$file_contents['utf_normalizer_common']['utf_jamo_index'][$utf_char] = $i * UNICODE_HANGUL_VCOUNT * UNICODE_HANGUL_TCOUNT + UNICODE_HANGUL_SBASE;
-	$file_contents['utf_normalizer_common']['utf_jamo_type'][$utf_char] = UNICODE_JAMO_L;
-}
-
-for ($i = 0; $i < UNICODE_HANGUL_VCOUNT; ++$i)
-{
-	$utf_char = cp_to_utf(UNICODE_HANGUL_VBASE + $i);
-	$file_contents['utf_normalizer_common']['utf_jamo_index'][$utf_char] = $i * UNICODE_HANGUL_TCOUNT;
-	$file_contents['utf_normalizer_common']['utf_jamo_type'][$utf_char] = UNICODE_JAMO_V;
-}
-
-for ($i = 0; $i < UNICODE_HANGUL_TCOUNT; ++$i)
-{
-	$utf_char = cp_to_utf(UNICODE_HANGUL_TBASE + $i);
-	$file_contents['utf_normalizer_common']['utf_jamo_index'][$utf_char] = $i;
-	$file_contents['utf_normalizer_common']['utf_jamo_type'][$utf_char] = UNICODE_JAMO_T;
-}
-
-/**
-* Load the CompositionExclusions table
-*/
-echo "Loading CompositionExclusion\n";
-$fp = fopen('CompositionExclusions.txt', 'rt');
-
-$exclude = array();
-while (!feof($fp))
-{
-	$line = fgets($fp, 1024);
-
-	if (!strpos(' 0123456789ABCDEFabcdef', $line[0]))
-	{
-		continue;
-	}
-
-	$cp = strtok($line, ' ');
-
-	if ($pos = strpos($cp, '..'))
-	{
-		$start = hexdec(substr($cp, 0, $pos));
-		$end = hexdec(substr($cp, $pos + 2));
-
-		for ($i = $start; $i < $end; ++$i)
-		{
-			$exclude[$i] = 1;
-		}
-	}
-	else
-	{
-		$exclude[hexdec($cp)] = 1;
-	}
-}
-fclose($fp);
-
-/**
-* Load QuickCheck tables
-*/
-echo "Generating QuickCheck tables\n";
-$fp = fopen('DerivedNormalizationProps.txt', 'rt');
-
-while (!feof($fp))
-{
-	$line = fgets($fp, 1024);
-
-	if (!strpos(' 0123456789ABCDEFabcdef', $line[0]))
-	{
-		continue;
-	}
-
-	$p = array_map('trim', explode(';', strtok($line, '#')));
-
-	/**
-	* Capture only NFC_QC, NFKC_QC
-	*/
-	if (!preg_match('#^NFK?C_QC$#', $p[1]))
-	{
-		continue;
-	}
-
-	if ($pos = strpos($p[0], '..'))
-	{
-		$start = hexdec(substr($p[0], 0, $pos));
-		$end = hexdec(substr($p[0], $pos + 2));
-	}
-	else
-	{
-		$start = $end = hexdec($p[0]);
-	}
-
-	if ($start >= UTF8_HANGUL_FIRST && $end <= UTF8_HANGUL_LAST)
-	{
-		/**
-		* We do not store Hangul syllables in the array
-		*/
-		continue;
-	}
-
-	if ($p[2] == 'M')
-	{
-		$val = UNICODE_QC_MAYBE;
-	}
-	else
-	{
-		$val = UNICODE_QC_NO;
-	}
-
-	if ($p[1] == 'NFKC_QC')
-	{
-		$file = 'utf_nfkc_qc';
-	}
-	else
-	{
-		$file = 'utf_nfc_qc';
-	}
-
-	for ($i = $start; $i <= $end; ++$i)
-	{
-		/**
-		* The vars have the same name as the file: $utf_nfc_qc is in utf_nfc_qc.php
-		*/
-		$file_contents[$file][$file][cp_to_utf($i)] = $val;
-	}
-}
-fclose($fp);
-
-/**
-* Do mappings
-*/
-echo "Loading Unicode decomposition mappings\n";
-$fp = fopen($phpbb_root_path . 'develop/UnicodeData.txt', 'rt');
-
-$map = array();
-while (!feof($fp))
-{
-	$p = explode(';', fgets($fp, 1024));
-	$cp = hexdec($p[0]);
-
-	if (!empty($p[3]))
-	{
-		/**
-		* Store combining class > 0
-		*/
-		$file_contents['utf_normalizer_common']['utf_combining_class'][cp_to_utf($cp)] = (int) $p[3];
-	}
-
-	if (!isset($p[5]) || !preg_match_all('#[0-9A-F]+#', strip_tags($p[5]), $m))
-	{
-		continue;
-	}
-
-	if (strpos($p[5], '>'))
-	{
-		$map['NFKD'][$cp] = implode(' ', array_map('hexdec', $m[0]));
-	}
-	else
-	{
-		$map['NFD'][$cp] = $map['NFKD'][$cp] = implode(' ', array_map('hexdec', $m[0]));
-	}
-}
-fclose($fp);
-
-/**
-* Build the canonical composition table
-*/
-echo "Generating the Canonical Composition table\n";
-foreach ($map['NFD'] as $cp => $decomp_seq)
-{
-	if (!strpos($decomp_seq, ' ') || isset($exclude[$cp]))
-	{
-		/**
-		* Singletons are excluded from canonical composition
-		*/
-		continue;
-	}
-
-	$utf_seq = implode('', array_map('cp_to_utf', explode(' ', $decomp_seq)));
-
-	if (!isset($file_contents['utf_canonical_comp']['utf_canonical_comp'][$utf_seq]))
-	{
-		$file_contents['utf_canonical_comp']['utf_canonical_comp'][$utf_seq] = cp_to_utf($cp);
-	}
-}
-
-/**
-* Decompose the NF[K]D mappings recursively and prepare the file contents
-*/
-echo "Generating the Canonical and Compatibility Decomposition tables\n\n";
-foreach ($map as $type => $decomp_map)
-{
-	foreach ($decomp_map as $cp => $decomp_seq)
-	{
-		$decomp_map[$cp] = decompose($decomp_map, $decomp_seq);
-	}
-	unset($decomp_seq);
-
-	if ($type == 'NFKD')
-	{
-		$file = 'utf_compatibility_decomp';
-		$var = 'utf_compatibility_decomp';
-	}
-	else
-	{
-		$file = 'utf_canonical_decomp';
-		$var = 'utf_canonical_decomp';
-	}
-
-	/**
-	* Generate the corresponding file
-	*/
-	foreach ($decomp_map as $cp => $decomp_seq)
-	{
-		$file_contents[$file][$var][cp_to_utf($cp)] = implode('', array_map('cp_to_utf', explode(' ', $decomp_seq)));
-	}
-}
-
-/**
-* Generate and/or alter the files
-*/
-foreach ($file_contents as $file => $contents)
-{
-	/**
-	* Generate a new file
-	*/
-	echo "Writing to $file.$phpEx\n";
-
-	if (!$fp = fopen($phpbb_root_path . 'includes/utf/data/' . $file . '.' . $phpEx, 'wb'))
-	{
-		trigger_error('Cannot open ' . $file . ' for write');
-	}
-
-	fwrite($fp, '<?php');
-	foreach ($contents as $var => $val)
-	{
-		fwrite($fp, "\n\$GLOBALS[" . my_var_export($var) . ']=' . my_var_export($val) . ";");
-	}
-	fclose($fp);
-}
-
-echo "\n*** UTF-8 normalization tables done\n\n";
-
-/**
-* Now we'll generate the files needed by the search indexer
+* Generate the files needed by the search indexer
 */
 echo "Generating search indexer tables\n";

@@ -424,32 +173,6 @@ die("\nAll done!\n");
 //                             Internal functions                             //
 ////////////////////////////////////////////////////////////////////////////////

-/**
-* Decompose a sequence recusively
-*
-* @param	array	$decomp_map	Decomposition mapping, passed by reference
-* @param	string	$decomp_seq	Decomposition sequence as decimal codepoints separated with a space
-* @return	string				Decomposition sequence, fully decomposed
-*/
-function decompose(&$decomp_map, $decomp_seq)
-{
-	$ret = array();
-	foreach (explode(' ', $decomp_seq) as $cp)
-	{
-		if (isset($decomp_map[$cp]))
-		{
-			$ret[] = decompose($decomp_map, $decomp_map[$cp]);
-		}
-		else
-		{
-			$ret[] = $cp;
-		}
-	}
-
-	return implode(' ', $ret);
-}
-
-
 /**
 * Return a parsable string representation of a variable
 *
@@ -537,17 +260,6 @@ function hex_to_utf($hex)
 	return cp_to_utf(hexdec($hex));
 }

-/**
-* Return a UTF string formed from a sequence of codepoints in hexadecimal
-*
-* @param	string	$seq		Sequence of codepoints, separated with a space
-* @return	string				UTF-8 string
-*/
-function hexseq_to_utf($seq)
-{
-	return implode('', array_map('hex_to_utf', explode(' ', $seq)));
-}
-
 /**
 * Convert a codepoint to a UTF-8 char
 *
--- a/phpBB/develop/unicode_testing.php
+++ b/phpBB/develop/unicode_testing.php
@@ -81,38 +81,3 @@ function utf8_to_unicode_callback($m)
 {
 	return '\u' . str_pad(base_convert(utf8_ord($m[0]), 10, 16), 4, '0', STR_PAD_LEFT) . '';
 }
-
-/**
-* A wrapper function for the normalizer which takes care of including the class if required and modifies the passed strings
-* to be in NFKC
-*
-* @param	mixed	$strings	a string or an array of strings to normalize
-* @return	mixed				the normalized content, preserving array keys if array given.
-*/
-function utf8_normalize_nfkc($strings)
-{
-	if (empty($strings))
-	{
-		return $strings;
-	}
-
-	if (!class_exists('utf_normalizer'))
-	{
-		global $phpbb_root_path, $phpEx;
-		include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
-	}
-
-	if (!is_array($strings))
-	{
-		utf_normalizer::nfkc($strings);
-	}
-	else if (is_array($strings))
-	{
-		foreach ($strings as $key => $string)
-		{
-			utf_normalizer::nfkc($strings[$key]);
-		}
-	}
-
-	return $strings;
-}
--- a/phpBB/develop/utf_normalizer_test.php
+++ b/phpBB/develop/utf_normalizer_test.php
@@ -1,394 +0,0 @@
-<?php
-/**
-*
-* This file is part of the phpBB Forum Software package.
-*
-* @copyright (c) phpBB Limited <https://www.phpbb.com>
-* @license GNU General Public License, version 2 (GPL-2.0)
-*
-* For full copyright and license information, please see
-* the docs/CREDITS.txt file.
-*
-*/
-
-if (php_sapi_name() != 'cli')
-{
-	die("This program must be run from the command line.\n");
-}
-
-//
-// Security message:
-//
-// This script is potentially dangerous.
-// Remove or comment the next line (die(".... ) to enable this script.
-// Do NOT FORGET to either remove this script or disable it after you have used it.
-//
-die("Please read the first lines of this script for instructions on how to enable it");
-
-set_time_limit(0);
-error_reporting(E_ALL);
-
-define('IN_PHPBB', true);
-$phpbb_root_path = '../';
-$phpEx = substr(strrchr(__FILE__, '.'), 1);
-
-
-/**
-* Let's download some files we need
-*/
-download('http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt');
-download('http://www.unicode.org/Public/UNIDATA/UnicodeData.txt');
-
-/**
-* Those are the tests we run
-*/
-$test_suite = array(
-	/**
-	* NFC
-	*   c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3)
-	*   c4 ==  NFC(c4) ==  NFC(c5)
-	*/
-	'NFC'	=>	array(
-		'c2'	=>	array('c1', 'c2', 'c3'),
-		'c4'	=>	array('c4', 'c5')
-	),
-
-	/**
-	* NFD
-	*   c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3)
-	*   c5 ==  NFD(c4) ==  NFD(c5)
-	*/
-	'NFD'	=>	array(
-		'c3'	=>	array('c1', 'c2', 'c3'),
-		'c5'	=>	array('c4', 'c5')
-	),
-
-	/**
-	* NFKC
-	*   c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
-	*/
-	'NFKC'	=>	array(
-		'c4'	=>	array('c1', 'c2', 'c3', 'c4', 'c5')
-	),
-
-	/**
-	* NFKD
-	*   c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
-	*/
-	'NFKD'	=>	array(
-		'c5'	=>	array('c1', 'c2', 'c3', 'c4', 'c5')
-	)
-);
-
-require_once($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
-
-$i = $n = 0;
-$failed = false;
-$tested_chars = array();
-
-$fp = fopen($phpbb_root_path . 'develop/NormalizationTest.txt', 'rb');
-while (!feof($fp))
-{
-	$line = fgets($fp);
-	++$n;
-
-	if ($line[0] == '@')
-	{
-		if ($i)
-		{
-			echo "done\n";
-		}
-
-		$i = 0;
-		echo "\n", substr($line, 1), "\n\n";
-		continue;
-	}
-
-	if (!strpos(' 0123456789ABCDEF', $line[0]))
-	{
-		continue;
-	}
-
-	if (++$i % 100 == 0)
-	{
-		echo $i, ' ';
-	}
-
-	list($c1, $c2, $c3, $c4, $c5) = explode(';', $line);
-
-	if (!strpos($c1, ' '))
-	{
-		/**
-		* We are currently testing a single character, we add it to the list of
-		* characters we have processed so that we can exclude it when testing
-		* for invariants
-		*/
-		$tested_chars[$c1] = 1;
-	}
-
-	foreach ($test_suite as $form => $serie)
-	{
-		foreach ($serie as $expected => $tests)
-		{
-			$hex_expected = ${$expected};
-			$utf_expected = hexseq_to_utf($hex_expected);
-
-			foreach ($tests as $test)
-			{
-				$utf_result = $utf_expected;
-				call_user_func(array('utf_normalizer', $form), $utf_result);
-
-				if (strcmp($utf_expected, $utf_result))
-				{
-					$failed = true;
-					$hex_result = utf_to_hexseq($utf_result);
-
-					echo "\nFAILED $expected == $form($test) ($hex_expected != $hex_result)";
-				}
-			}
-		}
-
-		if ($failed)
-		{
-			die("\n\nFailed at line $n\n");
-		}
-	}
-}
-fclose($fp);
-
-/**
-* Test for invariants
-*/
-echo "\n\nTesting for invariants...\n\n";
-
-$fp = fopen($phpbb_root_path . 'develop/UnicodeData.txt', 'rt');
-
-$n = 0;
-while (!feof($fp))
-{
-	if (++$n % 100 == 0)
-	{
-		echo $n, ' ';
-	}
-
-	$line = fgets($fp, 1024);
-
-	if (!$pos = strpos($line, ';'))
-	{
-		continue;
-	}
-
-	$hex_tested = $hex_expected = substr($line, 0, $pos);
-
-	if (isset($tested_chars[$hex_tested]))
-	{
-		continue;
-	}
-
-	$utf_expected = hex_to_utf($hex_expected);
-
-	if ($utf_expected >= UTF8_SURROGATE_FIRST
-	 && $utf_expected <= UTF8_SURROGATE_LAST)
-	{
-		/**
-		* Surrogates are illegal on their own, we expect the normalizer
-		* to return a replacement char
-		*/
-		$utf_expected = UTF8_REPLACEMENT;
-		$hex_expected = utf_to_hexseq($utf_expected);
-	}
-
-	foreach (array('nfc', 'nfkc', 'nfd', 'nfkd') as $form)
-	{
-		$utf_result = $utf_expected;
-		utf_normalizer::$form($utf_result);
-		$hex_result = utf_to_hexseq($utf_result);
-//		echo "$form($utf_expected) == $utf_result\n";
-
-		if (strcmp($utf_expected, $utf_result))
-		{
-			$failed = 1;
-
-			echo "\nFAILED $hex_expected == $form($hex_tested) ($hex_expected != $hex_result)";
-		}
-	}
-
-	if ($failed)
-	{
-		die("\n\nFailed at line $n\n");
-	}
-}
-fclose($fp);
-
-die("\n\nALL TESTS PASSED SUCCESSFULLY\n");
-
-/**
-* Download a file to the develop/ dir
-*
-* @param	string	$url		URL of the file to download
-* @return	null
-*/
-function download($url)
-{
-	global $phpbb_root_path;
-
-	if (file_exists($phpbb_root_path . 'develop/' . basename($url)))
-	{
-		return;
-	}
-
-	echo 'Downloading from ', $url, ' ';
-
-	if (!$fpr = fopen($url, 'rb'))
-	{
-		die("Can't download from $url\nPlease download it yourself and put it in the develop/ dir, kthxbai");
-	}
-
-	if (!$fpw = fopen($phpbb_root_path . 'develop/' . basename($url), 'wb'))
-	{
-		die("Can't open develop/" . basename($url) . " for output... please check your permissions or something");
-	}
-
-	$i = 0;
-	$chunk = 32768;
-	$done = '';
-
-	while (!feof($fpr))
-	{
-		$i += fwrite($fpw, fread($fpr, $chunk));
-		echo str_repeat("\x08", strlen($done));
-
-		$done = ($i >> 10) . ' KiB';
-		echo $done;
-	}
-	fclose($fpr);
-	fclose($fpw);
-
-	echo "\n";
-}
-
-/**
-* Convert a UTF string to a sequence of codepoints in hexadecimal
-*
-* @param	string	$utf	UTF string
-* @return	integer			Unicode codepoints in hex
-*/
-function utf_to_hexseq($str)
-{
-	$pos = 0;
-	$len = strlen($str);
-	$ret = array();
-
-	while ($pos < $len)
-	{
-		$c = $str[$pos];
-		switch ($c & "\xF0")
-		{
-			case "\xC0":
-			case "\xD0":
-				$utf_char = substr($str, $pos, 2);
-				$pos += 2;
-				break;
-
-			case "\xE0":
-				$utf_char = substr($str, $pos, 3);
-				$pos += 3;
-				break;
-
-			case "\xF0":
-				$utf_char = substr($str, $pos, 4);
-				$pos += 4;
-				break;
-
-			default:
-				$utf_char = $c;
-				++$pos;
-		}
-
-		$hex = dechex(utf_to_cp($utf_char));
-
-		if (!isset($hex[3]))
-		{
-			$hex = substr('000' . $hex, -4);
-		}
-
-		$ret[] = $hex;
-	}
-
-	return strtr(implode(' ', $ret), 'abcdef', 'ABCDEF');
-}
-
-/**
-* Convert a UTF-8 char to its codepoint
-*
-* @param	string	$utf_char	UTF-8 char
-* @return	integer				Unicode codepoint
-*/
-function utf_to_cp($utf_char)
-{
-	switch (strlen($utf_char))
-	{
-		case 1:
-			return ord($utf_char);
-
-		case 2:
-			return ((ord($utf_char[0]) & 0x1F) << 6) | (ord($utf_char[1]) & 0x3F);
-
-		case 3:
-			return ((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F);
-
-		case 4:
-			return ((ord($utf_char[0]) & 0x07) << 18) | ((ord($utf_char[1]) & 0x3F) << 12) | ((ord($utf_char[2]) & 0x3F) << 6) | (ord($utf_char[3]) & 0x3F);
-
-		default:
-			die('UTF-8 chars can only be 1-4 bytes long');
-	}
-}
-
-/**
-* Return a UTF string formed from a sequence of codepoints in hexadecimal
-*
-* @param	string	$seq		Sequence of codepoints, separated with a space
-* @return	string				UTF-8 string
-*/
-function hexseq_to_utf($seq)
-{
-	return implode('', array_map('hex_to_utf', explode(' ', $seq)));
-}
-
-/**
-* Convert a codepoint in hexadecimal to a UTF-8 char
-*
-* @param	string	$hex		Codepoint, in hexadecimal
-* @return	string				UTF-8 char
-*/
-function hex_to_utf($hex)
-{
-	return cp_to_utf(hexdec($hex));
-}
-
-/**
-* Convert a codepoint to a UTF-8 char
-*
-* @param	integer	$cp			Unicode codepoint
-* @return	string				UTF-8 string
-*/
-function cp_to_utf($cp)
-{
-	if ($cp > 0xFFFF)
-	{
-		return chr(0xF0 | ($cp >> 18)) . chr(0x80 | (($cp >> 12) & 0x3F)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
-	}
-	else if ($cp > 0x7FF)
-	{
-		return chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
-	}
-	else if ($cp > 0x7F)
-	{
-		return chr(0xC0 | ($cp >> 6)) . chr(0x80 | ($cp & 0x3F));
-	}
-	else
-	{
-		return chr($cp);
-	}
-}
--- a/phpBB/includes/utf/data/utf_canonical_comp.php
+++ b/phpBB/includes/utf/data/utf_canonical_comp.php
--- a/phpBB/includes/utf/data/utf_canonical_decomp.php
+++ b/phpBB/includes/utf/data/utf_canonical_decomp.php
--- a/phpBB/includes/utf/data/utf_compatibility_decomp.php
+++ b/phpBB/includes/utf/data/utf_compatibility_decomp.php
--- a/phpBB/includes/utf/data/utf_nfc_qc.php
+++ b/phpBB/includes/utf/data/utf_nfc_qc.php
--- a/phpBB/includes/utf/data/utf_nfkc_qc.php
+++ b/phpBB/includes/utf/data/utf_nfkc_qc.php
--- a/phpBB/includes/utf/data/utf_normalizer_common.php
+++ b/phpBB/includes/utf/data/utf_normalizer_common.php
@@ -1,4 +0,0 @@
-<?php
-$GLOBALS['utf_jamo_index']=array('ᄀ'=>44032,'ᄁ'=>44620,'ᄂ'=>45208,'ᄃ'=>45796,'ᄄ'=>46384,'ᄅ'=>46972,'ᄆ'=>47560,'ᄇ'=>48148,'ᄈ'=>48736,'ᄉ'=>49324,'ᄊ'=>49912,'ᄋ'=>50500,'ᄌ'=>51088,'ᄍ'=>51676,'ᄎ'=>52264,'ᄏ'=>52852,'ᄐ'=>53440,'ᄑ'=>54028,'ᄒ'=>54616,'ᅡ'=>0,'ᅢ'=>28,'ᅣ'=>56,'ᅤ'=>84,'ᅥ'=>112,'ᅦ'=>140,'ᅧ'=>168,'ᅨ'=>196,'ᅩ'=>224,'ᅪ'=>252,'ᅫ'=>280,'ᅬ'=>308,'ᅭ'=>336,'ᅮ'=>364,'ᅯ'=>392,'ᅰ'=>420,'ᅱ'=>448,'ᅲ'=>476,'ᅳ'=>504,'ᅴ'=>532,'ᅵ'=>560,'ᆧ'=>0,'ᆨ'=>1,'ᆩ'=>2,'ᆪ'=>3,'ᆫ'=>4,'ᆬ'=>5,'ᆭ'=>6,'ᆮ'=>7,'ᆯ'=>8,'ᆰ'=>9,'ᆱ'=>10,'ᆲ'=>11,'ᆳ'=>12,'ᆴ'=>13,'ᆵ'=>14,'ᆶ'=>15,'ᆷ'=>16,'ᆸ'=>17,'ᆹ'=>18,'ᆺ'=>19,'ᆻ'=>20,'ᆼ'=>21,'ᆽ'=>22,'ᆾ'=>23,'ᆿ'=>24,'ᇀ'=>25,'ᇁ'=>26,'ᇂ'=>27);
-$GLOBALS['utf_jamo_type']=array('ᄀ'=>0,'ᄁ'=>0,'ᄂ'=>0,'ᄃ'=>0,'ᄄ'=>0,'ᄅ'=>0,'ᄆ'=>0,'ᄇ'=>0,'ᄈ'=>0,'ᄉ'=>0,'ᄊ'=>0,'ᄋ'=>0,'ᄌ'=>0,'ᄍ'=>0,'ᄎ'=>0,'ᄏ'=>0,'ᄐ'=>0,'ᄑ'=>0,'ᄒ'=>0,'ᅡ'=>1,'ᅢ'=>1,'ᅣ'=>1,'ᅤ'=>1,'ᅥ'=>1,'ᅦ'=>1,'ᅧ'=>1,'ᅨ'=>1,'ᅩ'=>1,'ᅪ'=>1,'ᅫ'=>1,'ᅬ'=>1,'ᅭ'=>1,'ᅮ'=>1,'ᅯ'=>1,'ᅰ'=>1,'ᅱ'=>1,'ᅲ'=>1,'ᅳ'=>1,'ᅴ'=>1,'ᅵ'=>1,'ᆧ'=>2,'ᆨ'=>2,'ᆩ'=>2,'ᆪ'=>2,'ᆫ'=>2,'ᆬ'=>2,'ᆭ'=>2,'ᆮ'=>2,'ᆯ'=>2,'ᆰ'=>2,'ᆱ'=>2,'ᆲ'=>2,'ᆳ'=>2,'ᆴ'=>2,'ᆵ'=>2,'ᆶ'=>2,'ᆷ'=>2,'ᆸ'=>2,'ᆹ'=>2,'ᆺ'=>2,'ᆻ'=>2,'ᆼ'=>2,'ᆽ'=>2,'ᆾ'=>2,'ᆿ'=>2,'ᇀ'=>2,'ᇁ'=>2,'ᇂ'=>2);
-$GLOBALS['utf_combining_class']=array('̀'=>230,'́'=>230,'̂'=>230,'̃'=>230,'̄'=>230,'̅'=>230,'̆'=>230,'̇'=>230,'̈'=>230,'̉'=>230,'̊'=>230,'̋'=>230,'̌'=>230,'̍'=>230,'̎'=>230,'̏'=>230,'̐'=>230,'̑'=>230,'̒'=>230,'̓'=>230,'̔'=>230,'̕'=>232,'̖'=>220,'̗'=>220,'̘'=>220,'̙'=>220,'̚'=>232,'̛'=>216,'̜'=>220,'̝'=>220,'̞'=>220,'̟'=>220,'̠'=>220,'̡'=>202,'̢'=>202,'̣'=>220,'̤'=>220,'̥'=>220,'̦'=>220,'̧'=>202,'̨'=>202,'̩'=>220,'̪'=>220,'̫'=>220,'̬'=>220,'̭'=>220,'̮'=>220,'̯'=>220,'̰'=>220,'̱'=>220,'̲'=>220,'̳'=>220,'̴'=>1,'̵'=>1,'̶'=>1,'̷'=>1,'̸'=>1,'̹'=>220,'̺'=>220,'̻'=>220,'̼'=>220,'̽'=>230,'̾'=>230,'̿'=>230,'̀'=>230,'́'=>230,'͂'=>230,'̓'=>230,'̈́'=>230,'ͅ'=>240,'͆'=>230,'͇'=>220,'͈'=>220,'͉'=>220,'͊'=>230,'͋'=>230,'͌'=>230,'͍'=>220,'͎'=>220,'͐'=>230,'͑'=>230,'͒'=>230,'͓'=>220,'͔'=>220,'͕'=>220,'͖'=>220,'͗'=>230,'͘'=>232,'͙'=>220,'͚'=>220,'͛'=>230,'͜'=>233,'͝'=>234,'͞'=>234,'͟'=>233,'͠'=>234,'͡'=>234,'͢'=>233,'ͣ'=>230,'ͤ'=>230,'ͥ'=>230,'ͦ'=>230,'ͧ'=>230,'ͨ'=>230,'ͩ'=>230,'ͪ'=>230,'ͫ'=>230,'ͬ'=>230,'ͭ'=>230,'ͮ'=>230,'ͯ'=>230,'҃'=>230,'҄'=>230,'҅'=>230,'҆'=>230,'֑'=>220,'֒'=>230,'֓'=>230,'֔'=>230,'֕'=>230,'֖'=>220,'֗'=>230,'֘'=>230,'֙'=>230,'֚'=>222,'֛'=>220,'֜'=>230,'֝'=>230,'֞'=>230,'֟'=>230,'֠'=>230,'֡'=>230,'֢'=>220,'֣'=>220,'֤'=>220,'֥'=>220,'֦'=>220,'֧'=>220,'֨'=>230,'֩'=>230,'֪'=>220,'֫'=>230,'֬'=>230,'֭'=>222,'֮'=>228,'֯'=>230,'ְ'=>10,'ֱ'=>11,'ֲ'=>12,'ֳ'=>13,'ִ'=>14,'ֵ'=>15,'ֶ'=>16,'ַ'=>17,'ָ'=>18,'ֹ'=>19,'ֺ'=>19,'ֻ'=>20,'ּ'=>21,'ֽ'=>22,'ֿ'=>23,'ׁ'=>24,'ׂ'=>25,'ׄ'=>230,'ׅ'=>220,'ׇ'=>18,'ؐ'=>230,'ؑ'=>230,'ؒ'=>230,'ؓ'=>230,'ؔ'=>230,'ؕ'=>230,'ً'=>27,'ٌ'=>28,'ٍ'=>29,'َ'=>30,'ُ'=>31,'ِ'=>32,'ّ'=>33,'ْ'=>34,'ٓ'=>230,'ٔ'=>230,'ٕ'=>220,'ٖ'=>220,'ٗ'=>230,'٘'=>230,'ٙ'=>230,'ٚ'=>230,'ٛ'=>230,'ٜ'=>220,'ٝ'=>230,'ٞ'=>230,'ٰ'=>35,'ۖ'=>230,'ۗ'=>230,'ۘ'=>230,'ۙ'=>230,'ۚ'=>230,'ۛ'=>230,'ۜ'=>230,'۟'=>230,'۠'=>230,'ۡ'=>230,'ۢ'=>230,'ۣ'=>220,'ۤ'=>230,'ۧ'=>230,'ۨ'=>230,'۪'=>220,'۫'=>230,'۬'=>230,'ۭ'=>220,'ܑ'=>36,'ܰ'=>230,'ܱ'=>220,'ܲ'=>230,'ܳ'=>230,'ܴ'=>220,'ܵ'=>230,'ܶ'=>230,'ܷ'=>220,'ܸ'=>220,'ܹ'=>220,'ܺ'=>230,'ܻ'=>220,'ܼ'=>220,'ܽ'=>230,'ܾ'=>220,'ܿ'=>230,'݀'=>230,'݁'=>230,'݂'=>220,'݃'=>230,'݄'=>220,'݅'=>230,'݆'=>220,'݇'=>230,'݈'=>220,'݉'=>230,'݊'=>230,'߫'=>230,'߬'=>230,'߭'=>230,'߮'=>230,'߯'=>230,'߰'=>230,'߱'=>230,'߲'=>220,'߳'=>230,'़'=>7,'्'=>9,'॑'=>230,'॒'=>220,'॓'=>230,'॔'=>230,'়'=>7,'্'=>9,'਼'=>7,'੍'=>9,'઼'=>7,'્'=>9,'଼'=>7,'୍'=>9,'்'=>9,'్'=>9,'ౕ'=>84,'ౖ'=>91,'಼'=>7,'್'=>9,'്'=>9,'්'=>9,'ุ'=>103,'ู'=>103,'ฺ'=>9,'่'=>107,'้'=>107,'๊'=>107,'๋'=>107,'ຸ'=>118,'ູ'=>118,'່'=>122,'້'=>122,'໊'=>122,'໋'=>122,'༘'=>220,'༙'=>220,'༵'=>220,'༷'=>220,'༹'=>216,'ཱ'=>129,'ི'=>130,'ུ'=>132,'ེ'=>130,'ཻ'=>130,'ོ'=>130,'ཽ'=>130,'ྀ'=>130,'ྂ'=>230,'ྃ'=>230,'྄'=>9,'྆'=>230,'྇'=>230,'࿆'=>220,'့'=>7,'္'=>9,'፟'=>230,'᜔'=>9,'᜴'=>9,'្'=>9,'៝'=>230,'ᢩ'=>228,'᤹'=>222,'᤺'=>230,'᤻'=>220,'ᨗ'=>230,'ᨘ'=>220,'᬴'=>7,'᭄'=>9,'᭫'=>230,'᭬'=>220,'᭭'=>230,'᭮'=>230,'᭯'=>230,'᭰'=>230,'᭱'=>230,'᭲'=>230,'᭳'=>230,'᷀'=>230,'᷁'=>230,'᷂'=>220,'᷃'=>230,'᷄'=>230,'᷅'=>230,'᷆'=>230,'᷇'=>230,'᷈'=>230,'᷉'=>230,'᷊'=>220,'᷾'=>230,'᷿'=>220,'⃐'=>230,'⃑'=>230,'⃒'=>1,'⃓'=>1,'⃔'=>230,'⃕'=>230,'⃖'=>230,'⃗'=>230,'⃘'=>1,'⃙'=>1,'⃚'=>1,'⃛'=>230,'⃜'=>230,'⃡'=>230,'⃥'=>1,'⃦'=>1,'⃧'=>230,'⃨'=>220,'⃩'=>230,'⃪'=>1,'⃫'=>1,'⃬'=>220,'⃭'=>220,'⃮'=>220,'⃯'=>220,'〪'=>218,'〫'=>228,'〬'=>232,'〭'=>222,'〮'=>224,'〯'=>224,'゙'=>8,'゚'=>8,'꠆'=>9,'ﬞ'=>26,'︠'=>230,'︡'=>230,'︢'=>230,'︣'=>230,'𐨍'=>220,'𐨏'=>230,'𐨸'=>230,'𐨹'=>1,'𐨺'=>220,'𐨿'=>9,'𝅥'=>216,'𝅦'=>216,'𝅧'=>1,'𝅨'=>1,'𝅩'=>1,'𝅭'=>226,'𝅮'=>216,'𝅯'=>216,'𝅰'=>216,'𝅱'=>216,'𝅲'=>216,'𝅻'=>220,'𝅼'=>220,'𝅽'=>220,'𝅾'=>220,'𝅿'=>220,'𝆀'=>220,'𝆁'=>220,'𝆂'=>220,'𝆅'=>230,'𝆆'=>230,'𝆇'=>230,'𝆈'=>230,'𝆉'=>230,'𝆊'=>220,'𝆋'=>220,'𝆪'=>230,'𝆫'=>230,'𝆬'=>230,'𝆭'=>230,'𝉂'=>230,'𝉃'=>230,'𝉄'=>230);
--- a/phpBB/includes/utf/utf_normalizer.php
+++ b/phpBB/includes/utf/utf_normalizer.php
--- a/phpBB/includes/utf/utf_tools.php
+++ b/phpBB/includes/utf/utf_tools.php
@@ -867,7 +867,6 @@ function utf8_recode($string, $encoding)

 	// Trigger an error?! Fow now just give bad data :-(
 	trigger_error('Unknown encoding: ' . $encoding, E_USER_ERROR);
-	//return $string; // use utf_normalizer::cleanup() ?
 }

 /**
@@ -1611,14 +1610,8 @@ function utf8_case_fold_nfkc($text, $option = 'full')
 	// do the case fold
 	$text = utf8_case_fold($text, $option);

-	if (!class_exists('utf_normalizer'))
-	{
-		global $phpbb_root_path, $phpEx;
-		include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
-	}
-
 	// convert to NFKC
-	utf_normalizer::nfkc($text);
+	Normalizer::normalize($text, Normalizer::NFKC);

 	// FC_NFKC_Closure, http://www.unicode.org/Public/5.0.0/ucd/DerivedNormalizationProps.txt
 	$text = strtr($text, $fc_nfkc_closure);
@@ -1714,106 +1707,56 @@ function utf8_case_fold_nfc($text, $option = 'full')
 	return $text;
 }

-if (extension_loaded('intl'))
+/**
+* wrapper around PHP's native normalizer from intl
+* previously a PECL extension, included in the core since PHP 5.3.0
+* http://php.net/manual/en/normalizer.normalize.php
+*
+* @param	mixed	$strings	a string or an array of strings to normalize
+* @return	mixed				the normalized content, preserving array keys if array given.
+*/
+function utf8_normalize_nfc($strings)
 {
-	/**
-	* wrapper around PHP's native normalizer from intl
-	* previously a PECL extension, included in the core since PHP 5.3.0
-	* http://php.net/manual/en/normalizer.normalize.php
-	*
-	* @param	mixed	$strings	a string or an array of strings to normalize
-	* @return	mixed				the normalized content, preserving array keys if array given.
-	*/
-	function utf8_normalize_nfc($strings)
+	if (empty($strings))
 	{
-		if (empty($strings))
+		return $strings;
+	}
+
+	if (!is_array($strings))
+	{
+		if (Normalizer::isNormalized($strings))
 		{
 			return $strings;
 		}
-
-		if (!is_array($strings))
+		return (string) Normalizer::normalize($strings);
+	}
+	else
+	{
+		foreach ($strings as $key => $string)
 		{
-			if (Normalizer::isNormalized($strings))
+			if (is_array($string))
 			{
-				return $strings;
-			}
-			return (string) Normalizer::normalize($strings);
-		}
-		else
-		{
-			foreach ($strings as $key => $string)
-			{
-				if (is_array($string))
+				foreach ($string as $_key => $_string)
 				{
-					foreach ($string as $_key => $_string)
-					{
-						if (Normalizer::isNormalized($strings[$key][$_key]))
-						{
-							continue;
-						}
-						$strings[$key][$_key] = (string) Normalizer::normalize($strings[$key][$_key]);
-					}
-				}
-				else
-				{
-					if (Normalizer::isNormalized($strings[$key]))
+					if (Normalizer::isNormalized($strings[$key][$_key]))
 					{
 						continue;
 					}
-					$strings[$key] = (string) Normalizer::normalize($strings[$key]);
+					$strings[$key][$_key] = (string) Normalizer::normalize($strings[$key][$_key]);
 				}
 			}
-		}
-
-		return $strings;
-	}
-}
-else
-{
-	/**
-	* A wrapper function for the normalizer which takes care of including the class if
-	* required and modifies the passed strings to be in NFC (Normalization Form Composition).
-	*
-	* @param	mixed	$strings	a string or an array of strings to normalize
-	* @return	mixed				the normalized content, preserving array keys if array given.
-	*/
-	function utf8_normalize_nfc($strings)
-	{
-		if (empty($strings))
-		{
-			return $strings;
-		}
-
-		if (!class_exists('utf_normalizer'))
-		{
-			global $phpbb_root_path, $phpEx;
-			include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
-		}
-
-		if (!is_array($strings))
-		{
-			utf_normalizer::nfc($strings);
-		}
-		else if (is_array($strings))
-		{
-			foreach ($strings as $key => $string)
+			else
 			{
-				if (is_array($string))
+				if (Normalizer::isNormalized($strings[$key]))
 				{
-					foreach ($string as $_key => $_string)
-					{
-						utf_normalizer::nfc($strings[$key][$_key]);
-					}
-				}
-				else
-				{
-					utf_normalizer::nfc($strings[$key]);
+					continue;
 				}
+				$strings[$key] = (string) Normalizer::normalize($strings[$key]);
 			}
 		}
-
-		return $strings;
 	}
+
+	return $strings;
 }

 /**
--- a/phpBB/install/data/confusables.php
+++ b/phpBB/install/data/confusables.php
@@ -633,14 +633,8 @@ function utf8_new_case_fold_nfkc($text, $option = 'full')
 	// do the case fold
 	$text = utf8_new_case_fold($text, $option);

-	if (!class_exists('utf_normalizer'))
-	{
-		global $phpbb_root_path, $phpEx;
-		include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
-	}
-
 	// convert to NFKC
-	utf_new_normalizer::nfkc($text);
+	$text = Normalizer::normalize($text, Normalizer::NFKC);

 	// FC_NFKC_Closure, http://www.unicode.org/Public/5.0.0/ucd/DerivedNormalizationProps.txt
 	$text = strtr($text, $fc_nfkc_closure);
--- a/phpBB/install/data/new_normalizer.php
+++ b/phpBB/install/data/new_normalizer.php
@@ -1,197 +0,0 @@
-<?php
-/**
-*
-* This file is part of the phpBB Forum Software package.
-*
-* @copyright (c) phpBB Limited <https://www.phpbb.com>
-* @license GNU General Public License, version 2 (GPL-2.0)
-*
-* For full copyright and license information, please see
-* the docs/CREDITS.txt file.
-*
-*/
-
-/**
-* @ignore
-*/
-if (!defined('IN_PHPBB'))
-{
-	exit;
-}
-
-/**
-* A wrapper function for the normalizer which takes care of including the class if required and modifies the passed strings
-* to be in NFC (Normalization Form Composition).
-*
-* @param	mixed	$strings	a string or an array of strings to normalize
-* @return	mixed				the normalized content, preserving array keys if array given.
-*/
-function utf8_new_normalize_nfc($strings)
-{
-	if (empty($strings))
-	{
-		return $strings;
-	}
-
-	if (!is_array($strings))
-	{
-		utf_new_normalizer::nfc($strings);
-	}
-	else if (is_array($strings))
-	{
-		foreach ($strings as $key => $string)
-		{
-			if (is_array($string))
-			{
-				foreach ($string as $_key => $_string)
-				{
-					utf_new_normalizer::nfc($strings[$key][$_key]);
-				}
-			}
-			else
-			{
-				utf_new_normalizer::nfc($strings[$key]);
-			}
-		}
-	}
-
-	return $strings;
-}
-
-class utf_new_normalizer
-{
-	/**
-	* Validate, cleanup and normalize a string
-	*
-	* The ultimate convenience function! Clean up invalid UTF-8 sequences,
-	* and convert to Normal Form C, canonical composition.
-	*
-	* @param	string	&$str	The dirty string
-	* @return	string			The same string, all shiny and cleaned-up
-	*/
-	function cleanup(&$str)
-	{
-		// The string below is the list of all autorized characters, sorted by frequency in latin text
-		$pos = strspn($str, "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D");
-		$len = strlen($str);
-
-		if ($pos == $len)
-		{
-			// ASCII strings with no special chars return immediately
-			return;
-		}
-
-		// Note: we do not check for $GLOBALS['utf_canonical_decomp']. It is assumed they are always loaded together
-		if (!isset($GLOBALS['utf_nfc_qc']))
-		{
-			global $phpbb_root_path, $phpEx;
-			include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
-		}
-
-		if (!isset($GLOBALS['utf_canonical_decomp']))
-		{
-			global $phpbb_root_path, $phpEx;
-			include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
-		}
-
-		// Replace any byte in the range 0x00..0x1F, except for \r, \n and \t
-		// We replace those characters with a 0xFF byte, which is illegal in UTF-8 and will in turn be replaced with a UTF replacement char
-		$str = strtr(
-			$str,
-			"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
-			"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"
-		);
-
-		$str = utf_new_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
-	}
-
-	/**
-	* Validate and normalize a UTF string to NFC
-	*
-	* @param	string	&$str	Unchecked UTF string
-	* @return	string			The string, validated and in normal form
-	*/
-	function nfc(&$str)
-	{
-		$pos = strspn($str, UTF8_ASCII_RANGE);
-		$len = strlen($str);
-
-		if ($pos == $len)
-		{
-			// ASCII strings return immediately
-			return;
-		}
-
-		if (!isset($GLOBALS['utf_nfc_qc']))
-		{
-			global $phpbb_root_path, $phpEx;
-			include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
-		}
-
-		if (!isset($GLOBALS['utf_canonical_decomp']))
-		{
-			global $phpbb_root_path, $phpEx;
-			include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
-		}
-
-		$str = utf_new_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
-	}
-
-	/**
-	* Validate and normalize a UTF string to NFKC
-	*
-	* @param	string	&$str	Unchecked UTF string
-	* @return	string			The string, validated and in normal form
-	*/
-	function nfkc(&$str)
-	{
-		$pos = strspn($str, UTF8_ASCII_RANGE);
-		$len = strlen($str);
-
-		if ($pos == $len)
-		{
-			// ASCII strings return immediately
-			return;
-		}
-
-		if (!isset($GLOBALS['utf_nfkc_qc']))
-		{
-			global $phpbb_root_path, $phpEx;
-			include($phpbb_root_path . 'includes/utf/data/utf_nfkc_qc.' . $phpEx);
-		}
-
-		if (!isset($GLOBALS['utf_compatibility_decomp']))
-		{
-			global $phpbb_root_path, $phpEx;
-			include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);
-		}
-
-		$str = utf_new_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
-	}
-
-	/**
-	* Recompose a UTF string
-	*
-	* @param	string	$str			Unchecked UTF string
-	* @param	integer	$pos			Position of the first UTF char (in bytes)
-	* @param	integer	$len			Length of the string (in bytes)
-	* @param	array	&$qc			Quick-check array, passed by reference but never modified
-	* @param	array	&$decomp_map	Decomposition mapping, passed by reference but never modified
-	* @return	string					The string, validated and recomposed
-	*
-	* @access	private
-	*/
-	function recompose($str, $pos, $len, &$qc, &$decomp_map)
-	{
-		global $utf_canonical_comp;
-
-		// Load the canonical composition table
-		if (!isset($utf_canonical_comp))
-		{
-			global $phpbb_root_path, $phpEx;
-			include($phpbb_root_path . 'includes/utf/data/utf_canonical_comp.' . $phpEx);
-		}
-
-		return utf_normalizer::recompose($str, $pos, $len, $qc, $decomp_map);
-	}
-}
--- a/phpBB/install/database_update.php
+++ b/phpBB/install/database_update.php
@@ -74,7 +74,6 @@ require($phpbb_root_path . 'includes/functions.' . $phpEx);
 require($phpbb_root_path . 'includes/functions_content.' . $phpEx);

 require($phpbb_root_path . 'includes/constants.' . $phpEx);
-include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
 require($phpbb_root_path . 'includes/utf/utf_tools.' . $phpEx);

 // Set PHP error handler to ours
--- a/phpBB/install/index.php
+++ b/phpBB/install/index.php
@@ -102,7 +102,6 @@ phpbb_require_updated('includes/functions.' . $phpEx);
 phpbb_require_updated('includes/functions_content.' . $phpEx, true);

 phpbb_include_updated('includes/functions_admin.' . $phpEx);
-phpbb_include_updated('includes/utf/utf_normalizer.' . $phpEx);
 phpbb_include_updated('includes/utf/utf_tools.' . $phpEx);
 phpbb_require_updated('includes/functions_install.' . $phpEx);

--- a/phpBB/phpbb/search/fulltext_native.php
+++ b/phpBB/phpbb/search/fulltext_native.php
@@ -18,6 +18,13 @@ namespace phpbb\search;
 */
 class fulltext_native extends \phpbb\search\base
 {
+	const UTF8_HANGUL_FIRST = "\xEA\xB0\x80";
+	const UTF8_HANGUL_LAST = "\xED\x9E\xA3";
+	const UTF8_CJK_FIRST = "\xE4\xB8\x80";
+	const UTF8_CJK_LAST = "\xE9\xBE\xBB";
+	const UTF8_CJK_B_FIRST = "\xF0\xA0\x80\x80";
+	const UTF8_CJK_B_LAST = "\xF0\xAA\x9B\x96";
+
 	/**
 	 * Associative array holding index stats
 	 * @var array
@@ -93,7 +100,7 @@ class fulltext_native extends \phpbb\search\base
 	protected $user;

 	/**
-	* Initialises the fulltext_native search backend with min/max word length and makes sure the UTF-8 normalizer is loaded
+	* Initialises the fulltext_native search backend with min/max word length
 	*
 	* @param	boolean|string	&$error	is passed by reference and should either be set to false on success or an error message on failure
 	*/
@@ -110,10 +117,6 @@ class fulltext_native extends \phpbb\search\base
 		/**
 		* Load the UTF tools
 		*/
-		if (!class_exists('utf_normalizer'))
-		{
-			include($this->phpbb_root_path . 'includes/utf/utf_normalizer.' . $this->php_ext);
-		}
 		if (!function_exists('utf8_decode_ncr'))
 		{
 			include($this->phpbb_root_path . 'includes/utf/utf_tools.' . $this->php_ext);
@@ -1175,9 +1178,9 @@ class fulltext_native extends \phpbb\search\base
 				* Note: this could be optimized. If the codepoint is lower than Hangul's range
 				* we know that it will also be lower than CJK ranges
 				*/
-				if ((strncmp($word, UTF8_HANGUL_FIRST, 3) < 0 || strncmp($word, UTF8_HANGUL_LAST, 3) > 0)
-					&& (strncmp($word, UTF8_CJK_FIRST, 3) < 0 || strncmp($word, UTF8_CJK_LAST, 3) > 0)
-					&& (strncmp($word, UTF8_CJK_B_FIRST, 4) < 0 || strncmp($word, UTF8_CJK_B_LAST, 4) > 0))
+				if ((strncmp($word, self::UTF8_HANGUL_FIRST, 3) < 0 || strncmp($word, self::UTF8_HANGUL_LAST, 3) > 0)
+					&& (strncmp($word, self::UTF8_CJK_FIRST, 3) < 0 || strncmp($word, self::UTF8_CJK_LAST, 3) > 0)
+					&& (strncmp($word, self::UTF8_CJK_B_FIRST, 4) < 0 || strncmp($word, self::UTF8_CJK_B_LAST, 4) > 0))
 				{
 					$word = strtok(' ');
 					continue;
@@ -1544,8 +1547,6 @@ class fulltext_native extends \phpbb\search\base
 	* @param	string	$allowed_chars	String of special chars to allow
 	* @param	string	$encoding		Text encoding
 	* @return	string					Cleaned up text, only alphanumeric chars are left
-	*
-	* @todo \normalizer::cleanup being able to be used?
 	*/
 	protected function cleanup($text, $allowed_chars = null, $encoding = 'utf-8')
 	{
@@ -1572,12 +1573,9 @@ class fulltext_native extends \phpbb\search\base
 		$text = htmlspecialchars_decode(utf8_decode_ncr($text), ENT_QUOTES);

 		/**
-		* Load the UTF-8 normalizer
-		*
-		* If we use it more widely, an instance of that class should be held in a
-		* a global variable instead
+		* Normalize to NFC
 		*/
-		\utf_normalizer::nfc($text);
+		$text = \Normalizer::normalize($text);

 		/**
 		* The first thing we do is:
@@ -1670,9 +1668,9 @@ class fulltext_native extends \phpbb\search\base
 			$utf_char = substr($text, $pos, $utf_len);
 			$pos += $utf_len;

-			if (($utf_char >= UTF8_HANGUL_FIRST && $utf_char <= UTF8_HANGUL_LAST)
-				|| ($utf_char >= UTF8_CJK_FIRST && $utf_char <= UTF8_CJK_LAST)
-				|| ($utf_char >= UTF8_CJK_B_FIRST && $utf_char <= UTF8_CJK_B_LAST))
+			if (($utf_char >= self::UTF8_HANGUL_FIRST && $utf_char <= self::UTF8_HANGUL_LAST)
+				|| ($utf_char >= self::UTF8_CJK_FIRST && $utf_char <= self::UTF8_CJK_LAST)
+				|| ($utf_char >= self::UTF8_CJK_B_FIRST && $utf_char <= self::UTF8_CJK_B_LAST))
 			{
 				/**
 				* All characters within these ranges are valid
--- a/tests/RUNNING_TESTS.md
+++ b/tests/RUNNING_TESTS.md
@@ -120,8 +120,9 @@ directory (above phpBB):
 Slow tests
 --------------

-Certain tests, such as the UTF-8 normalizer or the DNS tests tend to be slow.
-Thus these tests are in the `slow` group, which is excluded by default. If you
+Certain tests, such as the DNS tests tend to be slow.
+Thus these tests are in the `slow` group, which is excluded by default. You can
+enable slow tests by copying the phpunit.xml.all file to phpunit.xml. If you
 only want the slow tests, run:

    $ phpBB/vendor/bin/phpunit --group slow
--- a/tests/utf/normalizer_test.php
+++ b/tests/utf/normalizer_test.php
@@ -1,327 +0,0 @@
-<?php
-/**
-*
-* This file is part of the phpBB Forum Software package.
-*
-* @copyright (c) phpBB Limited <https://www.phpbb.com>
-* @license GNU General Public License, version 2 (GPL-2.0)
-*
-* For full copyright and license information, please see
-* the docs/CREDITS.txt file.
-*
-*/
-
-require_once dirname(__FILE__) . '/../../phpBB/includes/utf/utf_normalizer.php';
-
-/**
-* @group slow
-*/
-class phpbb_utf_normalizer_test extends phpbb_test_case
-{
-	static private $data_dir;
-
-	static public function setUpBeforeClass()
-	{
-		self::$data_dir = dirname(__file__) . '/../tmp';
-		self::download('http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt', self::$data_dir);
-		self::download('http://www.unicode.org/Public/UNIDATA/UnicodeData.txt', self::$data_dir);
-	}
-
-	public function test_normalizer()
-	{
-		$test_suite = array(
-			/**
-			* NFC
-			*   c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3)
-			*   c4 ==  NFC(c4) ==  NFC(c5)
-			*/
-			'NFC'	=>	array(
-				'c2'	=>	array('c1', 'c2', 'c3'),
-				'c4'	=>	array('c4', 'c5')
-			),
-
-			/**
-			* NFD
-			*   c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3)
-			*   c5 ==  NFD(c4) ==  NFD(c5)
-			*/
-			'NFD'	=>	array(
-				'c3'	=>	array('c1', 'c2', 'c3'),
-				'c5'	=>	array('c4', 'c5')
-			),
-
-			/**
-			* NFKC
-			*   c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5)
-			*/
-			'NFKC'	=>	array(
-				'c4'	=>	array('c1', 'c2', 'c3', 'c4', 'c5')
-			),
-
-			/**
-			* NFKD
-			*   c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5)
-			*/
-			'NFKD'	=>	array(
-				'c5'	=>	array('c1', 'c2', 'c3', 'c4', 'c5')
-			)
-		);
-
-		$tested_chars = array();
-
-		$fp = fopen(self::$data_dir . '/NormalizationTest.txt', 'rb');
-		while (!feof($fp))
-		{
-			$line = fgets($fp);
-
-			if ($line[0] == '@')
-			{
-				continue;
-			}
-
-			if (!strpos(' 0123456789ABCDEF', $line[0]))
-			{
-				continue;
-			}
-
-			list($c1, $c2, $c3, $c4, $c5) = explode(';', $line);
-
-			if (!strpos($c1, ' '))
-			{
-				/**
-				* We are currently testing a single character, we add it to the list of
-				* characters we have processed so that we can exclude it when testing
-				* for invariants
-				*/
-				$tested_chars[$c1] = 1;
-			}
-
-			foreach ($test_suite as $form => $serie)
-			{
-				foreach ($serie as $expected => $tests)
-				{
-					$hex_expected = ${$expected};
-					$utf_expected = $this->hexseq_to_utf($hex_expected);
-
-					foreach ($tests as $test)
-					{
-						$utf_result = $utf_expected;
-						call_user_func_array(array('utf_normalizer', $form), array(&$utf_result));
-
-						$hex_result = $this->utf_to_hexseq($utf_result);
-						$this->assertEquals($utf_expected, $utf_result, "$expected == $form($test) ($hex_expected != $hex_result)");
-					}
-				}
-			}
-		}
-		fclose($fp);
-
-		return $tested_chars;
-	}
-
-	/**
-	* @depends test_normalizer
-	*/
-	public function test_invariants(array $tested_chars)
-	{
-		$fp = fopen(self::$data_dir . '/UnicodeData.txt', 'rb');
-
-		while (!feof($fp))
-		{
-			$line = fgets($fp, 1024);
-
-			if (!$pos = strpos($line, ';'))
-			{
-				continue;
-			}
-
-			$hex_tested = $hex_expected = substr($line, 0, $pos);
-
-			if (isset($tested_chars[$hex_tested]))
-			{
-				continue;
-			}
-
-			$utf_expected = $this->hex_to_utf($hex_expected);
-
-			if ($utf_expected >= UTF8_SURROGATE_FIRST
-			 && $utf_expected <= UTF8_SURROGATE_LAST)
-			{
-				/**
-				* Surrogates are illegal on their own, we expect the normalizer
-				* to return a replacement char
-				*/
-				$utf_expected = UTF8_REPLACEMENT;
-				$hex_expected = $this->utf_to_hexseq($utf_expected);
-			}
-
-			foreach (array('nfc', 'nfkc', 'nfd', 'nfkd') as $form)
-			{
-				$utf_result = $utf_expected;
-				call_user_func_array(array('utf_normalizer', $form), array(&$utf_result));
-				$hex_result = $this->utf_to_hexseq($utf_result);
-
-				$this->assertEquals($utf_expected, $utf_result, "$hex_expected == $form($hex_tested) ($hex_expected != $hex_result)");
-			}
-		}
-		fclose($fp);
-	}
-
-	/**
-	* Convert a UTF string to a sequence of codepoints in hexadecimal
-	*
-	* @param	string	$utf	UTF string
-	* @return	integer			Unicode codepoints in hex
-	*/
-	protected function utf_to_hexseq($str)
-	{
-		$pos = 0;
-		$len = strlen($str);
-		$ret = array();
-
-		while ($pos < $len)
-		{
-			$c = $str[$pos];
-			switch ($c & "\xF0")
-			{
-				case "\xC0":
-				case "\xD0":
-					$utf_char = substr($str, $pos, 2);
-					$pos += 2;
-					break;
-
-				case "\xE0":
-					$utf_char = substr($str, $pos, 3);
-					$pos += 3;
-					break;
-
-				case "\xF0":
-					$utf_char = substr($str, $pos, 4);
-					$pos += 4;
-					break;
-
-				default:
-					$utf_char = $c;
-					++$pos;
-			}
-
-			$hex = dechex($this->utf_to_cp($utf_char));
-
-			if (!isset($hex[3]))
-			{
-				$hex = substr('000' . $hex, -4);
-			}
-
-			$ret[] = $hex;
-		}
-
-		return strtr(implode(' ', $ret), 'abcdef', 'ABCDEF');
-	}
-
-	/**
-	* Convert a UTF-8 char to its codepoint
-	*
-	* @param	string	$utf_char	UTF-8 char
-	* @return	integer				Unicode codepoint
-	*/
-	protected function utf_to_cp($utf_char)
-	{
-		switch (strlen($utf_char))
-		{
-			case 1:
-				return ord($utf_char);
-
-			case 2:
-				return ((ord($utf_char[0]) & 0x1F) << 6) | (ord($utf_char[1]) & 0x3F);
-
-			case 3:
-				return ((ord($utf_char[0]) & 0x0F) << 12) | ((ord($utf_char[1]) & 0x3F) << 6) | (ord($utf_char[2]) & 0x3F);
-
-			case 4:
-				return ((ord($utf_char[0]) & 0x07) << 18) | ((ord($utf_char[1]) & 0x3F) << 12) | ((ord($utf_char[2]) & 0x3F) << 6) | (ord($utf_char[3]) & 0x3F);
-
-			default:
-				throw new RuntimeException('UTF-8 chars can only be 1-4 bytes long');
-		}
-	}
-
-	/**
-	* Return a UTF string formed from a sequence of codepoints in hexadecimal
-	*
-	* @param	string	$seq		Sequence of codepoints, separated with a space
-	* @return	string				UTF-8 string
-	*/
-	protected function hexseq_to_utf($seq)
-	{
-		return implode('', array_map(array($this, 'hex_to_utf'), explode(' ', $seq)));
-	}
-
-	/**
-	* Convert a codepoint in hexadecimal to a UTF-8 char
-	*
-	* @param	string	$hex		Codepoint, in hexadecimal
-	* @return	string				UTF-8 char
-	*/
-	protected function hex_to_utf($hex)
-	{
-		return $this->cp_to_utf(hexdec($hex));
-	}
-
-	/**
-	* Convert a codepoint to a UTF-8 char
-	*
-	* @param	integer	$cp			Unicode codepoint
-	* @return	string				UTF-8 string
-	*/
-	protected function cp_to_utf($cp)
-	{
-		if ($cp > 0xFFFF)
-		{
-			return chr(0xF0 | ($cp >> 18)) . chr(0x80 | (($cp >> 12) & 0x3F)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
-		}
-		else if ($cp > 0x7FF)
-		{
-			return chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
-		}
-		else if ($cp > 0x7F)
-		{
-			return chr(0xC0 | ($cp >> 6)) . chr(0x80 | ($cp & 0x3F));
-		}
-		else
-		{
-			return chr($cp);
-		}
-	}
-
-	// chunked download helper
-	static protected function download($url, $to)
-	{
-		$target = $to . '/' . basename($url);
-
-		if (file_exists($target))
-		{
-			return;
-		}
-
-		if (!$fpr = fopen($url, 'rb'))
-		{
-			echo "Failed to download $url\n";
-			return;
-		}
-
-		if (!$fpw = fopen($target, 'wb'))
-		{
-			echo "Failed to open $target for writing\n";
-			return;
-		}
-
-		$chunk = 32768;
-
-		while (!feof($fpr))
-		{
-			fwrite($fpw, fread($fpr, $chunk));
-		}
-		fclose($fpr);
-		fclose($fpw);
-	}
-}