Changed: moved functions that encode/decode NCRs from and to UTF-8 to utf_tools.php

git-svn-id: file:///svn/phpbb/trunk@6187 89ea8834-ac86-4346-8a33-228a782c2dd0
2025-08-05 16:27:38 +02:00 · 2006-07-15 17:01:59 +00:00
parent 7b8f0da356
commit 29d92430c5
2 changed files with 96 additions and 68 deletions
--- a/phpBB/includes/search/fulltext_native_improved.php
+++ b/phpBB/includes/search/fulltext_native_improved.php
@@ -47,10 +47,18 @@ class fulltext_native_improved extends search_backend

 		$this->word_length = array('min' => $config['fulltext_native_min_chars'], 'max' => $config['fulltext_native_max_chars']);

+		/**
+		* Load the UTF tools
+		*/
 		if (!class_exists('utf_normalizer'))
 		{
 			include($phpbb_root_path . 'includes/utf/utf_normalizer.' . $phpEx);
 		}
+		if (!function_exists('utf8_strlen'))
+		{
+			include($phpbb_root_path . 'includes/utf/utf_tools.' . $phpEx);
+		}
+

 		$error = false;
 	}
@@ -864,14 +872,6 @@ class fulltext_native_improved extends search_backend

 		$isset_min = $min - 1;

-		/**
-		* Load the UTF tools
-		*/
-		if (!function_exists('utf8_strlen'))
-		{
-			include($phpbb_root_path . 'includes/utf/utf_tools.' . $phpEx);
-		}
-
 		/**
 		* Clean up the string, remove HTML tags, remove BBCodes
 		*/
@@ -1259,11 +1259,6 @@ class fulltext_native_improved extends search_backend
 		$encoding = strtolower($encoding);
 		if ($encoding != 'utf-8')
 		{
-			if (!function_exists('utf8_recode'))
-			{
-				include($phpbb_root_path . 'includes/utf/utf_tools.' . $phpEx);
-			}
-
 			$text = utf8_recode($text, $encoding);
 		}

@@ -1277,7 +1272,7 @@ class fulltext_native_improved extends search_backend
 		/**
 		* Replace HTML entities and NCRs
 		*/
-		$text = html_entity_decode($this->decode_ncr($text), ENT_QUOTES);
+		$text = html_entity_decode(utf8_decode_ncr($text), ENT_QUOTES);

 		/**
 		* Load the UTF-8 normalizer
@@ -1481,60 +1476,6 @@ class fulltext_native_improved extends search_backend
 		return $ret;
 	}

-	/**
-	* Convert Numeric Character References to UTF-8 chars
-	*
-	* Notes:
-	*  - we do not convert NCRs recursively, if you pass &#38;#38; it will return &#38;
-	*  - we DO NOT check for the existence of the Unicode characters, therefore an entity
-	*    may be converted to an inexistent codepoint
-	*
-	* @param	string	$text		String to convert, encoded in UTF-8 (no normal form required)
-	* @return	string				UTF-8 string where NCRs have been replaced with the actual chars
-	*/
-	function decode_ncr($text)
-	{
-		/**
-		* @todo replace me with preg_replace_callback() or a loop
-		*/
-		return preg_replace(
-			'/&#([0-9]{1,6});/e',
-			"\$this->cp_to_utf(\$1)",
-			
-			preg_replace(
-				'/&#x([0-9A-F]{1,5});/ie',
-				"\$this->cp_to_utf(hexdec('\$1'))",
-				$text
-			)
-		);
-	}
-
-	/**
-	* Convert a codepoint to a UTF-8 char
-	*
-	* @param	integer	$cp			Unicode codepoint
-	* @return	string				UTF-8 string
-	*/
-	function cp_to_utf($cp)
-	{
-		if ($cp > 0xFFFF)
-		{
-			return chr(0xF0 | ($cp >> 18)) . chr(0x80 | (($cp >> 12) & 0x3F)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
-		}
-		elseif ($cp > 0x7FF)
-		{
-			return chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
-		}
-		elseif ($cp > 0x7F)
-		{
-			return chr(0xC0 | ($cp >> 6)) . chr(0x80 | ($cp & 0x3F));
-		}
-		else
-		{
-			return chr($cp);
-		}
-	}
-
 	/**
 	* Returns a list of options for the ACP to display
 	*/
--- a/phpBB/includes/utf/utf_tools.php
+++ b/phpBB/includes/utf/utf_tools.php
@@ -127,4 +127,91 @@ function utf8_recode($string, $encoding)
 	die('Finish me!! '.basename(__FILE__).' at line '.__LINE__);
 }

+/**
+* Replace all UTF-8 chars that are not in ASCII with their NCR
+*
+* @param	string	$text		UTF-8 string in NFC
+* @return	string				ASCII string using NCRs for non-ASCII chars
+*/
+function utf8_encode_ncr($text)
+{
+	return preg_replace_callback('#[\\xC2-\\xF4][\\x80-\\xBF]+#', 'utf8_encode_ncr_callback', $text);
+}
+
+/**
+* Callback used in encode_ncr()
+*
+* Takes a UTF-8 char and replaces it with its NCR. Attention, $m is an array
+*
+* @param	array	$m			0-based numerically indexed array passed by preg_replace_callback()
+* @return	string				A HTML NCR if the character is valid, or the original string otherwise
+*/
+function utf8_encode_ncr_callback($m)
+{
+	switch (strlen($m[0]))
+	{
+		case 1:
+			return '&#' . ord($m[0]) . ';';
+
+		case 2:
+			return '&#' . (((ord($m[0][0]) & 0x1F) << 6) | (ord($m[0][1]) & 0x3F)) . ';';
+
+		case 3:
+			return '&#' . (((ord($m[0][0]) & 0x0F) << 12) | ((ord($m[0][1]) & 0x3F) << 6) | (ord($m[0][2]) & 0x3F)) . ';';
+
+		case 4:
+			return '&#' . (((ord($m[0][0]) & 0x07) << 18) | ((ord($m[0][1]) & 0x3F) << 12) | ((ord($m[0][2]) & 0x3F) << 6) | (ord($m[0][3]) & 0x3F)) . ';';
+
+		default:
+			return $m[0];
+	}		
+}
+
+/**
+* Convert Numeric Character References to UTF-8 chars
+*
+* Notes:
+*  - we do not convert NCRs recursively, if you pass &#38;#38; it will return &#38;
+*  - we DO NOT check for the existence of the Unicode characters, therefore an entity
+*    may be converted to an inexistent codepoint
+*
+* @param	string	$text		String to convert, encoded in UTF-8 (no normal form required)
+* @return	string				UTF-8 string where NCRs have been replaced with the actual chars
+*/
+function utf8_decode_ncr($text)
+{
+	return preg_replace_callback('/&#([0-9]{1,6}|x[0-9A-F]{1,5});/i', 'utf8_decode_ncr_callback', $text);
+}
+
+/**
+* Callback used in decode_ncr()
+*
+* Takes a NCR (in decimal or hexadecimal) and returns a UTF-8 char. Attention, $m is an array.
+* It will ignore most of invalid NCRs, but not all!
+*
+* @param	array	$m			0-based numerically indexed array passed by preg_replace_callback()
+* @return	string				UTF-8 char
+*/
+function utf8_decode_ncr_callback($m)
+{
+	$cp = (strncasecmp($m[1], 'x', 1)) ? $m[1] : hexdec(substr($m[1], 1));
+
+	if ($cp > 0xFFFF)
+	{
+		return chr(0xF0 | ($cp >> 18)) . chr(0x80 | (($cp >> 12) & 0x3F)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
+	}
+	elseif ($cp > 0x7FF)
+	{
+		return chr(0xE0 | ($cp >> 12)) . chr(0x80 | (($cp >> 6) & 0x3F)) . chr(0x80 | ($cp & 0x3F));
+	}
+	elseif ($cp > 0x7F)
+	{
+		return chr(0xC0 | ($cp >> 6)) . chr(0x80 | ($cp & 0x3F));
+	}
+	else
+	{
+		return chr($cp);
+	}
+}
+
 ?>