mirror of
https://github.com/phpbb/phpbb.git
synced 2025-07-30 21:40:43 +02:00
- fixes for the following bugs:
#5326 #5318 #5304 #5290 #5288 #5278 #5276 #5272 #5266 - also fixed the "Call-time pass-by-reference" bug #5252 - within this step changed the normalize calls to require references. - added captcha size variables to the class scope (suggestion was posted at area51) git-svn-id: file:///svn/phpbb/trunk@6584 89ea8834-ac86-4346-8a33-228a782c2dd0
This commit is contained in:
@@ -67,10 +67,10 @@ class utf_normalizer
|
||||
* The ultimate convenience function! Clean up invalid UTF-8 sequences,
|
||||
* and convert to Normal Form C, canonical composition.
|
||||
*
|
||||
* @param string $str The dirty string
|
||||
* @param string &$str The dirty string
|
||||
* @return string The same string, all shiny and cleaned-up
|
||||
*/
|
||||
function cleanup($str)
|
||||
function cleanup(&$str)
|
||||
{
|
||||
// The string below is the list of all autorized characters, sorted by frequency in latin text
|
||||
$pos = strspn($str, "\x20\x65\x69\x61\x73\x6E\x74\x72\x6F\x6C\x75\x64\x5D\x5B\x63\x6D\x70\x27\x0A\x67\x7C\x68\x76\x2E\x66\x62\x2C\x3A\x3D\x2D\x71\x31\x30\x43\x32\x2A\x79\x78\x29\x28\x4C\x39\x41\x53\x2F\x50\x22\x45\x6A\x4D\x49\x6B\x33\x3E\x35\x54\x3C\x44\x34\x7D\x42\x7B\x38\x46\x77\x52\x36\x37\x55\x47\x4E\x3B\x4A\x7A\x56\x23\x48\x4F\x57\x5F\x26\x21\x4B\x3F\x58\x51\x25\x59\x5C\x09\x5A\x2B\x7E\x5E\x24\x40\x60\x7F\x0D");
|
||||
@@ -79,7 +79,7 @@ class utf_normalizer
|
||||
if ($pos == $len)
|
||||
{
|
||||
// ASCII strings with no special chars return immediately
|
||||
return $str;
|
||||
return;
|
||||
}
|
||||
|
||||
// Note: we do not check for $GLOBALS['utf_canonical_decomp']. It is assumed they are always loaded together
|
||||
@@ -91,23 +91,22 @@ class utf_normalizer
|
||||
|
||||
// Replace any byte in the range 0x00..0x1F, except for \r, \n and \t
|
||||
// We replace those characters with a 0xFF byte, which is illegal in UTF-8 and will in turn be replaced with a UTF replacement char
|
||||
return utf_normalizer::recompose(
|
||||
strtr(
|
||||
$str,
|
||||
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
|
||||
"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"
|
||||
),
|
||||
$pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']
|
||||
$str = strtr(
|
||||
$str,
|
||||
"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0B\x0C\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F",
|
||||
"\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"
|
||||
);
|
||||
|
||||
$str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate and normalize a UTF string to NFC
|
||||
*
|
||||
* @param string $str Unchecked UTF string
|
||||
* @param string &$str Unchecked UTF string
|
||||
* @return string The string, validated and in normal form
|
||||
*/
|
||||
function nfc($str)
|
||||
function nfc(&$str)
|
||||
{
|
||||
$pos = strspn($str, UTF8_ASCII_RANGE);
|
||||
$len = strlen($str);
|
||||
@@ -115,7 +114,7 @@ class utf_normalizer
|
||||
if ($pos == $len)
|
||||
{
|
||||
// ASCII strings return immediately
|
||||
return $str;
|
||||
return;
|
||||
}
|
||||
|
||||
if (!isset($GLOBALS['utf_nfc_qc']))
|
||||
@@ -124,16 +123,16 @@ class utf_normalizer
|
||||
include($phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx);
|
||||
}
|
||||
|
||||
return utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
|
||||
$str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfc_qc'], $GLOBALS['utf_canonical_decomp']);
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate and normalize a UTF string to NFKC
|
||||
*
|
||||
* @param string $str Unchecked UTF string
|
||||
* @param string &$str Unchecked UTF string
|
||||
* @return string The string, validated and in normal form
|
||||
*/
|
||||
function nfkc($str)
|
||||
function nfkc(&$str)
|
||||
{
|
||||
$pos = strspn($str, UTF8_ASCII_RANGE);
|
||||
$len = strlen($str);
|
||||
@@ -141,7 +140,7 @@ class utf_normalizer
|
||||
if ($pos == $len)
|
||||
{
|
||||
// ASCII strings return immediately
|
||||
return $str;
|
||||
return;
|
||||
}
|
||||
|
||||
if (!isset($GLOBALS['utf_nfkc_qc']))
|
||||
@@ -156,16 +155,16 @@ class utf_normalizer
|
||||
include($phpbb_root_path . 'includes/utf/data/utf_canonical_comp.' . $phpEx);
|
||||
}
|
||||
|
||||
return utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
|
||||
$str = utf_normalizer::recompose($str, $pos, $len, $GLOBALS['utf_nfkc_qc'], $GLOBALS['utf_compatibility_decomp']);
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate and normalize a UTF string to NFD
|
||||
*
|
||||
* @param string $str Unchecked UTF string
|
||||
* @param string &$str Unchecked UTF string
|
||||
* @return string The string, validated and in normal form
|
||||
*/
|
||||
function nfd($str)
|
||||
function nfd(&$str)
|
||||
{
|
||||
$pos = strspn($str, UTF8_ASCII_RANGE);
|
||||
$len = strlen($str);
|
||||
@@ -173,7 +172,7 @@ class utf_normalizer
|
||||
if ($pos == $len)
|
||||
{
|
||||
// ASCII strings return immediately
|
||||
return $str;
|
||||
return;
|
||||
}
|
||||
|
||||
if (!isset($GLOBALS['utf_canonical_decomp']))
|
||||
@@ -182,16 +181,16 @@ class utf_normalizer
|
||||
include($phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx);
|
||||
}
|
||||
|
||||
return utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_canonical_decomp']);
|
||||
$str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_canonical_decomp']);
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate and normalize a UTF string to NFKD
|
||||
*
|
||||
* @param string $str Unchecked UTF string
|
||||
* @param string &$str Unchecked UTF string
|
||||
* @return string The string, validated and in normal form
|
||||
*/
|
||||
function nfkd($str)
|
||||
function nfkd(&$str)
|
||||
{
|
||||
$pos = strspn($str, UTF8_ASCII_RANGE);
|
||||
$len = strlen($str);
|
||||
@@ -199,7 +198,7 @@ class utf_normalizer
|
||||
if ($pos == $len)
|
||||
{
|
||||
// ASCII strings return immediately
|
||||
return $str;
|
||||
return;
|
||||
}
|
||||
|
||||
if (!isset($GLOBALS['utf_compatibility_decomp']))
|
||||
@@ -208,7 +207,7 @@ class utf_normalizer
|
||||
include($phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx);
|
||||
}
|
||||
|
||||
return utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_compatibility_decomp']);
|
||||
$str = utf_normalizer::decompose($str, $pos, $len, $GLOBALS['utf_compatibility_decomp']);
|
||||
}
|
||||
|
||||
|
||||
@@ -239,14 +238,7 @@ class utf_normalizer
|
||||
$tmp = '';
|
||||
$i = $tmp_pos = $last_cc = 0;
|
||||
|
||||
if ($pos)
|
||||
{
|
||||
$buffer = array(++$i => $str[$pos - 1]);
|
||||
}
|
||||
else
|
||||
{
|
||||
$buffer = array();
|
||||
}
|
||||
$buffer = ($pos) ? array(++$i => $str[$pos - 1]) : array();
|
||||
|
||||
// UTF char length array
|
||||
// This array is used to determine the length of a UTF character.
|
||||
@@ -325,6 +317,9 @@ class utf_normalizer
|
||||
{
|
||||
// Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char
|
||||
// has been encoded in a five- or six- byte sequence
|
||||
/**
|
||||
* @todo $trailing_bytes always == 5?
|
||||
*/
|
||||
if ($utf_char[0] >= "\xF8")
|
||||
{
|
||||
if ($utf_char[0] < "\xF8")
|
||||
@@ -421,6 +416,9 @@ class utf_normalizer
|
||||
|
||||
default:
|
||||
// Five- and six- byte sequences do not need being checked for here anymore
|
||||
/**
|
||||
* @todo $trailing_bytes always == 5?
|
||||
*/
|
||||
if ($utf_char > UTF8_MAX)
|
||||
{
|
||||
// Out of the Unicode range
|
||||
@@ -1011,7 +1009,7 @@ class utf_normalizer
|
||||
ksort($utf_sort);
|
||||
}
|
||||
|
||||
foreach($utf_sort as $utf_chars)
|
||||
foreach ($utf_sort as $utf_chars)
|
||||
{
|
||||
$tmp .= implode('', $utf_chars);
|
||||
}
|
||||
@@ -1365,17 +1363,17 @@ class utf_normalizer
|
||||
// LIndex can only range from 0 to 18, therefore it cannot influence the first two bytes of the L Jamo, which allows us to hardcode them (based on LBase).
|
||||
//
|
||||
// The same goes for VIndex, but for TIndex there's a catch: the value of the third byte could exceed 0xBF and we would have to increment the second byte
|
||||
if ($tIndex = $idx % UNICODE_HANGUL_TCOUNT)
|
||||
if ($t_index = $idx % UNICODE_HANGUL_TCOUNT)
|
||||
{
|
||||
if ($tIndex < 25)
|
||||
if ($t_index < 25)
|
||||
{
|
||||
$utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x86\x00";
|
||||
$utf_char[8] = chr(0xA7 + $tIndex);
|
||||
$utf_char[8] = chr(0xA7 + $t_index);
|
||||
}
|
||||
else
|
||||
{
|
||||
$utf_char = "\xE1\x84\x00\xE1\x85\x00\xE1\x87\x00";
|
||||
$utf_char[8] = chr(0x67 + $tIndex);
|
||||
$utf_char[8] = chr(0x67 + $t_index);
|
||||
}
|
||||
}
|
||||
else
|
||||
@@ -1478,7 +1476,6 @@ class utf_normalizer
|
||||
}
|
||||
|
||||
return $tmp;
|
||||
|
||||
}
|
||||
else if ($tmp_pos)
|
||||
{
|
||||
|
Reference in New Issue
Block a user