2006-07-10 01:53:30 +00:00
< ? php
/**
*
* @ package phpBB3
* @ version $Id $
* @ copyright ( c ) 2005 phpBB Group
* @ license http :// opensource . org / licenses / gpl - license . php GNU Public License
*
*/
/**
* Some Unicode characters encoded in UTF - 8
*
* Preserved for compatibility
*/
define ( 'UTF8_REPLACEMENT' , " \xEF \xBF \xBD " );
define ( 'UTF8_MAX' , " \xF4 \x8F \xBF \xBF " );
define ( 'UTF8_FFFE' , " \xEF \xBF \xBE " );
define ( 'UTF8_FFFF' , " \xEF \xBF \xBF " );
define ( 'UTF8_SURROGATE_FIRST' , " \xED \xA0 \x80 " );
define ( 'UTF8_SURROGATE_LAST' , " \xED \xBF \xBF " );
2006-07-10 03:05:27 +00:00
define ( 'UTF8_HANGUL_FIRST' , " \xEA \xB0 \x80 " );
define ( 'UTF8_HANGUL_LAST' , " \xED \x9E \xA3 " );
2006-07-10 01:53:30 +00:00
2006-07-15 15:44:54 +00:00
define ( 'UTF8_CJK_FIRST' , " \xE4 \xB8 \x80 " );
define ( 'UTF8_CJK_LAST' , " \xE9 \xBE \xBB " );
define ( 'UTF8_CJK_B_FIRST' , " \xF0 \xA0 \x80 \x80 " );
define ( 'UTF8_CJK_B_LAST' , " \xF0 \xAA \x9B \x96 " );
2006-11-03 23:09:16 +00:00
// Unset global variables
unset ( $GLOBALS [ 'utf_jamo_index' ], $GLOBALS [ 'utf_jamo_type' ], $GLOBALS [ 'utf_nfc_qc' ], $GLOBALS [ 'utf_combining_class' ], $GLOBALS [ 'utf_canonical_comp' ], $GLOBALS [ 'utf_canonical_decomp' ], $GLOBALS [ 'utf_nfkc_qc' ], $GLOBALS [ 'utf_compatibility_decomp' ]);
// NFC_QC and NFKC_QC values
define ( 'UNICODE_QC_MAYBE' , 0 );
define ( 'UNICODE_QC_NO' , 1 );
// Contains all the ASCII characters appearing in UTF-8, sorted by frequency
define ( 'UTF8_ASCII_RANGE' , " \x20 \x65 \x69 \x61 \x73 \x6E \x74 \x72 \x6F \x6C \x75 \x64 \x5D \x5B \x63 \x6D \x70 \x27 \x0A \x67 \x7C \x68 \x76 \x2E \x66 \x62 \x2C \x3A \x3D \x2D \x71 \x31 \x30 \x43 \x32 \x2A \x79 \x78 \x29 \x28 \x4C \x39 \x41 \x53 \x2F \x50 \x22 \x45 \x6A \x4D \x49 \x6B \x33 \x3E \x35 \x54 \x3C \x44 \x34 \x7D \x42 \x7B \x38 \x46 \x77 \x52 \x36 \x37 \x55 \x47 \x4E \x3B \x4A \x7A \x56 \x23 \x48 \x4F \x57 \x5F \x26 \x21 \x4B \x3F \x58 \x51 \x25 \x59 \x5C \x09 \x5A \x2B \x7E \x5E \x24 \x40 \x60 \x7F \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0B \x0C \x0D \x0E \x0F \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1A \x1B \x1C \x1D \x1E \x1F " );
// Contains all the tail bytes that can appear in the composition of a UTF-8 char
define ( 'UTF8_TRAILING_BYTES' , " \xA9 \xA0 \xA8 \x80 \xAA \x99 \xA7 \xBB \xAB \x89 \x94 \x82 \xB4 \xA2 \xAE \x83 \xB0 \xB9 \xB8 \x93 \xAF \xBC \xB3 \x81 \xA4 \xB2 \x9C \xA1 \xB5 \xBE \xBD \xBA \x98 \xAD \xB1 \x84 \x95 \xA6 \xB6 \x88 \x8D \x90 \xB7 \xBF \x92 \x85 \xA5 \x97 \x8C \x86 \xA3 \x8E \x9F \x8F \x87 \x91 \x9D \xAC \x9E \x8B \x96 \x9B \x8A \x9A " );
// Constants used by the Hangul [de]composition algorithms
define ( 'UNICODE_HANGUL_SBASE' , 0xAC00 );
define ( 'UNICODE_HANGUL_LBASE' , 0x1100 );
define ( 'UNICODE_HANGUL_VBASE' , 0x1161 );
define ( 'UNICODE_HANGUL_TBASE' , 0x11A7 );
define ( 'UNICODE_HANGUL_SCOUNT' , 11172 );
define ( 'UNICODE_HANGUL_LCOUNT' , 19 );
define ( 'UNICODE_HANGUL_VCOUNT' , 21 );
define ( 'UNICODE_HANGUL_TCOUNT' , 28 );
define ( 'UNICODE_HANGUL_NCOUNT' , 588 );
define ( 'UNICODE_JAMO_L' , 0 );
define ( 'UNICODE_JAMO_V' , 1 );
define ( 'UNICODE_JAMO_T' , 2 );
2006-07-15 15:44:54 +00:00
2006-11-03 23:09:16 +00:00
/**
* Unicode normalization routines
*
* @ package phpBB3
*/
class utf_normalizer
2006-07-10 01:53:30 +00:00
{
2006-08-22 21:26:06 +00:00
/**
2006-11-03 23:09:16 +00:00
* Validate , cleanup and normalize a string
*
* The ultimate convenience function ! Clean up invalid UTF - 8 sequences ,
* and convert to Normal Form C , canonical composition .
2006-08-22 21:26:06 +00:00
*
2006-11-15 15:35:50 +00:00
* @ param string & $str The dirty string
2006-11-03 23:09:16 +00:00
* @ return string The same string , all shiny and cleaned - up
2006-08-22 21:26:06 +00:00
*/
2006-11-15 15:35:50 +00:00
function cleanup ( & $str )
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
// The string below is the list of all autorized characters, sorted by frequency in latin text
$pos = strspn ( $str , " \x20 \x65 \x69 \x61 \x73 \x6E \x74 \x72 \x6F \x6C \x75 \x64 \x5D \x5B \x63 \x6D \x70 \x27 \x0A \x67 \x7C \x68 \x76 \x2E \x66 \x62 \x2C \x3A \x3D \x2D \x71 \x31 \x30 \x43 \x32 \x2A \x79 \x78 \x29 \x28 \x4C \x39 \x41 \x53 \x2F \x50 \x22 \x45 \x6A \x4D \x49 \x6B \x33 \x3E \x35 \x54 \x3C \x44 \x34 \x7D \x42 \x7B \x38 \x46 \x77 \x52 \x36 \x37 \x55 \x47 \x4E \x3B \x4A \x7A \x56 \x23 \x48 \x4F \x57 \x5F \x26 \x21 \x4B \x3F \x58 \x51 \x25 \x59 \x5C \x09 \x5A \x2B \x7E \x5E \x24 \x40 \x60 \x7F \x0D " );
$len = strlen ( $str );
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
if ( $pos == $len )
{
// ASCII strings with no special chars return immediately
2006-11-15 15:35:50 +00:00
return ;
2006-11-03 23:09:16 +00:00
}
2006-08-22 21:26:06 +00:00
2006-11-03 23:09:16 +00:00
// Note: we do not check for $GLOBALS['utf_canonical_decomp']. It is assumed they are always loaded together
if ( ! isset ( $GLOBALS [ 'utf_nfc_qc' ]))
{
global $phpbb_root_path , $phpEx ;
include ( $phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx );
}
2006-08-22 21:26:06 +00:00
2006-11-03 23:09:16 +00:00
// Replace any byte in the range 0x00..0x1F, except for \r, \n and \t
// We replace those characters with a 0xFF byte, which is illegal in UTF-8 and will in turn be replaced with a UTF replacement char
2006-11-15 15:35:50 +00:00
$str = strtr (
$str ,
" \x00 \x01 \x02 \x03 \x04 \x05 \x06 \x07 \x08 \x0B \x0C \x0E \x0F \x10 \x11 \x12 \x13 \x14 \x15 \x16 \x17 \x18 \x19 \x1A \x1B \x1C \x1D \x1E \x1F " ,
" \xFF \xFF \xFF \xFF \xFF \xFF \xFF \xFF \xFF \xFF \xFF \xFF \xFF \xFF \xFF \xFF \xFF \xFF \xFF \xFF \xFF \xFF \xFF \xFF \xFF \xFF \xFF \xFF \xFF "
2006-11-03 23:09:16 +00:00
);
2006-11-15 15:35:50 +00:00
$str = utf_normalizer :: recompose ( $str , $pos , $len , $GLOBALS [ 'utf_nfc_qc' ], $GLOBALS [ 'utf_canonical_decomp' ]);
2006-11-03 23:09:16 +00:00
}
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
/**
* Validate and normalize a UTF string to NFC
*
2006-11-15 15:35:50 +00:00
* @ param string & $str Unchecked UTF string
2006-11-03 23:09:16 +00:00
* @ return string The string , validated and in normal form
*/
2006-11-15 15:35:50 +00:00
function nfc ( & $str )
2006-11-03 23:09:16 +00:00
{
$pos = strspn ( $str , UTF8_ASCII_RANGE );
$len = strlen ( $str );
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
if ( $pos == $len )
{
// ASCII strings return immediately
2006-11-15 15:35:50 +00:00
return ;
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
if ( ! isset ( $GLOBALS [ 'utf_nfc_qc' ]))
2006-08-22 21:26:06 +00:00
{
2006-11-03 23:09:16 +00:00
global $phpbb_root_path , $phpEx ;
include ( $phpbb_root_path . 'includes/utf/data/utf_nfc_qc.' . $phpEx );
2006-08-22 21:26:06 +00:00
}
2006-07-10 01:53:30 +00:00
2006-11-15 15:35:50 +00:00
$str = utf_normalizer :: recompose ( $str , $pos , $len , $GLOBALS [ 'utf_nfc_qc' ], $GLOBALS [ 'utf_canonical_decomp' ]);
2006-11-03 23:09:16 +00:00
}
/**
* Validate and normalize a UTF string to NFKC
*
2006-11-15 15:35:50 +00:00
* @ param string & $str Unchecked UTF string
2006-11-03 23:09:16 +00:00
* @ return string The string , validated and in normal form
*/
2006-11-15 15:35:50 +00:00
function nfkc ( & $str )
2006-11-03 23:09:16 +00:00
{
$pos = strspn ( $str , UTF8_ASCII_RANGE );
$len = strlen ( $str );
if ( $pos == $len )
2006-08-22 21:26:06 +00:00
{
2006-11-03 23:09:16 +00:00
// ASCII strings return immediately
2006-11-15 15:35:50 +00:00
return ;
2006-08-22 21:26:06 +00:00
}
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
if ( ! isset ( $GLOBALS [ 'utf_nfkc_qc' ]))
2006-08-22 21:26:06 +00:00
{
2006-11-03 23:09:16 +00:00
global $phpbb_root_path , $phpEx ;
include ( $phpbb_root_path . 'includes/utf/data/utf_nfkc_qc.' . $phpEx );
2006-08-22 21:26:06 +00:00
}
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
if ( ! isset ( $GLOBALS [ 'utf_canonical_comp' ]))
2006-08-22 21:26:06 +00:00
{
2006-11-03 23:09:16 +00:00
global $phpbb_root_path , $phpEx ;
include ( $phpbb_root_path . 'includes/utf/data/utf_canonical_comp.' . $phpEx );
2006-08-22 21:26:06 +00:00
}
2006-11-03 23:09:16 +00:00
2006-11-15 15:35:50 +00:00
$str = utf_normalizer :: recompose ( $str , $pos , $len , $GLOBALS [ 'utf_nfkc_qc' ], $GLOBALS [ 'utf_compatibility_decomp' ]);
2006-07-10 01:53:30 +00:00
}
/**
2006-11-03 23:09:16 +00:00
* Validate and normalize a UTF string to NFD
2006-07-10 01:53:30 +00:00
*
2006-11-15 15:35:50 +00:00
* @ param string & $str Unchecked UTF string
2006-11-03 23:09:16 +00:00
* @ return string The string , validated and in normal form
2006-07-10 01:53:30 +00:00
*/
2006-11-15 15:35:50 +00:00
function nfd ( & $str )
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$pos = strspn ( $str , UTF8_ASCII_RANGE );
$len = strlen ( $str );
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
if ( $pos == $len )
{
// ASCII strings return immediately
2006-11-15 15:35:50 +00:00
return ;
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
if ( ! isset ( $GLOBALS [ 'utf_canonical_decomp' ]))
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
global $phpbb_root_path , $phpEx ;
include ( $phpbb_root_path . 'includes/utf/data/utf_canonical_decomp.' . $phpEx );
}
2006-07-10 01:53:30 +00:00
2006-11-15 15:35:50 +00:00
$str = utf_normalizer :: decompose ( $str , $pos , $len , $GLOBALS [ 'utf_canonical_decomp' ]);
2006-11-03 23:09:16 +00:00
}
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
/**
* Validate and normalize a UTF string to NFKD
*
2006-11-15 15:35:50 +00:00
* @ param string & $str Unchecked UTF string
2006-11-03 23:09:16 +00:00
* @ return string The string , validated and in normal form
*/
2006-11-15 15:35:50 +00:00
function nfkd ( & $str )
2006-11-03 23:09:16 +00:00
{
$pos = strspn ( $str , UTF8_ASCII_RANGE );
$len = strlen ( $str );
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
if ( $pos == $len )
{
// ASCII strings return immediately
2006-11-15 15:35:50 +00:00
return ;
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
if ( ! isset ( $GLOBALS [ 'utf_compatibility_decomp' ]))
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
global $phpbb_root_path , $phpEx ;
include ( $phpbb_root_path . 'includes/utf/data/utf_compatibility_decomp.' . $phpEx );
}
2006-07-10 01:53:30 +00:00
2006-11-15 15:35:50 +00:00
$str = utf_normalizer :: decompose ( $str , $pos , $len , $GLOBALS [ 'utf_compatibility_decomp' ]);
2006-11-03 23:09:16 +00:00
}
2006-07-10 01:53:30 +00:00
2006-08-22 21:26:06 +00:00
2006-11-03 23:09:16 +00:00
/**
* Recompose a UTF string
*
2006-11-17 19:37:57 +00:00
* @ param string $str Unchecked UTF string
* @ param integer $pos Position of the first UTF char ( in bytes )
* @ param integer $len Length of the string ( in bytes )
* @ param array & $qc Quick - check array , passed by reference but never modified
* @ param array & $decomp_map Decomposition mapping , passed by reference but never modified
* @ return string The string , validated and recomposed
2006-11-03 23:09:16 +00:00
*
* @ access private
*/
function recompose ( $str , $pos , $len , & $qc , & $decomp_map )
{
global $utf_combining_class , $utf_canonical_comp , $utf_jamo_type , $utf_jamo_index ;
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
// Load some commonly-used tables
if ( ! isset ( $utf_jamo_index , $utf_jamo_type , $utf_combining_class ))
2006-07-10 01:53:30 +00:00
{
2007-02-12 23:19:41 +00:00
global $phpbb_root_path , $phpEx ;
include ( $phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx );
2006-11-03 23:09:16 +00:00
}
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
// Buffer the last ASCII char before the UTF-8 stuff if applicable
$tmp = '' ;
$i = $tmp_pos = $last_cc = 0 ;
2006-07-10 01:53:30 +00:00
2006-11-15 15:35:50 +00:00
$buffer = ( $pos ) ? array ( ++ $i => $str [ $pos - 1 ]) : array ();
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
// UTF char length array
// This array is used to determine the length of a UTF character.
// Be $c the result of ($str[$pos] & "\xF0") --where $str is the string we're operating on and $pos
// the position of the cursor--, if $utf_len_mask[$c] does not exist, the byte is an ASCII char.
// Otherwise, if $utf_len_mask[$c] is greater than 0, we have a the leading byte of a multibyte character
// whose length is $utf_len_mask[$c] and if it is equal to 0, the byte is a trailing byte.
$utf_len_mask = array (
// Leading bytes masks
" \xC0 " => 2 , " \xD0 " => 2 , " \xE0 " => 3 , " \xF0 " => 4 ,
// Trailing bytes masks
" \x80 " => 0 , " \x90 " => 0 , " \xA0 " => 0 , " \xB0 " => 0
);
$extra_check = array (
" \xED " => 1 , " \xEF " => 1 , " \xC0 " => 1 , " \xC1 " => 1 , " \xE0 " => 1 , " \xF0 " => 1 ,
" \xF4 " => 1 , " \xF5 " => 1 , " \xF6 " => 1 , " \xF7 " => 1 , " \xF8 " => 1 , " \xF9 " => 1 ,
" \xFA " => 1 , " \xFB " => 1 , " \xFC " => 1 , " \xFD " => 1 , " \xFE " => 1 , " \xFF " => 1
);
$utf_validation_mask = array (
2 => " \xE0 \xC0 " ,
3 => " \xF0 \xC0 \xC0 " ,
4 => " \xF8 \xC0 \xC0 \xC0 "
);
$utf_validation_check = array (
2 => " \xC0 \x80 " ,
3 => " \xE0 \x80 \x80 " ,
4 => " \xF0 \x80 \x80 \x80 "
);
// Main loop
do
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
// STEP 0: Capture the current char and buffer it
$c = $str [ $pos ];
$c_mask = $c & " \xF0 " ;
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
if ( isset ( $utf_len_mask [ $c_mask ]))
2006-08-22 21:26:06 +00:00
{
2006-11-03 23:09:16 +00:00
// Byte at $pos is either a leading byte or a missplaced trailing byte
if ( $utf_len = $utf_len_mask [ $c_mask ])
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
// Capture the char
$buffer [ ++ $i & 7 ] = $utf_char = substr ( $str , $pos , $utf_len );
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
// Let's find out if a thorough check is needed
if ( isset ( $qc [ $utf_char ]))
{
// If the UTF char is in the qc array then it may not be in normal form. We do nothing here, the actual processing is below this "if" block
}
else if ( isset ( $utf_combining_class [ $utf_char ]))
{
if ( $utf_combining_class [ $utf_char ] < $last_cc )
{
// A combining character that is NOT canonically ordered
}
else
2006-08-22 21:26:06 +00:00
{
2006-11-03 23:09:16 +00:00
// A combining character that IS canonically ordered, skip to the next char
$last_cc = $utf_combining_class [ $utf_char ];
$pos += $utf_len ;
continue ;
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
}
else
{
// At this point, $utf_char holds a UTF char that we know is not a NF[K]C_QC and is not a combining character.
// It can be a singleton, a canonical composite, a replacement char or an even an ill-formed bunch of bytes. Let's find out
$last_cc = 0 ;
// Check that we have the correct number of trailing bytes
if (( $utf_char & $utf_validation_mask [ $utf_len ]) != $utf_validation_check [ $utf_len ])
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
// Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char
// has been encoded in a five- or six- byte sequence
if ( $utf_char [ 0 ] >= " \xF8 " )
2006-07-10 01:53:30 +00:00
{
2006-11-16 20:16:36 +00:00
if ( $utf_char [ 0 ] < " \xFC " )
2006-11-03 23:09:16 +00:00
{
$trailing_bytes = 4 ;
}
2006-11-16 20:16:36 +00:00
else if ( $utf_char [ 0 ] > " \xFD " )
2006-11-03 23:09:16 +00:00
{
$trailing_bytes = 0 ;
}
else
{
$trailing_bytes = 5 ;
}
2006-07-10 01:53:30 +00:00
}
else
{
2006-11-03 23:09:16 +00:00
$trailing_bytes = $utf_len - 1 ;
2006-08-22 21:26:06 +00:00
}
2006-11-03 23:09:16 +00:00
$tmp .= substr ( $str , $tmp_pos , $pos - $tmp_pos ) . UTF8_REPLACEMENT ;
$pos += strspn ( $str , UTF8_TRAILING_BYTES , ++ $pos , $trailing_bytes );
$tmp_pos = $pos ;
continue ;
2006-07-10 01:53:30 +00:00
}
2006-08-22 21:26:06 +00:00
2006-11-03 23:09:16 +00:00
if ( isset ( $extra_check [ $c ]))
{
switch ( $c )
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
// Note: 0xED is quite common in Korean
case " \xED " :
if ( $utf_char >= " \xED \xA0 \x80 " )
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
// Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)
$tmp .= substr ( $str , $tmp_pos , $pos - $tmp_pos ) . UTF8_REPLACEMENT ;
$pos += $utf_len ;
$tmp_pos = $pos ;
continue 2 ;
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
break ;
// Note: 0xEF is quite common in Japanese
case " \xEF " :
if ( $utf_char == " \xEF \xBF \xBE " || $utf_char == " \xEF \xBF \xBF " )
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
// U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)
$tmp .= substr ( $str , $tmp_pos , $pos - $tmp_pos ) . UTF8_REPLACEMENT ;
$pos += $utf_len ;
$tmp_pos = $pos ;
continue 2 ;
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
break ;
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
case " \xC0 " :
case " \xC1 " :
if ( $utf_char <= " \xC1 \xBF " )
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
// Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char
$tmp .= substr ( $str , $tmp_pos , $pos - $tmp_pos ) . UTF8_REPLACEMENT ;
$pos += $utf_len ;
$tmp_pos = $pos ;
continue 2 ;
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
break ;
case " \xE0 " :
if ( $utf_char <= " \xE0 \x9F \xBF " )
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
// Unicode char U+0000..U+07FF encoded in 3 bytes
$tmp .= substr ( $str , $tmp_pos , $pos - $tmp_pos ) . UTF8_REPLACEMENT ;
$pos += $utf_len ;
$tmp_pos = $pos ;
continue 2 ;
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
break ;
2006-08-22 21:26:06 +00:00
2006-11-03 23:09:16 +00:00
case " \xF0 " :
if ( $utf_char <= " \xF0 \x8F \xBF \xBF " )
{
// Unicode char U+0000..U+FFFF encoded in 4 bytes
$tmp .= substr ( $str , $tmp_pos , $pos - $tmp_pos ) . UTF8_REPLACEMENT ;
$pos += $utf_len ;
$tmp_pos = $pos ;
continue 2 ;
}
break ;
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
default :
// Five- and six- byte sequences do not need being checked for here anymore
if ( $utf_char > UTF8_MAX )
{
// Out of the Unicode range
if ( $utf_char [ 0 ] < " \xF8 " )
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$trailing_bytes = 3 ;
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
else if ( $utf_char [ 0 ] < " \xFC " )
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$trailing_bytes = 4 ;
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
else if ( $utf_char [ 0 ] > " \xFD " )
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$trailing_bytes = 0 ;
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
else
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$trailing_bytes = 5 ;
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
$tmp .= substr ( $str , $tmp_pos , $pos - $tmp_pos ) . UTF8_REPLACEMENT ;
$pos += strspn ( $str , UTF8_TRAILING_BYTES , ++ $pos , $trailing_bytes );
$tmp_pos = $pos ;
continue 2 ;
}
break ;
2006-07-10 01:53:30 +00:00
}
}
2006-11-03 23:09:16 +00:00
// The char is a valid starter, move the cursor and go on
$pos += $utf_len ;
2006-07-10 01:53:30 +00:00
continue ;
}
2006-11-03 23:09:16 +00:00
}
else
{
// A trailing byte came out of nowhere, we will advance the cursor and treat the this byte and all following trailing bytes as if
// each of them was a Unicode replacement char
$spn = strspn ( $str , UTF8_TRAILING_BYTES , $pos );
$tmp .= substr ( $str , $tmp_pos , $pos - $tmp_pos ) . str_repeat ( UTF8_REPLACEMENT , $spn );
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
$pos += $spn ;
$tmp_pos = $pos ;
continue ;
}
2006-07-10 01:53:30 +00:00
2006-08-22 21:26:06 +00:00
2006-11-03 23:09:16 +00:00
// STEP 1: Decompose current char
// We have found a character that is either:
// - in the NFC_QC/NFKC_QC list
// - a non-starter char that is not canonically ordered
//
// We are going to capture the shortest UTF sequence that satisfies these two conditions:
//
// 1 - If the sequence does not start at the begginning of the string, it must begin with a starter,
// and that starter must not have the NF[K]C_QC property equal to "MAYBE"
//
// 2 - If the sequence does not end at the end of the string, it must end with a non-starter and be
// immediately followed by a starter that is not on the QC list
//
$utf_seq = array ();
$last_cc = 0 ;
$lpos = $pos ;
$pos += $utf_len ;
if ( isset ( $decomp_map [ $utf_char ]))
{
$_pos = 0 ;
$_len = strlen ( $decomp_map [ $utf_char ]);
2006-08-22 21:26:06 +00:00
2006-11-03 23:09:16 +00:00
do
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$_utf_len =& $utf_len_mask [ $decomp_map [ $utf_char ][ $_pos ] & " \xF0 " ];
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
if ( isset ( $_utf_len ))
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$utf_seq [] = substr ( $decomp_map [ $utf_char ], $_pos , $_utf_len );
$_pos += $_utf_len ;
}
else
{
$utf_seq [] = $decomp_map [ $utf_char ][ $_pos ];
++ $_pos ;
2006-07-10 01:53:30 +00:00
}
}
2006-11-03 23:09:16 +00:00
while ( $_pos < $_len );
}
else
{
// The char is not decomposable
$utf_seq = array ( $utf_char );
}
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
// STEP 2: Capture the starter
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
// Check out the combining class of the first character of the UTF sequence
$k = 0 ;
if ( isset ( $utf_combining_class [ $utf_seq [ 0 ]]) || $qc [ $utf_char ] == UNICODE_QC_MAYBE )
{
// Not a starter, inspect previous characters
// The last 8 characters are kept in a buffer so that we don't have to capture them everytime.
// This is enough for all real-life strings but even if it wasn't, we can capture characters in backward mode,
// although it is slower than this method.
//
// In the following loop, $j starts at the previous buffered character ($i - 1, because current character is
// at offset $i) and process them in backward mode until we find a starter.
//
// $k is the index on each UTF character inside of our UTF sequence. At this time, $utf_seq contains one or more
// characters numbered 0 to n. $k starts at 0 and for each char we prepend we pre-decrement it and for numbering
$starter_found = 0 ;
$j_min = max ( 1 , $i - 7 );
for ( $j = $i - 1 ; $j >= $j_min && $lpos > $tmp_pos ; -- $j )
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$utf_char = $buffer [ $j & 7 ];
$lpos -= strlen ( $utf_char );
2006-08-22 21:26:06 +00:00
2006-11-03 23:09:16 +00:00
if ( isset ( $decomp_map [ $utf_char ]))
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
// The char is a composite, decompose for storage
$decomp_seq = array ();
$_pos = 0 ;
$_len = strlen ( $decomp_map [ $utf_char ]);
2006-08-22 21:26:06 +00:00
2006-11-03 23:09:16 +00:00
do
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$c = $decomp_map [ $utf_char ][ $_pos ];
$_utf_len =& $utf_len_mask [ $c & " \xF0 " ];
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
if ( isset ( $_utf_len ))
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$decomp_seq [] = substr ( $decomp_map [ $utf_char ], $_pos , $_utf_len );
$_pos += $_utf_len ;
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
else
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$decomp_seq [] = $c ;
++ $_pos ;
2006-08-22 21:26:06 +00:00
}
2006-11-03 23:09:16 +00:00
}
while ( $_pos < $_len );
// Prepend the UTF sequence with our decomposed sequence
if ( isset ( $decomp_seq [ 1 ]))
{
// The char expanded into several chars
$decomp_cnt = sizeof ( $decomp_seq );
foreach ( $decomp_seq as $decomp_i => $decomp_char )
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$utf_seq [ $k + $decomp_i - $decomp_cnt ] = $decomp_char ;
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
$k -= $decomp_cnt ;
2006-07-10 01:53:30 +00:00
}
else
{
2006-11-03 23:09:16 +00:00
// Decomposed to a single char, easier to prepend
$utf_seq [ -- $k ] = $decomp_seq [ 0 ];
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
}
else
{
$utf_seq [ -- $k ] = $utf_char ;
}
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
if ( ! isset ( $utf_combining_class [ $utf_seq [ $k ]]))
{
// We have found our starter
$starter_found = 1 ;
break ;
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
}
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
if ( ! $starter_found && $lpos > $tmp_pos )
{
// The starter was not found in the buffer, let's rewind some more
do
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
// $utf_len_mask contains the masks of both leading bytes and trailing bytes. If $utf_en > 0 then it's a leading byte, otherwise it's a trailing byte.
$c = $str [ -- $lpos ];
$c_mask = $c & " \xF0 " ;
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
if ( isset ( $utf_len_mask [ $c_mask ]))
{
// UTF byte
if ( $utf_len = $utf_len_mask [ $c_mask ])
2006-08-22 21:26:06 +00:00
{
2006-11-03 23:09:16 +00:00
// UTF *leading* byte
$utf_char = substr ( $str , $lpos , $utf_len );
if ( isset ( $decomp_map [ $utf_char ]))
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
// Decompose the character
$decomp_seq = array ();
$_pos = 0 ;
$_len = strlen ( $decomp_map [ $utf_char ]);
2006-08-22 21:26:06 +00:00
2006-11-03 23:09:16 +00:00
do
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$c = $decomp_map [ $utf_char ][ $_pos ];
$_utf_len =& $utf_len_mask [ $c & " \xF0 " ];
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
if ( isset ( $_utf_len ))
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$decomp_seq [] = substr ( $decomp_map [ $utf_char ], $_pos , $_utf_len );
$_pos += $_utf_len ;
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
else
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$decomp_seq [] = $c ;
++ $_pos ;
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
}
while ( $_pos < $_len );
// Prepend the UTF sequence with our decomposed sequence
if ( isset ( $decomp_seq [ 1 ]))
{
// The char expanded into several chars
$decomp_cnt = sizeof ( $decomp_seq );
foreach ( $decomp_seq as $decomp_i => $utf_char )
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$utf_seq [ $k + $decomp_i - $decomp_cnt ] = $utf_char ;
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
$k -= $decomp_cnt ;
2006-07-10 01:53:30 +00:00
}
else
{
2006-11-03 23:09:16 +00:00
// Decomposed to a single char, easier to prepend
$utf_seq [ -- $k ] = $decomp_seq [ 0 ];
2006-07-10 01:53:30 +00:00
}
}
2006-11-03 23:09:16 +00:00
else
{
$utf_seq [ -- $k ] = $utf_char ;
}
2006-08-22 21:26:06 +00:00
}
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
else
{
// ASCII char
$utf_seq [ -- $k ] = $c ;
}
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
while ( $lpos > $tmp_pos );
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
}
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
// STEP 3: Capture following combining modifiers
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
while ( $pos < $len )
{
$c_mask = $str [ $pos ] & " \xF0 " ;
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
if ( isset ( $utf_len_mask [ $c_mask ]))
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
if ( $utf_len = $utf_len_mask [ $c_mask ])
{
$utf_char = substr ( $str , $pos , $utf_len );
}
else
{
// A trailing byte came out of nowhere
// Trailing bytes are replaced with Unicode replacement chars, we will just ignore it for now, break out of the loop
// as if it was a starter (replacement chars ARE starters) and let the next loop replace it
break ;
}
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
if ( isset ( $utf_combining_class [ $utf_char ]) || isset ( $qc [ $utf_char ]))
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
// Combining character, add it to the sequence and move the cursor
if ( isset ( $decomp_map [ $utf_char ]))
2006-08-22 21:26:06 +00:00
{
2006-11-03 23:09:16 +00:00
// Decompose the character
$_pos = 0 ;
$_len = strlen ( $decomp_map [ $utf_char ]);
2006-08-22 21:26:06 +00:00
2006-11-03 23:09:16 +00:00
do
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$c = $decomp_map [ $utf_char ][ $_pos ];
$_utf_len =& $utf_len_mask [ $c & " \xF0 " ];
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
if ( isset ( $_utf_len ))
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$utf_seq [] = substr ( $decomp_map [ $utf_char ], $_pos , $_utf_len );
$_pos += $_utf_len ;
}
else
{
$utf_seq [] = $c ;
++ $_pos ;
2006-07-10 01:53:30 +00:00
}
}
2006-11-03 23:09:16 +00:00
while ( $_pos < $_len );
2006-07-10 01:53:30 +00:00
}
else
{
2006-11-03 23:09:16 +00:00
$utf_seq [] = $utf_char ;
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
$pos += $utf_len ;
2006-07-10 01:53:30 +00:00
}
else
{
2006-11-03 23:09:16 +00:00
// Combining class 0 and no QC, break out of the loop
// Note: we do not know if that character is valid. If it's not, the next iteration will replace it
2006-07-10 01:53:30 +00:00
break ;
}
}
2006-11-03 23:09:16 +00:00
else
{
// ASCII chars are starters
break ;
}
}
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
// STEP 4: Sort and combine
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
// Here we sort...
$k_max = $k + sizeof ( $utf_seq );
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
if ( ! $k && $k_max == 1 )
{
// There is only one char in the UTF sequence, add it then jump to the next iteration of main loop
2006-08-22 21:26:06 +00:00
// Note: the two commented lines below can be enabled under PHP5 for a very small performance gain in most cases
// if (substr_compare($str, $utf_seq[0], $lpos, $pos - $lpos))
// {
2006-11-03 23:09:16 +00:00
$tmp .= substr ( $str , $tmp_pos , $lpos - $tmp_pos ) . $utf_seq [ 0 ];
$tmp_pos = $pos ;
2006-08-22 21:26:06 +00:00
// }
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
continue ;
}
// ...there we combine
if ( isset ( $utf_combining_class [ $utf_seq [ $k ]]))
{
$starter = $nf_seq = '' ;
}
else
{
$starter = $utf_seq [ $k ++ ];
$nf_seq = '' ;
}
$utf_sort = array ();
// We add an empty char at the end of the UTF char sequence. It will act as a starter and trigger the sort/combine routine
// at the end of the string without altering it
$utf_seq [] = '' ;
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
do
{
$utf_char = $utf_seq [ $k ++ ];
if ( isset ( $utf_combining_class [ $utf_char ]))
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$utf_sort [ $utf_combining_class [ $utf_char ]][] = $utf_char ;
2006-07-10 01:53:30 +00:00
}
else
{
2006-11-03 23:09:16 +00:00
if ( empty ( $utf_sort ))
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
// No combining characters... check for a composite of the two starters
if ( isset ( $utf_canonical_comp [ $starter . $utf_char ]))
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
// Good ol' composite character
$starter = $utf_canonical_comp [ $starter . $utf_char ];
}
else if ( isset ( $utf_jamo_type [ $utf_char ]))
{
// Current char is a composable jamo
if ( isset ( $utf_jamo_type [ $starter ]) && $utf_jamo_type [ $starter ] == UNICODE_JAMO_L && $utf_jamo_type [ $utf_char ] == UNICODE_JAMO_V )
2006-08-22 21:26:06 +00:00
{
2006-11-03 23:09:16 +00:00
// We have a L jamo followed by a V jamo, we are going to prefetch the next char to see if it's a T jamo
if ( isset ( $utf_jamo_type [ $utf_seq [ $k ]]) && $utf_jamo_type [ $utf_seq [ $k ]] == UNICODE_JAMO_T )
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
// L+V+T jamos, combine to a LVT Hangul syllable ($k is incremented)
$cp = $utf_jamo_index [ $starter ] + $utf_jamo_index [ $utf_char ] + $utf_jamo_index [ $utf_seq [ $k ]];
++ $k ;
2006-07-10 01:53:30 +00:00
}
else
{
2006-11-03 23:09:16 +00:00
// L+V jamos, combine to a LV Hangul syllable
$cp = $utf_jamo_index [ $starter ] + $utf_jamo_index [ $utf_char ];
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
$starter = chr ( 0xE0 | ( $cp >> 12 )) . chr ( 0x80 | (( $cp >> 6 ) & 0x3F )) . chr ( 0x80 | ( $cp & 0x3F ));
2006-07-10 01:53:30 +00:00
}
else
{
2006-11-03 23:09:16 +00:00
// Non-composable jamo, just add it to the sequence
2006-07-10 01:53:30 +00:00
$nf_seq .= $starter ;
$starter = $utf_char ;
}
}
else
{
2006-11-03 23:09:16 +00:00
// No composite, just add the first starter to the sequence then continue with the other one
$nf_seq .= $starter ;
$starter = $utf_char ;
}
}
else
{
ksort ( $utf_sort );
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
// For each class of combining characters
foreach ( $utf_sort as $cc => $utf_chars )
{
$j = 0 ;
2006-08-22 21:26:06 +00:00
2006-11-03 23:09:16 +00:00
do
{
// Look for a composite
if ( isset ( $utf_canonical_comp [ $starter . $utf_chars [ $j ]]))
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
// Found a composite, replace the starter
$starter = $utf_canonical_comp [ $starter . $utf_chars [ $j ]];
unset ( $utf_sort [ $cc ][ $j ]);
}
else
{
// No composite, all following characters in that class are blocked
break ;
2006-07-10 01:53:30 +00:00
}
}
2006-11-03 23:09:16 +00:00
while ( isset ( $utf_sort [ $cc ][ ++ $j ]));
}
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
// Add the starter to the normalized sequence, followed by non-starters in canonical order
$nf_seq .= $starter ;
2006-08-22 21:26:06 +00:00
2006-11-03 23:09:16 +00:00
foreach ( $utf_sort as $utf_chars )
{
if ( ! empty ( $utf_chars ))
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$nf_seq .= implode ( '' , $utf_chars );
2006-07-10 01:53:30 +00:00
}
}
2006-11-03 23:09:16 +00:00
// Reset the array and go on
$utf_sort = array ();
$starter = $utf_char ;
2006-08-22 21:26:06 +00:00
}
}
}
2006-11-03 23:09:16 +00:00
while ( $k <= $k_max );
$tmp .= substr ( $str , $tmp_pos , $lpos - $tmp_pos ) . $nf_seq ;
$tmp_pos = $pos ;
}
else
{
// Only a ASCII char can make the program get here
//
// First we skip the current byte with ++$pos, then we quickly skip following ASCII chars with strspn().
//
// The first two "if"'s here can be removed, with the consequences of being faster on latin text (lots of ASCII) and slower on
// multi-byte text (where the only ASCII chars are spaces and punctuation)
if ( ++ $pos != $len )
2006-08-22 21:26:06 +00:00
{
2006-11-03 23:09:16 +00:00
if ( $str [ $pos ] < " \x80 " )
2006-08-22 21:26:06 +00:00
{
2006-11-03 23:09:16 +00:00
$pos += strspn ( $str , UTF8_ASCII_RANGE , ++ $pos );
$buffer [ ++ $i & 7 ] = $str [ $pos - 1 ];
}
else
{
$buffer [ ++ $i & 7 ] = $c ;
2006-07-10 01:53:30 +00:00
}
}
}
2006-11-03 23:09:16 +00:00
}
while ( $pos < $len );
2006-08-22 21:26:06 +00:00
2006-11-03 23:09:16 +00:00
// Now is time to return the string
if ( $tmp_pos )
{
// If the $tmp_pos cursor is not at the beggining of the string then at least one character was not in normal form. Replace $str with the fixed version
if ( $tmp_pos == $len )
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
// The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str
return $tmp ;
}
else
{
// The rightmost chunk of $str has not been appended to $tmp yet
return $tmp . substr ( $str , $tmp_pos );
2006-07-10 01:53:30 +00:00
}
}
2006-11-03 23:09:16 +00:00
// The string was already in normal form
return $str ;
}
/**
* Decompose a UTF string
*
2006-11-17 19:37:57 +00:00
* @ param string $str UTF string
* @ param integer $pos Position of the first UTF char ( in bytes )
* @ param integer $len Length of the string ( in bytes )
* @ param array & $decomp_map Decomposition mapping , passed by reference but never modified
* @ return string The string , decomposed and sorted canonically
2006-11-03 23:09:16 +00:00
*
* @ access private
*/
function decompose ( $str , $pos , $len , & $decomp_map )
{
2007-02-12 23:19:41 +00:00
global $utf_combining_class ;
2006-11-03 23:09:16 +00:00
// Load some commonly-used tables
if ( ! isset ( $utf_combining_class ))
2006-07-10 01:53:30 +00:00
{
2007-02-12 23:19:41 +00:00
global $phpbb_root_path , $phpEx ;
include ( $phpbb_root_path . 'includes/utf/data/utf_normalizer_common.' . $phpEx );
2006-11-03 23:09:16 +00:00
}
2006-08-22 21:26:06 +00:00
2006-11-03 23:09:16 +00:00
// UTF char length array
$utf_len_mask = array (
// Leading bytes masks
" \xC0 " => 2 , " \xD0 " => 2 , " \xE0 " => 3 , " \xF0 " => 4 ,
// Trailing bytes masks
" \x80 " => 0 , " \x90 " => 0 , " \xA0 " => 0 , " \xB0 " => 0
);
// Some extra checks are triggered on the first byte of a UTF sequence
$extra_check = array (
" \xED " => 1 , " \xEF " => 1 , " \xC0 " => 1 , " \xC1 " => 1 , " \xE0 " => 1 , " \xF0 " => 1 ,
" \xF4 " => 1 , " \xF5 " => 1 , " \xF6 " => 1 , " \xF7 " => 1 , " \xF8 " => 1 , " \xF9 " => 1 ,
" \xFA " => 1 , " \xFB " => 1 , " \xFC " => 1 , " \xFD " => 1 , " \xFE " => 1 , " \xFF " => 1
);
// These masks are used to check if a UTF sequence is well formed. Here are the only 3 lengths we acknowledge:
// - 2-byte: 110? ???? 10?? ????
// - 3-byte: 1110 ???? 10?? ???? 10?? ????
// - 4-byte: 1111 0??? 10?? ???? 10?? ???? 10?? ????
// Note that 5- and 6- byte sequences are automatically discarded
$utf_validation_mask = array (
2 => " \xE0 \xC0 " ,
3 => " \xF0 \xC0 \xC0 " ,
4 => " \xF8 \xC0 \xC0 \xC0 "
);
$utf_validation_check = array (
2 => " \xC0 \x80 " ,
3 => " \xE0 \x80 \x80 " ,
4 => " \xF0 \x80 \x80 \x80 "
);
$tmp = '' ;
$starter_pos = $pos ;
$tmp_pos = $last_cc = $sort = $dump = 0 ;
$utf_sort = array ();
// Main loop
do
{
// STEP 0: Capture the current char
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
$cur_mask = $str [ $pos ] & " \xF0 " ;
if ( isset ( $utf_len_mask [ $cur_mask ]))
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
if ( $utf_len = $utf_len_mask [ $cur_mask ])
2006-08-22 21:26:06 +00:00
{
2006-11-03 23:09:16 +00:00
// Multibyte char
$utf_char = substr ( $str , $pos , $utf_len );
$pos += $utf_len ;
}
else
{
// A trailing byte came out of nowhere, we will treat it and all following trailing bytes as if each of them was a Unicode
// replacement char and we will advance the cursor
$spn = strspn ( $str , UTF8_TRAILING_BYTES , $pos );
if ( $dump )
2006-08-22 21:26:06 +00:00
{
2006-11-03 23:09:16 +00:00
$tmp .= substr ( $str , $tmp_pos , $starter_pos - $tmp_pos );
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
// Dump combiners
if ( ! empty ( $utf_sort ))
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
if ( $sort )
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
ksort ( $utf_sort );
}
2006-08-22 21:26:06 +00:00
2006-11-15 15:35:50 +00:00
foreach ( $utf_sort as $utf_chars )
2006-11-03 23:09:16 +00:00
{
$tmp .= implode ( '' , $utf_chars );
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
}
$tmp .= str_repeat ( UTF8_REPLACEMENT , $spn );
$dump = $sort = 0 ;
}
else
{
$tmp .= substr ( $str , $tmp_pos , $pos - $tmp_pos ) . str_repeat ( UTF8_REPLACEMENT , $spn );
}
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
$pos += $spn ;
$tmp_pos = $starter_pos = $pos ;
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
$utf_sort = array ();
$last_cc = 0 ;
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
continue ;
}
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
// STEP 1: Decide what to do with current char
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
// Now, in that order:
// - check if that character is decomposable
// - check if that character is a non-starter
// - check if that character requires extra checks to be performed
if ( isset ( $decomp_map [ $utf_char ]))
{
// Decompose the char
$_pos = 0 ;
$_len = strlen ( $decomp_map [ $utf_char ]);
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
do
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$c = $decomp_map [ $utf_char ][ $_pos ];
$_utf_len =& $utf_len_mask [ $c & " \xF0 " ];
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
if ( isset ( $_utf_len ))
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$_utf_char = substr ( $decomp_map [ $utf_char ], $_pos , $_utf_len );
$_pos += $_utf_len ;
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
if ( isset ( $utf_combining_class [ $_utf_char ]))
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
// The character decomposed to a non-starter, buffer it for sorting
$utf_sort [ $utf_combining_class [ $_utf_char ]][] = $_utf_char ;
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
if ( $utf_combining_class [ $_utf_char ] < $last_cc )
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
// Not canonically ordered, will require sorting
$sort = $dump = 1 ;
2006-07-10 01:53:30 +00:00
}
else
{
2006-11-03 23:09:16 +00:00
$dump = 1 ;
$last_cc = $utf_combining_class [ $_utf_char ];
2006-07-10 01:53:30 +00:00
}
}
else
{
2006-11-03 23:09:16 +00:00
// This character decomposition contains a starter, dump the buffer and continue
2006-07-10 01:53:30 +00:00
if ( $dump )
{
$tmp .= substr ( $str , $tmp_pos , $starter_pos - $tmp_pos );
2006-08-22 21:26:06 +00:00
// Dump combiners
2006-07-10 01:53:30 +00:00
if ( ! empty ( $utf_sort ))
{
if ( $sort )
{
ksort ( $utf_sort );
}
2006-08-22 21:26:06 +00:00
foreach ( $utf_sort as $utf_chars )
2006-07-10 01:53:30 +00:00
{
$tmp .= implode ( '' , $utf_chars );
}
}
2006-11-03 23:09:16 +00:00
$tmp .= $_utf_char ;
2006-07-10 01:53:30 +00:00
$dump = $sort = 0 ;
}
else
{
2006-11-03 23:09:16 +00:00
$tmp .= substr ( $str , $tmp_pos , $starter_pos - $tmp_pos ) . $_utf_char ;
2006-07-10 01:53:30 +00:00
}
$tmp_pos = $starter_pos = $pos ;
$utf_sort = array ();
$last_cc = 0 ;
}
}
else
{
2006-11-03 23:09:16 +00:00
// This character decomposition contains an ASCII char, which is a starter. Dump the buffer and continue
++ $_pos ;
2006-08-22 21:26:06 +00:00
2006-11-03 23:09:16 +00:00
if ( $dump )
2006-07-10 01:53:30 +00:00
{
$tmp .= substr ( $str , $tmp_pos , $starter_pos - $tmp_pos );
2006-11-03 23:09:16 +00:00
// Dump combiners
2006-07-10 01:53:30 +00:00
if ( ! empty ( $utf_sort ))
{
2006-11-03 23:09:16 +00:00
if ( $sort )
{
ksort ( $utf_sort );
}
2006-07-10 01:53:30 +00:00
2006-08-22 21:26:06 +00:00
foreach ( $utf_sort as $utf_chars )
2006-07-10 01:53:30 +00:00
{
$tmp .= implode ( '' , $utf_chars );
}
}
2006-11-03 23:09:16 +00:00
$tmp .= $c ;
2006-08-22 21:26:06 +00:00
$dump = $sort = 0 ;
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
else
{
$tmp .= substr ( $str , $tmp_pos , $pos - $utf_len - $tmp_pos ) . $c ;
}
$tmp_pos = $starter_pos = $pos ;
$utf_sort = array ();
$last_cc = 0 ;
}
}
while ( $_pos < $_len );
}
else if ( isset ( $utf_combining_class [ $utf_char ]))
{
// Combining character
if ( $utf_combining_class [ $utf_char ] < $last_cc )
{
// Not in canonical order
$sort = $dump = 1 ;
}
else
{
$last_cc = $utf_combining_class [ $utf_char ];
}
$utf_sort [ $utf_combining_class [ $utf_char ]][] = $utf_char ;
}
else
{
// Non-decomposable starter, check out if it's a Hangul syllable
if ( $utf_char < UTF8_HANGUL_FIRST || $utf_char > UTF8_HANGUL_LAST )
{
// Nope, regular UTF char, check that we have the correct number of trailing bytes
if (( $utf_char & $utf_validation_mask [ $utf_len ]) != $utf_validation_check [ $utf_len ])
{
// Current char isn't well-formed or legal: either one or several trailing bytes are missing, or the Unicode char
// has been encoded in a five- or six- byte sequence.
// Move the cursor back to its original position then advance it to the position it should really be at
$pos -= $utf_len ;
$tmp .= substr ( $str , $tmp_pos , $starter_pos - $tmp_pos );
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
if ( ! empty ( $utf_sort ))
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
ksort ( $utf_sort );
foreach ( $utf_sort as $utf_chars )
2006-08-22 21:26:06 +00:00
{
2006-11-03 23:09:16 +00:00
$tmp .= implode ( '' , $utf_chars );
}
$utf_sort = array ();
}
// Add a replacement char then another replacement char for every trailing byte.
//
// @todo I'm not entirely sure that's how we're supposed to mark invalidated byte sequences, check this
$spn = strspn ( $str , UTF8_TRAILING_BYTES , ++ $pos );
$tmp .= str_repeat ( UTF8_REPLACEMENT , $spn + 1 );
$dump = $sort = 0 ;
$pos += $spn ;
$tmp_pos = $pos ;
continue ;
}
if ( isset ( $extra_check [ $utf_char [ 0 ]]))
{
switch ( $utf_char [ 0 ])
{
// Note: 0xED is quite common in Korean
case " \xED " :
if ( $utf_char >= " \xED \xA0 \x80 " )
{
// Surrogates (U+D800..U+DFFF) are not allowed in UTF-8 (UTF sequence 0xEDA080..0xEDBFBF)
$tmp .= substr ( $str , $tmp_pos , $starter_pos - $tmp_pos );
if ( ! empty ( $utf_sort ))
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
ksort ( $utf_sort );
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
foreach ( $utf_sort as $utf_chars )
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$tmp .= implode ( '' , $utf_chars );
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
$utf_sort = array ();
}
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
$tmp .= UTF8_REPLACEMENT ;
$dump = $sort = 0 ;
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
$tmp_pos = $starter_pos = $pos ;
continue 2 ;
}
break ;
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
// Note: 0xEF is quite common in Japanese
case " \xEF " :
if ( $utf_char == " \xEF \xBF \xBE " || $utf_char == " \xEF \xBF \xBF " )
{
// U+FFFE and U+FFFF are explicitly disallowed (UTF sequence 0xEFBFBE..0xEFBFBF)
$tmp .= substr ( $str , $tmp_pos , $starter_pos - $tmp_pos );
if ( ! empty ( $utf_sort ))
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
ksort ( $utf_sort );
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
foreach ( $utf_sort as $utf_chars )
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$tmp .= implode ( '' , $utf_chars );
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
$utf_sort = array ();
}
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
$tmp .= UTF8_REPLACEMENT ;
$dump = $sort = 0 ;
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
$tmp_pos = $starter_pos = $pos ;
continue 2 ;
}
break ;
case " \xC0 " :
case " \xC1 " :
if ( $utf_char <= " \xC1 \xBF " )
{
// Overlong sequence: Unicode char U+0000..U+007F encoded as a double-byte UTF char
$tmp .= substr ( $str , $tmp_pos , $starter_pos - $tmp_pos );
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
if ( ! empty ( $utf_sort ))
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
ksort ( $utf_sort );
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
foreach ( $utf_sort as $utf_chars )
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$tmp .= implode ( '' , $utf_chars );
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
$utf_sort = array ();
}
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
$tmp .= UTF8_REPLACEMENT ;
$dump = $sort = 0 ;
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
$tmp_pos = $starter_pos = $pos ;
continue 2 ;
}
break ;
case " \xE0 " :
if ( $utf_char <= " \xE0 \x9F \xBF " )
{
// Unicode char U+0000..U+07FF encoded in 3 bytes
$tmp .= substr ( $str , $tmp_pos , $starter_pos - $tmp_pos );
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
if ( ! empty ( $utf_sort ))
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
ksort ( $utf_sort );
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
foreach ( $utf_sort as $utf_chars )
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$tmp .= implode ( '' , $utf_chars );
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
$utf_sort = array ();
}
$tmp .= UTF8_REPLACEMENT ;
$dump = $sort = 0 ;
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
$tmp_pos = $starter_pos = $pos ;
continue 2 ;
}
break ;
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
case " \xF0 " :
if ( $utf_char <= " \xF0 \x8F \xBF \xBF " )
{
// Unicode char U+0000..U+FFFF encoded in 4 bytes
$tmp .= substr ( $str , $tmp_pos , $starter_pos - $tmp_pos );
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
if ( ! empty ( $utf_sort ))
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
ksort ( $utf_sort );
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
foreach ( $utf_sort as $utf_chars )
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$tmp .= implode ( '' , $utf_chars );
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
$utf_sort = array ();
}
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
$tmp .= UTF8_REPLACEMENT ;
$dump = $sort = 0 ;
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
$tmp_pos = $starter_pos = $pos ;
continue 2 ;
}
break ;
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
default :
if ( $utf_char > UTF8_MAX )
{
// Out of the Unicode range
$tmp .= substr ( $str , $tmp_pos , $starter_pos - $tmp_pos );
if ( ! empty ( $utf_sort ))
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
ksort ( $utf_sort );
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
foreach ( $utf_sort as $utf_chars )
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$tmp .= implode ( '' , $utf_chars );
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
$utf_sort = array ();
}
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
$tmp .= UTF8_REPLACEMENT ;
$dump = $sort = 0 ;
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
$tmp_pos = $starter_pos = $pos ;
continue 2 ;
}
break ;
2006-07-10 01:53:30 +00:00
}
}
2006-11-03 23:09:16 +00:00
}
else
{
// Hangul syllable
$idx = ((( ord ( $utf_char [ 0 ]) & 0x0F ) << 12 ) | (( ord ( $utf_char [ 1 ]) & 0x3F ) << 6 ) | ( ord ( $utf_char [ 2 ]) & 0x3F )) - UNICODE_HANGUL_SBASE ;
2006-08-22 21:26:06 +00:00
2006-11-03 23:09:16 +00:00
// LIndex can only range from 0 to 18, therefore it cannot influence the first two bytes of the L Jamo, which allows us to hardcode them (based on LBase).
//
// The same goes for VIndex, but for TIndex there's a catch: the value of the third byte could exceed 0xBF and we would have to increment the second byte
2006-11-15 15:35:50 +00:00
if ( $t_index = $idx % UNICODE_HANGUL_TCOUNT )
2006-11-03 23:09:16 +00:00
{
2006-11-15 15:35:50 +00:00
if ( $t_index < 25 )
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$utf_char = " \xE1 \x84 \x00 \xE1 \x85 \x00 \xE1 \x86 \x00 " ;
2006-11-15 15:35:50 +00:00
$utf_char [ 8 ] = chr ( 0xA7 + $t_index );
2006-07-10 01:53:30 +00:00
}
else
{
2006-11-03 23:09:16 +00:00
$utf_char = " \xE1 \x84 \x00 \xE1 \x85 \x00 \xE1 \x87 \x00 " ;
2006-11-15 15:35:50 +00:00
$utf_char [ 8 ] = chr ( 0x67 + $t_index );
2006-07-10 01:53:30 +00:00
}
}
2006-11-03 23:09:16 +00:00
else
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$utf_char = " \xE1 \x84 \x00 \xE1 \x85 \x00 " ;
2006-08-22 21:26:06 +00:00
}
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
$utf_char [ 2 ] = chr ( 0x80 + ( int ) ( $idx / UNICODE_HANGUL_NCOUNT ));
$utf_char [ 5 ] = chr ( 0xA1 + ( int ) (( $idx % UNICODE_HANGUL_NCOUNT ) / UNICODE_HANGUL_TCOUNT ));
// Just like other decompositions, the resulting Jamos must be dumped to the tmp string
$dump = 1 ;
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
// Do we need to dump stuff to the tmp string?
2006-07-10 01:53:30 +00:00
if ( $dump )
{
$tmp .= substr ( $str , $tmp_pos , $starter_pos - $tmp_pos );
2006-08-22 21:26:06 +00:00
// Dump combiners
2006-07-10 01:53:30 +00:00
if ( ! empty ( $utf_sort ))
{
if ( $sort )
{
ksort ( $utf_sort );
}
2006-08-22 21:26:06 +00:00
foreach ( $utf_sort as $utf_chars )
2006-07-10 01:53:30 +00:00
{
$tmp .= implode ( '' , $utf_chars );
}
}
2006-11-03 23:09:16 +00:00
$tmp .= $utf_char ;
2006-07-10 01:53:30 +00:00
$dump = $sort = 0 ;
2006-11-03 23:09:16 +00:00
$tmp_pos = $pos ;
2006-07-10 01:53:30 +00:00
}
$last_cc = 0 ;
$utf_sort = array ();
$starter_pos = $pos ;
}
}
2006-11-03 23:09:16 +00:00
else
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
// ASCII char, which happens to be a starter (as any other ASCII char)
if ( $dump )
2006-08-22 21:26:06 +00:00
{
2006-11-03 23:09:16 +00:00
$tmp .= substr ( $str , $tmp_pos , $starter_pos - $tmp_pos );
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
// Dump combiners
if ( ! empty ( $utf_sort ))
2006-08-22 21:26:06 +00:00
{
2006-11-03 23:09:16 +00:00
if ( $sort )
{
ksort ( $utf_sort );
}
foreach ( $utf_sort as $utf_chars )
{
$tmp .= implode ( '' , $utf_chars );
}
2006-08-22 21:26:06 +00:00
}
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
$tmp .= $str [ $pos ];
$dump = $sort = 0 ;
$tmp_pos = ++ $pos ;
$pos += strspn ( $str , UTF8_ASCII_RANGE , $pos );
}
else
{
$pos += strspn ( $str , UTF8_ASCII_RANGE , ++ $pos );
}
2006-07-10 01:53:30 +00:00
2006-11-03 23:09:16 +00:00
$last_cc = 0 ;
$utf_sort = array ();
$starter_pos = $pos ;
2006-08-22 21:26:06 +00:00
}
2006-11-03 23:09:16 +00:00
}
while ( $pos < $len );
// Now is time to return the string
if ( $dump )
{
$tmp .= substr ( $str , $tmp_pos , $starter_pos - $tmp_pos );
// Dump combiners
if ( ! empty ( $utf_sort ))
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
if ( $sort )
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
ksort ( $utf_sort );
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
foreach ( $utf_sort as $utf_chars )
2006-07-10 01:53:30 +00:00
{
2006-11-03 23:09:16 +00:00
$tmp .= implode ( '' , $utf_chars );
2006-07-10 01:53:30 +00:00
}
}
2006-11-03 23:09:16 +00:00
return $tmp ;
}
else if ( $tmp_pos )
{
// If the $tmp_pos cursor was moved then at least one character was not in normal form. Replace $str with the fixed version
if ( $tmp_pos == $len )
{
// The $tmp_pos cursor is at the end of $str, therefore $tmp holds the whole $str
return $tmp ;
}
else
{
// The rightmost chunk of $str has not been appended to $tmp yet
return $tmp . substr ( $str , $tmp_pos );
}
2006-07-10 01:53:30 +00:00
}
2006-11-03 23:09:16 +00:00
// The string was already in normal form
return $str ;
2006-07-10 01:53:30 +00:00
}
}
2006-08-22 21:26:06 +00:00
?>