2005-10-17 15:48:46 +00:00
< ? php
/***************************************************************
2011-10-22 21:07:24 +02:00
* Copyright notice
*
* ( c ) 2003 - 2011 Kasper Skårhøj ( kasperYYYY @ typo3 . com )
* All rights reserved
*
* This script is part of the Typo3 project . The Typo3 project is
* free software ; you can redistribute it and / or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation ; either version 2 of the License , or
* ( at your option ) any later version .
*
* The GNU General Public License can be found at
* http :// www . gnu . org / copyleft / gpl . html .
*
* This script is distributed in the hope that it will be useful ,
* but WITHOUT ANY WARRANTY ; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE . See the
* GNU General Public License for more details .
*
* This copyright notice MUST APPEAR in all copies of the script !
***************************************************************/
2005-10-17 15:48:46 +00:00
/**
* Class for conversion between charsets .
*
2011-10-22 21:07:24 +02:00
* @ author Kasper Skårhøj < kasperYYYY @ typo3 . com >
2005-10-17 15:48:46 +00:00
* @ author Martin Kutschker < martin . t . kutschker @ blackbox . net >
*/
/**
* Notes on UTF - 8
*
* Functions working on UTF - 8 strings :
*
* - strchr / strstr
* - strrchr
* - substr_count
* - implode / explode / join
*
* Functions nearly working on UTF - 8 strings :
*
2005-12-13 17:21:53 +00:00
* - strlen : returns the length in BYTES , if you need the length in CHARACTERS use utf8_strlen
* - trim / ltrim / rtrim : the second parameter 'charlist' won ' t work for characters not contained in 7 - bit ASCII
2005-10-17 15:48:46 +00:00
* - strpos / strrpos : they return the BYTE position , if you need the CHARACTER position use utf8_strpos / utf8_strrpos
* - htmlentities : charset support for UTF - 8 only since PHP 4.3 . 0
2009-11-18 21:48:00 +00:00
* - preg_ *: Support compiled into PHP by default nowadays , but could be unavailable , need to use modifier
2005-10-17 15:48:46 +00:00
*
* Functions NOT working on UTF - 8 strings :
*
* - str * cmp
* - stristr
* - stripos
* - substr
* - strrev
* - split / spliti
* - ...
*
*/
/**
* Class for conversion between charsets
*
2011-10-22 21:07:24 +02:00
* @ author Kasper Skårhøj < kasperYYYY @ typo3 . com >
2005-10-17 15:48:46 +00:00
* @ author Martin Kutschker < martin . t . kutschker @ blackbox . net >
* @ package TYPO3
* @ subpackage t3lib
*/
class t3lib_cs {
2012-04-21 21:31:03 +02:00
/**
* @ var t3lib_l10n_Locales
*/
protected $locales ;
2011-10-22 21:07:24 +02:00
var $noCharByteVal = 63 ; // ASCII Value for chars with no equivalent.
2005-10-17 15:48:46 +00:00
// This is the array where parsed conversion tables are stored (cached)
2011-10-22 21:07:24 +02:00
var $parsedCharsets = array ();
2005-10-17 15:48:46 +00:00
// An array where case folding data will be stored (cached)
2011-10-22 21:07:24 +02:00
var $caseFolding = array ();
2005-10-17 15:48:46 +00:00
// An array where charset-to-ASCII mappings are stored (cached)
2011-10-22 21:07:24 +02:00
var $toASCII = array ();
2005-10-17 15:48:46 +00:00
// This tells the converter which charsets has two bytes per char:
2011-10-22 21:07:24 +02:00
var $twoByteSets = array (
'ucs-2' => 1 , // 2-byte Unicode
2005-10-17 15:48:46 +00:00
);
// This tells the converter which charsets has four bytes per char:
2011-10-22 21:07:24 +02:00
var $fourByteSets = array (
'ucs-4' => 1 , // 4-byte Unicode
'utf-32' => 1 , // 4-byte Unicode (limited to the 21-bits of UTF-16)
2005-10-17 15:48:46 +00:00
);
// This tells the converter which charsets use a scheme like the Extended Unix Code:
2011-10-22 21:07:24 +02:00
var $eucBasedSets = array (
'gb2312' => 1 , // Chinese, simplified.
'big5' => 1 , // Chinese, traditional.
'euc-kr' => 1 , // Korean
'shift_jis' => 1 , // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
2005-10-17 15:48:46 +00:00
);
// see http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
// http://czyborra.com/charsets/iso8859.html
2011-10-22 21:07:24 +02:00
var $synonyms = array (
2005-10-17 15:48:46 +00:00
'us' => 'ascii' ,
2011-10-22 21:07:24 +02:00
'us-ascii' => 'ascii' ,
2005-10-17 15:48:46 +00:00
'cp819' => 'iso-8859-1' ,
'ibm819' => 'iso-8859-1' ,
'iso-ir-100' => 'iso-8859-1' ,
2009-11-18 21:48:00 +00:00
'iso-ir-101' => 'iso-8859-2' ,
'iso-ir-109' => 'iso-8859-3' ,
'iso-ir-110' => 'iso-8859-4' ,
'iso-ir-144' => 'iso-8859-5' ,
'iso-ir-127' => 'iso-8859-6' ,
'iso-ir-126' => 'iso-8859-7' ,
'iso-ir-138' => 'iso-8859-8' ,
2005-10-17 15:48:46 +00:00
'iso-ir-148' => 'iso-8859-9' ,
2009-11-18 21:48:00 +00:00
'iso-ir-157' => 'iso-8859-10' ,
'iso-ir-179' => 'iso-8859-13' ,
2005-10-17 15:48:46 +00:00
'iso-ir-199' => 'iso-8859-14' ,
'iso-ir-203' => 'iso-8859-15' ,
'csisolatin1' => 'iso-8859-1' ,
'csisolatin2' => 'iso-8859-2' ,
'csisolatin3' => 'iso-8859-3' ,
'csisolatin5' => 'iso-8859-9' ,
'csisolatin8' => 'iso-8859-14' ,
'csisolatin9' => 'iso-8859-15' ,
'csisolatingreek' => 'iso-8859-7' ,
'iso-celtic' => 'iso-8859-14' ,
'latin1' => 'iso-8859-1' ,
'latin2' => 'iso-8859-2' ,
'latin3' => 'iso-8859-3' ,
'latin5' => 'iso-8859-9' ,
'latin6' => 'iso-8859-10' ,
'latin8' => 'iso-8859-14' ,
'latin9' => 'iso-8859-15' ,
'l1' => 'iso-8859-1' ,
'l2' => 'iso-8859-2' ,
'l3' => 'iso-8859-3' ,
'l5' => 'iso-8859-9' ,
'l6' => 'iso-8859-10' ,
'l8' => 'iso-8859-14' ,
'l9' => 'iso-8859-15' ,
'cyrillic' => 'iso-8859-5' ,
'arabic' => 'iso-8859-6' ,
'tis-620' => 'iso-8859-11' ,
'win874' => 'windows-874' ,
'win1250' => 'windows-1250' ,
'win1251' => 'windows-1251' ,
'win1252' => 'windows-1252' ,
'win1253' => 'windows-1253' ,
'win1254' => 'windows-1254' ,
'win1255' => 'windows-1255' ,
'win1256' => 'windows-1256' ,
'win1257' => 'windows-1257' ,
'win1258' => 'windows-1258' ,
'cp1250' => 'windows-1250' ,
'cp1251' => 'windows-1251' ,
'cp1252' => 'windows-1252' ,
'ms-ee' => 'windows-1250' ,
'ms-ansi' => 'windows-1252' ,
'ms-greek' => 'windows-1253' ,
'ms-turk' => 'windows-1254' ,
'winbaltrim' => 'windows-1257' ,
'koi-8ru' => 'koi-8r' ,
'koi8r' => 'koi-8r' ,
'cp878' => 'koi-8r' ,
'mac' => 'macroman' ,
'macintosh' => 'macroman' ,
'euc-cn' => 'gb2312' ,
'x-euc-cn' => 'gb2312' ,
'euccn' => 'gb2312' ,
'cp936' => 'gb2312' ,
'big-5' => 'big5' ,
'cp950' => 'big5' ,
'eucjp' => 'euc-jp' ,
'sjis' => 'shift_jis' ,
'shift-jis' => 'shift_jis' ,
'cp932' => 'shift_jis' ,
'cp949' => 'euc-kr' ,
'utf7' => 'utf-7' ,
'utf8' => 'utf-8' ,
'utf16' => 'utf-16' ,
'utf32' => 'utf-32' ,
'utf8' => 'utf-8' ,
'ucs2' => 'ucs-2' ,
'ucs4' => 'ucs-4' ,
);
2009-11-18 21:48:00 +00:00
// mapping of iso-639-1 language codes to script names
2011-10-22 21:07:24 +02:00
var $lang_to_script = array (
2009-11-18 21:48:00 +00:00
// iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
2012-09-02 16:10:57 +02:00
'af' => 'west_european' , //Afrikaans
2005-10-17 15:48:46 +00:00
'ar' => 'arabic' ,
2011-10-22 21:07:24 +02:00
'bg' => 'cyrillic' , // Bulgarian
'bs' => 'east_european' , // Bosnian
'cs' => 'east_european' , // Czech
'da' => 'west_european' , // Danish
'de' => 'west_european' , // German
'es' => 'west_european' , // Spanish
2005-10-17 15:48:46 +00:00
'et' => 'estonian' ,
2011-10-22 21:07:24 +02:00
'eo' => 'unicode' , // Esperanto
'eu' => 'west_european' , // Basque
'fa' => 'arabic' , // Persian
'fi' => 'west_european' , // Finish
'fo' => 'west_european' , // Faroese
'fr' => 'west_european' , // French
'ga' => 'west_european' , // Irish
'gl' => 'west_european' , // Galician
2005-10-17 15:48:46 +00:00
'gr' => 'greek' ,
2011-10-22 21:07:24 +02:00
'he' => 'hebrew' , // Hebrew (since 1998)
'hi' => 'unicode' , // Hindi
'hr' => 'east_european' , // Croatian
'hu' => 'east_european' , // Hungarian
'iw' => 'hebrew' , // Hebrew (til 1998)
'is' => 'west_european' , // Icelandic
'it' => 'west_european' , // Italian
2005-10-17 15:48:46 +00:00
'ja' => 'japanese' ,
2011-10-22 21:07:24 +02:00
'ka' => 'unicode' , // Georgian
'kl' => 'west_european' , // Greenlandic
'km' => 'unicode' , // Khmer
2005-10-17 15:48:46 +00:00
'ko' => 'korean' ,
'lt' => 'lithuanian' ,
2011-10-22 21:07:24 +02:00
'lv' => 'west_european' , // Latvian/Lettish
'nl' => 'west_european' , // Dutch
'no' => 'west_european' , // Norwegian
'nb' => 'west_european' , // Norwegian Bokmal
'nn' => 'west_european' , // Norwegian Nynorsk
'pl' => 'east_european' , // Polish
'pt' => 'west_european' , // Portuguese
'ro' => 'east_european' , // Romanian
'ru' => 'cyrillic' , // Russian
'sk' => 'east_european' , // Slovak
'sl' => 'east_european' , // Slovenian
'sr' => 'cyrillic' , // Serbian
'sv' => 'west_european' , // Swedish
'sq' => 'albanian' , // Albanian
2005-10-17 15:48:46 +00:00
'th' => 'thai' ,
2011-10-22 21:07:24 +02:00
'uk' => 'cyrillic' , // Ukranian
2005-10-17 15:48:46 +00:00
'vi' => 'vietnamese' ,
'zh' => 'chinese' ,
// MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
2006-05-15 23:31:50 +00:00
// http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
2012-09-02 16:10:57 +02:00
'afk' => 'west_european' , // Afrikaans
2006-05-15 23:31:50 +00:00
'ara' => 'arabic' ,
2011-10-22 21:07:24 +02:00
'bgr' => 'cyrillic' , // Bulgarian
'cat' => 'west_european' , // Catalan
2005-10-17 15:48:46 +00:00
'chs' => 'simpl_chinese' ,
'cht' => 'trad_chinese' ,
2011-10-22 21:07:24 +02:00
'csy' => 'east_european' , // Czech
'dan' => 'west_european' , // Danisch
'deu' => 'west_european' , // German
'dea' => 'west_european' , // German (Austrian)
'des' => 'west_european' , // German (Swiss)
'ena' => 'west_european' , // English (Australian)
'enc' => 'west_european' , // English (Canadian)
'eng' => 'west_european' , // English
'enz' => 'west_european' , // English (New Zealand)
'enu' => 'west_european' , // English (United States)
'euq' => 'west_european' , // Basque
'fos' => 'west_european' , // Faroese
'far' => 'arabic' , // Persian
'fin' => 'west_european' , // Finish
'fra' => 'west_european' , // French
'frb' => 'west_european' , // French (Belgian)
'frc' => 'west_european' , // French (Canadian)
'frs' => 'west_european' , // French (Swiss)
'geo' => 'unicode' , // Georgian
'glg' => 'west_european' , // Galician
2005-10-17 15:48:46 +00:00
'ell' => 'greek' ,
2006-05-15 23:31:50 +00:00
'heb' => 'hebrew' ,
2011-10-22 21:07:24 +02:00
'hin' => 'unicode' , // Hindi
'hun' => 'east_european' , // Hungarian
'isl' => 'west_euorpean' , // Icelandic
'ita' => 'west_european' , // Italian
'its' => 'west_european' , // Italian (Swiss)
2005-10-17 15:48:46 +00:00
'jpn' => 'japanese' ,
2011-10-22 21:07:24 +02:00
'khm' => 'unicode' , // Khmer
2005-10-17 15:48:46 +00:00
'kor' => 'korean' ,
2006-05-15 23:31:50 +00:00
'lth' => 'lithuanian' ,
2011-10-22 21:07:24 +02:00
'lvi' => 'west_european' , // Latvian/Lettish
'msl' => 'west_european' , // Malay
'nlb' => 'west_european' , // Dutch (Belgian)
'nld' => 'west_european' , // Dutch
'nor' => 'west_european' , // Norwegian (bokmal)
'non' => 'west_european' , // Norwegian (nynorsk)
'plk' => 'east_european' , // Polish
'ptg' => 'west_european' , // Portuguese
'ptb' => 'west_european' , // Portuguese (Brazil)
'rom' => 'east_european' , // Romanian
'rus' => 'cyrillic' , // Russian
'slv' => 'east_european' , // Slovenian
'sky' => 'east_european' , // Slovak
'srl' => 'east_european' , // Serbian (Latin)
'srb' => 'cyrillic' , // Serbian (Cyrillic)
'esp' => 'west_european' , // Spanish (trad. sort)
'esm' => 'west_european' , // Spanish (Mexican)
'esn' => 'west_european' , // Spanish (internat. sort)
'sve' => 'west_european' , // Swedish
'sqi' => 'albanian' , // Albanian
2006-05-15 23:31:50 +00:00
'tha' => 'thai' ,
2005-10-17 15:48:46 +00:00
'trk' => 'turkish' ,
2011-10-22 21:07:24 +02:00
'ukr' => 'cyrillic' , // Ukrainian
2005-10-17 15:48:46 +00:00
// English language names
2012-09-02 16:10:57 +02:00
'afrikaans' => 'west_european' ,
2008-06-16 15:31:21 +00:00
'albanian' => 'albanian' ,
2006-05-15 23:31:50 +00:00
'arabic' => 'arabic' ,
'basque' => 'west_european' ,
'bosnian' => 'east_european' ,
2005-10-17 15:48:46 +00:00
'bulgarian' => 'east_european' ,
'catalan' => 'west_european' ,
'croatian' => 'east_european' ,
'czech' => 'east_european' ,
'danish' => 'west_european' ,
'dutch' => 'west_european' ,
'english' => 'west_european' ,
2006-05-15 23:31:50 +00:00
'esperanto' => 'unicode' ,
'estonian' => 'estonian' ,
'faroese' => 'west_european' ,
'farsi' => 'arabic' ,
2005-10-17 15:48:46 +00:00
'finnish' => 'west_european' ,
'french' => 'west_european' ,
'galician' => 'west_european' ,
2008-06-16 15:31:21 +00:00
'georgian' => 'unicode' ,
2005-10-17 15:48:46 +00:00
'german' => 'west_european' ,
2006-05-15 23:31:50 +00:00
'greek' => 'greek' ,
'greenlandic' => 'west_european' ,
'hebrew' => 'hebrew' ,
'hindi' => 'unicode' ,
2005-10-17 15:48:46 +00:00
'hungarian' => 'east_european' ,
'icelandic' => 'west_european' ,
'italian' => 'west_european' ,
2011-10-22 21:07:24 +02:00
'khmer' => 'unicode' ,
2005-10-17 15:48:46 +00:00
'latvian' => 'west_european' ,
'lettish' => 'west_european' ,
2006-05-15 23:31:50 +00:00
'lithuanian' => 'lithuanian' ,
'malay' => 'west_european' ,
2005-10-17 15:48:46 +00:00
'norwegian' => 'west_european' ,
2006-05-15 23:31:50 +00:00
'persian' => 'arabic' ,
2005-10-17 15:48:46 +00:00
'polish' => 'east_european' ,
'portuguese' => 'west_european' ,
'russian' => 'cyrillic' ,
'romanian' => 'east_european' ,
2006-05-15 23:31:50 +00:00
'serbian' => 'cyrillic' ,
2005-10-17 15:48:46 +00:00
'slovak' => 'east_european' ,
'slovenian' => 'east_european' ,
'spanish' => 'west_european' ,
'svedish' => 'west_european' ,
2006-05-15 23:31:50 +00:00
'that' => 'thai' ,
'turkish' => 'turkish' ,
2005-10-17 15:48:46 +00:00
'ukrainian' => 'cyrillic' ,
);
// mapping of language (family) names to charsets on Unix
2011-10-22 21:07:24 +02:00
var $script_to_charset_unix = array (
2005-10-17 15:48:46 +00:00
'west_european' => 'iso-8859-1' ,
'estonian' => 'iso-8859-1' ,
'east_european' => 'iso-8859-2' ,
'baltic' => 'iso-8859-4' ,
'cyrillic' => 'iso-8859-5' ,
'arabic' => 'iso-8859-6' ,
'greek' => 'iso-8859-7' ,
'hebrew' => 'iso-8859-8' ,
'turkish' => 'iso-8859-9' ,
'thai' => 'iso-8859-11' , // = TIS-620
'lithuanian' => 'iso-8859-13' ,
'chinese' => 'gb2312' , // = euc-cn
'japanese' => 'euc-jp' ,
'korean' => 'euc-kr' ,
'simpl_chinese' => 'gb2312' ,
'trad_chinese' => 'big5' ,
'vietnamese' => '' ,
2006-05-15 23:31:50 +00:00
'unicode' => 'utf-8' ,
2008-06-16 15:31:21 +00:00
'albanian' => 'utf-8'
2005-10-17 15:48:46 +00:00
);
// mapping of language (family) names to charsets on Windows
2011-10-22 21:07:24 +02:00
var $script_to_charset_windows = array (
2005-10-17 15:48:46 +00:00
'east_european' => 'windows-1250' ,
'cyrillic' => 'windows-1251' ,
'west_european' => 'windows-1252' ,
'greek' => 'windows-1253' ,
'turkish' => 'windows-1254' ,
'hebrew' => 'windows-1255' ,
'arabic' => 'windows-1256' ,
'baltic' => 'windows-1257' ,
'estonian' => 'windows-1257' ,
'lithuanian' => 'windows-1257' ,
'vietnamese' => 'windows-1258' ,
'thai' => 'cp874' ,
'korean' => 'cp949' ,
'chinese' => 'gb2312' ,
'japanese' => 'shift_jis' ,
'simpl_chinese' => 'gb2312' ,
'trad_chinese' => 'big5' ,
2008-06-16 15:31:21 +00:00
'albanian' => 'windows-1250' ,
'unicode' => 'utf-8'
2005-10-17 15:48:46 +00:00
);
// mapping of locale names to charsets
2011-10-22 21:07:24 +02:00
var $locale_to_charset = array (
2005-10-17 15:48:46 +00:00
'japanese.euc' => 'euc-jp' ,
'ja_jp.ujis' => 'euc-jp' ,
'korean.euc' => 'euc-kr' ,
2006-05-15 23:31:50 +00:00
'sr@Latn' => 'iso-8859-2' ,
2005-10-17 15:48:46 +00:00
'zh_cn' => 'gb2312' ,
'zh_hk' => 'big5' ,
'zh_tw' => 'big5' ,
);
// TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
// Empty values means "iso-8859-1"
var $charSetArray = array (
2012-09-02 16:10:57 +02:00
'af' => '' ,
2012-04-21 21:31:03 +02:00
'ar' => 'iso-8859-6' ,
'ba' => 'iso-8859-2' ,
'bg' => 'windows-1251' ,
'br' => '' ,
'ca' => 'iso-8859-15' ,
'ch' => 'gb2312' ,
'cs' => 'windows-1250' ,
'cz' => 'windows-1250' ,
'da' => '' ,
2005-10-17 15:48:46 +00:00
'de' => '' ,
2012-04-21 21:31:03 +02:00
'dk' => '' ,
'el' => 'iso-8859-7' ,
'eo' => 'utf-8' ,
2005-10-17 15:48:46 +00:00
'es' => '' ,
2012-04-21 21:31:03 +02:00
'et' => 'iso-8859-4' ,
'eu' => '' ,
'fa' => 'utf-8' ,
2005-10-17 15:48:46 +00:00
'fi' => '' ,
2012-04-21 21:31:03 +02:00
'fo' => 'utf-8' ,
'fr' => '' ,
'fr_CA' => '' ,
'ga' => '' ,
'ge' => 'utf-8' ,
2005-10-17 15:48:46 +00:00
'gl' => '' ,
'gr' => 'iso-8859-7' ,
'he' => 'utf-8' ,
2012-04-21 21:31:03 +02:00
'hi' => 'utf-8' ,
'hk' => 'big5' ,
'hr' => 'windows-1250' ,
'hu' => 'iso-8859-2' ,
'is' => 'utf-8' ,
'it' => '' ,
'ja' => 'shift_jis' ,
2005-10-17 15:48:46 +00:00
'jp' => 'shift_jis' ,
2012-04-21 21:31:03 +02:00
'ka' => 'utf-8' ,
'kl' => 'utf-8' ,
'km' => 'utf-8' ,
'ko' => 'euc-kr' ,
2005-10-17 15:48:46 +00:00
'kr' => 'euc-kr' ,
2012-04-21 21:31:03 +02:00
'lt' => 'windows-1257' ,
'lv' => 'utf-8' ,
'ms' => '' ,
2005-10-17 15:48:46 +00:00
'my' => '' ,
2012-04-21 21:31:03 +02:00
'nl' => '' ,
'no' => '' ,
'pl' => 'iso-8859-2' ,
'pt' => '' ,
'pt_BR' => '' ,
2011-10-22 21:07:24 +02:00
'qc' => '' ,
2012-04-21 21:31:03 +02:00
'ro' => 'iso-8859-2' ,
'ru' => 'windows-1251' ,
'se' => '' ,
'si' => 'windows-1250' ,
'sk' => 'windows-1250' ,
'sl' => 'windows-1250' ,
'sq' => 'utf-8' ,
'sr' => 'utf-8' ,
'sv' => '' ,
'th' => 'iso-8859-11' ,
'tr' => 'iso-8859-9' ,
'ua' => 'windows-1251' ,
'uk' => 'windows-1251' ,
'vi' => 'utf-8' ,
'vn' => 'utf-8' ,
'zh' => 'big5' ,
2005-10-17 15:48:46 +00:00
);
// TYPO3 specific: Array with the iso names used for each system language in TYPO3:
2012-04-21 21:31:03 +02:00
// Missing keys means: same as TYPO3
2012-09-02 16:10:57 +02:00
// @deprecated since TYPO3 4.6, will be removed in TYPO3 6.0 - use t3lib_l10n_Locales::getIsoMapping()
2005-10-17 15:48:46 +00:00
var $isoArray = array (
2006-05-15 23:31:50 +00:00
'ba' => 'bs' ,
'br' => 'pt_BR' ,
'ch' => 'zh_CN' ,
2005-10-17 15:48:46 +00:00
'cz' => 'cs' ,
2006-05-15 23:31:50 +00:00
'dk' => 'da' ,
2005-10-17 15:48:46 +00:00
'si' => 'sl' ,
'se' => 'sv' ,
2006-05-15 23:31:50 +00:00
'gl' => 'kl' ,
2005-10-17 15:48:46 +00:00
'gr' => 'el' ,
'hk' => 'zh_HK' ,
2006-05-15 23:31:50 +00:00
'kr' => 'ko' ,
2005-10-17 15:48:46 +00:00
'ua' => 'uk' ,
'jp' => 'ja' ,
2011-10-22 21:07:24 +02:00
'qc' => 'fr_CA' ,
2005-10-17 15:48:46 +00:00
'vn' => 'vi' ,
2011-10-22 21:07:24 +02:00
'ge' => 'ka' ,
'ga' => 'gl' ,
2005-10-17 15:48:46 +00:00
);
2012-04-21 21:31:03 +02:00
/**
* Default constructor .
*/
public function __construct () {
$this -> locales = t3lib_div :: makeInstance ( 't3lib_l10n_Locales' );
}
2005-10-17 15:48:46 +00:00
/**
* Normalize - changes input character set to lowercase letters .
*
* @ param string Input charset
* @ return string Normalized charset
* @ author Martin Kutschker < martin . t . kutschker @ blackbox . net >
*/
2011-10-22 21:07:24 +02:00
function parse_charset ( $charset ) {
2008-06-16 15:31:21 +00:00
$charset = trim ( strtolower ( $charset ));
2011-10-22 21:07:24 +02:00
if ( isset ( $this -> synonyms [ $charset ])) {
$charset = $this -> synonyms [ $charset ];
}
2005-10-17 15:48:46 +00:00
return $charset ;
}
/**
* Get the charset of a locale .
*
2011-10-22 21:07:24 +02:00
* ln language
* ln_CN language / country
* ln_CN . cs language / country / charset
2005-10-17 15:48:46 +00:00
* ln_CN . cs @ mod language / country / charset / modifier
*
* @ param string Locale string
* @ return string Charset resolved for locale string
* @ author Martin Kutschker < martin . t . kutschker @ blackbox . net >
*/
2011-10-22 21:07:24 +02:00
function get_locale_charset ( $locale ) {
2005-10-17 15:48:46 +00:00
$locale = strtolower ( $locale );
// exact locale specific charset?
2011-10-22 21:07:24 +02:00
if ( isset ( $this -> locale_to_charset [ $locale ])) {
return $this -> locale_to_charset [ $locale ];
}
2005-10-17 15:48:46 +00:00
// get modifier
2011-10-22 21:07:24 +02:00
list ( $locale , $modifier ) = explode ( '@' , $locale );
2005-10-17 15:48:46 +00:00
// locale contains charset: use it
2011-10-22 21:07:24 +02:00
list ( $locale , $charset ) = explode ( '.' , $locale );
if ( $charset ) {
return $this -> parse_charset ( $charset );
}
2005-10-17 15:48:46 +00:00
// modifier is 'euro' (after charset check, because of xx.utf-8@euro)
2011-10-22 21:07:24 +02:00
if ( $modifier == 'euro' ) {
return 'iso-8859-15' ;
}
2005-10-17 15:48:46 +00:00
// get language
2011-10-22 21:07:24 +02:00
list ( $language , $country ) = explode ( '_' , $locale );
if ( isset ( $this -> lang_to_script [ $language ])) {
$script = $this -> lang_to_script [ $language ];
}
2005-10-17 15:48:46 +00:00
2011-10-22 21:07:24 +02:00
if ( TYPO3_OS == 'WIN' ) {
2008-06-16 15:31:21 +00:00
$cs = $this -> script_to_charset_windows [ $script ] ? $this -> script_to_charset_windows [ $script ] : 'windows-1252' ;
2005-10-17 15:48:46 +00:00
} else {
2012-09-02 16:10:57 +02:00
$cs = $this -> script_to_charset_unix [ $script ] ? $this -> script_to_charset_unix [ $script ] : 'utf-8' ;
2005-10-17 15:48:46 +00:00
}
2006-05-15 23:31:50 +00:00
return $cs ;
2005-10-17 15:48:46 +00:00
}
/********************************************
*
* Charset Conversion functions
*
********************************************/
/**
* Convert from one charset to another charset .
*
* @ param string Input string
* @ param string From charset ( the current charset of the string )
* @ param string To charset ( the output charset wanted )
* @ param boolean If set , then characters that are not available in the destination character set will be encoded as numeric entities
* @ return string Converted string
* @ see convArray ()
*/
2011-10-22 21:07:24 +02:00
function conv ( $str , $fromCS , $toCS , $useEntityForNoChar = 0 ) {
if ( $fromCS == $toCS ) {
return $str ;
}
2005-10-17 15:48:46 +00:00
// PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
2011-10-22 21:07:24 +02:00
if ( $toCS == 'utf-8' || ! $useEntityForNoChar ) {
switch ( $GLOBALS [ 'TYPO3_CONF_VARS' ][ 'SYS' ][ 't3lib_cs_convMethod' ]) {
case 'mbstring' :
$conv_str = mb_convert_encoding ( $str , $toCS , $fromCS );
if ( FALSE !== $conv_str ) {
return $conv_str ;
2012-04-21 21:31:03 +02:00
} // returns FALSE for unsupported charsets
2011-10-22 21:07:24 +02:00
break ;
2005-10-17 15:48:46 +00:00
2011-10-22 21:07:24 +02:00
case 'iconv' :
$conv_str = iconv ( $fromCS , $toCS . '//TRANSLIT' , $str );
if ( FALSE !== $conv_str ) {
return $conv_str ;
}
break ;
2005-10-17 15:48:46 +00:00
2011-10-22 21:07:24 +02:00
case 'recode' :
$conv_str = recode_string ( $fromCS . '..' . $toCS , $str );
if ( FALSE !== $conv_str ) {
return $conv_str ;
}
break ;
2005-10-17 15:48:46 +00:00
}
// fallback to TYPO3 conversion
}
2011-10-22 21:07:24 +02:00
if ( $fromCS != 'utf-8' ) {
$str = $this -> utf8_encode ( $str , $fromCS );
}
if ( $toCS != 'utf-8' ) {
$str = $this -> utf8_decode ( $str , $toCS , $useEntityForNoChar );
}
2005-10-17 15:48:46 +00:00
return $str ;
}
/**
2011-10-22 21:07:24 +02:00
* Convert all elements in ARRAY with type string from one charset to another charset .
2005-10-17 15:48:46 +00:00
* NOTICE : Array is passed by reference !
*
* @ param string Input array , possibly multidimensional
* @ param string From charset ( the current charset of the string )
* @ param string To charset ( the output charset wanted )
* @ param boolean If set , then characters that are not available in the destination character set will be encoded as numeric entities
* @ return void
* @ see conv ()
*/
2011-10-22 21:07:24 +02:00
function convArray ( & $array , $fromCS , $toCS , $useEntityForNoChar = 0 ) {
foreach ( $array as $key => $value ) {
if ( is_array ( $array [ $key ])) {
$this -> convArray ( $array [ $key ], $fromCS , $toCS , $useEntityForNoChar );
} elseif ( is_string ( $array [ $key ])) {
$array [ $key ] = $this -> conv ( $array [ $key ], $fromCS , $toCS , $useEntityForNoChar );
2005-10-17 15:48:46 +00:00
}
}
}
/**
* Converts $str from $charset to UTF - 8
*
* @ param string String in local charset to convert to UTF - 8
* @ param string Charset , lowercase . Must be found in csconvtbl / folder .
* @ return string Output string , converted to UTF - 8
*/
2011-10-22 21:07:24 +02:00
function utf8_encode ( $str , $charset ) {
2005-10-17 15:48:46 +00:00
2011-10-22 21:07:24 +02:00
if ( $charset === 'utf-8' ) {
return $str ;
}
2005-10-17 15:48:46 +00:00
// Charset is case-insensitive.
2011-10-22 21:07:24 +02:00
if ( $this -> initCharset ( $charset )) { // Parse conv. table if not already...
2005-10-17 15:48:46 +00:00
$strLen = strlen ( $str );
2011-10-22 21:07:24 +02:00
$outStr = '' ;
for ( $a = 0 ; $a < $strLen ; $a ++ ) { // Traverse each char in string.
$chr = substr ( $str , $a , 1 );
$ord = ord ( $chr );
if ( isset ( $this -> twoByteSets [ $charset ])) { // If the charset has two bytes per char
$ord2 = ord ( $str { $a + 1 });
$ord = $ord << 8 | $ord2 ; // assume big endian
if ( isset ( $this -> parsedCharsets [ $charset ][ 'local' ][ $ord ])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
$outStr .= $this -> parsedCharsets [ $charset ][ 'local' ][ $ord ];
} else {
$outStr .= chr ( $this -> noCharByteVal );
} // No char exists
2005-10-17 15:48:46 +00:00
$a ++ ;
2011-10-22 21:07:24 +02:00
} elseif ( $ord > 127 ) { // If char has value over 127 it's a multibyte char in UTF-8
if ( isset ( $this -> eucBasedSets [ $charset ])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
if ( $charset != 'shift_jis' || ( $ord < 0xA0 || $ord > 0xDF )) { // Shift-JIS: chars between 160 and 223 are single byte
2005-10-17 15:48:46 +00:00
$a ++ ;
2011-10-22 21:07:24 +02:00
$ord2 = ord ( substr ( $str , $a , 1 ));
$ord = $ord * 256 + $ord2 ;
2005-10-17 15:48:46 +00:00
}
}
2011-10-22 21:07:24 +02:00
if ( isset ( $this -> parsedCharsets [ $charset ][ 'local' ][ $ord ])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
$outStr .= $this -> parsedCharsets [ $charset ][ 'local' ][ $ord ];
} else {
$outStr .= chr ( $this -> noCharByteVal );
} // No char exists
} else {
$outStr .= $chr ;
} // ... otherwise it's just ASCII 0-127 and one byte. Transparent
2005-10-17 15:48:46 +00:00
}
return $outStr ;
}
}
/**
* Converts $str from UTF - 8 to $charset
*
* @ param string String in UTF - 8 to convert to local charset
* @ param string Charset , lowercase . Must be found in csconvtbl / folder .
* @ param boolean If set , then characters that are not available in the destination character set will be encoded as numeric entities
* @ return string Output string , converted to local charset
*/
2011-10-22 21:07:24 +02:00
function utf8_decode ( $str , $charset , $useEntityForNoChar = 0 ) {
2005-10-17 15:48:46 +00:00
2009-11-18 21:48:00 +00:00
if ( $charset === 'utf-8' ) {
return $str ;
}
2005-10-17 15:48:46 +00:00
// Charset is case-insensitive.
2011-10-22 21:07:24 +02:00
if ( $this -> initCharset ( $charset )) { // Parse conv. table if not already...
2005-10-17 15:48:46 +00:00
$strLen = strlen ( $str );
2011-10-22 21:07:24 +02:00
$outStr = '' ;
$buf = '' ;
for ( $a = 0 , $i = 0 ; $a < $strLen ; $a ++ , $i ++ ) { // Traverse each char in UTF-8 string.
$chr = substr ( $str , $a , 1 );
$ord = ord ( $chr );
if ( $ord > 127 ) { // This means multibyte! (first byte!)
if ( $ord & 64 ) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
$buf = $chr ; // Add first byte
for ( $b = 0 ; $b < 8 ; $b ++ ) { // for each byte in multibyte string...
$ord = $ord << 1 ; // Shift it left and ...
if ( $ord & 128 ) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
$a ++ ; // Increase pointer...
$buf .= substr ( $str , $a , 1 ); // ... and add the next char.
} else {
break ;
}
2005-10-17 15:48:46 +00:00
}
2011-10-22 21:07:24 +02:00
if ( isset ( $this -> parsedCharsets [ $charset ][ 'utf8' ][ $buf ])) { // If the UTF-8 char-sequence is found then...
$mByte = $this -> parsedCharsets [ $charset ][ 'utf8' ][ $buf ]; // The local number
if ( $mByte > 255 ) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
$outStr .= chr (( $mByte >> 8 ) & 255 ) . chr ( $mByte & 255 );
} else {
$outStr .= chr ( $mByte );
}
} elseif ( $useEntityForNoChar ) { // Create num entity:
$outStr .= '&#' . $this -> utf8CharToUnumber ( $buf , 1 ) . ';' ;
} else {
$outStr .= chr ( $this -> noCharByteVal );
} // No char exists
} else {
$outStr .= chr ( $this -> noCharByteVal );
} // No char exists (MIDDLE of MB sequence!)
} else {
$outStr .= $chr ;
} // ... otherwise it's just ASCII 0-127 and one byte. Transparent
2005-10-17 15:48:46 +00:00
}
return $outStr ;
}
}
/**
* Converts all chars > 127 to numeric entities .
*
* @ param string Input string
* @ return string Output string
*/
2011-10-22 21:07:24 +02:00
function utf8_to_entities ( $str ) {
2005-10-17 15:48:46 +00:00
$strLen = strlen ( $str );
2011-10-22 21:07:24 +02:00
$outStr = '' ;
$buf = '' ;
for ( $a = 0 ; $a < $strLen ; $a ++ ) { // Traverse each char in UTF-8 string.
$chr = substr ( $str , $a , 1 );
$ord = ord ( $chr );
if ( $ord > 127 ) { // This means multibyte! (first byte!)
if ( $ord & 64 ) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
$buf = $chr ; // Add first byte
for ( $b = 0 ; $b < 8 ; $b ++ ) { // for each byte in multibyte string...
$ord = $ord << 1 ; // Shift it left and ...
if ( $ord & 128 ) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
$a ++ ; // Increase pointer...
$buf .= substr ( $str , $a , 1 ); // ... and add the next char.
} else {
break ;
}
2005-10-17 15:48:46 +00:00
}
2011-10-22 21:07:24 +02:00
$outStr .= '&#' . $this -> utf8CharToUnumber ( $buf , 1 ) . ';' ;
} else {
$outStr .= chr ( $this -> noCharByteVal );
} // No char exists (MIDDLE of MB sequence!)
} else {
$outStr .= $chr ;
} // ... otherwise it's just ASCII 0-127 and one byte. Transparent
2005-10-17 15:48:46 +00:00
}
return $outStr ;
}
/**
* Converts numeric entities ( UNICODE , eg . decimal ( & #1234;) or hexadecimal ()) to UTF-8 multibyte chars
*
* @ param string Input string , UTF - 8
* @ param boolean If set , then all string - HTML entities ( like & amp ; or & pound ; will be converted as well )
* @ return string Output string
*/
2012-09-02 16:10:57 +02:00
function entities_to_utf8 ( $str , $alsoStdHtmlEnt = FALSE ) {
2013-09-19 22:02:45 +02:00
// Workaround for #39287: 3rd parameter for get_html_translation_table() was only added in PHP 5.3.4 and later
// see http://php.net/manual/en/function.get-html-translation-table.php
$applyPhpCompatibilityFix = version_compare ( phpversion (), '5.3.4' , '<' );
2011-10-22 21:07:24 +02:00
if ( $alsoStdHtmlEnt ) {
2013-09-19 22:02:45 +02:00
if ( $applyPhpCompatibilityFix === TRUE ) {
$trans_tbl = array_flip ( get_html_translation_table ( HTML_ENTITIES , ENT_COMPAT ));
} else {
$trans_tbl = array_flip ( get_html_translation_table ( HTML_ENTITIES , ENT_COMPAT , 'UTF-8' ));
}
2005-10-17 15:48:46 +00:00
}
2009-11-18 21:48:00 +00:00
$token = md5 ( microtime ());
$parts = explode ( $token , preg_replace ( '/(&([#[:alnum:]]*);)/' , $token . '${2}' . $token , $str ));
2011-10-22 21:07:24 +02:00
foreach ( $parts as $k => $v ) {
2012-09-02 16:10:57 +02:00
// only take every second element
if ( $k % 2 === 0 ) {
continue ;
}
$position = 0 ;
if ( substr ( $v , $position , 1 ) == '#' ) { // Dec or hex entities:
$position ++ ;
if ( substr ( $v , $position , 1 ) == 'x' ) {
$v = hexdec ( substr ( $v , ++ $position ));
} else {
$v = substr ( $v , $position );
2005-10-17 15:48:46 +00:00
}
2012-09-02 16:10:57 +02:00
$parts [ $k ] = $this -> UnumberToChar ( $v );
} elseif ( $alsoStdHtmlEnt && isset ( $trans_tbl [ '&' . $v . ';' ])) { // Other entities:
2013-09-19 22:02:45 +02:00
$v = $trans_tbl [ '&' . $v . ';' ];
if ( $applyPhpCompatibilityFix === TRUE ) {
$v = $this -> utf8_encode ( $v , 'iso-8859-1' );
}
$parts [ $k ] = $v ;
2012-09-02 16:10:57 +02:00
} else { // No conversion:
$parts [ $k ] = '&' . $v . ';' ;
2005-10-17 15:48:46 +00:00
}
}
2011-10-22 21:07:24 +02:00
return implode ( '' , $parts );
2005-10-17 15:48:46 +00:00
}
/**
* Converts all chars in the input UTF - 8 string into integer numbers returned in an array
*
* @ param string Input string , UTF - 8
* @ param boolean If set , then all HTML entities ( like & amp ; or & pound ; or & #123; or 㽝) will be detected as characters.
* @ param boolean If set , then instead of integer numbers the real UTF - 8 char is returned .
* @ return array Output array with the char numbers
*/
2011-10-22 21:07:24 +02:00
function utf8_to_numberarray ( $str , $convEntities = 0 , $retChar = 0 ) {
2005-10-17 15:48:46 +00:00
// If entities must be registered as well...:
2011-10-22 21:07:24 +02:00
if ( $convEntities ) {
$str = $this -> entities_to_utf8 ( $str , 1 );
2005-10-17 15:48:46 +00:00
}
// Do conversion:
$strLen = strlen ( $str );
2011-10-22 21:07:24 +02:00
$outArr = array ();
$buf = '' ;
for ( $a = 0 ; $a < $strLen ; $a ++ ) { // Traverse each char in UTF-8 string.
$chr = substr ( $str , $a , 1 );
$ord = ord ( $chr );
if ( $ord > 127 ) { // This means multibyte! (first byte!)
if ( $ord & 64 ) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
$buf = $chr ; // Add first byte
for ( $b = 0 ; $b < 8 ; $b ++ ) { // for each byte in multibyte string...
$ord = $ord << 1 ; // Shift it left and ...
if ( $ord & 128 ) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
$a ++ ; // Increase pointer...
$buf .= substr ( $str , $a , 1 ); // ... and add the next char.
} else {
break ;
}
2005-10-17 15:48:46 +00:00
}
2011-10-22 21:07:24 +02:00
$outArr [] = $retChar ? $buf : $this -> utf8CharToUnumber ( $buf );
} else {
$outArr [] = $retChar ? chr ( $this -> noCharByteVal ) : $this -> noCharByteVal ;
} // No char exists (MIDDLE of MB sequence!)
} else {
$outArr [] = $retChar ? chr ( $ord ) : $ord ;
} // ... otherwise it's just ASCII 0-127 and one byte. Transparent
2005-10-17 15:48:46 +00:00
}
return $outArr ;
}
/**
* Converts a UNICODE number to a UTF - 8 multibyte character
* Algorithm based on script found at From : http :// czyborra . com / utf /
* Unit - tested by Kasper
*
* The binary representation of the character ' s integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence :
*
* bytes | bits | representation
2011-10-22 21:07:24 +02:00
* 1 | 7 | 0 vvvvvvv
* 2 | 11 | 110 vvvvv 10 vvvvvv
* 3 | 16 | 1110 vvvv 10 vvvvvv 10 vvvvvv
* 4 | 21 | 11110 vvv 10 vvvvvv 10 vvvvvv 10 vvvvvv
* 5 | 26 | 111110 vv 10 vvvvvv 10 vvvvvv 10 vvvvvv 10 vvvvvv
* 6 | 31 | 1111110 v 10 vvvvvv 10 vvvvvv 10 vvvvvv 10 vvvvvv 10 vvvvvv
2005-10-17 15:48:46 +00:00
*
* @ param integer UNICODE integer
* @ return string UTF - 8 multibyte character string
* @ see utf8CharToUnumber ()
*/
2011-10-22 21:07:24 +02:00
function UnumberToChar ( $cbyte ) {
$str = '' ;
2005-10-17 15:48:46 +00:00
if ( $cbyte < 0x80 ) {
2011-10-22 21:07:24 +02:00
$str .= chr ( $cbyte );
} else {
if ( $cbyte < 0x800 ) {
$str .= chr ( 0xC0 | ( $cbyte >> 6 ));
$str .= chr ( 0x80 | ( $cbyte & 0x3F ));
} else {
if ( $cbyte < 0x10000 ) {
$str .= chr ( 0xE0 | ( $cbyte >> 12 ));
$str .= chr ( 0x80 | (( $cbyte >> 6 ) & 0x3F ));
$str .= chr ( 0x80 | ( $cbyte & 0x3F ));
} else {
if ( $cbyte < 0x200000 ) {
$str .= chr ( 0xF0 | ( $cbyte >> 18 ));
$str .= chr ( 0x80 | (( $cbyte >> 12 ) & 0x3F ));
$str .= chr ( 0x80 | (( $cbyte >> 6 ) & 0x3F ));
$str .= chr ( 0x80 | ( $cbyte & 0x3F ));
} else {
if ( $cbyte < 0x4000000 ) {
$str .= chr ( 0xF8 | ( $cbyte >> 24 ));
$str .= chr ( 0x80 | (( $cbyte >> 18 ) & 0x3F ));
$str .= chr ( 0x80 | (( $cbyte >> 12 ) & 0x3F ));
$str .= chr ( 0x80 | (( $cbyte >> 6 ) & 0x3F ));
$str .= chr ( 0x80 | ( $cbyte & 0x3F ));
} else {
if ( $cbyte < 0x80000000 ) {
$str .= chr ( 0xFC | ( $cbyte >> 30 ));
$str .= chr ( 0x80 | (( $cbyte >> 24 ) & 0x3F ));
$str .= chr ( 0x80 | (( $cbyte >> 18 ) & 0x3F ));
$str .= chr ( 0x80 | (( $cbyte >> 12 ) & 0x3F ));
$str .= chr ( 0x80 | (( $cbyte >> 6 ) & 0x3F ));
$str .= chr ( 0x80 | ( $cbyte & 0x3F ));
} else { // Cannot express a 32-bit character in UTF-8
$str .= chr ( $this -> noCharByteVal );
}
}
}
}
}
2005-10-17 15:48:46 +00:00
}
return $str ;
}
/**
* Converts a UTF - 8 Multibyte character to a UNICODE number
* Unit - tested by Kasper
*
* @ param string UTF - 8 multibyte character string
* @ param boolean If set , then a hex . number is returned .
* @ return integer UNICODE integer
* @ see UnumberToChar ()
*/
2011-10-22 21:07:24 +02:00
function utf8CharToUnumber ( $str , $hex = 0 ) {
$ord = ord ( substr ( $str , 0 , 1 )); // First char
if (( $ord & 192 ) == 192 ) { // This verifyes that it IS a multi byte string
$binBuf = '' ;
for ( $b = 0 ; $b < 8 ; $b ++ ) { // for each byte in multibyte string...
$ord = $ord << 1 ; // Shift it left and ...
if ( $ord & 128 ) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
$binBuf .= substr ( '00000000' . decbin ( ord ( substr ( $str , $b + 1 , 1 ))), - 6 );
} else {
break ;
}
2005-10-17 15:48:46 +00:00
}
2011-10-22 21:07:24 +02:00
$binBuf = substr ( '00000000' . decbin ( ord ( substr ( $str , 0 , 1 ))), - ( 6 - $b )) . $binBuf ;
2005-10-17 15:48:46 +00:00
$int = bindec ( $binBuf );
2011-10-22 21:07:24 +02:00
} else {
$int = $ord ;
}
2005-10-17 15:48:46 +00:00
2011-10-22 21:07:24 +02:00
return $hex ? 'x' . dechex ( $int ) : $int ;
2005-10-17 15:48:46 +00:00
}
/********************************************
*
* Init functions
*
********************************************/
/**
* This will initialize a charset for use if it 's defined in the PATH_t3lib.' csconvtbl / ' folder
* This function is automatically called by the conversion functions
*
* PLEASE SEE : http :// www . unicode . org / Public / MAPPINGS /
*
* @ param string The charset to be initialized . Use lowercase charset always ( the charset must match exactly with a filename in csconvtbl / folder ([ charset ] . tbl )
* @ return integer Returns '1' if already loaded . Returns FALSE if charset conversion table was not found . Returns '2' if the charset conversion table was found and parsed .
* @ access private
*/
2011-10-22 21:07:24 +02:00
function initCharset ( $charset ) {
2005-10-17 15:48:46 +00:00
// Only process if the charset is not yet loaded:
2012-04-21 21:31:03 +02:00
if ( ! is_array ( $this -> parsedCharsets [ $charset ])) {
2005-10-17 15:48:46 +00:00
// Conversion table filename:
2011-10-22 21:07:24 +02:00
$charsetConvTableFile = PATH_t3lib . 'csconvtbl/' . $charset . '.tbl' ;
2005-10-17 15:48:46 +00:00
// If the conversion table is found:
2011-10-22 21:07:24 +02:00
if ( $charset && t3lib_div :: validPathStr ( $charsetConvTableFile ) && @ is_file ( $charsetConvTableFile )) {
2005-10-17 15:48:46 +00:00
// Cache file for charsets:
// Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
2011-10-22 21:07:24 +02:00
$cacheFile = t3lib_div :: getFileAbsFileName ( 'typo3temp/cs/charset_' . $charset . '.tbl' );
if ( $cacheFile && @ is_file ( $cacheFile )) {
$this -> parsedCharsets [ $charset ] = unserialize ( t3lib_div :: getUrl ( $cacheFile ));
2005-10-17 15:48:46 +00:00
} else {
// Parse conversion table into lines:
2011-10-22 21:07:24 +02:00
$lines = t3lib_div :: trimExplode ( LF , t3lib_div :: getUrl ( $charsetConvTableFile ), 1 );
2005-10-17 15:48:46 +00:00
// Initialize the internal variable holding the conv. table:
2011-10-22 21:07:24 +02:00
$this -> parsedCharsets [ $charset ] = array ( 'local' => array (), 'utf8' => array ());
2005-10-17 15:48:46 +00:00
// traverse the lines:
2011-10-22 21:07:24 +02:00
$detectedType = '' ;
foreach ( $lines as $value ) {
if ( trim ( $value ) && substr ( $value , 0 , 1 ) != '#' ) { // Comment line or blanks are ignored.
2005-10-17 15:48:46 +00:00
// Detect type if not done yet: (Done on first real line)
// The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
2011-10-22 21:07:24 +02:00
if ( ! $detectedType ) {
$detectedType = preg_match ( '/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/' , $value ) ? 'whitespaced' : 'ms-token' ;
}
2005-10-17 15:48:46 +00:00
2011-10-22 21:07:24 +02:00
if ( $detectedType == 'ms-token' ) {
2009-11-18 21:48:00 +00:00
list ( $hexbyte , $utf8 ) = preg_split ( '/[=:]/' , $value , 3 );
2011-10-22 21:07:24 +02:00
} elseif ( $detectedType == 'whitespaced' ) {
$regA = array ();
preg_match ( '/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/' , $value , $regA );
2005-10-17 15:48:46 +00:00
$hexbyte = $regA [ 1 ];
2011-10-22 21:07:24 +02:00
$utf8 = 'U+' . $regA [ 2 ];
2005-10-17 15:48:46 +00:00
}
$decval = hexdec ( trim ( $hexbyte ));
2011-10-22 21:07:24 +02:00
if ( $decval > 127 ) {
$utf8decval = hexdec ( substr ( trim ( $utf8 ), 2 ));
$this -> parsedCharsets [ $charset ][ 'local' ][ $decval ] = $this -> UnumberToChar ( $utf8decval );
$this -> parsedCharsets [ $charset ][ 'utf8' ][ $this -> parsedCharsets [ $charset ][ 'local' ][ $decval ]] = $decval ;
2005-10-17 15:48:46 +00:00
}
}
}
2011-10-22 21:07:24 +02:00
if ( $cacheFile ) {
t3lib_div :: writeFileToTypo3tempDir ( $cacheFile , serialize ( $this -> parsedCharsets [ $charset ]));
2005-10-17 15:48:46 +00:00
}
}
return 2 ;
2011-10-22 21:07:24 +02:00
} else {
return FALSE ;
}
} else {
return 1 ;
}
2005-10-17 15:48:46 +00:00
}
/**
* This function initializes all UTF - 8 character data tables .
*
* PLEASE SEE : http :// www . unicode . org / Public / UNIDATA /
*
* @ param string Mode ( " case " , " ascii " , ... )
* @ return integer Returns FALSE on error , a TRUE value on success : 1 table already loaded , 2 , cached version , 3 table parsed ( and cached ) .
* @ access private
*/
2011-10-22 21:07:24 +02:00
function initUnicodeData ( $mode = NULL ) {
2005-10-17 15:48:46 +00:00
// cache files
$cacheFileCase = t3lib_div :: getFileAbsFileName ( 'typo3temp/cs/cscase_utf-8.tbl' );
$cacheFileASCII = t3lib_div :: getFileAbsFileName ( 'typo3temp/cs/csascii_utf-8.tbl' );
// Only process if the tables are not yet loaded
2011-10-22 21:07:24 +02:00
switch ( $mode ) {
2005-10-17 15:48:46 +00:00
case 'case' :
2011-10-22 21:07:24 +02:00
if ( is_array ( $this -> caseFolding [ 'utf-8' ])) {
return 1 ;
}
2006-03-09 17:18:15 +00:00
2005-10-17 15:48:46 +00:00
// Use cached version if possible
2011-10-22 21:07:24 +02:00
if ( $cacheFileCase && @ is_file ( $cacheFileCase )) {
2005-10-17 15:48:46 +00:00
$this -> caseFolding [ 'utf-8' ] = unserialize ( t3lib_div :: getUrl ( $cacheFileCase ));
return 2 ;
}
break ;
case 'ascii' :
2011-10-22 21:07:24 +02:00
if ( is_array ( $this -> toASCII [ 'utf-8' ])) {
return 1 ;
}
2005-10-17 15:48:46 +00:00
// Use cached version if possible
2011-10-22 21:07:24 +02:00
if ( $cacheFileASCII && @ is_file ( $cacheFileASCII )) {
2005-10-17 15:48:46 +00:00
$this -> toASCII [ 'utf-8' ] = unserialize ( t3lib_div :: getUrl ( $cacheFileASCII ));
return 2 ;
}
break ;
}
// process main Unicode data file
2011-10-22 21:07:24 +02:00
$unicodeDataFile = PATH_t3lib . 'unidata/UnicodeData.txt' ;
if ( ! ( t3lib_div :: validPathStr ( $unicodeDataFile ) && @ is_file ( $unicodeDataFile ))) {
return FALSE ;
}
2005-10-17 15:48:46 +00:00
2011-10-22 21:07:24 +02:00
$fh = fopen ( $unicodeDataFile , 'rb' );
if ( ! $fh ) {
return FALSE ;
}
2005-10-17 15:48:46 +00:00
// key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
// note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
$this -> caseFolding [ 'utf-8' ] = array ();
$utf8CaseFolding =& $this -> caseFolding [ 'utf-8' ]; // a shorthand
$utf8CaseFolding [ 'toUpper' ] = array ();
$utf8CaseFolding [ 'toLower' ] = array ();
$utf8CaseFolding [ 'toTitle' ] = array ();
2011-10-22 21:07:24 +02:00
$decomposition = array (); // array of temp. decompositions
$mark = array (); // array of chars that are marks (eg. composing accents)
$number = array (); // array of chars that are numbers (eg. digits)
$omit = array (); // array of chars to be omitted (eg. Russian hard sign)
2005-10-17 15:48:46 +00:00
2011-10-22 21:07:24 +02:00
while ( ! feof ( $fh )) {
$line = fgets ( $fh , 4096 );
2005-10-17 15:48:46 +00:00
// has a lot of info
2011-10-22 21:07:24 +02:00
list ( $char , $name , $cat , , , $decomp , , , $num , , , , $upper , $lower , $title ,) = explode ( ';' , rtrim ( $line ));
2005-10-17 15:48:46 +00:00
$ord = hexdec ( $char );
2011-10-22 21:07:24 +02:00
if ( $ord > 0xFFFF ) {
break ;
} // only process the BMP
2005-10-17 15:48:46 +00:00
$utf8_char = $this -> UnumberToChar ( $ord );
2011-10-22 21:07:24 +02:00
if ( $upper ) {
$utf8CaseFolding [ 'toUpper' ][ $utf8_char ] = $this -> UnumberToChar ( hexdec ( $upper ));
}
if ( $lower ) {
$utf8CaseFolding [ 'toLower' ][ $utf8_char ] = $this -> UnumberToChar ( hexdec ( $lower ));
}
2005-10-17 15:48:46 +00:00
// store "title" only when different from "upper" (only a few)
2011-10-22 21:07:24 +02:00
if ( $title && $title != $upper ) {
$utf8CaseFolding [ 'toTitle' ][ $utf8_char ] = $this -> UnumberToChar ( hexdec ( $title ));
}
2005-10-17 15:48:46 +00:00
2011-10-22 21:07:24 +02:00
switch ( $cat { 0 }) {
case 'M' : // mark (accent, umlaut, ...)
2005-10-17 15:48:46 +00:00
$mark [ " U+ $char " ] = 1 ;
break ;
2011-10-22 21:07:24 +02:00
case 'N' : // numeric value
if ( $ord > 0x80 && $num != '' ) {
$number [ " U+ $char " ] = $num ;
}
2005-10-17 15:48:46 +00:00
}
// accented Latin letters without "official" decomposition
$match = array ();
2011-10-22 21:07:24 +02:00
if ( preg_match ( '/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/' , $name , $match ) && ! $decomp ) {
2005-10-17 15:48:46 +00:00
$c = ord ( $match [ 2 ]);
2011-10-22 21:07:24 +02:00
if ( $match [ 1 ] == 'SMALL' ) {
$c += 32 ;
}
2005-10-17 15:48:46 +00:00
$decomposition [ " U+ $char " ] = array ( dechex ( $c ));
continue ;
}
$match = array ();
2011-10-22 21:07:24 +02:00
if ( preg_match ( '/(<.*>)? *(.+)/' , $decomp , $match )) {
switch ( $match [ 1 ]) {
case '<circle>' : // add parenthesis as circle replacement, eg (1)
$match [ 2 ] = '0028 ' . $match [ 2 ] . ' 0029' ;
2005-10-17 15:48:46 +00:00
break ;
2011-10-22 21:07:24 +02:00
case '<square>' : // add square brackets as square replacement, eg [1]
$match [ 2 ] = '005B ' . $match [ 2 ] . ' 005D' ;
2005-10-17 15:48:46 +00:00
break ;
2011-10-22 21:07:24 +02:00
case '<compat>' : // ignore multi char decompositions that start with a space
if ( preg_match ( '/^0020 /' , $match [ 2 ])) {
continue 2 ;
}
2005-10-17 15:48:46 +00:00
break ;
// ignore Arabic and vertical layout presentation decomposition
case '<initial>' :
case '<medial>' :
case '<final>' :
case '<isolated>' :
case '<vertical>' :
continue 2 ;
}
2009-11-18 21:48:00 +00:00
$decomposition [ " U+ $char " ] = explode ( ' ' , $match [ 2 ]);
2005-10-17 15:48:46 +00:00
}
}
fclose ( $fh );
// process additional Unicode data for casing (allow folded characters to expand into a sequence)
2011-10-22 21:07:24 +02:00
$specialCasingFile = PATH_t3lib . 'unidata/SpecialCasing.txt' ;
if ( t3lib_div :: validPathStr ( $specialCasingFile ) && @ is_file ( $specialCasingFile )) {
$fh = fopen ( $specialCasingFile , 'rb' );
if ( $fh ) {
while ( ! feof ( $fh )) {
$line = fgets ( $fh , 4096 );
if ( $line { 0 } != '#' && trim ( $line ) != '' ) {
list ( $char , $lower , $title , $upper , $cond ) = t3lib_div :: trimExplode ( ';' , $line );
if ( $cond == '' || $cond { 0 } == '#' ) {
2005-10-17 15:48:46 +00:00
$utf8_char = $this -> UnumberToChar ( hexdec ( $char ));
2011-10-22 21:07:24 +02:00
if ( $char != $lower ) {
2009-11-18 21:48:00 +00:00
$arr = explode ( ' ' , $lower );
2012-04-21 21:31:03 +02:00
for ( $i = 0 ; isset ( $arr [ $i ]); $i ++ ) {
$arr [ $i ] = $this -> UnumberToChar ( hexdec ( $arr [ $i ]));
}
2011-10-22 21:07:24 +02:00
$utf8CaseFolding [ 'toLower' ][ $utf8_char ] = implode ( '' , $arr );
2005-10-17 15:48:46 +00:00
}
2011-10-22 21:07:24 +02:00
if ( $char != $title && $title != $upper ) {
2009-11-18 21:48:00 +00:00
$arr = explode ( ' ' , $title );
2012-04-21 21:31:03 +02:00
for ( $i = 0 ; isset ( $arr [ $i ]); $i ++ ) {
$arr [ $i ] = $this -> UnumberToChar ( hexdec ( $arr [ $i ]));
}
2011-10-22 21:07:24 +02:00
$utf8CaseFolding [ 'toTitle' ][ $utf8_char ] = implode ( '' , $arr );
2005-10-17 15:48:46 +00:00
}
2011-10-22 21:07:24 +02:00
if ( $char != $upper ) {
$arr = explode ( ' ' , $upper );
2012-04-21 21:31:03 +02:00
for ( $i = 0 ; isset ( $arr [ $i ]); $i ++ ) {
$arr [ $i ] = $this -> UnumberToChar ( hexdec ( $arr [ $i ]));
}
2011-10-22 21:07:24 +02:00
$utf8CaseFolding [ 'toUpper' ][ $utf8_char ] = implode ( '' , $arr );
2005-10-17 15:48:46 +00:00
}
}
}
}
fclose ( $fh );
}
}
// process custom decompositions
2011-10-22 21:07:24 +02:00
$customTranslitFile = PATH_t3lib . 'unidata/Translit.txt' ;
if ( t3lib_div :: validPathStr ( $customTranslitFile ) && @ is_file ( $customTranslitFile )) {
$fh = fopen ( $customTranslitFile , 'rb' );
if ( $fh ) {
while ( ! feof ( $fh )) {
$line = fgets ( $fh , 4096 );
if ( $line { 0 } != '#' && trim ( $line ) != '' ) {
list ( $char , $translit ) = t3lib_div :: trimExplode ( ';' , $line );
if ( ! $translit ) {
$omit [ " U+ $char " ] = 1 ;
}
2009-11-18 21:48:00 +00:00
$decomposition [ " U+ $char " ] = explode ( ' ' , $translit );
2005-10-17 15:48:46 +00:00
}
}
fclose ( $fh );
}
}
// decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
2011-10-22 21:07:24 +02:00
foreach ( $decomposition as $from => $to ) {
2005-10-17 15:48:46 +00:00
$code_decomp = array ();
2011-10-22 21:07:24 +02:00
while ( $code_value = array_shift ( $to )) {
if ( isset ( $decomposition [ " U+ $code_value " ])) { // do recursive decomposition
foreach ( array_reverse ( $decomposition [ " U+ $code_value " ]) as $cv ) {
2005-10-17 15:48:46 +00:00
array_unshift ( $to , $cv );
}
2011-10-22 21:07:24 +02:00
} elseif ( ! isset ( $mark [ " U+ $code_value " ])) { // remove mark
2005-10-17 15:48:46 +00:00
array_push ( $code_decomp , $code_value );
}
}
2011-10-22 21:07:24 +02:00
if ( count ( $code_decomp ) || isset ( $omit [ $from ])) {
2005-10-17 15:48:46 +00:00
$decomposition [ $from ] = $code_decomp ;
} else {
unset ( $decomposition [ $from ]);
}
}
// create ascii only mapping
$this -> toASCII [ 'utf-8' ] = array ();
$ascii =& $this -> toASCII [ 'utf-8' ];
2011-10-22 21:07:24 +02:00
foreach ( $decomposition as $from => $to ) {
2005-10-17 15:48:46 +00:00
$code_decomp = array ();
2011-10-22 21:07:24 +02:00
while ( $code_value = array_shift ( $to )) {
2005-10-17 15:48:46 +00:00
$ord = hexdec ( $code_value );
2011-10-22 21:07:24 +02:00
if ( $ord > 127 ) {
continue 2 ;
} // skip decompositions containing non-ASCII chars
2005-10-17 15:48:46 +00:00
else
2011-10-22 21:07:24 +02:00
{
array_push ( $code_decomp , chr ( $ord ));
}
2005-10-17 15:48:46 +00:00
}
2011-10-22 21:07:24 +02:00
$ascii [ $this -> UnumberToChar ( hexdec ( $from ))] = join ( '' , $code_decomp );
2005-10-17 15:48:46 +00:00
}
// add numeric decompositions
2011-10-22 21:07:24 +02:00
foreach ( $number as $from => $to ) {
2005-10-17 15:48:46 +00:00
$utf8_char = $this -> UnumberToChar ( hexdec ( $from ));
2011-10-22 21:07:24 +02:00
if ( ! isset ( $ascii [ $utf8_char ])) {
2005-10-17 15:48:46 +00:00
$ascii [ $utf8_char ] = $to ;
}
}
2011-10-22 21:07:24 +02:00
if ( $cacheFileCase ) {
t3lib_div :: writeFileToTypo3tempDir ( $cacheFileCase , serialize ( $utf8CaseFolding ));
2005-10-17 15:48:46 +00:00
}
2011-10-22 21:07:24 +02:00
if ( $cacheFileASCII ) {
t3lib_div :: writeFileToTypo3tempDir ( $cacheFileASCII , serialize ( $ascii ));
2005-10-17 15:48:46 +00:00
}
return 3 ;
}
/**
* This function initializes the folding table for a charset other than UTF - 8.
* This function is automatically called by the case folding functions .
*
* @ param string Charset for which to initialize case folding .
* @ return integer Returns FALSE on error , a TRUE value on success : 1 table already loaded , 2 , cached version , 3 table parsed ( and cached ) .
* @ access private
*/
2011-10-22 21:07:24 +02:00
function initCaseFolding ( $charset ) {
2005-10-17 15:48:46 +00:00
// Only process if the case table is not yet loaded:
2011-10-22 21:07:24 +02:00
if ( is_array ( $this -> caseFolding [ $charset ])) {
return 1 ;
}
2005-10-17 15:48:46 +00:00
// Use cached version if possible
2011-10-22 21:07:24 +02:00
$cacheFile = t3lib_div :: getFileAbsFileName ( 'typo3temp/cs/cscase_' . $charset . '.tbl' );
if ( $cacheFile && @ is_file ( $cacheFile )) {
2005-10-17 15:48:46 +00:00
$this -> caseFolding [ $charset ] = unserialize ( t3lib_div :: getUrl ( $cacheFile ));
return 2 ;
}
// init UTF-8 conversion for this charset
2011-10-22 21:07:24 +02:00
if ( ! $this -> initCharset ( $charset )) {
return FALSE ;
2005-10-17 15:48:46 +00:00
}
// UTF-8 case folding is used as the base conversion table
2011-10-22 21:07:24 +02:00
if ( ! $this -> initUnicodeData ( 'case' )) {
return FALSE ;
2005-10-17 15:48:46 +00:00
}
$nochar = chr ( $this -> noCharByteVal );
2011-10-22 21:07:24 +02:00
foreach ( $this -> parsedCharsets [ $charset ][ 'local' ] as $ci => $utf8 ) {
2005-10-17 15:48:46 +00:00
// reconvert to charset (don't use chr() of numeric value, might be muli-byte)
$c = $this -> utf8_decode ( $utf8 , $charset );
// $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
$cc = $this -> utf8_decode ( $this -> caseFolding [ 'utf-8' ][ 'toUpper' ][ $utf8 ], $charset );
2011-10-22 21:07:24 +02:00
if ( $cc != '' && $cc != $nochar ) {
$this -> caseFolding [ $charset ][ 'toUpper' ][ $c ] = $cc ;
}
2005-10-17 15:48:46 +00:00
// $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
$cc = $this -> utf8_decode ( $this -> caseFolding [ 'utf-8' ][ 'toLower' ][ $utf8 ], $charset );
2011-10-22 21:07:24 +02:00
if ( $cc != '' && $cc != $nochar ) {
$this -> caseFolding [ $charset ][ 'toLower' ][ $c ] = $cc ;
}
2005-10-17 15:48:46 +00:00
// $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
$cc = $this -> utf8_decode ( $this -> caseFolding [ 'utf-8' ][ 'toTitle' ][ $utf8 ], $charset );
2011-10-22 21:07:24 +02:00
if ( $cc != '' && $cc != $nochar ) {
$this -> caseFolding [ $charset ][ 'toTitle' ][ $c ] = $cc ;
}
2005-10-17 15:48:46 +00:00
}
// add the ASCII case table
2011-10-22 21:07:24 +02:00
for ( $i = ord ( 'a' ); $i <= ord ( 'z' ); $i ++ ) {
$this -> caseFolding [ $charset ][ 'toUpper' ][ chr ( $i )] = chr ( $i - 32 );
2005-10-17 15:48:46 +00:00
}
2011-10-22 21:07:24 +02:00
for ( $i = ord ( 'A' ); $i <= ord ( 'Z' ); $i ++ ) {
$this -> caseFolding [ $charset ][ 'toLower' ][ chr ( $i )] = chr ( $i + 32 );
2005-10-17 15:48:46 +00:00
}
2011-10-22 21:07:24 +02:00
if ( $cacheFile ) {
t3lib_div :: writeFileToTypo3tempDir ( $cacheFile , serialize ( $this -> caseFolding [ $charset ]));
2005-10-17 15:48:46 +00:00
}
return 3 ;
}
/**
* This function initializes the to - ASCII conversion table for a charset other than UTF - 8.
* This function is automatically called by the ASCII transliteration functions .
*
* @ param string Charset for which to initialize conversion .
* @ return integer Returns FALSE on error , a TRUE value on success : 1 table already loaded , 2 , cached version , 3 table parsed ( and cached ) .
* @ access private
*/
2011-10-22 21:07:24 +02:00
function initToASCII ( $charset ) {
2005-10-17 15:48:46 +00:00
// Only process if the case table is not yet loaded:
2011-10-22 21:07:24 +02:00
if ( is_array ( $this -> toASCII [ $charset ])) {
return 1 ;
}
2005-10-17 15:48:46 +00:00
// Use cached version if possible
2011-10-22 21:07:24 +02:00
$cacheFile = t3lib_div :: getFileAbsFileName ( 'typo3temp/cs/csascii_' . $charset . '.tbl' );
if ( $cacheFile && @ is_file ( $cacheFile )) {
2005-10-17 15:48:46 +00:00
$this -> toASCII [ $charset ] = unserialize ( t3lib_div :: getUrl ( $cacheFile ));
return 2 ;
}
// init UTF-8 conversion for this charset
2011-10-22 21:07:24 +02:00
if ( ! $this -> initCharset ( $charset )) {
return FALSE ;
2005-10-17 15:48:46 +00:00
}
// UTF-8/ASCII transliteration is used as the base conversion table
2011-10-22 21:07:24 +02:00
if ( ! $this -> initUnicodeData ( 'ascii' )) {
return FALSE ;
2005-10-17 15:48:46 +00:00
}
$nochar = chr ( $this -> noCharByteVal );
2011-10-22 21:07:24 +02:00
foreach ( $this -> parsedCharsets [ $charset ][ 'local' ] as $ci => $utf8 ) {
2005-10-17 15:48:46 +00:00
// reconvert to charset (don't use chr() of numeric value, might be muli-byte)
$c = $this -> utf8_decode ( $utf8 , $charset );
2011-10-22 21:07:24 +02:00
if ( isset ( $this -> toASCII [ 'utf-8' ][ $utf8 ])) {
2005-10-17 15:48:46 +00:00
$this -> toASCII [ $charset ][ $c ] = $this -> toASCII [ 'utf-8' ][ $utf8 ];
}
}
2011-10-22 21:07:24 +02:00
if ( $cacheFile ) {
t3lib_div :: writeFileToTypo3tempDir ( $cacheFile , serialize ( $this -> toASCII [ $charset ]));
2005-10-17 15:48:46 +00:00
}
return 3 ;
}
/********************************************
*
* String operation functions
*
********************************************/
/**
* Returns a part of a string .
* Unit - tested by Kasper ( single byte charsets only )
*
* @ param string The character set
* @ param string Character string
* @ param integer Start position ( character position )
* @ param integer Length ( in characters )
* @ return string The substring
* @ see substr (), mb_substr ()
* @ author Martin Kutschker < martin . t . kutschker @ blackbox . net >
*/
2011-10-22 21:07:24 +02:00
function substr ( $charset , $string , $start , $len = NULL ) {
if ( $len === 0 || $string === '' ) {
return '' ;
}
2005-10-17 15:48:46 +00:00
2011-10-22 21:07:24 +02:00
if ( $GLOBALS [ 'TYPO3_CONF_VARS' ][ 'SYS' ][ 't3lib_cs_utils' ] == 'mbstring' ) {
2005-10-17 15:48:46 +00:00
// cannot omit $len, when specifying charset
2011-10-22 21:07:24 +02:00
if ( $len == NULL ) {
$enc = mb_internal_encoding (); // save internal encoding
2005-12-13 17:21:53 +00:00
mb_internal_encoding ( $charset );
2011-10-22 21:07:24 +02:00
$str = mb_substr ( $string , $start );
mb_internal_encoding ( $enc ); // restore internal encoding
2005-10-17 15:48:46 +00:00
return $str ;
}
2005-12-13 17:21:53 +00:00
else {
2011-10-22 21:07:24 +02:00
return mb_substr ( $string , $start , $len , $charset );
2005-12-13 17:21:53 +00:00
}
2011-10-22 21:07:24 +02:00
} elseif ( $GLOBALS [ 'TYPO3_CONF_VARS' ][ 'SYS' ][ 't3lib_cs_utils' ] == 'iconv' ) {
2005-12-13 17:21:53 +00:00
// cannot omit $len, when specifying charset
2011-10-22 21:07:24 +02:00
if ( $len == NULL ) {
$enc = iconv_get_encoding ( 'internal_encoding' ); // save internal encoding
iconv_set_encoding ( 'internal_encoding' , $charset );
$str = iconv_substr ( $string , $start );
iconv_set_encoding ( 'internal_encoding' , $enc ); // restore internal encoding
2005-12-13 17:21:53 +00:00
return $str ;
}
else {
2011-10-22 21:07:24 +02:00
return iconv_substr ( $string , $start , $len , $charset );
2005-12-13 17:21:53 +00:00
}
2011-10-22 21:07:24 +02:00
} elseif ( $charset == 'utf-8' ) {
return $this -> utf8_substr ( $string , $start , $len );
} elseif ( $this -> eucBasedSets [ $charset ]) {
return $this -> euc_substr ( $string , $start , $charset , $len );
} elseif ( $this -> twoByteSets [ $charset ]) {
return substr ( $string , $start * 2 , $len * 2 );
} elseif ( $this -> fourByteSets [ $charset ]) {
return substr ( $string , $start * 4 , $len * 4 );
}
// treat everything else as single-byte encoding
return $len === NULL ? substr ( $string , $start ) : substr ( $string , $start , $len );
2005-10-17 15:48:46 +00:00
}
/**
* Counts the number of characters .
* Unit - tested by Kasper ( single byte charsets only )
*
* @ param string The character set
* @ param string Character string
* @ return integer The number of characters
* @ see strlen ()
* @ author Martin Kutschker < martin . t . kutschker @ blackbox . net >
*/
2011-10-22 21:07:24 +02:00
function strlen ( $charset , $string ) {
if ( $GLOBALS [ 'TYPO3_CONF_VARS' ][ 'SYS' ][ 't3lib_cs_utils' ] == 'mbstring' ) {
return mb_strlen ( $string , $charset );
} elseif ( $GLOBALS [ 'TYPO3_CONF_VARS' ][ 'SYS' ][ 't3lib_cs_utils' ] == 'iconv' ) {
return iconv_strlen ( $string , $charset );
} elseif ( $charset == 'utf-8' ) {
2005-10-17 15:48:46 +00:00
return $this -> utf8_strlen ( $string );
2011-10-22 21:07:24 +02:00
} elseif ( $this -> eucBasedSets [ $charset ]) {
return $this -> euc_strlen ( $string , $charset );
} elseif ( $this -> twoByteSets [ $charset ]) {
return strlen ( $string ) / 2 ;
} elseif ( $this -> fourByteSets [ $charset ]) {
return strlen ( $string ) / 4 ;
}
// treat everything else as single-byte encoding
2005-10-17 15:48:46 +00:00
return strlen ( $string );
}
2009-11-18 21:48:00 +00:00
/**
* Method to crop strings using the mb_substr function .
*
* @ param string The character set
* @ param string String to be cropped
* @ param integer Crop length ( in characters )
* @ param string Crop signifier
* @ return string The shortened string
* @ see mb_strlen (), mb_substr ()
*/
protected function cropMbstring ( $charset , $string , $len , $crop = '' ) {
2011-10-22 21:07:24 +02:00
if ( intval ( $len ) === 0 || mb_strlen ( $string , $charset ) <= abs ( $len )) {
2009-11-18 21:48:00 +00:00
return $string ;
}
if ( $len > 0 ) {
$string = mb_substr ( $string , 0 , $len , $charset ) . $crop ;
} else {
$string = $crop . mb_substr ( $string , $len , mb_strlen ( $string , $charset ), $charset );
}
return $string ;
}
2005-10-17 15:48:46 +00:00
/**
* Truncates a string and pre -/ appends a string .
* Unit tested by Kasper
*
* @ param string The character set
* @ param string Character string
* @ param integer Length ( in characters )
* @ param string Crop signifier
* @ return string The shortened string
* @ see substr (), mb_strimwidth ()
* @ author Martin Kutschker < martin . t . kutschker @ blackbox . net >
*/
2011-10-22 21:07:24 +02:00
function crop ( $charset , $string , $len , $crop = '' ) {
2009-11-18 21:48:00 +00:00
if ( $GLOBALS [ 'TYPO3_CONF_VARS' ][ 'SYS' ][ 't3lib_cs_utils' ] == 'mbstring' ) {
return $this -> cropMbstring ( $charset , $string , $len , $crop );
}
2011-10-22 21:07:24 +02:00
if ( intval ( $len ) == 0 ) {
return $string ;
}
2005-10-17 15:48:46 +00:00
2011-10-22 21:07:24 +02:00
if ( $charset == 'utf-8' ) {
$i = $this -> utf8_char2byte_pos ( $string , $len );
} elseif ( $this -> eucBasedSets [ $charset ]) {
$i = $this -> euc_char2byte_pos ( $string , $len , $charset );
2005-10-17 15:48:46 +00:00
} else {
2011-10-22 21:07:24 +02:00
if ( $len > 0 ) {
2005-10-17 15:48:46 +00:00
$i = $len ;
} else {
2011-10-22 21:07:24 +02:00
$i = strlen ( $string ) + $len ;
if ( $i <= 0 ) {
$i = FALSE ;
}
2005-10-17 15:48:46 +00:00
}
}
2011-10-22 21:07:24 +02:00
if ( $i === FALSE ) { // $len outside actual string length
2005-10-17 15:48:46 +00:00
return $string ;
2011-10-22 21:07:24 +02:00
} else {
if ( $len > 0 ) {
if ( strlen ( $string { $i })) {
return substr ( $string , 0 , $i ) . $crop ;
2005-10-17 15:48:46 +00:00
}
} else {
2011-10-22 21:07:24 +02:00
if ( strlen ( $string { $i - 1 })) {
return $crop . substr ( $string , $i );
2005-10-17 15:48:46 +00:00
}
}
2011-10-22 21:07:24 +02:00
/*
2012-04-21 21:31:03 +02:00
if ( abs ( $len ) < $this -> strlen ( $charset , $string )) { // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return TRUE here (which is not a catastrophe, but...)
2011-10-22 21:07:24 +02:00
if ( $len > 0 ) {
return substr ( $string , 0 , $i ) . $crop ;
} else {
return $crop . substr ( $string , $i );
}
}
*/
2005-10-17 15:48:46 +00:00
}
return $string ;
}
/**
* Cuts a string short at a given byte length .
*
* @ param string The character set
* @ param string Character string
* @ param integer The byte length
* @ return string The shortened string
* @ see mb_strcut ()
* @ author Martin Kutschker < martin . t . kutschker @ blackbox . net >
*/
2011-10-22 21:07:24 +02:00
function strtrunc ( $charset , $string , $len ) {
if ( $len <= 0 ) {
return '' ;
}
if ( $GLOBALS [ 'TYPO3_CONF_VARS' ][ 'SYS' ][ 't3lib_cs_utils' ] == 'mbstring' ) {
return mb_strcut ( $string , 0 , $len , $charset );
} elseif ( $charset == 'utf-8' ) {
return $this -> utf8_strtrunc ( $string , $len );
} elseif ( $this -> eucBasedSets [ $charset ]) {
return $this -> euc_strtrunc ( $string , $len , $charset );
} elseif ( $this -> twoByteSets [ $charset ]) {
if ( $len % 2 ) {
$len -- ;
} // don't cut at odd positions
} elseif ( $this -> fourByteSets [ $charset ]) {
2005-10-17 15:48:46 +00:00
$x = $len % 4 ;
2011-10-22 21:07:24 +02:00
$len -= $x ; // realign to position dividable by four
2005-10-17 15:48:46 +00:00
}
2011-10-22 21:07:24 +02:00
// treat everything else as single-byte encoding
return substr ( $string , 0 , $len );
2005-10-17 15:48:46 +00:00
}
/**
* Translates all characters of a string into their respective case values .
* Unlike strtolower () and strtoupper () this method is locale independent .
* Note that the string length may change !
2011-10-22 21:07:24 +02:00
* eg . lower case German " ß " ( sharp S ) becomes upper case " SS "
2005-10-17 15:48:46 +00:00
* Unit - tested by Kasper
* Real case folding is language dependent , this method ignores this fact .
*
* @ param string Character set of string
* @ param string Input string to convert case for
* @ param string Case keyword : " toLower " means lowercase conversion , anything else is uppercase ( use " toUpper " )
* @ return string The converted string
* @ author Martin Kutschker < martin . t . kutschker @ blackbox . net >
* @ see strtolower (), strtoupper ()
*/
2011-10-22 21:07:24 +02:00
function conv_case ( $charset , $string , $case ) {
if ( $GLOBALS [ 'TYPO3_CONF_VARS' ][ 'SYS' ][ 't3lib_cs_utils' ] == 'mbstring' ) {
if ( $case == 'toLower' ) {
$string = mb_strtolower ( $string , $charset );
2005-10-17 15:48:46 +00:00
} else {
2011-10-22 21:07:24 +02:00
$string = mb_strtoupper ( $string , $charset );
2005-10-17 15:48:46 +00:00
}
2011-10-22 21:07:24 +02:00
} elseif ( $charset == 'utf-8' ) {
$string = $this -> utf8_char_mapping ( $string , 'case' , $case );
} elseif ( isset ( $this -> eucBasedSets [ $charset ])) {
$string = $this -> euc_char_mapping ( $string , $charset , 'case' , $case );
2005-10-17 15:48:46 +00:00
} else {
// treat everything else as single-byte encoding
2011-10-22 21:07:24 +02:00
$string = $this -> sb_char_mapping ( $string , $charset , 'case' , $case );
2005-10-17 15:48:46 +00:00
}
return $string ;
}
2012-04-21 21:31:03 +02:00
/**
* Equivalent of lcfirst / ucfirst but using character set .
*
* @ param string $charset
* @ param string $string
* @ param string $case
* @ return string
* @ see t3lib_cs :: conv_case ()
*/
public function convCaseFirst ( $charset , $string , $case ) {
$firstChar = $this -> substr ( $charset , $string , 0 , 1 );
$firstChar = $this -> conv_case ( $charset , $firstChar , $case );
$remainder = $this -> substr ( $charset , $string , 1 );
return $firstChar . $remainder ;
}
2005-10-17 15:48:46 +00:00
/**
2011-10-22 21:07:24 +02:00
* Converts special chars ( like æøåÆØÅ , umlauts etc ) to ascii equivalents ( usually double - bytes , like æ => ae etc . )
2005-10-17 15:48:46 +00:00
*
2012-09-02 16:10:57 +02:00
* @ param string $charset Character set of string
* @ param string $string Input string to convert
* @ return string The converted string
2005-10-17 15:48:46 +00:00
*/
2011-10-22 21:07:24 +02:00
function specCharsToASCII ( $charset , $string ) {
if ( $charset == 'utf-8' ) {
$string = $this -> utf8_char_mapping ( $string , 'ascii' );
} elseif ( isset ( $this -> eucBasedSets [ $charset ])) {
$string = $this -> euc_char_mapping ( $string , $charset , 'ascii' );
2005-10-17 15:48:46 +00:00
} else {
// treat everything else as single-byte encoding
2011-10-22 21:07:24 +02:00
$string = $this -> sb_char_mapping ( $string , $charset , 'ascii' );
2005-10-17 15:48:46 +00:00
}
return $string ;
}
2009-11-18 21:48:00 +00:00
/**
* converts the language codes that we get from the client ( usually HTTP_ACCEPT_LANGUAGE )
* into a TYPO3 - readable language code
* @ param $languageCodesList list of language codes . something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
2011-10-22 21:07:24 +02:00
* see http :// www . w3 . org / Protocols / rfc2616 / rfc2616 - sec14 . html #sec14.4
2009-11-18 21:48:00 +00:00
* @ return string a preferred language that TYPO3 supports , or " default " if none found
* @ author Benjamin Mack ( benni . typo3 . org )
*/
public function getPreferredClientLanguage ( $languageCodesList ) {
$allLanguageCodes = array ();
$selectedLanguage = 'default' ;
2011-10-22 21:07:24 +02:00
// get all languages where TYPO3 code is the same as the ISO code
2009-11-18 21:48:00 +00:00
foreach ( $this -> charSetArray as $typo3Lang => $charSet ) {
$allLanguageCodes [ $typo3Lang ] = $typo3Lang ;
}
2011-10-22 21:07:24 +02:00
// get all languages where TYPO3 code differs from ISO code
// or needs the country part
// the iso codes will here overwrite the default typo3 language in the key
2012-04-21 21:31:03 +02:00
foreach ( $this -> locales -> getIsoMapping () as $typo3Lang => $isoLang ) {
2009-11-18 21:48:00 +00:00
$isoLang = join ( '-' , explode ( '_' , $isoLang ));
$allLanguageCodes [ $typo3Lang ] = $isoLang ;
}
2011-10-22 21:07:24 +02:00
// move the iso codes to the (because we're comparing the keys with "isset" later on)
2009-11-18 21:48:00 +00:00
$allLanguageCodes = array_flip ( $allLanguageCodes );
$preferredLanguages = t3lib_div :: trimExplode ( ',' , $languageCodesList );
2011-10-22 21:07:24 +02:00
// order the preferred languages after they key
2009-11-18 21:48:00 +00:00
$sortedPreferredLanguages = array ();
foreach ( $preferredLanguages as $preferredLanguage ) {
$quality = 1.0 ;
2011-10-22 21:07:24 +02:00
if ( strpos ( $preferredLanguage , ';q=' ) !== FALSE ) {
2009-11-18 21:48:00 +00:00
list ( $preferredLanguage , $quality ) = explode ( ';q=' , $preferredLanguage );
}
$sortedPreferredLanguages [ $preferredLanguage ] = $quality ;
}
2011-10-22 21:07:24 +02:00
// loop through the languages, with the highest priority first
2009-11-18 21:48:00 +00:00
arsort ( $sortedPreferredLanguages , SORT_NUMERIC );
foreach ( $sortedPreferredLanguages as $preferredLanguage => $quality ) {
if ( isset ( $allLanguageCodes [ $preferredLanguage ])) {
$selectedLanguage = $allLanguageCodes [ $preferredLanguage ];
break ;
}
2011-10-22 21:07:24 +02:00
// strip the country code from the end
2009-11-18 21:48:00 +00:00
list ( $preferredLanguage , $preferredCountry ) = explode ( '-' , $preferredLanguage );
if ( isset ( $allLanguageCodes [ $preferredLanguage ])) {
$selectedLanguage = $allLanguageCodes [ $preferredLanguage ];
break ;
}
}
if ( ! $selectedLanguage || $selectedLanguage == 'en' ) {
$selectedLanguage = 'default' ;
}
return $selectedLanguage ;
}
2005-10-17 15:48:46 +00:00
/********************************************
*
* Internal string operation functions
*
********************************************/
/**
* Maps all characters of a string in a single byte charset .
*
* @ param string the string
* @ param string the charset
* @ param string mode : 'case' ( case folding ) or 'ascii' ( ASCII transliteration )
* @ param string 'case' : conversion 'toLower' or 'toUpper'
* @ return string the converted string
* @ author Martin Kutschker < martin . t . kutschker @ blackbox . net >
*/
2011-10-22 21:07:24 +02:00
function sb_char_mapping ( $str , $charset , $mode , $opt = '' ) {
switch ( $mode ) {
2005-10-17 15:48:46 +00:00
case 'case' :
2011-10-22 21:07:24 +02:00
if ( ! $this -> initCaseFolding ( $charset )) {
return $str ;
} // do nothing
2005-10-17 15:48:46 +00:00
$map =& $this -> caseFolding [ $charset ][ $opt ];
break ;
case 'ascii' :
2011-10-22 21:07:24 +02:00
if ( ! $this -> initToASCII ( $charset )) {
return $str ;
} // do nothing
2005-10-17 15:48:46 +00:00
$map =& $this -> toASCII [ $charset ];
break ;
default :
return $str ;
}
$out = '' ;
2011-10-22 21:07:24 +02:00
for ( $i = 0 ; strlen ( $str { $i }); $i ++ ) {
2005-10-17 15:48:46 +00:00
$c = $str { $i };
2011-10-22 21:07:24 +02:00
if ( isset ( $map [ $c ])) {
2005-10-17 15:48:46 +00:00
$out .= $map [ $c ];
} else {
$out .= $c ;
}
}
return $out ;
}
/********************************************
*
* Internal UTF - 8 string operation functions
*
********************************************/
/**
* Returns a part of a UTF - 8 string .
* Unit - tested by Kasper and works 100 % like substr () / mb_substr () for full range of $start / $len
*
* @ param string UTF - 8 string
* @ param integer Start position ( character position )
* @ param integer Length ( in characters )
* @ return string The substring
* @ see substr ()
* @ author Martin Kutschker < martin . t . kutschker @ blackbox . net >
*/
2011-10-22 21:07:24 +02:00
function utf8_substr ( $str , $start , $len = NULL ) {
if ( ! strcmp ( $len , '0' )) {
return '' ;
}
2005-10-17 15:48:46 +00:00
2011-10-22 21:07:24 +02:00
$byte_start = $this -> utf8_char2byte_pos ( $str , $start );
if ( $byte_start === FALSE ) {
if ( $start > 0 ) {
return FALSE ; // $start outside string length
2005-10-17 15:48:46 +00:00
} else {
$start = 0 ;
}
}
2011-10-22 21:07:24 +02:00
$str = substr ( $str , $byte_start );
2005-10-17 15:48:46 +00:00
2011-10-22 21:07:24 +02:00
if ( $len != NULL ) {
$byte_end = $this -> utf8_char2byte_pos ( $str , $len );
if ( $byte_end === FALSE ) // $len outside actual string length
{
return $len < 0 ? '' : $str ;
} // When length is less than zero and exceeds, then we return blank string.
2005-10-17 15:48:46 +00:00
else
2011-10-22 21:07:24 +02:00
{
return substr ( $str , 0 , $byte_end );
}
}
else {
return $str ;
2005-10-17 15:48:46 +00:00
}
}
/**
* Counts the number of characters of a string in UTF - 8.
* Unit - tested by Kasper and works 100 % like strlen () / mb_strlen ()
*
* @ param string UTF - 8 multibyte character string
* @ return integer The number of characters
* @ see strlen ()
* @ author Martin Kutschker < martin . t . kutschker @ blackbox . net >
*/
2011-10-22 21:07:24 +02:00
function utf8_strlen ( $str ) {
$n = 0 ;
for ( $i = 0 ; strlen ( $str { $i }); $i ++ ) {
2005-10-17 15:48:46 +00:00
$c = ord ( $str { $i });
2011-10-22 21:07:24 +02:00
if ( ! ( $c & 0x80 )) // single-byte (0xxxxxx)
{
2005-10-17 15:48:46 +00:00
$n ++ ;
2011-10-22 21:07:24 +02:00
}
elseif (( $c & 0xC0 ) == 0xC0 ) // multi-byte starting byte (11xxxxxx)
{
2005-10-17 15:48:46 +00:00
$n ++ ;
2011-10-22 21:07:24 +02:00
}
2005-10-17 15:48:46 +00:00
}
return $n ;
}
/**
* Truncates a string in UTF - 8 short at a given byte length .
*
* @ param string UTF - 8 multibyte character string
* @ param integer the byte length
* @ return string the shortened string
* @ see mb_strcut ()
* @ author Martin Kutschker < martin . t . kutschker @ blackbox . net >
*/
2011-10-22 21:07:24 +02:00
function utf8_strtrunc ( $str , $len ) {
$i = $len - 1 ;
2005-10-17 15:48:46 +00:00
if ( ord ( $str { $i }) & 0x80 ) { // part of a multibyte sequence
2012-04-21 21:31:03 +02:00
for (; $i > 0 && ! ( ord ( $str { $i }) & 0x40 ); $i -- ) {
// find the first byte
;
}
2011-10-22 21:07:24 +02:00
if ( $i <= 0 ) {
return '' ;
} // sanity check
2012-04-21 21:31:03 +02:00
for ( $bc = 0 , $mbs = ord ( $str { $i }); $mbs & 0x80 ; $mbs = $mbs << 1 ) {
// calculate number of bytes
$bc ++ ;
}
2011-10-22 21:07:24 +02:00
if ( $bc + $i > $len ) {
return substr ( $str , 0 , $i );
}
2009-11-18 21:48:00 +00:00
// fallthru: multibyte char fits into length
2005-10-17 15:48:46 +00:00
}
2011-10-22 21:07:24 +02:00
return substr ( $str , 0 , $len );
2005-10-17 15:48:46 +00:00
}
/**
* Find position of first occurrence of a string , both arguments are in UTF - 8.
*
* @ param string UTF - 8 string to search in
* @ param string UTF - 8 string to search for
* @ param integer Positition to start the search
* @ return integer The character position
* @ see strpos ()
* @ author Martin Kutschker < martin . t . kutschker @ blackbox . net >
*/
2011-10-22 21:07:24 +02:00
function utf8_strpos ( $haystack , $needle , $offset = 0 ) {
if ( $GLOBALS [ 'TYPO3_CONF_VARS' ][ 'SYS' ][ 't3lib_cs_utils' ] == 'mbstring' ) {
return mb_strpos ( $haystack , $needle , $offset , 'utf-8' );
} elseif ( $GLOBALS [ 'TYPO3_CONF_VARS' ][ 'SYS' ][ 't3lib_cs_utils' ] == 'iconv' ) {
return iconv_strpos ( $haystack , $needle , $offset , 'utf-8' );
2005-10-17 15:48:46 +00:00
}
2011-10-22 21:07:24 +02:00
$byte_offset = $this -> utf8_char2byte_pos ( $haystack , $offset );
if ( $byte_offset === FALSE ) {
return FALSE ;
} // offset beyond string length
2005-10-17 15:48:46 +00:00
2011-10-22 21:07:24 +02:00
$byte_pos = strpos ( $haystack , $needle , $byte_offset );
if ( $byte_pos === FALSE ) {
return FALSE ;
} // needle not found
2005-10-17 15:48:46 +00:00
2011-10-22 21:07:24 +02:00
return $this -> utf8_byte2char_pos ( $haystack , $byte_pos );
2005-10-17 15:48:46 +00:00
}
/**
* Find position of last occurrence of a char in a string , both arguments are in UTF - 8.
*
* @ param string UTF - 8 string to search in
* @ param string UTF - 8 character to search for ( single character )
* @ return integer The character position
* @ see strrpos ()
* @ author Martin Kutschker < martin . t . kutschker @ blackbox . net >
*/
2011-10-22 21:07:24 +02:00
function utf8_strrpos ( $haystack , $needle ) {
if ( $GLOBALS [ 'TYPO3_CONF_VARS' ][ 'SYS' ][ 't3lib_cs_utils' ] == 'mbstring' ) {
return mb_strrpos ( $haystack , $needle , 'utf-8' );
} elseif ( $GLOBALS [ 'TYPO3_CONF_VARS' ][ 'SYS' ][ 't3lib_cs_utils' ] == 'iconv' ) {
return iconv_strrpos ( $haystack , $needle , 'utf-8' );
2005-10-17 15:48:46 +00:00
}
2011-10-22 21:07:24 +02:00
$byte_pos = strrpos ( $haystack , $needle );
if ( $byte_pos === FALSE ) {
return FALSE ;
} // needle not found
2005-10-17 15:48:46 +00:00
2011-10-22 21:07:24 +02:00
return $this -> utf8_byte2char_pos ( $haystack , $byte_pos );
2005-10-17 15:48:46 +00:00
}
/**
* Translates a character position into an 'absolute' byte position .
* Unit tested by Kasper .
*
* @ param string UTF - 8 string
* @ param integer Character position ( negative values start from the end )
* @ return integer Byte position
* @ author Martin Kutschker < martin . t . kutschker @ blackbox . net >
*/
2011-10-22 21:07:24 +02:00
function utf8_char2byte_pos ( $str , $pos ) {
$n = 0 ; // number of characters found
$p = abs ( $pos ); // number of characters wanted
2005-10-17 15:48:46 +00:00
2011-10-22 21:07:24 +02:00
if ( $pos >= 0 ) {
2005-10-17 15:48:46 +00:00
$i = 0 ;
$d = 1 ;
} else {
2011-10-22 21:07:24 +02:00
$i = strlen ( $str ) - 1 ;
2005-10-17 15:48:46 +00:00
$d = - 1 ;
}
2011-10-22 21:07:24 +02:00
for (; strlen ( $str { $i }) && $n < $p ; $i += $d ) {
$c = ( int ) ord ( $str { $i });
if ( ! ( $c & 0x80 )) // single-byte (0xxxxxx)
{
2005-10-17 15:48:46 +00:00
$n ++ ;
2011-10-22 21:07:24 +02:00
}
elseif (( $c & 0xC0 ) == 0xC0 ) // multi-byte starting byte (11xxxxxx)
{
2005-10-17 15:48:46 +00:00
$n ++ ;
2011-10-22 21:07:24 +02:00
}
2005-10-17 15:48:46 +00:00
}
2011-10-22 21:07:24 +02:00
if ( ! strlen ( $str { $i })) {
return FALSE ;
} // offset beyond string length
2005-10-17 15:48:46 +00:00
2011-10-22 21:07:24 +02:00
if ( $pos >= 0 ) {
2005-10-17 15:48:46 +00:00
// skip trailing multi-byte data bytes
2011-10-22 21:07:24 +02:00
while (( ord ( $str { $i }) & 0x80 ) && ! ( ord ( $str { $i }) & 0x40 )) {
$i ++ ;
}
2005-10-17 15:48:46 +00:00
} else {
// correct offset
$i ++ ;
}
return $i ;
}
/**
* Translates an 'absolute' byte position into a character position .
* Unit tested by Kasper .
*
* @ param string UTF - 8 string
* @ param integer byte position
* @ return integer character position
* @ author Martin Kutschker < martin . t . kutschker @ blackbox . net >
*/
2011-10-22 21:07:24 +02:00
function utf8_byte2char_pos ( $str , $pos ) {
$n = 0 ; // number of characters
for ( $i = $pos ; $i > 0 ; $i -- ) {
$c = ( int ) ord ( $str { $i });
if ( ! ( $c & 0x80 )) // single-byte (0xxxxxx)
{
2005-10-17 15:48:46 +00:00
$n ++ ;
2011-10-22 21:07:24 +02:00
}
elseif (( $c & 0xC0 ) == 0xC0 ) // multi-byte starting byte (11xxxxxx)
{
2005-10-17 15:48:46 +00:00
$n ++ ;
2011-10-22 21:07:24 +02:00
}
2005-10-17 15:48:46 +00:00
}
2011-10-22 21:07:24 +02:00
if ( ! strlen ( $str { $i })) {
return FALSE ;
} // offset beyond string length
2005-10-17 15:48:46 +00:00
return $n ;
}
/**
* Maps all characters of an UTF - 8 string .
*
* @ param string UTF - 8 string
* @ param string mode : 'case' ( case folding ) or 'ascii' ( ASCII transliteration )
* @ param string 'case' : conversion 'toLower' or 'toUpper'
* @ return string the converted string
* @ author Martin Kutschker < martin . t . kutschker @ blackbox . net >
*/
2011-10-22 21:07:24 +02:00
function utf8_char_mapping ( $str , $mode , $opt = '' ) {
if ( ! $this -> initUnicodeData ( $mode )) {
return $str ;
} // do nothing
2005-10-17 15:48:46 +00:00
$out = '' ;
2011-10-22 21:07:24 +02:00
switch ( $mode ) {
2005-10-17 15:48:46 +00:00
case 'case' :
$map =& $this -> caseFolding [ 'utf-8' ][ $opt ];
break ;
case 'ascii' :
$map =& $this -> toASCII [ 'utf-8' ];
break ;
default :
return $str ;
}
2011-10-22 21:07:24 +02:00
for ( $i = 0 ; strlen ( $str { $i }); $i ++ ) {
2005-10-17 15:48:46 +00:00
$c = ord ( $str { $i });
2011-10-22 21:07:24 +02:00
if ( ! ( $c & 0x80 )) // single-byte (0xxxxxx)
{
2005-10-17 15:48:46 +00:00
$mbc = $str { $i };
2011-10-22 21:07:24 +02:00
}
elseif (( $c & 0xC0 ) == 0xC0 ) { // multi-byte starting byte (11xxxxxx)
for ( $bc = 0 ; $c & 0x80 ; $c = $c << 1 ) {
$bc ++ ;
} // calculate number of bytes
$mbc = substr ( $str , $i , $bc );
$i += $bc - 1 ;
2005-10-17 15:48:46 +00:00
}
2011-10-22 21:07:24 +02:00
if ( isset ( $map [ $mbc ])) {
2005-10-17 15:48:46 +00:00
$out .= $map [ $mbc ];
} else {
$out .= $mbc ;
}
}
return $out ;
}
/********************************************
*
* Internal EUC string operation functions
*
* Extended Unix Code :
* ASCII compatible 7 bit single bytes chars
* 8 bit two byte chars
*
* Shift - JIS is treated as a special case .
*
********************************************/
/**
* Cuts a string in the EUC charset family short at a given byte length .
*
* @ param string EUC multibyte character string
* @ param integer the byte length
* @ param string the charset
* @ return string the shortened string
* @ see mb_strcut ()
* @ author Martin Kutschker < martin . t . kutschker @ blackbox . net >
*/
2011-10-22 21:07:24 +02:00
function euc_strtrunc ( $str , $len , $charset ) {
2005-10-17 15:48:46 +00:00
$sjis = ( $charset == 'shift_jis' );
2011-10-22 21:07:24 +02:00
for ( $i = 0 ; strlen ( $str { $i }) && $i < $len ; $i ++ ) {
2005-10-17 15:48:46 +00:00
$c = ord ( $str { $i });
2011-10-22 21:07:24 +02:00
if ( $sjis ) {
if (( $c >= 0x80 && $c < 0xA0 ) || ( $c >= 0xE0 )) {
$i ++ ;
} // advance a double-byte char
2005-10-17 15:48:46 +00:00
}
2011-10-22 21:07:24 +02:00
else {
if ( $c >= 0x80 ) {
$i ++ ;
} // advance a double-byte char
2005-10-17 15:48:46 +00:00
}
}
2011-10-22 21:07:24 +02:00
if ( ! strlen ( $str { $i })) {
return $str ;
} // string shorter than supplied length
2005-10-17 15:48:46 +00:00
2011-10-22 21:07:24 +02:00
if ( $i > $len ) {
return substr ( $str , 0 , $len - 1 ); // we ended on a first byte
2009-11-18 21:48:00 +00:00
} else {
2011-10-22 21:07:24 +02:00
return substr ( $str , 0 , $len );
2009-11-18 21:48:00 +00:00
}
}
2005-10-17 15:48:46 +00:00
/**
* Returns a part of a string in the EUC charset family .
*
* @ param string EUC multibyte character string
* @ param integer start position ( character position )
* @ param string the charset
* @ param integer length ( in characters )
* @ return string the substring
* @ author Martin Kutschker < martin . t . kutschker @ blackbox . net >
*/
2011-10-22 21:07:24 +02:00
function euc_substr ( $str , $start , $charset , $len = NULL ) {
$byte_start = $this -> euc_char2byte_pos ( $str , $start , $charset );
if ( $byte_start === FALSE ) {
return FALSE ;
} // $start outside string length
$str = substr ( $str , $byte_start );
if ( $len != NULL ) {
$byte_end = $this -> euc_char2byte_pos ( $str , $len , $charset );
if ( $byte_end === FALSE ) // $len outside actual string length
{
2005-10-17 15:48:46 +00:00
return $str ;
2011-10-22 21:07:24 +02:00
}
2005-10-17 15:48:46 +00:00
else
2011-10-22 21:07:24 +02:00
{
return substr ( $str , 0 , $byte_end );
}
}
else {
return $str ;
2005-10-17 15:48:46 +00:00
}
}
/**
* Counts the number of characters of a string in the EUC charset family .
*
* @ param string EUC multibyte character string
* @ param string the charset
* @ return integer the number of characters
* @ see strlen ()
* @ author Martin Kutschker < martin . t . kutschker @ blackbox . net >
*/
2011-10-22 21:07:24 +02:00
function euc_strlen ( $str , $charset ) {
2005-10-17 15:48:46 +00:00
$sjis = ( $charset == 'shift_jis' );
2011-10-22 21:07:24 +02:00
$n = 0 ;
for ( $i = 0 ; strlen ( $str { $i }); $i ++ ) {
2005-10-17 15:48:46 +00:00
$c = ord ( $str { $i });
2011-10-22 21:07:24 +02:00
if ( $sjis ) {
if (( $c >= 0x80 && $c < 0xA0 ) || ( $c >= 0xE0 )) {
$i ++ ;
} // advance a double-byte char
2005-10-17 15:48:46 +00:00
}
2011-10-22 21:07:24 +02:00
else {
if ( $c >= 0x80 ) {
$i ++ ;
} // advance a double-byte char
2005-10-17 15:48:46 +00:00
}
$n ++ ;
}
return $n ;
}
/**
* Translates a character position into an 'absolute' byte position .
*
* @ param string EUC multibyte character string
* @ param integer character position ( negative values start from the end )
* @ param string the charset
* @ return integer byte position
* @ author Martin Kutschker < martin . t . kutschker @ blackbox . net >
*/
2011-10-22 21:07:24 +02:00
function euc_char2byte_pos ( $str , $pos , $charset ) {
2005-10-17 15:48:46 +00:00
$sjis = ( $charset == 'shift_jis' );
$n = 0 ; // number of characters seen
2011-10-22 21:07:24 +02:00
$p = abs ( $pos ); // number of characters wanted
2005-10-17 15:48:46 +00:00
2011-10-22 21:07:24 +02:00
if ( $pos >= 0 ) {
2005-10-17 15:48:46 +00:00
$i = 0 ;
$d = 1 ;
} else {
2011-10-22 21:07:24 +02:00
$i = strlen ( $str ) - 1 ;
2005-10-17 15:48:46 +00:00
$d = - 1 ;
}
2011-10-22 21:07:24 +02:00
for (; strlen ( $str { $i }) && $n < $p ; $i += $d ) {
2005-10-17 15:48:46 +00:00
$c = ord ( $str { $i });
2011-10-22 21:07:24 +02:00
if ( $sjis ) {
if (( $c >= 0x80 && $c < 0xA0 ) || ( $c >= 0xE0 )) {
$i += $d ;
} // advance a double-byte char
2005-10-17 15:48:46 +00:00
}
2011-10-22 21:07:24 +02:00
else {
if ( $c >= 0x80 ) {
$i += $d ;
} // advance a double-byte char
2005-10-17 15:48:46 +00:00
}
$n ++ ;
}
2011-10-22 21:07:24 +02:00
if ( ! strlen ( $str { $i })) {
return FALSE ;
} // offset beyond string length
2005-10-17 15:48:46 +00:00
2011-10-22 21:07:24 +02:00
if ( $pos < 0 ) {
$i ++ ;
} // correct offset
2005-10-17 15:48:46 +00:00
return $i ;
}
/**
* Maps all characters of a string in the EUC charset family .
*
* @ param string EUC multibyte character string
* @ param string the charset
* @ param string mode : 'case' ( case folding ) or 'ascii' ( ASCII transliteration )
* @ param string 'case' : conversion 'toLower' or 'toUpper'
* @ return string the converted string
* @ author Martin Kutschker < martin . t . kutschker @ blackbox . net >
*/
2011-10-22 21:07:24 +02:00
function euc_char_mapping ( $str , $charset , $mode , $opt = '' ) {
switch ( $mode ) {
2005-10-17 15:48:46 +00:00
case 'case' :
2011-10-22 21:07:24 +02:00
if ( ! $this -> initCaseFolding ( $charset )) {
return $str ;
} // do nothing
2005-10-17 15:48:46 +00:00
$map =& $this -> caseFolding [ $charset ][ $opt ];
break ;
case 'ascii' :
2011-10-22 21:07:24 +02:00
if ( ! $this -> initToASCII ( $charset )) {
return $str ;
} // do nothing
2005-10-17 15:48:46 +00:00
$map =& $this -> toASCII [ $charset ];
break ;
default :
return $str ;
}
$sjis = ( $charset == 'shift_jis' );
$out = '' ;
2011-10-22 21:07:24 +02:00
for ( $i = 0 ; strlen ( $str { $i }); $i ++ ) {
2005-10-17 15:48:46 +00:00
$mbc = $str { $i };
$c = ord ( $mbc );
2011-10-22 21:07:24 +02:00
if ( $sjis ) {
if (( $c >= 0x80 && $c < 0xA0 ) || ( $c >= 0xE0 )) { // a double-byte char
$mbc = substr ( $str , $i , 2 );
2005-10-17 15:48:46 +00:00
$i ++ ;
}
}
2011-10-22 21:07:24 +02:00
else {
if ( $c >= 0x80 ) { // a double-byte char
$mbc = substr ( $str , $i , 2 );
2005-10-17 15:48:46 +00:00
$i ++ ;
}
}
2011-10-22 21:07:24 +02:00
if ( isset ( $map [ $mbc ])) {
2005-10-17 15:48:46 +00:00
$out .= $map [ $mbc ];
} else {
$out .= $mbc ;
}
}
return $out ;
}
}
2011-10-22 21:07:24 +02:00
if ( defined ( 'TYPO3_MODE' ) && isset ( $GLOBALS [ 'TYPO3_CONF_VARS' ][ TYPO3_MODE ][ 'XCLASS' ][ 't3lib/class.t3lib_cs.php' ])) {
include_once ( $GLOBALS [ 'TYPO3_CONF_VARS' ][ TYPO3_MODE ][ 'XCLASS' ][ 't3lib/class.t3lib_cs.php' ]);
2005-10-17 15:48:46 +00:00
}
2009-11-18 21:48:00 +00:00
2012-09-02 16:10:57 +02:00
?>