1
0
mirror of https://github.com/e107inc/e107.git synced 2025-07-31 03:40:37 +02:00

work on CHARSET

This commit is contained in:
marj
2009-10-30 20:58:52 +00:00
parent 68d962a5cc
commit 5ee7878f47
2 changed files with 54 additions and 25 deletions

View File

@@ -9,8 +9,8 @@
* General purpose file * General purpose file
* *
* $Source: /cvs_backup/e107_0.8/class2.php,v $ * $Source: /cvs_backup/e107_0.8/class2.php,v $
* $Revision: 1.151 $ * $Revision: 1.152 $
* $Date: 2009-10-28 15:42:16 $ * $Date: 2009-10-30 20:58:52 $
* $Author: marj_nl_fr $ * $Author: marj_nl_fr $
* *
*/ */
@@ -50,7 +50,7 @@ $oblev_before_start = ob_get_level();
if(!isset($_E107) || !is_array($_E107)) { $_E107 = array(); } if(!isset($_E107) || !is_array($_E107)) { $_E107 = array(); }
if(isset($_E107['cli']) && !isset($_E107['debug']) && isset($_SERVER["HTTP_USER_AGENT"])) if(isset($_E107['cli']) && !isset($_E107['debug']) && isset($_SERVER["HTTP_USER_AGENT"]))
{ {
exit; exit();
} }
if(!isset($_E107['cli'])) if(!isset($_E107['cli']))
@@ -726,7 +726,8 @@ e107_include_once(e_LANGUAGEDIR.e_LANGUAGE.'/'.e_LANGUAGE.'.php');
e107_include_once(e_LANGUAGEDIR.e_LANGUAGE."/".e_LANGUAGE.'_custom.php'); e107_include_once(e_LANGUAGEDIR.e_LANGUAGE."/".e_LANGUAGE.'_custom.php');
// Now we know the site CHARSET, define how to handle utf-8 as necessary // Now we know the site CHARSET, define how to handle utf-8 as necessary
$tp->initCharset(); // CHARSET is UTF-8, thus initCharset() is used in e_parse() constructor
// $tp->initCharset();
if($pref['sitelanguage'] != e_LANGUAGE && varset($pref['multilanguage']) && !$pref['multilanguage_subdomain']) if($pref['sitelanguage'] != e_LANGUAGE && varset($pref['multilanguage']) && !$pref['multilanguage_subdomain'])
{ {

View File

@@ -9,8 +9,8 @@
* Text processing and parsing functions * Text processing and parsing functions
* *
* $Source: /cvs_backup/e107_0.8/e107_handlers/e_parse_class.php,v $ * $Source: /cvs_backup/e107_0.8/e107_handlers/e_parse_class.php,v $
* $Revision: 1.71 $ * $Revision: 1.72 $
* $Date: 2009-10-30 20:05:17 $ * $Date: 2009-10-30 20:58:51 $
* $Author: marj_nl_fr $ * $Author: marj_nl_fr $
* *
*/ */
@@ -207,27 +207,43 @@ class e_parse
); );
function e_parse() /**
* Constructor - keep it public for backward compatibility
still some new e_parse() in the core
*
* @return void
*/
public function __construct()
{ {
// initialise the type of UTF-8 processing methods depending on PHP version and mb string extension
$this->initCharset();
// Preprocess the supermods to be useful default arrays with all values // Preprocess the supermods to be useful default arrays with all values
foreach ($this->e_SuperMods as $key=>$val) foreach ($this->e_SuperMods as $key => $val)
{ {
// precalculate super defaults // precalculate super defaults
$this->e_SuperMods[$key] = array_merge($this->e_optDefault , $this->e_SuperMods [$key]); $this->e_SuperMods[$key] = array_merge($this->e_optDefault , $this->e_SuperMods[$key]);
$this->e_SuperMods[$key]['context'] = $key; $this->e_SuperMods[$key]['context'] = $key;
} }
} }
// This has to be a separate function - can't be called until CHARSET known /**
//TODO deprecated? * Initialise the type of UTF-8 processing methods depending on PHP version and mb string extension.
function initCharset() *
* NOTE: can't be called until CHARSET is known
but we all know that it is UTF-8 now
*
* @return void
*/
private function initCharset()
{ {
// Start by working out what, if anything, we do about utf-8 handling. // Start by working out what, if anything, we do about utf-8 handling.
// 'Do nothing' is the simple option // 'Do nothing' is the simple option
$this->utfAction = 0; $this->utfAction = 0;
if(strtolower(CHARSET) == 'utf-8') // CHARSET is utf-8
{ // if(strtolower(CHARSET) == 'utf-8')
// {
$this->isutf8 = TRUE; $this->isutf8 = TRUE;
if(version_compare(PHP_VERSION, '6.0.0') < 1) if(version_compare(PHP_VERSION, '6.0.0') < 1)
{ {
@@ -254,7 +270,7 @@ class e_parse
require (E_UTF8_PACK.'native/core.php'); require (E_UTF8_PACK.'native/core.php');
} }
} }
} // }
} }
@@ -407,7 +423,7 @@ class e_parse
} }
else else
{ {
$data = htmlspecialchars($data, ENT_QUOTES, CHARSET); $data = htmlspecialchars($data, ENT_QUOTES, 'UTF-8');
$data = str_replace('\\', '&#092;', $data); $data = str_replace('\\', '&#092;', $data);
$ret = preg_replace("/&amp;#(\d*?);/", "&#\\1;", $data); $ret = preg_replace("/&amp;#(\d*?);/", "&#\\1;", $data);
} }
@@ -754,23 +770,33 @@ class e_parse
} }
// Truncate a string to a maximum length $len - append the string $more if it was truncated /**
// Uses current CHARSET - for utf-8, returns $len characters rather than $len bytes * Truncate a string to a maximum length $len ­ append the string $more if it was truncated
function text_truncate($text, $len = 200, $more = "[more]") * Uses current CHARSET ­ for utf-8, returns $len characters rather than $len bytes
*
* @param string $text ­ string to process
* @param integer $len ­ length of characters to be truncated
* @param string $more ­ string which will be added if truncation
* @return string
*/
public function text_truncate($text, $len = 200, $more = "[more]")
{ {
// Always valid // Always valid
if(strlen($text) <= $len) if(strlen($text) <= $len)
return $text; return $text;
/* shouldn't be needed
if (strtolower(CHARSET) !== 'utf-8') if (strtolower(CHARSET) !== 'utf-8')
{ {
// Non-utf-8 - one byte per character - simple (unless there's an entity involved) // Non-utf-8 - one byte per character - simple (unless there's an entity involved)
$ret = substr($text,0,$len); $ret = substr($text,0,$len);
} }
else else
*/
{ {
// Its a utf-8 string here - don't know whether its longer than allowed length yet // It's a utf-8 string here - don't know whether it's longer than allowed length yet
preg_match('#^(?:[\x00-\x7F]|[\xC0-\xFF][\x80-\xBF]+){0,0}'. preg_match('#^(?:[\x00-\x7F]|[\xC0-\xFF][\x80-\xBF]+){0,0}'.
'((?:[\x00-\x7F]|[\xC0-\xFF][\x80-\xBF]+){0,'.$len.'})(.{0,1}).*#s',$text,$matches); '((?:[\x00-\x7F]|[\xC0-\xFF][\x80-\xBF]+){0,'.$len.'})(.{0,1}).*#s',
$text, $matches);
// return if utf-8 length is less than max as well // return if utf-8 length is less than max as well
if (empty($matches[2])) if (empty($matches[2]))
return $text; return $text;
@@ -787,7 +813,7 @@ class e_parse
} }
function textclean ($text, $wrap=100) function textclean ($text, $wrap = 100)
{ {
$text = str_replace("\n\n\n", "\n\n", $text); $text = str_replace("\n\n\n", "\n\n", $text);
$text = $this->htmlwrap($text, $wrap); $text = $this->htmlwrap($text, $wrap);
@@ -982,8 +1008,10 @@ class e_parse
// Not sure whether checks are necessary now we've reorganised // Not sure whether checks are necessary now we've reorganised
// if (!$matches[3]) $bbcode = str_replace($search, $replace, $matches[4]); // if (!$matches[3]) $bbcode = str_replace($search, $replace, $matches[4]);
// Because we're bypassing most of the initial parser processing, we should be able to just reverse the effects of toDB() and execute the code // Because we're bypassing most of the initial parser processing, we should be able to just reverse the effects of toDB() and execute the code
if (!$matches[3]) $bbcode = html_entity_decode($matches[4], ENT_QUOTES, CHARSET); if (!$matches[3])
if (DB_INF_SHOW) echo "PHP after decode: ".htmlentities($bbcode)."<br /><br />"; $bbcode = html_entity_decode($matches[4], ENT_QUOTES, 'UTF-8');
if (DB_INF_SHOW)
echo "PHP after decode: ".htmlentities($bbcode)."<br /><br />";
break; break;
case 'html' : case 'html' :
$proc_funcs = TRUE; $proc_funcs = TRUE;
@@ -1267,7 +1295,7 @@ class e_parse
// URLs posted without HTML access may have an &amp; in them. // URLs posted without HTML access may have an &amp; in them.
$text = str_replace('&amp;', '&', $text); $text = str_replace('&amp;', '&', $text);
// Xhtml compliance. // Xhtml compliance.
$text = htmlspecialchars($text, ENT_QUOTES, CHARSET); $text = htmlspecialchars($text, ENT_QUOTES, 'UTF-8');
if(!preg_match('/&#|\'|"|\(|\)|<|>/s', $text)) if(!preg_match('/&#|\'|"|\(|\)|<|>/s', $text))
{ {
$text = $this->replaceConstants($text); $text = $this->replaceConstants($text);