1
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-07-30 19:00:10 +02:00

[3.1.1] Fix Shift_JIS encoding wonkiness with yen symbols and whatnot

- Improve parseCDATA algorithm to take into account newline normalization
- Fix regression in FontFamily validator. We now have a legit parser in place, albeit somewhat limited in use. Will be superseded by parser for entire grammar
- Convert EncoderTest to new format

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1769 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang
2008-05-25 05:40:20 +00:00
parent 10530d7f81
commit bb16d8eae5
9 changed files with 242 additions and 97 deletions

View File

@@ -51,16 +51,13 @@ abstract class HTMLPurifier_AttrDef
*
* @warning This processing is inconsistent with XML's whitespace handling
* as specified by section 3.3.3 and referenced XHTML 1.0 section
* 4.7. Compliant processing requires all line breaks normalized
* to "\n", so the fix is not as simple as fixing it in this
* function. Trim and whitespace collapsing are supposed to only
* occur in NMTOKENs. However, note that we are NOT necessarily
* parsing XML, thus, this behavior may still be correct.
* 4.7. However, note that we are NOT necessarily
* parsing XML, thus, this behavior may still be correct. We
* assume that newlines have been normalized.
*/
public function parseCDATA($string) {
$string = trim($string);
$string = str_replace("\n", '', $string);
$string = str_replace(array("\r", "\t"), ' ', $string);
$string = str_replace(array("\n", "\t", "\r"), ' ', $string);
return $string;
}

View File

@@ -16,10 +16,10 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
'cursive' => true
);
$string = $this->parseCDATA($string);
// assume that no font names contain commas in them
$fonts = explode(',', $string);
$final = '';
$non_sgml = HTMLPurifier_Encoder::getNonSgmlCharacters();
foreach($fonts as $font) {
$font = trim($font);
if ($font === '') continue;
@@ -35,11 +35,33 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
$quote = $font[0];
if ($font[$length - 1] !== $quote) continue;
$font = substr($font, 1, $length - 2);
// double-backslash processing is buggy. Namely, it doesn't allow
// fonts that contain an adjacent quote, backslash, or comma
$font = str_replace("\\$quote", $quote, $font); // de-escape quote
$font = str_replace("\\\n", '', $font); // de-escape newlines
$font = str_replace("\\\\", "\\", $font); // de-escape double backslashes
$new_font = '';
for ($i = 0, $c = strlen($font); $i < $c; $i++) {
if ($font[$i] === '\\') {
$i++;
if ($i >= $c) {
$new_font .= '\\';
break;
}
if (ctype_xdigit($font[$i])) {
$code = $font[$i];
for ($a = 1, $i++; $i < $c && $a < 6; $i++, $a++) {
if (!ctype_xdigit($font[$i])) break;
$code .= $font[$i];
}
$char = HTMLPurifier_Encoder::unichr(hexdec($code));
if (isset($non_sgml[$char])) continue;
$new_font .= $char;
if ($i < $c && trim($font[$i]) !== '') $i--;
continue;
}
if ($font[$i] === "\n") continue;
}
$new_font .= $font[$i];
}
$font = $new_font;
}
// $font is a pure representation of the font name

View File

@@ -7,6 +7,8 @@
class HTMLPurifier_Encoder
{
private static $nonSgmlCharacters;
/**
* Constructor throws fatal error if you attempt to instantiate class
*/
@@ -19,6 +21,24 @@ class HTMLPurifier_Encoder
*/
private static function muteErrorHandler() {}
/**
* Returns a lookup of UTF-8 character byte sequences that are non-SGML.
*/
public static function getNonSgmlCharacters() {
if (empty(HTMLPurifier_Encoder::$nonSgmlCharacters)) {
for ($i = 0; $i <= 31; $i++) {
// non-SGML ASCII chars
// save \r, \t and \n
if ($i == 9 || $i == 13 || $i == 10) continue;
HTMLPurifier_Encoder::$nonSgmlCharacters[chr($i)] = '';
}
for ($i = 127; $i <= 159; $i++) {
HTMLPurifier_Encoder::$nonSgmlCharacters[HTMLPurifier_Encoder::unichr($i)] = '';
}
}
return HTMLPurifier_Encoder::$nonSgmlCharacters;
}
/**
* Cleans a UTF-8 string for well-formedness and SGML validity
*
@@ -46,18 +66,7 @@ class HTMLPurifier_Encoder
*/
public static function cleanUTF8($str, $force_php = false) {
static $non_sgml_chars = array();
if (empty($non_sgml_chars)) {
for ($i = 0; $i <= 31; $i++) {
// non-SGML ASCII chars
// save \r, \t and \n
if ($i == 9 || $i == 13 || $i == 10) continue;
$non_sgml_chars[chr($i)] = '';
}
for ($i = 127; $i <= 159; $i++) {
$non_sgml_chars[HTMLPurifier_Encoder::unichr($i)] = '';
}
}
$non_sgml = HTMLPurifier_Encoder::getNonSgmlCharacters();
static $iconv = null;
if ($iconv === null) $iconv = function_exists('iconv');
@@ -66,7 +75,7 @@ class HTMLPurifier_Encoder
// This is an optimization: if the string is already valid UTF-8, no
// need to do iconv/php stuff. 99% of the time, this will be the case.
if (preg_match('/^.{1}/us', $str)) {
return strtr($str, $non_sgml_chars);
return strtr($str, $non_sgml);
}
if ($iconv && !$force_php) {
@@ -74,7 +83,7 @@ class HTMLPurifier_Encoder
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
$str = iconv('UTF-8', 'UTF-8//IGNORE', $str);
restore_error_handler();
return strtr($str, $non_sgml_chars);
return strtr($str, $non_sgml);
}
$mState = 0; // cached expected number of octets after the current octet
@@ -276,17 +285,20 @@ class HTMLPurifier_Encoder
* Converts a string to UTF-8 based on configuration.
*/
public static function convertToUTF8($str, $config, $context) {
static $iconv = null;
if ($iconv === null) $iconv = function_exists('iconv');
$encoding = $config->get('Core', 'Encoding');
if ($encoding === 'utf-8') return $str;
static $iconv = null;
if ($iconv === null) $iconv = function_exists('iconv');
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
$str = iconv($encoding, 'utf-8//IGNORE', $str);
// If the string is bjorked by Shift_JIS or a similar encoding
// that doesn't support all of ASCII, convert the naughty
// characters to their true byte-wise ASCII/UTF-8 equivalents.
$str = strtr($str, HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding));
restore_error_handler();
return $str;
} elseif ($encoding === 'iso-8859-1') {
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
$str = utf8_encode($str);
restore_error_handler();
return $str;
@@ -300,20 +312,28 @@ class HTMLPurifier_Encoder
* characters being omitted.
*/
public static function convertFromUTF8($str, $config, $context) {
static $iconv = null;
if ($iconv === null) $iconv = function_exists('iconv');
$encoding = $config->get('Core', 'Encoding');
if ($encoding === 'utf-8') return $str;
if ($config->get('Core', 'EscapeNonASCIICharacters')) {
static $iconv = null;
if ($iconv === null) $iconv = function_exists('iconv');
if ($escape = $config->get('Core', 'EscapeNonASCIICharacters')) {
$str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str);
}
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
// Undo our previous fix in convertToUTF8, otherwise iconv will barf
$ascii_fix = HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding);
if (!$escape && !empty($ascii_fix)) {
$clear_fix = array();
foreach ($ascii_fix as $utf8 => $native) $clear_fix[$utf8] = '';
$str = strtr($str, $clear_fix);
}
$str = strtr($str, array_flip($ascii_fix));
// Normal stuff
$str = iconv('utf-8', $encoding . '//IGNORE', $str);
restore_error_handler();
return $str;
} elseif ($encoding === 'iso-8859-1') {
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
$str = utf8_decode($str);
restore_error_handler();
return $str;
@@ -368,6 +388,47 @@ class HTMLPurifier_Encoder
return $result;
}
/**
* This expensive function tests whether or not a given character
* encoding supports ASCII. 7/8-bit encodings like Shift_JIS will
* fail this test, and require special processing. Variable width
* encodings shouldn't ever fail.
*
* @param string $encoding Encoding name to test, as per iconv format
* @param bool $bypass Whether or not to bypass the precompiled arrays.
* @return Array of UTF-8 characters to their corresponding ASCII,
* which can be used to "undo" any overzealous iconv action.
*/
public static function testEncodingSupportsASCII($encoding, $bypass = false) {
static $encodings = array();
if (!$bypass) {
if (isset($encodings[$encoding])) return $encodings[$encoding];
$lenc = strtolower($encoding);
switch ($lenc) {
case 'shift_jis':
return array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~');
case 'johab':
return array("\xE2\x82\xA9" => '\\');
}
if (strpos($lenc, 'iso-8859-') === 0) return array();
}
$ret = array();
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
if (iconv('UTF-8', $encoding, 'a') === false) return false;
for ($i = 0x20; $i <= 0x7E; $i++) { // all printable ASCII chars
$c = chr($i);
if (iconv('UTF-8', "$encoding//IGNORE", $c) === '') {
// Reverse engineer: what's the UTF-8 equiv of this byte
// sequence? This assumes that there's no variable width
// encoding that doesn't support ASCII.
$ret[iconv($encoding, 'UTF-8//IGNORE', $c)] = $c;
}
}
restore_error_handler();
$encodings[$encoding] = $ret;
return $ret;
}
}