From 6f25c39c3eb1d8e8becb19a515423d58f75a2e86 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Wed, 11 Jun 2008 19:01:22 +0000 Subject: [PATCH] [2.1.5] [MFH] Fix Shift_JIS bug. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1793 48356398-32a2-884e-a903-53898d9a118a --- NEWS | 7 +++ .../HTMLPurifier/AttrDef/CSS/FontFamily.php | 6 ++- library/HTMLPurifier/Encoder.php | 51 ++++++------------- tests/HTMLPurifier/EncoderTest.php | 2 + 4 files changed, 29 insertions(+), 37 deletions(-) diff --git a/NEWS b/NEWS index 91892389..153852ad 100644 --- a/NEWS +++ b/NEWS @@ -24,6 +24,13 @@ ERRATA - Disable percent height/width attributes for img - Fix stray backslashes in font-family; CSS Unicode character escapes are now properly resolved (although *only* in font-family). +- Improve parseCDATA algorithm to take into account newline normalization +- Account for browser confusion between Yen character and backslash in + Shift_JIS encoding. This fix generalizes to any other encoding which is not + a strict superset of printable ASCII. +- Improved adherence to Unicode by checking for non-character codepoints. + Thanks Geoffrey Sneddon for reporting. This may result in degraded + performance for extremely large inputs. . Added HTMLPurifier_UnitConverter and HTMLPurifier_Length for convenient handling of CSS-style lengths. HTMLPurifier_AttrDef_CSS_Length now uses this class. diff --git a/library/HTMLPurifier/AttrDef/CSS/FontFamily.php b/library/HTMLPurifier/AttrDef/CSS/FontFamily.php index eef3c179..7418368a 100644 --- a/library/HTMLPurifier/AttrDef/CSS/FontFamily.php +++ b/library/HTMLPurifier/AttrDef/CSS/FontFamily.php @@ -22,7 +22,6 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef // assume that no font names contain commas in them $fonts = explode(',', $string); $final = ''; - $non_sgml = HTMLPurifier_Encoder::getNonSgmlCharacters(); foreach($fonts as $font) { $font = trim($font); if ($font === '') continue; @@ -53,8 +52,11 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef if (!ctype_xdigit($font[$i])) break; $code .= $font[$i]; } + // We have to be extremely careful when adding + // new characters, to make sure we're not breaking + // the encoding. $char = HTMLPurifier_Encoder::unichr(hexdec($code)); - if (isset($non_sgml[$char])) continue; + if (HTMLPurifier_Encoder::cleanUTF8($char) === '') continue; $new_font .= $char; if ($i < $c && trim($font[$i]) !== '') $i--; continue; diff --git a/library/HTMLPurifier/Encoder.php b/library/HTMLPurifier/Encoder.php index 7f535229..4ec73606 100644 --- a/library/HTMLPurifier/Encoder.php +++ b/library/HTMLPurifier/Encoder.php @@ -68,24 +68,6 @@ class HTMLPurifier_Encoder function muteErrorHandler() {} /** - * Returns a lookup of UTF-8 character byte sequences that are non-SGML. - */ - function getNonSgmlCharacters() { - static $nonSgmlCharacters; - if (empty($nonSgmlCharacters)) { - for ($i = 0; $i <= 31; $i++) { - // non-SGML ASCII chars - // save \r, \t and \n - if ($i == 9 || $i == 13 || $i == 10) continue; - $nonSgmlCharacters[chr($i)] = ''; - } - for ($i = 127; $i <= 159; $i++) { - $nonSgmlCharacters[HTMLPurifier_Encoder::unichr($i)] = ''; - } - } - return $nonSgmlCharacters; - } - /** * Cleans a UTF-8 string for well-formedness and SGML validity * @@ -114,24 +96,13 @@ class HTMLPurifier_Encoder */ function cleanUTF8($str, $force_php = false) { - $non_sgml = HTMLPurifier_Encoder::getNonSgmlCharacters(); - - static $iconv = null; - if ($iconv === null) $iconv = function_exists('iconv'); - // UTF-8 validity is checked since PHP 4.3.5 // This is an optimization: if the string is already valid UTF-8, no - // need to do iconv/php stuff. 99% of the time, this will be the case. - if (preg_match('/^.{1}/us', $str)) { - return strtr($str, $non_sgml); - } - - if ($iconv && !$force_php) { - // do the shortcut way - set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler')); - $str = iconv('UTF-8', 'UTF-8//IGNORE', $str); - restore_error_handler(); - return strtr($str, $non_sgml); + // need to do PHP stuff. 99% of the time, this will be the case. + // The regexp matches the XML char production, as well as well as excluding + // non-SGML codepoints U+007F to U+009F + if (preg_match('/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du', $str)) { + return $str; } $mState = 0; // cached expected number of octets after the current octet @@ -242,7 +213,17 @@ class HTMLPurifier_Encoder ) { } elseif (0xFEFF != $mUcs4 && // omit BOM - !($mUcs4 >= 128 && $mUcs4 <= 159) // omit non-SGML + // check for valid Char unicode codepoints + ( + 0x9 == $mUcs4 || + 0xA == $mUcs4 || + 0xD == $mUcs4 || + (0x20 <= $mUcs4 && 0x7E >= $mUcs4) || + // 7F-9F is not strictly prohibited by XML, + // but it is non-SGML, and thus we don't allow it + (0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4) || + (0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4) + ) ) { $out .= $char; } diff --git a/tests/HTMLPurifier/EncoderTest.php b/tests/HTMLPurifier/EncoderTest.php index 205fb8b6..fbae4ce8 100644 --- a/tests/HTMLPurifier/EncoderTest.php +++ b/tests/HTMLPurifier/EncoderTest.php @@ -27,6 +27,8 @@ class HTMLPurifier_EncoderTest extends HTMLPurifier_Harness $this->assertCleanUTF8("\xC2\x80", ''); // two byte invalid SGML $this->assertCleanUTF8("\xF3\xBF\xBF\xBF"); // valid four byte $this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8 + // invalid codepoints + $this->assertCleanUTF8("\xED\xB0\x80", ''); } function test_convertToUTF8_noConvert() {