From 6f25c39c3eb1d8e8becb19a515423d58f75a2e86 Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <edwardzyang@thewritingpot.com>
Date: Wed, 11 Jun 2008 19:01:22 +0000
Subject: [PATCH] [2.1.5] [MFH] Fix Shift_JIS bug.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1793 48356398-32a2-884e-a903-53898d9a118a
---
 NEWS                                          |  7 +++
 .../HTMLPurifier/AttrDef/CSS/FontFamily.php   |  6 ++-
 library/HTMLPurifier/Encoder.php              | 51 ++++++-------------
 tests/HTMLPurifier/EncoderTest.php            |  2 +
 4 files changed, 29 insertions(+), 37 deletions(-)

diff --git a/NEWS b/NEWS
index 91892389..153852ad 100644
--- a/NEWS
+++ b/NEWS
@@ -24,6 +24,13 @@ ERRATA
 - Disable percent height/width attributes for img
 - Fix stray backslashes in font-family; CSS Unicode character escapes are
   now properly resolved (although *only* in font-family).
+- Improve parseCDATA algorithm to take into account newline normalization
+- Account for browser confusion between Yen character and backslash in
+  Shift_JIS encoding. This fix generalizes to any other encoding which is not
+  a strict superset of printable ASCII.
+- Improved adherence to Unicode by checking for non-character codepoints.
+  Thanks Geoffrey Sneddon for reporting. This may result in degraded
+  performance for extremely large inputs.
 . Added HTMLPurifier_UnitConverter and HTMLPurifier_Length for convenient
   handling of CSS-style lengths. HTMLPurifier_AttrDef_CSS_Length now uses
   this class.
diff --git a/library/HTMLPurifier/AttrDef/CSS/FontFamily.php b/library/HTMLPurifier/AttrDef/CSS/FontFamily.php
index eef3c179..7418368a 100644
--- a/library/HTMLPurifier/AttrDef/CSS/FontFamily.php
+++ b/library/HTMLPurifier/AttrDef/CSS/FontFamily.php
@@ -22,7 +22,6 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
         // assume that no font names contain commas in them
         $fonts = explode(',', $string);
         $final = '';
-        $non_sgml = HTMLPurifier_Encoder::getNonSgmlCharacters();
         foreach($fonts as $font) {
             $font = trim($font);
             if ($font === '') continue;
@@ -53,8 +52,11 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
                                 if (!ctype_xdigit($font[$i])) break;
                                 $code .= $font[$i];
                             }
+                            // We have to be extremely careful when adding
+                            // new characters, to make sure we're not breaking
+                            // the encoding.
                             $char = HTMLPurifier_Encoder::unichr(hexdec($code));
-                            if (isset($non_sgml[$char])) continue;
+                            if (HTMLPurifier_Encoder::cleanUTF8($char) === '') continue;
                             $new_font .= $char;
                             if ($i < $c && trim($font[$i]) !== '') $i--;
                             continue;
diff --git a/library/HTMLPurifier/Encoder.php b/library/HTMLPurifier/Encoder.php
index 7f535229..4ec73606 100644
--- a/library/HTMLPurifier/Encoder.php
+++ b/library/HTMLPurifier/Encoder.php
@@ -68,24 +68,6 @@ class HTMLPurifier_Encoder
     function muteErrorHandler() {}
     
     /**
-     * Returns a lookup of UTF-8 character byte sequences that are non-SGML.
-     */
-    function getNonSgmlCharacters() {
-        static $nonSgmlCharacters;
-        if (empty($nonSgmlCharacters)) {
-            for ($i = 0; $i <= 31; $i++) {
-                // non-SGML ASCII chars
-                // save \r, \t and \n
-                if ($i == 9 || $i == 13 || $i == 10) continue;
-                $nonSgmlCharacters[chr($i)] = '';
-            }
-            for ($i = 127; $i <= 159; $i++) {
-                $nonSgmlCharacters[HTMLPurifier_Encoder::unichr($i)] = '';
-            }
-        }
-        return $nonSgmlCharacters;
-    }
-    
     /**
      * Cleans a UTF-8 string for well-formedness and SGML validity
      * 
@@ -114,24 +96,13 @@ class HTMLPurifier_Encoder
      */
     function cleanUTF8($str, $force_php = false) {
         
-        $non_sgml = HTMLPurifier_Encoder::getNonSgmlCharacters();
-        
-        static $iconv = null;
-        if ($iconv === null) $iconv = function_exists('iconv');
-        
         // UTF-8 validity is checked since PHP 4.3.5
         // This is an optimization: if the string is already valid UTF-8, no
-        // need to do iconv/php stuff. 99% of the time, this will be the case.
-        if (preg_match('/^.{1}/us', $str)) {
-            return strtr($str, $non_sgml);
-        }
-        
-        if ($iconv && !$force_php) {
-            // do the shortcut way
-            set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
-            $str = iconv('UTF-8', 'UTF-8//IGNORE', $str);
-            restore_error_handler();
-            return strtr($str, $non_sgml);
+        // need to do PHP stuff. 99% of the time, this will be the case.
+        // The regexp matches the XML char production, as well as well as excluding
+        // non-SGML codepoints U+007F to U+009F
+        if (preg_match('/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du', $str)) {
+            return $str;
         }
         
         $mState = 0; // cached expected number of octets after the current octet
@@ -242,7 +213,17 @@ class HTMLPurifier_Encoder
                         ) {
                             
                         } elseif (0xFEFF != $mUcs4 && // omit BOM
-                            !($mUcs4 >= 128 && $mUcs4 <= 159) // omit non-SGML
+                            // check for valid Char unicode codepoints
+                            (
+                                0x9 == $mUcs4 ||
+                                0xA == $mUcs4 ||
+                                0xD == $mUcs4 ||
+                                (0x20 <= $mUcs4 && 0x7E >= $mUcs4) ||
+                                // 7F-9F is not strictly prohibited by XML,
+                                // but it is non-SGML, and thus we don't allow it
+                                (0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4) ||
+                                (0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4)
+                            )
                         ) {
                             $out .= $char;
                         }
diff --git a/tests/HTMLPurifier/EncoderTest.php b/tests/HTMLPurifier/EncoderTest.php
index 205fb8b6..fbae4ce8 100644
--- a/tests/HTMLPurifier/EncoderTest.php
+++ b/tests/HTMLPurifier/EncoderTest.php
@@ -27,6 +27,8 @@ class HTMLPurifier_EncoderTest extends HTMLPurifier_Harness
         $this->assertCleanUTF8("\xC2\x80", ''); // two byte invalid SGML
         $this->assertCleanUTF8("\xF3\xBF\xBF\xBF"); // valid four byte
         $this->assertCleanUTF8("\xDF\xFF", ''); // malformed UTF8
+        // invalid codepoints
+        $this->assertCleanUTF8("\xED\xB0\x80", '');
     }
     
     function test_convertToUTF8_noConvert() {