Fixed broken multibyte numeric entity conversion in Lexer::substituteNonSpecialEntities()

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@299 48356398-32a2-884e-a903-53898d9a118a
2025-10-15 14:14:24 +02:00 · 2006-08-18 17:49:33 +00:00
parent c393ef8a81
commit a33cd12f1a
3 changed files with 109 additions and 3 deletions
--- a/library/HTMLPurifier/Lexer.php
+++ b/library/HTMLPurifier/Lexer.php
@@ -172,21 +172,77 @@ class HTMLPurifier_Lexer
     * 
     * @warning Though this is public in order to let the callback happen,
     *          calling it directly is not recommended.
+     * @note Based on Feyd's function at
+     *       <http://forums.devnetwork.net/viewtopic.php?p=191404#191404>,
+     *       which is in public domain.
+     * @note While we're going to do code point parsing anyway, a good
+     *       optimization would be to refuse to translate code points that
+     *       are non-SGML characters.  However, this could lead to duplication.
     * @param $matches  PCRE matches array, with 0 the entire match, and
     *                  either index 1, 2 or 3 set with a hex value, dec value,
     *                  or string (respectively).
     * @returns Replacement string.
     * @todo Implement string translations
     */
+    
+    // +----------+----------+----------+----------+
+    // | 33222222 | 22221111 | 111111   |          |
+    // | 10987654 | 32109876 | 54321098 | 76543210 | bit
+    // +----------+----------+----------+----------+
+    // |          |          |          | 0xxxxxxx | 1 byte 0x00000000..0x0000007F
+    // |          |          | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF
+    // |          | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF
+    // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF
+    // +----------+----------+----------+----------+
+    // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF)
+    // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes
+    // +----------+----------+----------+----------+ 
+    
    function nonSpecialEntityCallback($matches) {
        // replaces all but big five
        $entity = $matches[0];
        $is_num = (@$matches[0][1] === '#');
        if ($is_num) {
            $is_hex = (@$entity[2] === 'x');
-            $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
-            if (isset($this->_special_dec2str[$int]))  return $entity;
-            return chr($int);
+            $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
+            
+            // abort for special characters
+            if (isset($this->_special_dec2str[$code]))  return $entity;
+            
+            if($code > 1114111 or $code < 0 or
+              ($code >= 55296 and $code <= 57343) ) {
+                // bits are set outside the "valid" range as defined
+                // by UNICODE 4.1.0 
+                return '';
+            }
+            
+            $x = $y = $z = $w = 0; 
+            if ($code < 128) {
+                // regular ASCII character
+                $x = $code;
+            } else {
+                // set up bits for UTF-8
+                $x = ($code & 63) | 128;
+                if ($code < 2048) {
+                    $y = (($code & 2047) >> 6) | 192;
+                } else {
+                    $y = (($code & 4032) >> 6) | 128;
+                    if($code < 65536) {
+                        $z = (($code >> 12) & 15) | 224;
+                    } else {
+                        $z = (($code >> 12) & 63) | 128;
+                        $w = (($code >> 18) & 7)  | 240;
+                    }
+                } 
+            }
+            // set up the actual character
+            $ret = '';
+            if($w) $ret .= chr($w);
+            if($z) $ret .= chr($z);
+            if($y) $ret .= chr($y);
+            $ret .= chr($x); 
+            
+            return $ret;
        } else {
            if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
            if (!$this->_entity_lookup) {