More refactoring: bundling charset and entity stuff together makes little sense, so new HTMLPurifier/EntityParser.php.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@341 48356398-32a2-884e-a903-53898d9a118a
2025-10-16 22:46:06 +02:00 · 2006-08-30 02:21:39 +00:00
parent 89376a11e3
commit 0ac97774d4
7 changed files with 309 additions and 259 deletions
--- a/library/HTMLPurifier/Encoder.php
+++ b/library/HTMLPurifier/Encoder.php
@@ -3,164 +3,17 @@
 require_once 'HTMLPurifier/EntityLookup.php';

 /**
- * An HTML and UTF-8 specific encoder that cleans, unentity-izes and transforms.
+ * A UTF-8 specific character encoder that handles cleaning and transforming.
 */
 class HTMLPurifier_Encoder
 {
    
-    var $_entity_lookup;
-    
-    /**
-     * Callback regex string for parsing entities.
-     * @protected
-     */                             
-    var $_substituteEntitiesRegex =
-'/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z]+));?/';
-//     1. hex             2. dec      3. string
-    
-    
-    /**
-     * Decimal to parsed string conversion table for special entities.
-     * @protected
-     */
-    var $_special_dec2str =
-            array(
-                    34 => '"',
-                    38 => '&',
-                    39 => "'",
-                    60 => '<',
-                    62 => '>'
-            );
-    
-    /**
-     * Stripped entity names to decimal conversion table for special entities.
-     * @protected
-     */
-    var $_special_ent2dec =
-            array(
-                    'quot' => 34,
-                    'amp'  => 38,
-                    'lt'   => 60,
-                    'gt'   => 62
-            );
-    
-    /**
-     * Substitutes non-special entities with their parsed equivalents. Since
-     * running this whenever you have parsed character is t3h 5uck, we run
-     * it before everything else.
-     * 
-     * @protected
-     * @param $string String to have non-special entities parsed.
-     * @returns Parsed string.
-     */
-    function substituteNonSpecialEntities($string) {
-        // it will try to detect missing semicolons, but don't rely on it
-        return preg_replace_callback(
-            $this->_substituteEntitiesRegex,
-            array($this, 'nonSpecialEntityCallback'),
-            $string
-            );
-    }
-    
-    /**
-     * Callback function for substituteNonSpecialEntities() that does the work.
-     * 
-     * @warning Though this is public in order to let the callback happen,
-     *          calling it directly is not recommended.
-     * @note Based on Feyd's function at
-     *       <http://forums.devnetwork.net/viewtopic.php?p=191404#191404>,
-     *       which is in public domain.
-     * @note While we're going to do code point parsing anyway, a good
-     *       optimization would be to refuse to translate code points that
-     *       are non-SGML characters.  However, this could lead to duplication.
-     * @param $matches  PCRE matches array, with 0 the entire match, and
-     *                  either index 1, 2 or 3 set with a hex value, dec value,
-     *                  or string (respectively).
-     * @returns Replacement string.
-     * @todo Implement string translations
-     */
-    
-    // +----------+----------+----------+----------+
-    // | 33222222 | 22221111 | 111111   |          |
-    // | 10987654 | 32109876 | 54321098 | 76543210 | bit
-    // +----------+----------+----------+----------+
-    // |          |          |          | 0xxxxxxx | 1 byte 0x00000000..0x0000007F
-    // |          |          | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF
-    // |          | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF
-    // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF
-    // +----------+----------+----------+----------+
-    // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF)
-    // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes
-    // +----------+----------+----------+----------+ 
-    
-    function nonSpecialEntityCallback($matches) {
-        // replaces all but big five
-        $entity = $matches[0];
-        $is_num = (@$matches[0][1] === '#');
-        if ($is_num) {
-            $is_hex = (@$entity[2] === 'x');
-            $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
-            
-            // abort for special characters
-            if (isset($this->_special_dec2str[$code]))  return $entity;
-            
-            if($code > 1114111 or $code < 0 or
-              ($code >= 55296 and $code <= 57343) ) {
-                // bits are set outside the "valid" range as defined
-                // by UNICODE 4.1.0 
-                return '';
-            }
-            
-            $x = $y = $z = $w = 0; 
-            if ($code < 128) {
-                // regular ASCII character
-                $x = $code;
-            } else {
-                // set up bits for UTF-8
-                $x = ($code & 63) | 128;
-                if ($code < 2048) {
-                    $y = (($code & 2047) >> 6) | 192;
-                } else {
-                    $y = (($code & 4032) >> 6) | 128;
-                    if($code < 65536) {
-                        $z = (($code >> 12) & 15) | 224;
-                    } else {
-                        $z = (($code >> 12) & 63) | 128;
-                        $w = (($code >> 18) & 7)  | 240;
-                    }
-                } 
-            }
-            // set up the actual character
-            $ret = '';
-            if($w) $ret .= chr($w);
-            if($z) $ret .= chr($z);
-            if($y) $ret .= chr($y);
-            $ret .= chr($x); 
-            
-            return $ret;
-        } else {
-            if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
-            if (!$this->_entity_lookup) {
-                require_once 'HTMLPurifier/EntityLookup.php';
-                $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
-            }
-            if (isset($this->_entity_lookup->table[$matches[3]])) {
-                return $this->_entity_lookup->table[$matches[3]];
-            } else {
-                return $entity;
-            }
-        }
-    }
-    
    /**
     * Cleans a UTF-8 string for well-formedness and SGML validity
     * 
     * It will parse according to UTF-8 and return a valid UTF8 string, with
     * non-SGML codepoints excluded.
     * 
-     * @warning This function can find a lot of use, so we may be moving
-     *          it to a dedicated class.
-     * 
     * @note Just for reference, the non-SGML code points are 0 to 31 and
     *       127 to 159, inclusive.  However, we allow code points 9, 10
     *       and 13, which are the tab, line feed and carriage return
@@ -171,7 +24,8 @@ class HTMLPurifier_Encoder
     *       implemented with iconv using 'UTF-8//IGNORE', mbstring, or
     *       even the PCRE modifier 'u', these do not allow us to strip
     *       control characters or disallowed code points, and the latter
-     *       does not allow invalid UTF8 characters to be ignored.
+     *       does not allow invalid UTF-8 characters to be ignored.  Once
+     *       PHP 6 appears all our problems magically disappear.
     * 
     * @note Decomposing the string into Unicode code points is necessary
     *       because SGML disallows the use of specific code points, not
@@ -181,7 +35,16 @@ class HTMLPurifier_Encoder
     * 
     * @note Code adapted from utf8ToUnicode by Henri Sivonen and
     *       hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
-     *       LGPL license.  Notes on what changed are inside.
+     *       LGPL license.  Notes on what changed are inside, but in general,
+     *       the original code transformed UTF-8 text into an array of integer
+     *       Unicode codepoints. Understandably, transforming that back to
+     *       a string would be somewhat expensive, so the function was modded to
+     *       directly operate on the string.  However, this discourages code
+     *       reuse, and the logic enumerated here would be useful for any
+     *       function that needs to be able to understand UTF-8 characters.
+     *       As of right now, only smart lossless character encoding converters
+     *       would need that, and I'm probably not going to implement them.
+     *       Once again, PHP 6 should solve all our problems.
     */
    function cleanUTF8($str) {
        $mState = 0; // cached expected number of octets after the current octet
@@ -316,50 +179,6 @@ class HTMLPurifier_Encoder
        return $out;
    }
    
-    /**
-     * Substitutes only special entities with their parsed equivalents.
-     * 
-     * @notice We try to avoid calling this function because otherwise, it
-     * would have to be called a lot (for every parsed section).
-     * 
-     * @protected
-     * @param $string String to have non-special entities parsed.
-     * @returns Parsed string.
-     */
-    function substituteSpecialEntities($string) {
-        return preg_replace_callback(
-            $this->_substituteEntitiesRegex,
-            array('HTMLPurifier_Encoder', 'specialEntityCallback'),
-            $string);
-    }
-    
-    /**
-     * Callback function for substituteSpecialEntities() that does the work.
-     * 
-     * This callback has same syntax as nonSpecialEntityCallback().
-     * 
-     * @warning Though this is public in order to let the callback happen,
-     *          calling it directly is not recommended.
-     * @param $matches  PCRE-style matches array, with 0 the entire match, and
-     *                  either index 1, 2 or 3 set with a hex value, dec value,
-     *                  or string (respectively).
-     * @returns Replacement string.
-     */
-    function specialEntityCallback($matches) {
-        $entity = $matches[0];
-        $is_num = (@$matches[0][1] === '#');
-        if ($is_num) {
-            $is_hex = (@$entity[2] === 'x');
-            $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
-            return isset($this->_special_dec2str[$int]) ?
-                $this->_special_dec2str[$int] :
-                $entity;
-        } else {
-            return isset($this->_special_ent2dec[$matches[3]]) ?
-                $this->_special_ent2dec[$matches[3]] :
-                $entity;
-        }
-    }
    
 }

--- a/library/HTMLPurifier/EntityParser.php
+++ b/library/HTMLPurifier/EntityParser.php
@@ -0,0 +1,212 @@
+<?php
+
+require_once 'HTMLPurifier/EntityLookup.php';
+
+/**
+ * Handles referencing and derefencing character entities
+ */
+class HTMLPurifier_EntityParser
+{
+    
+    /**
+     * Reference to entity lookup talbe.
+     * @protected
+     */
+    var $_entity_lookup;
+    
+    /**
+     * Callback regex string for parsing entities.
+     * @protected
+     */                             
+    var $_substituteEntitiesRegex =
+'/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z]+));?/';
+//     1. hex             2. dec      3. string
+    
+    
+    /**
+     * Decimal to parsed string conversion table for special entities.
+     * @protected
+     */
+    var $_special_dec2str =
+            array(
+                    34 => '"',
+                    38 => '&',
+                    39 => "'",
+                    60 => '<',
+                    62 => '>'
+            );
+    
+    /**
+     * Stripped entity names to decimal conversion table for special entities.
+     * @protected
+     */
+    var $_special_ent2dec =
+            array(
+                    'quot' => 34,
+                    'amp'  => 38,
+                    'lt'   => 60,
+                    'gt'   => 62
+            );
+    
+    /**
+     * Substitutes non-special entities with their parsed equivalents. Since
+     * running this whenever you have parsed character is t3h 5uck, we run
+     * it before everything else.
+     * 
+     * @protected
+     * @param $string String to have non-special entities parsed.
+     * @returns Parsed string.
+     */
+    function substituteNonSpecialEntities($string) {
+        // it will try to detect missing semicolons, but don't rely on it
+        return preg_replace_callback(
+            $this->_substituteEntitiesRegex,
+            array($this, 'nonSpecialEntityCallback'),
+            $string
+            );
+    }
+    
+    /**
+     * Callback function for substituteNonSpecialEntities() that does the work.
+     * 
+     * @warning Though this is public in order to let the callback happen,
+     *          calling it directly is not recommended.
+     * @note Based on Feyd's function at
+     *       <http://forums.devnetwork.net/viewtopic.php?p=191404#191404>,
+     *       which is in public domain.
+     * @note While we're going to do code point parsing anyway, a good
+     *       optimization would be to refuse to translate code points that
+     *       are non-SGML characters.  However, this could lead to duplication.
+     * @note This function is heavily intimate with the inner workings of
+     *       UTF-8 and would also be well suited in the Encoder class (or at
+     *       least deferring some processing to it).  This is also very
+     *       similar to the unichr function in
+     *       maintenance/generate-entity-file.php (although this is superior,
+     *       due to its sanity checks).
+     * @param $matches  PCRE matches array, with 0 the entire match, and
+     *                  either index 1, 2 or 3 set with a hex value, dec value,
+     *                  or string (respectively).
+     * @returns Replacement string.
+     * @todo Implement string translations
+     */
+    
+    // +----------+----------+----------+----------+
+    // | 33222222 | 22221111 | 111111   |          |
+    // | 10987654 | 32109876 | 54321098 | 76543210 | bit
+    // +----------+----------+----------+----------+
+    // |          |          |          | 0xxxxxxx | 1 byte 0x00000000..0x0000007F
+    // |          |          | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF
+    // |          | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF
+    // | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF
+    // +----------+----------+----------+----------+
+    // | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF)
+    // | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes
+    // +----------+----------+----------+----------+ 
+    
+    function nonSpecialEntityCallback($matches) {
+        // replaces all but big five
+        $entity = $matches[0];
+        $is_num = (@$matches[0][1] === '#');
+        if ($is_num) {
+            $is_hex = (@$entity[2] === 'x');
+            $code = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
+            
+            // abort for special characters
+            if (isset($this->_special_dec2str[$code]))  return $entity;
+            
+            if($code > 1114111 or $code < 0 or
+              ($code >= 55296 and $code <= 57343) ) {
+                // bits are set outside the "valid" range as defined
+                // by UNICODE 4.1.0 
+                return '';
+            }
+            
+            $x = $y = $z = $w = 0; 
+            if ($code < 128) {
+                // regular ASCII character
+                $x = $code;
+            } else {
+                // set up bits for UTF-8
+                $x = ($code & 63) | 128;
+                if ($code < 2048) {
+                    $y = (($code & 2047) >> 6) | 192;
+                } else {
+                    $y = (($code & 4032) >> 6) | 128;
+                    if($code < 65536) {
+                        $z = (($code >> 12) & 15) | 224;
+                    } else {
+                        $z = (($code >> 12) & 63) | 128;
+                        $w = (($code >> 18) & 7)  | 240;
+                    }
+                } 
+            }
+            // set up the actual character
+            $ret = '';
+            if($w) $ret .= chr($w);
+            if($z) $ret .= chr($z);
+            if($y) $ret .= chr($y);
+            $ret .= chr($x); 
+            
+            return $ret;
+        } else {
+            if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
+            if (!$this->_entity_lookup) {
+                require_once 'HTMLPurifier/EntityLookup.php';
+                $this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
+            }
+            if (isset($this->_entity_lookup->table[$matches[3]])) {
+                return $this->_entity_lookup->table[$matches[3]];
+            } else {
+                return $entity;
+            }
+        }
+    }
+    
+    /**
+     * Substitutes only special entities with their parsed equivalents.
+     * 
+     * @notice We try to avoid calling this function because otherwise, it
+     * would have to be called a lot (for every parsed section).
+     * 
+     * @protected
+     * @param $string String to have non-special entities parsed.
+     * @returns Parsed string.
+     */
+    function substituteSpecialEntities($string) {
+        return preg_replace_callback(
+            $this->_substituteEntitiesRegex,
+            array($this, 'specialEntityCallback'),
+            $string);
+    }
+    
+    /**
+     * Callback function for substituteSpecialEntities() that does the work.
+     * 
+     * This callback has same syntax as nonSpecialEntityCallback().
+     * 
+     * @warning Though this is public in order to let the callback happen,
+     *          calling it directly is not recommended.
+     * @param $matches  PCRE-style matches array, with 0 the entire match, and
+     *                  either index 1, 2 or 3 set with a hex value, dec value,
+     *                  or string (respectively).
+     * @returns Replacement string.
+     */
+    function specialEntityCallback($matches) {
+        $entity = $matches[0];
+        $is_num = (@$matches[0][1] === '#');
+        if ($is_num) {
+            $is_hex = (@$entity[2] === 'x');
+            $int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
+            return isset($this->_special_dec2str[$int]) ?
+                $this->_special_dec2str[$int] :
+                $entity;
+        } else {
+            return isset($this->_special_ent2dec[$matches[3]]) ?
+                $this->_special_ent2dec[$matches[3]] :
+                $entity;
+        }
+    }
+    
+}
+
+?>
--- a/library/HTMLPurifier/Lexer.php
+++ b/library/HTMLPurifier/Lexer.php
@@ -2,6 +2,7 @@

 require_once 'HTMLPurifier/Token.php';
 require_once 'HTMLPurifier/Encoder.php';
+require_once 'HTMLPurifier/EntityParser.php';

 HTMLPurifier_ConfigDef::define(
    'Core', 'AcceptFullDocuments', true, 'bool',
@@ -57,6 +58,7 @@ class HTMLPurifier_Lexer
    
    function HTMLPurifier_Lexer() {
        $this->_encoder = new HTMLPurifier_Encoder();
+        $this->_entity_parser = new HTMLPurifier_EntityParser();
    }
    
    var $_encoder;
@@ -152,7 +154,7 @@ class HTMLPurifier_Lexer
        $html = $this->escapeCDATA($html);
        
        // expand entities that aren't the big five
-        $html = $this->_encoder->substituteNonSpecialEntities($html);
+        $html = $this->_entity_parser->substituteNonSpecialEntities($html);
        
        // clean into wellformed UTF-8 string for an SGML context: this has
        // to be done after entity expansion because the entities sometimes
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@@ -66,7 +66,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
        if ($num_amp_2 <= $num_esc_amp) return $string;
        
        // hmm... now we have some uncommon entities. Use the callback.
-        $string = $this->_encoder->substituteSpecialEntities($string);
+        $string = $this->_entity_parser->substituteSpecialEntities($string);
        return $string;
    }