Remove a huge swath of duplicated function calls by factoring them into a normalize() function. Also made DirectLex's variable names consistent with the rest of the classes.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@340 48356398-32a2-884e-a903-53898d9a118a
2025-07-31 03:10:09 +02:00 · 2006-08-29 20:05:26 +00:00
parent 1de3088276
commit 89376a11e3
4 changed files with 46 additions and 51 deletions
--- a/library/HTMLPurifier/Lexer.php
+++ b/library/HTMLPurifier/Lexer.php
@@ -137,6 +137,31 @@ class HTMLPurifier_Lexer
        return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
    }
    
+    /**
+     * Takes a piece of HTML and normalizes it by converting entities, fixing
+     * encoding, extracting bits, and other good stuff.
+     */
+    function normalize($html, $config) {
+        
+        // extract body from document if applicable
+        if ($config->get('Core', 'AcceptFullDocuments')) {
+            $html = $this->extractBody($html);
+        }
+        
+        // escape CDATA
+        $html = $this->escapeCDATA($html);
+        
+        // expand entities that aren't the big five
+        $html = $this->_encoder->substituteNonSpecialEntities($html);
+        
+        // clean into wellformed UTF-8 string for an SGML context: this has
+        // to be done after entity expansion because the entities sometimes
+        // represent non-SGML characters (horror, horror!)
+        $html = $this->_encoder->cleanUTF8($html);
+        
+        return $html;
+    }
+    
    /**
     * Takes a string of HTML (fragment or document) and returns the content
     */