Add CDATA support to the Lexers, as well as give PEARSax3 entity replacement.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@106 48356398-32a2-884e-a903-53898d9a118a
2025-08-08 07:06:46 +02:00 · 2006-07-23 23:04:34 +00:00
parent 5ce0ae7056
commit 609977f9f5
6 changed files with 165 additions and 65 deletions
--- a/library/HTMLPurifier/Lexer/DOMLex.php
+++ b/library/HTMLPurifier/Lexer/DOMLex.php
@@ -27,8 +27,14 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
    
    public function tokenizeHTML($string) {
        $doc = new DOMDocument();
+        
        // preprocess string
        $string = '<html><body><div>'.$string.'</div></body></html>';
+        
+        // replace and escape the CDATA sections, since parsing under HTML
+        // mode won't get 'em.
+        $string = $this->escapeCDATA($string);
+        
        @$doc->loadHTML($string); // mute all errors, handle it transparently
        return $this->tokenizeDOM(
            $doc->childNodes->item(1)-> // html
@@ -55,7 +61,8 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
        if ( !($node instanceof DOMElement) ) {
            if ($node instanceof DOMComment) {
                $tokens[] = new HTMLPurifier_Token_Comment($node->data);
-            } elseif ($node instanceof DOMText) {
+            } elseif ($node instanceof DOMText ||
+                      $node instanceof DOMCharacterData) {
                $tokens[] = new HTMLPurifier_Token_Text($node->data);
            }
            // quite possibly, the object wasn't handled, that's fine