Fix DOM bug where default encoding for HTML docs is not UTF-8.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@252 48356398-32a2-884e-a903-53898d9a118a
2025-07-31 03:10:09 +02:00 · 2006-08-14 13:27:18 +00:00
parent ebf0da9b78
commit 299236f695
3 changed files with 45 additions and 4 deletions
--- a/library/HTMLPurifier/Lexer/DOMLex.php
+++ b/library/HTMLPurifier/Lexer/DOMLex.php
@@ -27,18 +27,23 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
    
    public function tokenizeHTML($string) {
        $doc = new DOMDocument();
-        
-        // preprocess string
-        $string = '<html><body><div>'.$string.'</div></body></html>';
+        $doc->encoding = 'UTF-8'; // technically does nothing, but comprehensive
        
        // replace and escape the CDATA sections, since parsing under HTML
        // mode won't get 'em.
        $string = $this->escapeCDATA($string);
        
+        // preprocess string, essential for UTF-8
+        $string =
+        '<html><head>'.
+        '<meta http-equiv="Content-Type" content="text/html;'.
+            ' charset=utf-8" />'.
+        '</head><body><div>'.$string.'</div></body></html>';
+        
        @$doc->loadHTML($string); // mute all errors, handle it transparently
        return $this->tokenizeDOM(
            $doc->childNodes->item(1)-> // html
-                  childNodes->item(0)-> // body
+                  childNodes->item(1)-> // body
                  childNodes->item(0)   // div
            );
    }