Malformed UTF-8 and non-SGML character detection and cleaning implemented

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@303 48356398-32a2-884e-a903-53898d9a118a
2025-08-06 14:16:32 +02:00 · 2006-08-19 17:53:59 +00:00
parent 53808ee34a
commit 973cc43b64
11 changed files with 131 additions and 58 deletions
--- a/library/HTMLPurifier/Lexer/PEARSax3.php
+++ b/library/HTMLPurifier/Lexer/PEARSax3.php
@@ -29,20 +29,21 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
     */
    var $tokens = array();
    
-    function tokenizeHTML($html, $config = null) {
+    function tokenizeHTML($string, $config = null) {
        if (!$config) $config = HTMLPurifier_Config::createDefault();
-        $html = $this->escapeCDATA($html);
+        $string = $this->escapeCDATA($string);
        if ($config->get('Core', 'AcceptFullDocuments')) {
-            $html = $this->extractBody($html);
+            $string = $this->extractBody($string);
        }
-        $html = $this->substituteNonSpecialEntities($html);
+        $string = $this->substituteNonSpecialEntities($string);
+        $string = $this->cleanUTF8($string);
        $parser=& new XML_HTMLSax3();
        $parser->set_object($this);
        $parser->set_element_handler('openHandler','closeHandler');
        $parser->set_data_handler('dataHandler');
        $parser->set_escape_handler('escapeHandler');
        $parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);
-        $parser->parse($html);
+        $parser->parse($string);
        $tokens = $this->tokens;
        $this->tokens = array();
        return $tokens;