diff --git a/library/HTMLPurifier/Lexer.php b/library/HTMLPurifier/Lexer.php index 031e8e3d..0da29766 100644 --- a/library/HTMLPurifier/Lexer.php +++ b/library/HTMLPurifier/Lexer.php @@ -137,6 +137,31 @@ class HTMLPurifier_Lexer return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8'); } + /** + * Takes a piece of HTML and normalizes it by converting entities, fixing + * encoding, extracting bits, and other good stuff. + */ + function normalize($html, $config) { + + // extract body from document if applicable + if ($config->get('Core', 'AcceptFullDocuments')) { + $html = $this->extractBody($html); + } + + // escape CDATA + $html = $this->escapeCDATA($html); + + // expand entities that aren't the big five + $html = $this->_encoder->substituteNonSpecialEntities($html); + + // clean into wellformed UTF-8 string for an SGML context: this has + // to be done after entity expansion because the entities sometimes + // represent non-SGML characters (horror, horror!) + $html = $this->_encoder->cleanUTF8($html); + + return $html; + } + /** * Takes a string of HTML (fragment or document) and returns the content */ diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php index e408fa84..202b4c8a 100644 --- a/library/HTMLPurifier/Lexer/DOMLex.php +++ b/library/HTMLPurifier/Lexer/DOMLex.php @@ -37,24 +37,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer public function tokenizeHTML($string, $config = null) { if (!$config) $config = HTMLPurifier_Config::createDefault(); - if ($config->get('Core', 'AcceptFullDocuments')) { - $string = $this->extractBody($string); - } - - $doc = new DOMDocument(); - $doc->encoding = 'UTF-8'; // technically does nothing, but whatever - - // replace and escape the CDATA sections, since parsing under HTML - // mode won't get 'em. - $string = $this->escapeCDATA($string); - - // substitute non-special entities. While DOM is perfectly capable - // of doing this, we need to get at the UTF-8 characters in - // cleanUTF8 - $string = $this->_encoder->substituteNonSpecialEntities($string); - - // clean it into well-formed UTF-8 string - $string = $this->_encoder->cleanUTF8($string); + $string = $this->normalize($string, $config); // preprocess string, essential for UTF-8 $string = @@ -66,6 +49,8 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer ' charset=utf-8" />'. '
'.$string.'
'; + $doc = new DOMDocument(); + $doc->encoding = 'UTF-8'; // technically does nothing, but whatever @$doc->loadHTML($string); // mute all errors, handle it transparently $tokens = array(); diff --git a/library/HTMLPurifier/Lexer/DirectLex.php b/library/HTMLPurifier/Lexer/DirectLex.php index 6951c491..7e749581 100644 --- a/library/HTMLPurifier/Lexer/DirectLex.php +++ b/library/HTMLPurifier/Lexer/DirectLex.php @@ -76,31 +76,16 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer */ var $_whitespace = "\x20\x09\x0D\x0A"; - function tokenizeHTML($string, $config = null) { + function tokenizeHTML($html, $config = null) { if (!$config) $config = HTMLPurifier_Config::createDefault(); - // some quick checking (if empty, return empty) - $string = @ (string) $string; - if ($string == '') return array(); - - if ($config->get('Core', 'AcceptFullDocuments')) { - $string = $this->extractBody($string); - } + $html = $this->normalize($html, $config); $cursor = 0; // our location in the text $inside_tag = false; // whether or not we're parsing the inside of a tag $array = array(); // result array - // escape CDATA - $string = $this->escapeCDATA($string); - - // expand entities THAT AREN'T THE BIG FIVE - $string = $this->_encoder->substituteNonSpecialEntities($string); - - // clean it into wellformed UTF-8 string - $string = $this->_encoder->cleanUTF8($string); - // infinite loop protection // has to be pretty big, since html docs can be big // we're allow two hundred thousand tags... more than enough? @@ -111,8 +96,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer // infinite loop protection if (++$loops > 200000) return array(); - $position_next_lt = strpos($string, '<', $cursor); - $position_next_gt = strpos($string, '>', $cursor); + $position_next_lt = strpos($html, '<', $cursor); + $position_next_gt = strpos($html, '>', $cursor); // triggers on "asdf" but not "asdf " if ($position_next_lt === $cursor) { @@ -126,7 +111,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer HTMLPurifier_Token_Text( $this->parseData( substr( - $string, $cursor, $position_next_lt - $cursor + $html, $cursor, $position_next_lt - $cursor ) ) ); @@ -136,13 +121,13 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer } elseif (!$inside_tag) { // We are not inside tag but there are no more tags // If we're already at the end, break - if ($cursor === strlen($string)) break; + if ($cursor === strlen($html)) break; // Create Text of rest of string $array[] = new HTMLPurifier_Token_Text( $this->parseData( substr( - $string, $cursor + $html, $cursor ) ) ); @@ -151,7 +136,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer // We are in tag and it is well formed // Grab the internals of the tag $strlen_segment = $position_next_gt - $cursor; - $segment = substr($string, $cursor, $strlen_segment); + $segment = substr($html, $cursor, $strlen_segment); // Check if it's a comment if ( @@ -232,7 +217,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer HTMLPurifier_Token_Text( '<' . $this->parseData( - substr($string, $cursor) + substr($html, $cursor) ) ); break; diff --git a/library/HTMLPurifier/Lexer/PEARSax3.php b/library/HTMLPurifier/Lexer/PEARSax3.php index c042d2f9..d2d90a12 100644 --- a/library/HTMLPurifier/Lexer/PEARSax3.php +++ b/library/HTMLPurifier/Lexer/PEARSax3.php @@ -30,23 +30,23 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer var $tokens = array(); function tokenizeHTML($string, $config = null) { + + $this->tokens = array(); + if (!$config) $config = HTMLPurifier_Config::createDefault(); - $string = $this->escapeCDATA($string); - if ($config->get('Core', 'AcceptFullDocuments')) { - $string = $this->extractBody($string); - } - $string = $this->_encoder->substituteNonSpecialEntities($string); - $string = $this->_encoder->cleanUTF8($string); + $string = $this->normalize($string, $config); + $parser=& new XML_HTMLSax3(); $parser->set_object($this); $parser->set_element_handler('openHandler','closeHandler'); $parser->set_data_handler('dataHandler'); $parser->set_escape_handler('escapeHandler'); $parser->set_option('XML_OPTION_ENTITIES_PARSED', 1); + $parser->parse($string); - $tokens = $this->tokens; - $this->tokens = array(); - return $tokens; + + return $this->tokens; + } /**