diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php index 202b4c8a..1ba9675f 100644 --- a/library/HTMLPurifier/Lexer/DOMLex.php +++ b/library/HTMLPurifier/Lexer/DOMLex.php @@ -75,16 +75,15 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer protected function tokenizeDOM($node, &$tokens, $collect = false) { // recursive goodness! - // intercept non element nodes - - if ( isset($node->data) ) { - if ($node->nodeType === XML_TEXT_NODE || - $node->nodeType === XML_CDATA_SECTION_NODE) { - $tokens[] = $this->factory->createText($node->data); - } elseif ($node->nodeType === XML_COMMENT_NODE) { - $tokens[] = $this->factory->createComment($node->data); - } - // quite possibly, the object wasn't handled, that's fine + // intercept non element nodes. WE MUST catch all of them, + // but we're not getting the character reference nodes because + // those should have been preprocessed + if ($node->nodeType === XML_TEXT_NODE || + $node->nodeType === XML_CDATA_SECTION_NODE) { + $tokens[] = $this->factory->createText($node->data); + return; + } elseif ($node->nodeType === XML_COMMENT_NODE) { + $tokens[] = $this->factory->createComment($node->data); return; }