From 5662efc9366a36cfa1745f4b3d787101c3c9d60d Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Mon, 6 Mar 2017 22:54:54 -0800 Subject: [PATCH] Fix #78. Signed-off-by: Edward Z. Yang --- NEWS | 6 ++++++ library/HTMLPurifier/Lexer/DOMLex.php | 26 +++++++++++++++++++------- library/HTMLPurifier/Lexer/PH5P.php | 4 ++-- tests/HTMLPurifier/HTMLT/t78.htmlt | 7 +++++++ tests/HTMLPurifier/LexerTest.php | 14 +++++++++++--- 5 files changed, 45 insertions(+), 12 deletions(-) create mode 100644 tests/HTMLPurifier/HTMLT/t78.htmlt diff --git a/NEWS b/NEWS index f850e625..8ab039e3 100644 --- a/NEWS +++ b/NEWS @@ -22,6 +22,12 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier - We accidentally dropped certain Unicode characters if there was one or more invalid characters. This has been fixed, thanks to mpyw +- Fix for "Don't truncate upon encountering when using DOMLex" + caused a regression with HTML 4.01 Strict parsing with libxml 2.9.1 + (and maybe later versions, but known OK with libxml 2.9.4). The + fix is to go about handling truncation a bit more cleverly so that + we can wrap with divs (sidestepping the bug) but slurping out the + rest of the text in case it ran off the end. (#78) # By default, when a link has a target attribute associated with it, we now also add rel="noopener" in order to prevent the new window from being able to overwrite diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php index b8181929..1406c506 100644 --- a/library/HTMLPurifier/Lexer/DOMLex.php +++ b/library/HTMLPurifier/Lexer/DOMLex.php @@ -72,12 +72,20 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer $doc->loadHTML($html); restore_error_handler(); + $body = $doc->getElementsByTagName('html')->item(0)-> // + getElementsByTagName('body')->item(0); // + + $div = $body->getElementsByTagName('div')->item(0); //
$tokens = array(); - $this->tokenizeDOM( - $doc->getElementsByTagName('html')->item(0)-> // - getElementsByTagName('body')->item(0), // - $tokens - ); + $this->tokenizeDOM($div, $tokens); + // If the div has a sibling, that means we tripped across + // a premature
tag. So remove the div we parsed, + // and then tokenize the rest of body. We can't tokenize + // the sibling directly as we'll lose the tags in that case. + if ($div->nextSibling) { + $body->removeChild($div); + $this->tokenizeDOM($body, $tokens); + } return $tokens; } @@ -252,7 +260,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer * @param HTMLPurifier_Context $context * @return string */ - protected function wrapHTML($html, $config, $context) + protected function wrapHTML($html, $config, $context, $use_div = true) { $def = $config->getDefinition('HTML'); $ret = ''; @@ -271,7 +279,11 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer $ret .= ''; $ret .= ''; // No protection if $html contains a stray ! - $ret .= '' . $html . ''; + $ret .= ''; + if ($use_div) $ret .= '
'; + $ret .= $html; + if ($use_div) $ret .= '
'; + $ret .= ''; return $ret; } } diff --git a/library/HTMLPurifier/Lexer/PH5P.php b/library/HTMLPurifier/Lexer/PH5P.php index ff4fa218..39a677da 100644 --- a/library/HTMLPurifier/Lexer/PH5P.php +++ b/library/HTMLPurifier/Lexer/PH5P.php @@ -21,7 +21,7 @@ class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex public function tokenizeHTML($html, $config, $context) { $new_html = $this->normalize($html, $config, $context); - $new_html = $this->wrapHTML($new_html, $config, $context); + $new_html = $this->wrapHTML($new_html, $config, $context, false /* no div */); try { $parser = new HTML5($new_html); $doc = $parser->save(); @@ -34,7 +34,7 @@ class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex $tokens = array(); $this->tokenizeDOM( $doc->getElementsByTagName('html')->item(0)-> // - getElementsByTagName('body')->item(0) // + getElementsByTagName('body')->item(0) // , $tokens ); diff --git a/tests/HTMLPurifier/HTMLT/t78.htmlt b/tests/HTMLPurifier/HTMLT/t78.htmlt new file mode 100644 index 00000000..adc3b532 --- /dev/null +++ b/tests/HTMLPurifier/HTMLT/t78.htmlt @@ -0,0 +1,7 @@ +--INI-- +HTML.Doctype = HTML 4.01 Strict +--HTML-- +Vetgedrukt Schuingedrukt Hou jij ook zo van vakjesdenken? +--EXPECT-- +Vetgedrukt Schuingedrukt Hou jij ook zo van vakjesdenken? +--# vim: et sw=4 sts=4 diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php index 00e08097..e28dc9e9 100644 --- a/tests/HTMLPurifier/LexerTest.php +++ b/tests/HTMLPurifier/LexerTest.php @@ -814,13 +814,21 @@ div {} public function test_tokenizeHTML_prematureDivClose() { $this->assertTokenization( - 'dontdie', + 'dontdie', array( new HTMLPurifier_Token_End('div'), - new HTMLPurifier_Token_Text('dontdie') + new HTMLPurifier_Token_Text('dont'), + new HTMLPurifier_Token_Start('b'), + new HTMLPurifier_Token_Text('die'), + new HTMLPurifier_Token_End('b'), ), array( - 'DOMLex' => $alt = array(new HTMLPurifier_Token_Text('dontdie')), + 'DOMLex' => $alt = array( + new HTMLPurifier_Token_Text('dont'), + new HTMLPurifier_Token_Start('b'), + new HTMLPurifier_Token_Text('die'), + new HTMLPurifier_Token_End('b') + ), 'PH5P' => $alt ) );