From 5662efc9366a36cfa1745f4b3d787101c3c9d60d Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang" <ezyang@cs.stanford.edu>
Date: Mon, 6 Mar 2017 22:54:54 -0800
Subject: [PATCH] Fix #78.

Signed-off-by: Edward Z. Yang <ezyang@cs.stanford.edu>
---
 NEWS                                  |  6 ++++++
 library/HTMLPurifier/Lexer/DOMLex.php | 26 +++++++++++++++++++-------
 library/HTMLPurifier/Lexer/PH5P.php   |  4 ++--
 tests/HTMLPurifier/HTMLT/t78.htmlt    |  7 +++++++
 tests/HTMLPurifier/LexerTest.php      | 14 +++++++++++---
 5 files changed, 45 insertions(+), 12 deletions(-)
 create mode 100644 tests/HTMLPurifier/HTMLT/t78.htmlt
diff --git a/NEWS b/NEWS
index f850e625..8ab039e3 100644
--- a/NEWS
+++ b/NEWS
@@ -22,6 +22,12 @@ NEWS ( CHANGELOG and HISTORY )                                     HTMLPurifier
 - We accidentally dropped certain Unicode characters if there was
   one or more invalid characters.  This has been fixed, thanks
   to mpyw <ryosuke_i_628@yahoo.co.jp>
+- Fix for "Don't truncate upon encountering </div> when using DOMLex"
+  caused a regression with HTML 4.01 Strict parsing with libxml 2.9.1
+  (and maybe later versions, but known OK with libxml 2.9.4).  The
+  fix is to go about handling truncation a bit more cleverly so that
+  we can wrap with divs (sidestepping the bug) but slurping out the
+  rest of the text in case it ran off the end.  (#78)
 # By default, when a link has a target attribute associated
   with it, we now also add rel="noopener" in order to
   prevent the new window from being able to overwrite
diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php
index b8181929..1406c506 100644
--- a/library/HTMLPurifier/Lexer/DOMLex.php
+++ b/library/HTMLPurifier/Lexer/DOMLex.php
@@ -72,12 +72,20 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
         $doc->loadHTML($html);
         restore_error_handler();
 
+        $body = $doc->getElementsByTagName('html')->item(0)-> // <html>
+                      getElementsByTagName('body')->item(0);  // <body>
+
+        $div = $body->getElementsByTagName('div')->item(0); // <div>
         $tokens = array();
-        $this->tokenizeDOM(
-            $doc->getElementsByTagName('html')->item(0)-> // <html>
-            getElementsByTagName('body')->item(0), //   <body>
-            $tokens
-        );
+        $this->tokenizeDOM($div, $tokens);
+        // If the div has a sibling, that means we tripped across
+        // a premature </div> tag.  So remove the div we parsed,
+        // and then tokenize the rest of body.  We can't tokenize
+        // the sibling directly as we'll lose the tags in that case.
+        if ($div->nextSibling) {
+            $body->removeChild($div);
+            $this->tokenizeDOM($body, $tokens);
+        }
         return $tokens;
     }
 
@@ -252,7 +260,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
      * @param HTMLPurifier_Context $context
      * @return string
      */
-    protected function wrapHTML($html, $config, $context)
+    protected function wrapHTML($html, $config, $context, $use_div = true)
     {
         $def = $config->getDefinition('HTML');
         $ret = '';
@@ -271,7 +279,11 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
         $ret .= '<html><head>';
         $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
         // No protection if $html contains a stray </div>!
-        $ret .= '</head><body>' . $html . '</body></html>';
+        $ret .= '</head><body>';
+        if ($use_div) $ret .= '<div>';
+        $ret .= $html;
+        if ($use_div) $ret .= '</div>';
+        $ret .= '</body></html>';
         return $ret;
     }
 }
diff --git a/library/HTMLPurifier/Lexer/PH5P.php b/library/HTMLPurifier/Lexer/PH5P.php
index ff4fa218..39a677da 100644
--- a/library/HTMLPurifier/Lexer/PH5P.php
+++ b/library/HTMLPurifier/Lexer/PH5P.php
@@ -21,7 +21,7 @@ class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex
     public function tokenizeHTML($html, $config, $context)
     {
         $new_html = $this->normalize($html, $config, $context);
-        $new_html = $this->wrapHTML($new_html, $config, $context);
+        $new_html = $this->wrapHTML($new_html, $config, $context, false /* no div */);
         try {
             $parser = new HTML5($new_html);
             $doc = $parser->save();
@@ -34,7 +34,7 @@ class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex
         $tokens = array();
         $this->tokenizeDOM(
             $doc->getElementsByTagName('html')->item(0)-> // <html>
-                getElementsByTagName('body')->item(0) //   <body>
+                  getElementsByTagName('body')->item(0) //   <body>
             ,
             $tokens
         );
diff --git a/tests/HTMLPurifier/HTMLT/t78.htmlt b/tests/HTMLPurifier/HTMLT/t78.htmlt
new file mode 100644
index 00000000..adc3b532
--- /dev/null
+++ b/tests/HTMLPurifier/HTMLT/t78.htmlt
@@ -0,0 +1,7 @@
+--INI--
+HTML.Doctype = HTML 4.01 Strict
+--HTML--
+<b>Vetgedrukt</b> <i>Schuingedrukt</i> <span>Hou</span><iframe></iframe><script></script> jij ook zo van vakjesdenken?
+--EXPECT--
+<b>Vetgedrukt</b> <i>Schuingedrukt</i> <span>Hou</span> jij ook zo van vakjesdenken?
+--# vim: et sw=4 sts=4
diff --git a/tests/HTMLPurifier/LexerTest.php b/tests/HTMLPurifier/LexerTest.php
index 00e08097..e28dc9e9 100644
--- a/tests/HTMLPurifier/LexerTest.php
+++ b/tests/HTMLPurifier/LexerTest.php
@@ -814,13 +814,21 @@ div {}
     public function test_tokenizeHTML_prematureDivClose()
     {
         $this->assertTokenization(
-            '</div>dontdie',
+            '</div>dont<b>die</b>',
             array(
                 new HTMLPurifier_Token_End('div'),
-                new HTMLPurifier_Token_Text('dontdie')
+                new HTMLPurifier_Token_Text('dont'),
+                new HTMLPurifier_Token_Start('b'),
+                new HTMLPurifier_Token_Text('die'),
+                new HTMLPurifier_Token_End('b'),
             ),
             array(
-                'DOMLex' => $alt = array(new HTMLPurifier_Token_Text('dontdie')),
+                'DOMLex' => $alt = array(
+                    new HTMLPurifier_Token_Text('dont'),
+                    new HTMLPurifier_Token_Start('b'),
+                    new HTMLPurifier_Token_Text('die'),
+                    new HTMLPurifier_Token_End('b')
+                ),
                 'PH5P' => $alt
             )
         );