Fix #78.

Signed-off-by: Edward Z. Yang <ezyang@cs.stanford.edu>
2025-07-31 03:10:09 +02:00 · 2017-03-06 22:54:54 -08:00
parent 353c96f156
commit 5662efc936
5 changed files with 45 additions and 12 deletions
--- a/library/HTMLPurifier/Lexer/DOMLex.php
+++ b/library/HTMLPurifier/Lexer/DOMLex.php
@@ -72,12 +72,20 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
        $doc->loadHTML($html);
        restore_error_handler();

+        $body = $doc->getElementsByTagName('html')->item(0)-> // <html>
+                      getElementsByTagName('body')->item(0);  // <body>
+
+        $div = $body->getElementsByTagName('div')->item(0); // <div>
        $tokens = array();
-        $this->tokenizeDOM(
-            $doc->getElementsByTagName('html')->item(0)-> // <html>
-            getElementsByTagName('body')->item(0), //   <body>
-            $tokens
-        );
+        $this->tokenizeDOM($div, $tokens);
+        // If the div has a sibling, that means we tripped across
+        // a premature </div> tag.  So remove the div we parsed,
+        // and then tokenize the rest of body.  We can't tokenize
+        // the sibling directly as we'll lose the tags in that case.
+        if ($div->nextSibling) {
+            $body->removeChild($div);
+            $this->tokenizeDOM($body, $tokens);
+        }
        return $tokens;
    }

@@ -252,7 +260,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
     * @param HTMLPurifier_Context $context
     * @return string
     */
-    protected function wrapHTML($html, $config, $context)
+    protected function wrapHTML($html, $config, $context, $use_div = true)
    {
        $def = $config->getDefinition('HTML');
        $ret = '';
@@ -271,7 +279,11 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
        $ret .= '<html><head>';
        $ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
        // No protection if $html contains a stray </div>!
-        $ret .= '</head><body>' . $html . '</body></html>';
+        $ret .= '</head><body>';
+        if ($use_div) $ret .= '<div>';
+        $ret .= $html;
+        if ($use_div) $ret .= '</div>';
+        $ret .= '</body></html>';
        return $ret;
    }
 }
--- a/library/HTMLPurifier/Lexer/PH5P.php
+++ b/library/HTMLPurifier/Lexer/PH5P.php
@@ -21,7 +21,7 @@ class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex
    public function tokenizeHTML($html, $config, $context)
    {
        $new_html = $this->normalize($html, $config, $context);
-        $new_html = $this->wrapHTML($new_html, $config, $context);
+        $new_html = $this->wrapHTML($new_html, $config, $context, false /* no div */);
        try {
            $parser = new HTML5($new_html);
            $doc = $parser->save();
@@ -34,7 +34,7 @@ class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex
        $tokens = array();
        $this->tokenizeDOM(
            $doc->getElementsByTagName('html')->item(0)-> // <html>
-                getElementsByTagName('body')->item(0) //   <body>
+                  getElementsByTagName('body')->item(0) //   <body>
            ,
            $tokens
        );