Revamp entity decoding to be more like HTML5.

See %Core.LegacyEntityDecoder for more details. Signed-off-by: Edward Z. Yang <ezyang@cs.stanford.edu>
2025-07-31 03:10:09 +02:00 · 2017-03-07 13:34:55 -08:00
parent 66bbae73a9
commit 7e11c271b9
10 changed files with 272 additions and 35 deletions
--- a/library/HTMLPurifier/Lexer/DOMLex.php
+++ b/library/HTMLPurifier/Lexer/DOMLex.php
@@ -77,14 +77,14 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer

        $div = $body->getElementsByTagName('div')->item(0); // <div>
        $tokens = array();
-        $this->tokenizeDOM($div, $tokens);
+        $this->tokenizeDOM($div, $tokens, $config);
        // If the div has a sibling, that means we tripped across
        // a premature </div> tag.  So remove the div we parsed,
        // and then tokenize the rest of body.  We can't tokenize
        // the sibling directly as we'll lose the tags in that case.
        if ($div->nextSibling) {
            $body->removeChild($div);
-            $this->tokenizeDOM($body, $tokens);
+            $this->tokenizeDOM($body, $tokens, $config);
        }
        return $tokens;
    }
@@ -96,7 +96,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
     * @param HTMLPurifier_Token[] $tokens   Array-list of already tokenized tokens.
     * @return HTMLPurifier_Token of node appended to previously passed tokens.
     */
-    protected function tokenizeDOM($node, &$tokens)
+    protected function tokenizeDOM($node, &$tokens, $config)
    {
        $level = 0;
        $nodes = array($level => new HTMLPurifier_Queue(array($node)));
@@ -105,7 +105,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
            while (!$nodes[$level]->isEmpty()) {
                $node = $nodes[$level]->shift(); // FIFO
                $collect = $level > 0 ? true : false;
-                $needEndingTag = $this->createStartNode($node, $tokens, $collect);
+                $needEndingTag = $this->createStartNode($node, $tokens, $collect, $config);
                if ($needEndingTag) {
                    $closingNodes[$level][] = $node;
                }
@@ -135,7 +135,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
     * @return bool if the token needs an endtoken
     * @todo data and tagName properties don't seem to exist in DOMNode?
     */
-    protected function createStartNode($node, &$tokens, $collect)
+    protected function createStartNode($node, &$tokens, $collect, $config)
    {
        // intercept non element nodes. WE MUST catch all of them,
        // but we're not getting the character reference nodes because
@@ -159,7 +159,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
                    }
                }
            }
-            $tokens[] = $this->factory->createText($this->parseData($data));
+            $tokens[] = $this->factory->createText($this->parseText($data, $config));
            return false;
        } elseif ($node->nodeType === XML_COMMENT_NODE) {
            // this is code is only invoked for comments in script/style in versions
--- a/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/library/HTMLPurifier/Lexer/DirectLex.php
@@ -129,12 +129,12 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                // We are not inside tag and there still is another tag to parse
                $token = new
                HTMLPurifier_Token_Text(
-                    $this->parseData(
+                    $this->parseText(
                        substr(
                            $html,
                            $cursor,
                            $position_next_lt - $cursor
-                        )
+                        ), $config
                    )
                );
                if ($maintain_line_numbers) {
@@ -154,11 +154,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                // Create Text of rest of string
                $token = new
                HTMLPurifier_Token_Text(
-                    $this->parseData(
+                    $this->parseText(
                        substr(
                            $html,
                            $cursor
-                        )
+                        ), $config
                    )
                );
                if ($maintain_line_numbers) {
@@ -324,8 +324,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                $token = new
                HTMLPurifier_Token_Text(
                    '<' .
-                    $this->parseData(
-                        substr($html, $cursor)
+                    $this->parseText(
+                        substr($html, $cursor), $config
                    )
                );
                if ($maintain_line_numbers) {
@@ -429,7 +429,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
            if ($value === false) {
                $value = '';
            }
-            return array($key => $this->parseData($value));
+            return array($key => $this->parseAttr($value, $config));
        }

        // setup loop environment
@@ -518,7 +518,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                if ($value === false) {
                    $value = '';
                }
-                $array[$key] = $this->parseData($value);
+                $array[$key] = $this->parseAttr($value, $config);
                $cursor++;
            } else {
                // boolattr
--- a/library/HTMLPurifier/Lexer/PH5P.php
+++ b/library/HTMLPurifier/Lexer/PH5P.php
@@ -36,7 +36,7 @@ class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex
            $doc->getElementsByTagName('html')->item(0)-> // <html>
                  getElementsByTagName('body')->item(0) //   <body>
            ,
-            $tokens
+            $tokens, $config
        );
        return $tokens;
    }
@@ -1515,6 +1515,7 @@ class HTML5
                // Consume the maximum number of characters possible, with the
                // consumed characters case-sensitively matching one of the
                // identifiers in the first column of the entities table.
+
                $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
                $len = strlen($e_name);

@@ -1547,7 +1548,7 @@ class HTML5

        // Return a character token for the character corresponding to the
        // entity name (as given by the second column of the entities table).
-        return html_entity_decode('&' . $entity . ';', ENT_QUOTES, 'UTF-8');
+        return html_entity_decode('&' . rtrim($entity, ';') . ';', ENT_QUOTES, 'UTF-8');
    }

    private function emitToken($token)