1
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-08-06 22:26:31 +02:00

Revamp entity decoding to be more like HTML5.

See %Core.LegacyEntityDecoder for more details.

Signed-off-by: Edward Z. Yang <ezyang@cs.stanford.edu>
This commit is contained in:
Edward Z. Yang
2017-03-07 13:34:55 -08:00
parent 66bbae73a9
commit 7e11c271b9
10 changed files with 272 additions and 35 deletions

View File

@@ -77,14 +77,14 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
$div = $body->getElementsByTagName('div')->item(0); // <div>
$tokens = array();
$this->tokenizeDOM($div, $tokens);
$this->tokenizeDOM($div, $tokens, $config);
// If the div has a sibling, that means we tripped across
// a premature </div> tag. So remove the div we parsed,
// and then tokenize the rest of body. We can't tokenize
// the sibling directly as we'll lose the tags in that case.
if ($div->nextSibling) {
$body->removeChild($div);
$this->tokenizeDOM($body, $tokens);
$this->tokenizeDOM($body, $tokens, $config);
}
return $tokens;
}
@@ -96,7 +96,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
* @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens.
* @return HTMLPurifier_Token of node appended to previously passed tokens.
*/
protected function tokenizeDOM($node, &$tokens)
protected function tokenizeDOM($node, &$tokens, $config)
{
$level = 0;
$nodes = array($level => new HTMLPurifier_Queue(array($node)));
@@ -105,7 +105,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
while (!$nodes[$level]->isEmpty()) {
$node = $nodes[$level]->shift(); // FIFO
$collect = $level > 0 ? true : false;
$needEndingTag = $this->createStartNode($node, $tokens, $collect);
$needEndingTag = $this->createStartNode($node, $tokens, $collect, $config);
if ($needEndingTag) {
$closingNodes[$level][] = $node;
}
@@ -135,7 +135,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
* @return bool if the token needs an endtoken
* @todo data and tagName properties don't seem to exist in DOMNode?
*/
protected function createStartNode($node, &$tokens, $collect)
protected function createStartNode($node, &$tokens, $collect, $config)
{
// intercept non element nodes. WE MUST catch all of them,
// but we're not getting the character reference nodes because
@@ -159,7 +159,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
}
}
}
$tokens[] = $this->factory->createText($this->parseData($data));
$tokens[] = $this->factory->createText($this->parseText($data, $config));
return false;
} elseif ($node->nodeType === XML_COMMENT_NODE) {
// this is code is only invoked for comments in script/style in versions