1
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-07-31 03:10:09 +02:00

Revamp entity decoding to be more like HTML5.

See %Core.LegacyEntityDecoder for more details.

Signed-off-by: Edward Z. Yang <ezyang@cs.stanford.edu>
This commit is contained in:
Edward Z. Yang
2017-03-07 13:34:55 -08:00
parent 66bbae73a9
commit 7e11c271b9
10 changed files with 272 additions and 35 deletions

View File

@@ -77,14 +77,14 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
$div = $body->getElementsByTagName('div')->item(0); // <div>
$tokens = array();
$this->tokenizeDOM($div, $tokens);
$this->tokenizeDOM($div, $tokens, $config);
// If the div has a sibling, that means we tripped across
// a premature </div> tag. So remove the div we parsed,
// and then tokenize the rest of body. We can't tokenize
// the sibling directly as we'll lose the tags in that case.
if ($div->nextSibling) {
$body->removeChild($div);
$this->tokenizeDOM($body, $tokens);
$this->tokenizeDOM($body, $tokens, $config);
}
return $tokens;
}
@@ -96,7 +96,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
* @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens.
* @return HTMLPurifier_Token of node appended to previously passed tokens.
*/
protected function tokenizeDOM($node, &$tokens)
protected function tokenizeDOM($node, &$tokens, $config)
{
$level = 0;
$nodes = array($level => new HTMLPurifier_Queue(array($node)));
@@ -105,7 +105,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
while (!$nodes[$level]->isEmpty()) {
$node = $nodes[$level]->shift(); // FIFO
$collect = $level > 0 ? true : false;
$needEndingTag = $this->createStartNode($node, $tokens, $collect);
$needEndingTag = $this->createStartNode($node, $tokens, $collect, $config);
if ($needEndingTag) {
$closingNodes[$level][] = $node;
}
@@ -135,7 +135,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
* @return bool if the token needs an endtoken
* @todo data and tagName properties don't seem to exist in DOMNode?
*/
protected function createStartNode($node, &$tokens, $collect)
protected function createStartNode($node, &$tokens, $collect, $config)
{
// intercept non element nodes. WE MUST catch all of them,
// but we're not getting the character reference nodes because
@@ -159,7 +159,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
}
}
}
$tokens[] = $this->factory->createText($this->parseData($data));
$tokens[] = $this->factory->createText($this->parseText($data, $config));
return false;
} elseif ($node->nodeType === XML_COMMENT_NODE) {
// this is code is only invoked for comments in script/style in versions

View File

@@ -129,12 +129,12 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
// We are not inside tag and there still is another tag to parse
$token = new
HTMLPurifier_Token_Text(
$this->parseData(
$this->parseText(
substr(
$html,
$cursor,
$position_next_lt - $cursor
)
), $config
)
);
if ($maintain_line_numbers) {
@@ -154,11 +154,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
// Create Text of rest of string
$token = new
HTMLPurifier_Token_Text(
$this->parseData(
$this->parseText(
substr(
$html,
$cursor
)
), $config
)
);
if ($maintain_line_numbers) {
@@ -324,8 +324,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
$token = new
HTMLPurifier_Token_Text(
'<' .
$this->parseData(
substr($html, $cursor)
$this->parseText(
substr($html, $cursor), $config
)
);
if ($maintain_line_numbers) {
@@ -429,7 +429,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
if ($value === false) {
$value = '';
}
return array($key => $this->parseData($value));
return array($key => $this->parseAttr($value, $config));
}
// setup loop environment
@@ -518,7 +518,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
if ($value === false) {
$value = '';
}
$array[$key] = $this->parseData($value);
$array[$key] = $this->parseAttr($value, $config);
$cursor++;
} else {
// boolattr

View File

@@ -36,7 +36,7 @@ class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex
$doc->getElementsByTagName('html')->item(0)-> // <html>
getElementsByTagName('body')->item(0) // <body>
,
$tokens
$tokens, $config
);
return $tokens;
}
@@ -1515,6 +1515,7 @@ class HTML5
// Consume the maximum number of characters possible, with the
// consumed characters case-sensitively matching one of the
// identifiers in the first column of the entities table.
$e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
$len = strlen($e_name);
@@ -1547,7 +1548,7 @@ class HTML5
// Return a character token for the character corresponding to the
// entity name (as given by the second column of the entities table).
return html_entity_decode('&' . $entity . ';', ENT_QUOTES, 'UTF-8');
return html_entity_decode('&' . rtrim($entity, ';') . ';', ENT_QUOTES, 'UTF-8');
}
private function emitToken($token)