mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2025-07-31 03:10:09 +02:00
Revamp entity decoding to be more like HTML5.
See %Core.LegacyEntityDecoder for more details. Signed-off-by: Edward Z. Yang <ezyang@cs.stanford.edu>
This commit is contained in:
@@ -77,14 +77,14 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
||||
|
||||
$div = $body->getElementsByTagName('div')->item(0); // <div>
|
||||
$tokens = array();
|
||||
$this->tokenizeDOM($div, $tokens);
|
||||
$this->tokenizeDOM($div, $tokens, $config);
|
||||
// If the div has a sibling, that means we tripped across
|
||||
// a premature </div> tag. So remove the div we parsed,
|
||||
// and then tokenize the rest of body. We can't tokenize
|
||||
// the sibling directly as we'll lose the tags in that case.
|
||||
if ($div->nextSibling) {
|
||||
$body->removeChild($div);
|
||||
$this->tokenizeDOM($body, $tokens);
|
||||
$this->tokenizeDOM($body, $tokens, $config);
|
||||
}
|
||||
return $tokens;
|
||||
}
|
||||
@@ -96,7 +96,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
||||
* @param HTMLPurifier_Token[] $tokens Array-list of already tokenized tokens.
|
||||
* @return HTMLPurifier_Token of node appended to previously passed tokens.
|
||||
*/
|
||||
protected function tokenizeDOM($node, &$tokens)
|
||||
protected function tokenizeDOM($node, &$tokens, $config)
|
||||
{
|
||||
$level = 0;
|
||||
$nodes = array($level => new HTMLPurifier_Queue(array($node)));
|
||||
@@ -105,7 +105,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
||||
while (!$nodes[$level]->isEmpty()) {
|
||||
$node = $nodes[$level]->shift(); // FIFO
|
||||
$collect = $level > 0 ? true : false;
|
||||
$needEndingTag = $this->createStartNode($node, $tokens, $collect);
|
||||
$needEndingTag = $this->createStartNode($node, $tokens, $collect, $config);
|
||||
if ($needEndingTag) {
|
||||
$closingNodes[$level][] = $node;
|
||||
}
|
||||
@@ -135,7 +135,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
||||
* @return bool if the token needs an endtoken
|
||||
* @todo data and tagName properties don't seem to exist in DOMNode?
|
||||
*/
|
||||
protected function createStartNode($node, &$tokens, $collect)
|
||||
protected function createStartNode($node, &$tokens, $collect, $config)
|
||||
{
|
||||
// intercept non element nodes. WE MUST catch all of them,
|
||||
// but we're not getting the character reference nodes because
|
||||
@@ -159,7 +159,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
||||
}
|
||||
}
|
||||
}
|
||||
$tokens[] = $this->factory->createText($this->parseData($data));
|
||||
$tokens[] = $this->factory->createText($this->parseText($data, $config));
|
||||
return false;
|
||||
} elseif ($node->nodeType === XML_COMMENT_NODE) {
|
||||
// this is code is only invoked for comments in script/style in versions
|
||||
|
@@ -129,12 +129,12 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
// We are not inside tag and there still is another tag to parse
|
||||
$token = new
|
||||
HTMLPurifier_Token_Text(
|
||||
$this->parseData(
|
||||
$this->parseText(
|
||||
substr(
|
||||
$html,
|
||||
$cursor,
|
||||
$position_next_lt - $cursor
|
||||
)
|
||||
), $config
|
||||
)
|
||||
);
|
||||
if ($maintain_line_numbers) {
|
||||
@@ -154,11 +154,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
// Create Text of rest of string
|
||||
$token = new
|
||||
HTMLPurifier_Token_Text(
|
||||
$this->parseData(
|
||||
$this->parseText(
|
||||
substr(
|
||||
$html,
|
||||
$cursor
|
||||
)
|
||||
), $config
|
||||
)
|
||||
);
|
||||
if ($maintain_line_numbers) {
|
||||
@@ -324,8 +324,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
$token = new
|
||||
HTMLPurifier_Token_Text(
|
||||
'<' .
|
||||
$this->parseData(
|
||||
substr($html, $cursor)
|
||||
$this->parseText(
|
||||
substr($html, $cursor), $config
|
||||
)
|
||||
);
|
||||
if ($maintain_line_numbers) {
|
||||
@@ -429,7 +429,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
if ($value === false) {
|
||||
$value = '';
|
||||
}
|
||||
return array($key => $this->parseData($value));
|
||||
return array($key => $this->parseAttr($value, $config));
|
||||
}
|
||||
|
||||
// setup loop environment
|
||||
@@ -518,7 +518,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
if ($value === false) {
|
||||
$value = '';
|
||||
}
|
||||
$array[$key] = $this->parseData($value);
|
||||
$array[$key] = $this->parseAttr($value, $config);
|
||||
$cursor++;
|
||||
} else {
|
||||
// boolattr
|
||||
|
@@ -36,7 +36,7 @@ class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex
|
||||
$doc->getElementsByTagName('html')->item(0)-> // <html>
|
||||
getElementsByTagName('body')->item(0) // <body>
|
||||
,
|
||||
$tokens
|
||||
$tokens, $config
|
||||
);
|
||||
return $tokens;
|
||||
}
|
||||
@@ -1515,6 +1515,7 @@ class HTML5
|
||||
// Consume the maximum number of characters possible, with the
|
||||
// consumed characters case-sensitively matching one of the
|
||||
// identifiers in the first column of the entities table.
|
||||
|
||||
$e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
|
||||
$len = strlen($e_name);
|
||||
|
||||
@@ -1547,7 +1548,7 @@ class HTML5
|
||||
|
||||
// Return a character token for the character corresponding to the
|
||||
// entity name (as given by the second column of the entities table).
|
||||
return html_entity_decode('&' . $entity . ';', ENT_QUOTES, 'UTF-8');
|
||||
return html_entity_decode('&' . rtrim($entity, ';') . ';', ENT_QUOTES, 'UTF-8');
|
||||
}
|
||||
|
||||
private function emitToken($token)
|
||||
|
Reference in New Issue
Block a user