1
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-08-06 14:16:32 +02:00

Revamp entity decoding to be more like HTML5.

See %Core.LegacyEntityDecoder for more details.

Signed-off-by: Edward Z. Yang <ezyang@cs.stanford.edu>
This commit is contained in:
Edward Z. Yang
2017-03-07 13:34:55 -08:00
parent 66bbae73a9
commit 7e11c271b9
10 changed files with 272 additions and 35 deletions

View File

@@ -169,21 +169,24 @@ class HTMLPurifier_Lexer
'&#x27;' => "'"
);
public function parseText($string, $config) {
return $this->parseData($string, false, $config);
}
public function parseAttr($string, $config) {
return $this->parseData($string, true, $config);
}
/**
* Parses special entities into the proper characters.
*
* This string will translate escaped versions of the special characters
* into the correct ones.
*
* @warning
* You should be able to treat the output of this function as
* completely parsed, but that's only because all other entities should
* have been handled previously in substituteNonSpecialEntities()
*
* @param string $string String character data to be parsed.
* @return string Parsed character data.
*/
public function parseData($string)
public function parseData($string, $is_attr, $config)
{
// following functions require at least one character
if ($string === '') {
@@ -209,7 +212,15 @@ class HTMLPurifier_Lexer
}
// hmm... now we have some uncommon entities. Use the callback.
$string = $this->_entity_parser->substituteSpecialEntities($string);
if ($config->get('Core.LegacyEntityDecoder')) {
$string = $this->_entity_parser->substituteSpecialEntities($string);
} else {
if ($is_attr) {
$string = $this->_entity_parser->substituteAttrEntities($string);
} else {
$string = $this->_entity_parser->substituteTextEntities($string);
}
}
return $string;
}
@@ -323,7 +334,9 @@ class HTMLPurifier_Lexer
}
// expand entities that aren't the big five
$html = $this->_entity_parser->substituteNonSpecialEntities($html);
if ($config->get('Core.LegacyEntityDecoder')) {
$html = $this->_entity_parser->substituteNonSpecialEntities($html);
}
// clean into wellformed UTF-8 string for an SGML context: this has
// to be done after entity expansion because the entities sometimes