1
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-08-02 20:27:40 +02:00

fix: catastrophic backtracking in Core.AggressivelyFixLt (#440)

This commit is contained in:
Kieran
2025-06-06 04:49:52 +01:00
committed by GitHub
parent 77ebd08632
commit 418eeb7dc0
2 changed files with 121 additions and 10 deletions

View File

@@ -52,14 +52,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
// attempt to armor stray angled brackets that cannot possibly
// form tags and thus are probably being used as emoticons
if ($config->get('Core.AggressivelyFixLt')) {
$char = '[^a-z!\/]';
$comment = "/<!--(.*?)(-->|\z)/is";
$html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
do {
$old = $html;
$html = preg_replace("/<($char)/i", '&lt;\\1', $html);
} while ($html !== $old);
$html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
$html = $this->aggressivelyFixLt($html);
}
// preprocess html, essential for UTF-8
@@ -288,7 +281,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
*/
public function callbackUndoCommentSubst($matches)
{
return '<!--' . strtr($matches[1], array('&amp;' => '&', '&lt;' => '<')) . $matches[2];
return '<!--' . $this->undoCommentSubstr($matches[1]) . $matches[2];
}
/**
@@ -299,7 +292,25 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
*/
public function callbackArmorCommentEntities($matches)
{
return '<!--' . str_replace('&', '&amp;', $matches[1]) . $matches[2];
return '<!--' . $this->armorEntities($matches[1]) . $matches[2];
}
/**
* @param string $string
* @return string
*/
protected function armorEntities($string)
{
return str_replace('&', '&amp;', $string);
}
/**
* @param string $string
* @return string
*/
protected function undoCommentSubstr($string)
{
return strtr($string, array('&amp;' => '&', '&lt;' => '<'));
}
/**
@@ -335,6 +346,66 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
$ret .= '</body></html>';
return $ret;
}
/**
* @param string $html
* @return string
*/
protected function aggressivelyFixLt($html)
{
$char = '[^a-z!\/]';
$html = $this->manipulateHtmlComments($html, array($this, 'armorEntities'));
do {
$old = $html;
$html = preg_replace("/<($char)/i", '&lt;\\1', $html);
} while ($html !== $old);
return $this->manipulateHtmlComments($html, array($this, 'undoCommentSubstr'));
}
/**
* Modify HTML comments in the given HTML content using a callback.
*
* @param string $html
* @param callable $callback
* @return string
*/
protected function manipulateHtmlComments($html, callable $callback)
{
$offset = 0;
$startTag = '<!--';
$endTag = '-->';
while (($startPos = strpos($html, $startTag, $offset)) !== false) {
$startPos += strlen($startTag); // Move past `<!--`
$endPos = strpos($html, $endTag, $startPos);
if ($endPos === false) {
// No matching ending comment tag found
break;
}
// Extract the original comment content
$commentContent = substr($html, $startPos, $endPos - $startPos);
// Apply the callback to the comment content
$newCommentContent = $callback($commentContent);
// Reconstruct the entire comment with the new content
$newComment = $startTag . $newCommentContent . $endTag;
// Replace the old comment in the HTML content with the new one
$html = substr($html, 0, $startPos - strlen($startTag)) .
$newComment .
substr($html, $endPos + strlen($endTag));
// Move offset to the end of the new comment for the next iteration
$offset = strpos($html, $newComment, $offset) + strlen($newComment);
}
return $html;
}
}
// vim: et sw=4 sts=4

View File

@@ -0,0 +1,40 @@
<?php
class HTMLPurifier_Lexer_DomLexTest extends HTMLPurifier_Harness
{
protected $domLex;
public function setUp()
{
$this->domLex = new HTMLPurifier_Lexer_DOMLex();
}
public function testCoreAggressivelyFixLtEmojis()
{
$context = new HTMLPurifier_Context();
$config = HTMLPurifier_Config::createDefault();
$output = $this->domLex->tokenizeHTML('<b><3</b>', $config, $context);
$this->assertIdentical($output, array(
new HTMLPurifier_Token_Start('b'),
new HTMLPurifier_Token_Text('<3'),
new HTMLPurifier_Token_End('b')
));
}
public function testCoreAggressivelyFixLtComments()
{
$context = new HTMLPurifier_Context();
$config = HTMLPurifier_Config::createDefault();
$output = $this->domLex->tokenizeHTML('<!-- Nested <!-- Not to be included --> comment -->', $config, $context);
$this->assertIdentical($output, array(
new HTMLPurifier_Token_Comment(' Nested <!-- Not to be included '),
new HTMLPurifier_Token_Text(' comment -->')
));
}
}
// vim: et sw=4 sts=4