mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2025-08-02 20:27:40 +02:00
fix: catastrophic backtracking in Core.AggressivelyFixLt (#440)
This commit is contained in:
@@ -52,14 +52,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
||||
// attempt to armor stray angled brackets that cannot possibly
|
||||
// form tags and thus are probably being used as emoticons
|
||||
if ($config->get('Core.AggressivelyFixLt')) {
|
||||
$char = '[^a-z!\/]';
|
||||
$comment = "/<!--(.*?)(-->|\z)/is";
|
||||
$html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
|
||||
do {
|
||||
$old = $html;
|
||||
$html = preg_replace("/<($char)/i", '<\\1', $html);
|
||||
} while ($html !== $old);
|
||||
$html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
|
||||
$html = $this->aggressivelyFixLt($html);
|
||||
}
|
||||
|
||||
// preprocess html, essential for UTF-8
|
||||
@@ -288,7 +281,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
||||
*/
|
||||
public function callbackUndoCommentSubst($matches)
|
||||
{
|
||||
return '<!--' . strtr($matches[1], array('&' => '&', '<' => '<')) . $matches[2];
|
||||
return '<!--' . $this->undoCommentSubstr($matches[1]) . $matches[2];
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -299,7 +292,25 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
||||
*/
|
||||
public function callbackArmorCommentEntities($matches)
|
||||
{
|
||||
return '<!--' . str_replace('&', '&', $matches[1]) . $matches[2];
|
||||
return '<!--' . $this->armorEntities($matches[1]) . $matches[2];
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $string
|
||||
* @return string
|
||||
*/
|
||||
protected function armorEntities($string)
|
||||
{
|
||||
return str_replace('&', '&', $string);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $string
|
||||
* @return string
|
||||
*/
|
||||
protected function undoCommentSubstr($string)
|
||||
{
|
||||
return strtr($string, array('&' => '&', '<' => '<'));
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -335,6 +346,66 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
||||
$ret .= '</body></html>';
|
||||
return $ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param string $html
|
||||
* @return string
|
||||
*/
|
||||
protected function aggressivelyFixLt($html)
|
||||
{
|
||||
$char = '[^a-z!\/]';
|
||||
$html = $this->manipulateHtmlComments($html, array($this, 'armorEntities'));
|
||||
|
||||
do {
|
||||
$old = $html;
|
||||
$html = preg_replace("/<($char)/i", '<\\1', $html);
|
||||
} while ($html !== $old);
|
||||
|
||||
return $this->manipulateHtmlComments($html, array($this, 'undoCommentSubstr'));
|
||||
}
|
||||
|
||||
/**
|
||||
* Modify HTML comments in the given HTML content using a callback.
|
||||
*
|
||||
* @param string $html
|
||||
* @param callable $callback
|
||||
* @return string
|
||||
*/
|
||||
protected function manipulateHtmlComments($html, callable $callback)
|
||||
{
|
||||
$offset = 0;
|
||||
$startTag = '<!--';
|
||||
$endTag = '-->';
|
||||
|
||||
while (($startPos = strpos($html, $startTag, $offset)) !== false) {
|
||||
$startPos += strlen($startTag); // Move past `<!--`
|
||||
$endPos = strpos($html, $endTag, $startPos);
|
||||
|
||||
if ($endPos === false) {
|
||||
// No matching ending comment tag found
|
||||
break;
|
||||
}
|
||||
|
||||
// Extract the original comment content
|
||||
$commentContent = substr($html, $startPos, $endPos - $startPos);
|
||||
|
||||
// Apply the callback to the comment content
|
||||
$newCommentContent = $callback($commentContent);
|
||||
|
||||
// Reconstruct the entire comment with the new content
|
||||
$newComment = $startTag . $newCommentContent . $endTag;
|
||||
|
||||
// Replace the old comment in the HTML content with the new one
|
||||
$html = substr($html, 0, $startPos - strlen($startTag)) .
|
||||
$newComment .
|
||||
substr($html, $endPos + strlen($endTag));
|
||||
|
||||
// Move offset to the end of the new comment for the next iteration
|
||||
$offset = strpos($html, $newComment, $offset) + strlen($newComment);
|
||||
}
|
||||
|
||||
return $html;
|
||||
}
|
||||
}
|
||||
|
||||
// vim: et sw=4 sts=4
|
||||
|
40
tests/HTMLPurifier/Lexer/DomLexTest.php
Normal file
40
tests/HTMLPurifier/Lexer/DomLexTest.php
Normal file
@@ -0,0 +1,40 @@
|
||||
<?php
|
||||
|
||||
class HTMLPurifier_Lexer_DomLexTest extends HTMLPurifier_Harness
|
||||
{
|
||||
|
||||
protected $domLex;
|
||||
|
||||
public function setUp()
|
||||
{
|
||||
$this->domLex = new HTMLPurifier_Lexer_DOMLex();
|
||||
}
|
||||
|
||||
public function testCoreAggressivelyFixLtEmojis()
|
||||
{
|
||||
$context = new HTMLPurifier_Context();
|
||||
$config = HTMLPurifier_Config::createDefault();
|
||||
$output = $this->domLex->tokenizeHTML('<b><3</b>', $config, $context);
|
||||
|
||||
$this->assertIdentical($output, array(
|
||||
new HTMLPurifier_Token_Start('b'),
|
||||
new HTMLPurifier_Token_Text('<3'),
|
||||
new HTMLPurifier_Token_End('b')
|
||||
));
|
||||
}
|
||||
|
||||
public function testCoreAggressivelyFixLtComments()
|
||||
{
|
||||
$context = new HTMLPurifier_Context();
|
||||
$config = HTMLPurifier_Config::createDefault();
|
||||
$output = $this->domLex->tokenizeHTML('<!-- Nested <!-- Not to be included --> comment -->', $config, $context);
|
||||
|
||||
$this->assertIdentical($output, array(
|
||||
new HTMLPurifier_Token_Comment(' Nested <!-- Not to be included '),
|
||||
new HTMLPurifier_Token_Text(' comment -->')
|
||||
));
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// vim: et sw=4 sts=4
|
Reference in New Issue
Block a user