mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2025-08-03 20:58:11 +02:00
fix: catastrophic backtracking in Core.AggressivelyFixLt (#440)
This commit is contained in:
@@ -52,14 +52,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
|||||||
// attempt to armor stray angled brackets that cannot possibly
|
// attempt to armor stray angled brackets that cannot possibly
|
||||||
// form tags and thus are probably being used as emoticons
|
// form tags and thus are probably being used as emoticons
|
||||||
if ($config->get('Core.AggressivelyFixLt')) {
|
if ($config->get('Core.AggressivelyFixLt')) {
|
||||||
$char = '[^a-z!\/]';
|
$html = $this->aggressivelyFixLt($html);
|
||||||
$comment = "/<!--(.*?)(-->|\z)/is";
|
|
||||||
$html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
|
|
||||||
do {
|
|
||||||
$old = $html;
|
|
||||||
$html = preg_replace("/<($char)/i", '<\\1', $html);
|
|
||||||
} while ($html !== $old);
|
|
||||||
$html = preg_replace_callback($comment, array($this, 'callbackUndoCommentSubst'), $html); // fix comments
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// preprocess html, essential for UTF-8
|
// preprocess html, essential for UTF-8
|
||||||
@@ -288,7 +281,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
|||||||
*/
|
*/
|
||||||
public function callbackUndoCommentSubst($matches)
|
public function callbackUndoCommentSubst($matches)
|
||||||
{
|
{
|
||||||
return '<!--' . strtr($matches[1], array('&' => '&', '<' => '<')) . $matches[2];
|
return '<!--' . $this->undoCommentSubstr($matches[1]) . $matches[2];
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -299,7 +292,25 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
|||||||
*/
|
*/
|
||||||
public function callbackArmorCommentEntities($matches)
|
public function callbackArmorCommentEntities($matches)
|
||||||
{
|
{
|
||||||
return '<!--' . str_replace('&', '&', $matches[1]) . $matches[2];
|
return '<!--' . $this->armorEntities($matches[1]) . $matches[2];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param string $string
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function armorEntities($string)
|
||||||
|
{
|
||||||
|
return str_replace('&', '&', $string);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param string $string
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function undoCommentSubstr($string)
|
||||||
|
{
|
||||||
|
return strtr($string, array('&' => '&', '<' => '<'));
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -335,6 +346,66 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
|||||||
$ret .= '</body></html>';
|
$ret .= '</body></html>';
|
||||||
return $ret;
|
return $ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param string $html
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function aggressivelyFixLt($html)
|
||||||
|
{
|
||||||
|
$char = '[^a-z!\/]';
|
||||||
|
$html = $this->manipulateHtmlComments($html, array($this, 'armorEntities'));
|
||||||
|
|
||||||
|
do {
|
||||||
|
$old = $html;
|
||||||
|
$html = preg_replace("/<($char)/i", '<\\1', $html);
|
||||||
|
} while ($html !== $old);
|
||||||
|
|
||||||
|
return $this->manipulateHtmlComments($html, array($this, 'undoCommentSubstr'));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Modify HTML comments in the given HTML content using a callback.
|
||||||
|
*
|
||||||
|
* @param string $html
|
||||||
|
* @param callable $callback
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
protected function manipulateHtmlComments($html, callable $callback)
|
||||||
|
{
|
||||||
|
$offset = 0;
|
||||||
|
$startTag = '<!--';
|
||||||
|
$endTag = '-->';
|
||||||
|
|
||||||
|
while (($startPos = strpos($html, $startTag, $offset)) !== false) {
|
||||||
|
$startPos += strlen($startTag); // Move past `<!--`
|
||||||
|
$endPos = strpos($html, $endTag, $startPos);
|
||||||
|
|
||||||
|
if ($endPos === false) {
|
||||||
|
// No matching ending comment tag found
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract the original comment content
|
||||||
|
$commentContent = substr($html, $startPos, $endPos - $startPos);
|
||||||
|
|
||||||
|
// Apply the callback to the comment content
|
||||||
|
$newCommentContent = $callback($commentContent);
|
||||||
|
|
||||||
|
// Reconstruct the entire comment with the new content
|
||||||
|
$newComment = $startTag . $newCommentContent . $endTag;
|
||||||
|
|
||||||
|
// Replace the old comment in the HTML content with the new one
|
||||||
|
$html = substr($html, 0, $startPos - strlen($startTag)) .
|
||||||
|
$newComment .
|
||||||
|
substr($html, $endPos + strlen($endTag));
|
||||||
|
|
||||||
|
// Move offset to the end of the new comment for the next iteration
|
||||||
|
$offset = strpos($html, $newComment, $offset) + strlen($newComment);
|
||||||
|
}
|
||||||
|
|
||||||
|
return $html;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// vim: et sw=4 sts=4
|
// vim: et sw=4 sts=4
|
||||||
|
40
tests/HTMLPurifier/Lexer/DomLexTest.php
Normal file
40
tests/HTMLPurifier/Lexer/DomLexTest.php
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
class HTMLPurifier_Lexer_DomLexTest extends HTMLPurifier_Harness
|
||||||
|
{
|
||||||
|
|
||||||
|
protected $domLex;
|
||||||
|
|
||||||
|
public function setUp()
|
||||||
|
{
|
||||||
|
$this->domLex = new HTMLPurifier_Lexer_DOMLex();
|
||||||
|
}
|
||||||
|
|
||||||
|
public function testCoreAggressivelyFixLtEmojis()
|
||||||
|
{
|
||||||
|
$context = new HTMLPurifier_Context();
|
||||||
|
$config = HTMLPurifier_Config::createDefault();
|
||||||
|
$output = $this->domLex->tokenizeHTML('<b><3</b>', $config, $context);
|
||||||
|
|
||||||
|
$this->assertIdentical($output, array(
|
||||||
|
new HTMLPurifier_Token_Start('b'),
|
||||||
|
new HTMLPurifier_Token_Text('<3'),
|
||||||
|
new HTMLPurifier_Token_End('b')
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
public function testCoreAggressivelyFixLtComments()
|
||||||
|
{
|
||||||
|
$context = new HTMLPurifier_Context();
|
||||||
|
$config = HTMLPurifier_Config::createDefault();
|
||||||
|
$output = $this->domLex->tokenizeHTML('<!-- Nested <!-- Not to be included --> comment -->', $config, $context);
|
||||||
|
|
||||||
|
$this->assertIdentical($output, array(
|
||||||
|
new HTMLPurifier_Token_Comment(' Nested <!-- Not to be included '),
|
||||||
|
new HTMLPurifier_Token_Text(' comment -->')
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// vim: et sw=4 sts=4
|
Reference in New Issue
Block a user