diff --git a/NEWS b/NEWS index c2961f1f..c7360e20 100644 --- a/NEWS +++ b/NEWS @@ -10,7 +10,9 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier ========================== 2.1.0, unknown release date -(none) +! With %Core.AggressivelyFixLt, <3 and similar emoticons no longer + trigger HTML removal in PHP5 (DOMLex). This directive is not necessary + for PHP4 (DirectLex). 2.0.2, unknown release date (none) diff --git a/library/HTMLPurifier/Lexer/DOMLex.php b/library/HTMLPurifier/Lexer/DOMLex.php index 82865673..1816add4 100644 --- a/library/HTMLPurifier/Lexer/DOMLex.php +++ b/library/HTMLPurifier/Lexer/DOMLex.php @@ -3,6 +3,16 @@ require_once 'HTMLPurifier/Lexer.php'; require_once 'HTMLPurifier/TokenFactory.php'; +HTMLPurifier_ConfigSchema::define( + 'Core', 'AggressivelyFixLt', false, 'bool', ' +This directive enables aggressive pre-filter fixes HTML Purifier can +perform in order to ensure that open angled-brackets do not get killed +during parsing stage. Enabling this will result in two preg_replace_callback +calls and one preg_replace call for every bit of HTML passed through here. +It is not necessary and will have no effect for PHP 4. +This directive has been available since 2.1.0. +'); + /** * Parser that uses PHP 5's DOM extension (part of the core). * @@ -42,6 +52,16 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer $html = $this->normalize($html, $config, $context); + // attempt to armor stray angled brackets that cannot possibly + // form tags and thus are probably being used as emoticons + if ($config->get('Core', 'AggressivelyFixLt')) { + $char = '[^a-z!\/]'; + $comment = "/|\z)/is"; + $html = preg_replace_callback($comment, array('HTMLPurifier_Lexer_DOMLex', 'callbackArmorCommentEntities'), $html); + $html = preg_replace("/<($char)/i", '<\\1', $html); + $html = preg_replace_callback($comment, array('HTMLPurifier_Lexer_DOMLex', 'callbackUndoCommentSubst'), $html); // fix comments + } + // preprocess html, essential for UTF-8 $html = ''&','<'=>'<')) . $matches[2]; + } + + /** + * Callback function that entity-izes ampersands in comments so that + * callbackUndoCommentSubst doesn't clobber them + */ + function callbackArmorCommentEntities($matches) { + return '
'; @@ -306,6 +311,7 @@ class HTMLPurifier_LexerTest extends UnitTestCase new HTMLPurifier_Token_Empty('br') ); $sax_expect[20] = false; + $config[20] = HTMLPurifier_Config::create(array('Core.AggressivelyFixLt' => true)); // test comment parsing of missing end $input[21] = ''; + $expect[23] = array( + new HTMLPurifier_Token_Comment(' This comment < < & ') + ); + $sax_expect[23] = false; + $config[21] = HTMLPurifier_Config::create(array('Core.AggressivelyFixLt' => true)); $default_config = HTMLPurifier_Config::createDefault(); $default_context = new HTMLPurifier_Context();