From 631021733baf2b20b1b61f933db16e232589e16d Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Sun, 17 Feb 2013 15:47:38 -0800 Subject: [PATCH] Add %Core.DisableExcludes directive Signed-off-by: Edward Z. Yang --- NEWS | 4 +++ configdoc/usage.xml | 33 ++++++++++-------- library/HTMLPurifier/ConfigSchema/schema.ser | Bin 14784 -> 14880 bytes .../schema/Core.DisableExcludes.txt | 14 ++++++++ library/HTMLPurifier/Strategy/FixNesting.php | 22 ++++++++++-- .../HTMLPurifier/Strategy/FixNestingTest.php | 5 +++ 6 files changed, 62 insertions(+), 16 deletions(-) create mode 100644 library/HTMLPurifier/ConfigSchema/schema/Core.DisableExcludes.txt diff --git a/NEWS b/NEWS index bfc03050..842d9b3c 100644 --- a/NEWS +++ b/NEWS @@ -20,6 +20,10 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier ! Permit underscores in font families ! Support for page-break-* CSS3 properties when proprietary properties are enabled. +! New directive %Core.EnableExcludes; can be set to 'false' to turn off + SGML excludes checking. If HTML Purifier is removing too much text + and you don't care about full standards compliance, try setting this to + 'false'. - Use prepend for SPL autoloading on PHP 5.3 and later. - Fix bug with nofollow transform when pre-existing rel exists. - Fix bug where background:url() always gets lower-cased diff --git a/configdoc/usage.xml b/configdoc/usage.xml index 050d3384..79f38b85 100644 --- a/configdoc/usage.xml +++ b/configdoc/usage.xml @@ -24,32 +24,32 @@ - 214 + 215 - 218 + 219 - 222 + 223 - 226 + 227 - 296 + 302 - 310 + 316 @@ -80,18 +80,18 @@ 337 - 367 + 372 341 - 374 + 379 - 368 + 373 @@ -419,17 +419,17 @@ - 54 + 55 - 78 + 79 - 276 + 277 @@ -473,12 +473,12 @@ - 12 + 15 - 13 + 16 @@ -491,6 +491,11 @@ 70 + + + 57 + + 53 diff --git a/library/HTMLPurifier/ConfigSchema/schema.ser b/library/HTMLPurifier/ConfigSchema/schema.ser index 32a02c4c53bbc1c587e0a3c66800764a1f763323..fa0bacb9476cab9e69889141969c6fefd2b4419c 100644 GIT binary patch delta 133 zcmX?5yr5))IivYzd7=62(nbbWO3wL3sd_G%#feEdsjd~tIi)G7#Y)ymn|DgGa814@ e=s}!X6D}rWgU#=i=dsJ;HZ;@9d~=t!86N + This directive disables SGML-style exclusions, e.g. the exclusion of + <object> in any descendant of a + <pre> tag. Disabling excludes will allow some + invalid documents to pass through HTML Purifier, but HTML Purifier + will also be less likely to accidentally remove large documents during + processing. +

+--# vim: et sw=4 sts=4 diff --git a/library/HTMLPurifier/Strategy/FixNesting.php b/library/HTMLPurifier/Strategy/FixNesting.php index f8180239..d1588b93 100644 --- a/library/HTMLPurifier/Strategy/FixNesting.php +++ b/library/HTMLPurifier/Strategy/FixNesting.php @@ -26,6 +26,22 @@ * translated into text depends on the child definitions. * * @todo Enable nodes to be bubbled out of the structure. + * + * @warning This algorithm (though it may be hard to see) proceeds from + * a top-down fashion. Thus, parents are processed before + * children. This is easy to implement and has a nice effiency + * benefit, in that if a node is removed, we never waste any + * time processing it, but it also means that if a child + * changes in a non-encapsulated way (e.g. it is removed), we + * need to go back and reprocess the parent to see if those + * changes resulted in problems for the parent. See + * [BACKTRACK] for an example of this. In the current + * implementation, this backtracking can only be triggered when + * a node is removed and if that node was the sole node, the + * parent would need to be removed. As such, it is easy to see + * that backtracking only incurs constant overhead. If more + * sophisticated backtracking is implemented, care must be + * taken to avoid nontermination or exponential blowup. */ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy @@ -38,6 +54,8 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy // get a copy of the HTML definition $definition = $config->getHTMLDefinition(); + $excludes_enabled = !$config->get('Core.DisableExcludes'); + // insert implicit "parent" node, will be removed at end. // DEFINITION CALL $parent_name = $definition->info_parent; @@ -147,7 +165,7 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy // parent exclusions. The array should not be very large, two // elements at most. $excluded = false; - if (!empty($exclude_stack)) { + if (!empty($exclude_stack) && $excludes_enabled) { foreach ($exclude_stack as $lookup) { if (isset($lookup[$tokens[$i]->name])) { $excluded = true; @@ -235,7 +253,7 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy // our current implementation claims that that case would // not allow empty, even if it did if (!$parent_def->child->allow_empty) { - // we need to do a double-check + // we need to do a double-check [BACKTRACK] $i = $parent_index; array_pop($stack); } diff --git a/tests/HTMLPurifier/Strategy/FixNestingTest.php b/tests/HTMLPurifier/Strategy/FixNestingTest.php index 9394352e..965ae2a8 100644 --- a/tests/HTMLPurifier/Strategy/FixNestingTest.php +++ b/tests/HTMLPurifier/Strategy/FixNestingTest.php @@ -139,6 +139,11 @@ class HTMLPurifier_Strategy_FixNestingTest extends HTMLPurifier_StrategyHarness $this->assertResult('
text
', '

text

'); } + function testDisabledExcludes() { + $this->config->set('Core.DisableExcludes', true); + $this->assertResult('
'); + } + } // vim: et sw=4 sts=4