From 631021733baf2b20b1b61f933db16e232589e16d Mon Sep 17 00:00:00 2001
From: "Edward Z. Yang"
Date: Sun, 17 Feb 2013 15:47:38 -0800
Subject: [PATCH] Add %Core.DisableExcludes directive
Signed-off-by: Edward Z. Yang
---
NEWS | 4 +++
configdoc/usage.xml | 33 ++++++++++--------
library/HTMLPurifier/ConfigSchema/schema.ser | Bin 14784 -> 14880 bytes
.../schema/Core.DisableExcludes.txt | 14 ++++++++
library/HTMLPurifier/Strategy/FixNesting.php | 22 ++++++++++--
.../HTMLPurifier/Strategy/FixNestingTest.php | 5 +++
6 files changed, 62 insertions(+), 16 deletions(-)
create mode 100644 library/HTMLPurifier/ConfigSchema/schema/Core.DisableExcludes.txt
diff --git a/NEWS b/NEWS
index bfc03050..842d9b3c 100644
--- a/NEWS
+++ b/NEWS
@@ -20,6 +20,10 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
! Permit underscores in font families
! Support for page-break-* CSS3 properties when proprietary properties
are enabled.
+! New directive %Core.EnableExcludes; can be set to 'false' to turn off
+ SGML excludes checking. If HTML Purifier is removing too much text
+ and you don't care about full standards compliance, try setting this to
+ 'false'.
- Use prepend for SPL autoloading on PHP 5.3 and later.
- Fix bug with nofollow transform when pre-existing rel exists.
- Fix bug where background:url() always gets lower-cased
diff --git a/configdoc/usage.xml b/configdoc/usage.xml
index 050d3384..79f38b85 100644
--- a/configdoc/usage.xml
+++ b/configdoc/usage.xml
@@ -24,32 +24,32 @@
- 214
+ 215
- 218
+ 219
- 222
+ 223
- 226
+ 227
- 296
+ 302
- 310
+ 316
@@ -80,18 +80,18 @@
337
- 367
+ 372
341
- 374
+ 379
- 368
+ 373
@@ -473,12 +473,12 @@
- 12
+ 15
- 13
+ 16
@@ -491,6 +491,11 @@
70
+
+
+ 57
+
+
53
diff --git a/library/HTMLPurifier/ConfigSchema/schema.ser b/library/HTMLPurifier/ConfigSchema/schema.ser
index 32a02c4c53bbc1c587e0a3c66800764a1f763323..fa0bacb9476cab9e69889141969c6fefd2b4419c 100644
GIT binary patch
delta 133
zcmX?5yr5))IivYzd7=62(nbbWO3wL3sd_G%#feEdsjd~tIi)G7#Y)ymn|DgGa814@
e=s}!X6D}rWgU#=i=dsJ;HZ;@9d~=t!86N
+ This directive disables SGML-style exclusions, e.g. the exclusion of
+ <object>
in any descendant of a
+ <pre>
tag. Disabling excludes will allow some
+ invalid documents to pass through HTML Purifier, but HTML Purifier
+ will also be less likely to accidentally remove large documents during
+ processing.
+
+--# vim: et sw=4 sts=4
diff --git a/library/HTMLPurifier/Strategy/FixNesting.php b/library/HTMLPurifier/Strategy/FixNesting.php
index f8180239..d1588b93 100644
--- a/library/HTMLPurifier/Strategy/FixNesting.php
+++ b/library/HTMLPurifier/Strategy/FixNesting.php
@@ -26,6 +26,22 @@
* translated into text depends on the child definitions.
*
* @todo Enable nodes to be bubbled out of the structure.
+ *
+ * @warning This algorithm (though it may be hard to see) proceeds from
+ * a top-down fashion. Thus, parents are processed before
+ * children. This is easy to implement and has a nice effiency
+ * benefit, in that if a node is removed, we never waste any
+ * time processing it, but it also means that if a child
+ * changes in a non-encapsulated way (e.g. it is removed), we
+ * need to go back and reprocess the parent to see if those
+ * changes resulted in problems for the parent. See
+ * [BACKTRACK] for an example of this. In the current
+ * implementation, this backtracking can only be triggered when
+ * a node is removed and if that node was the sole node, the
+ * parent would need to be removed. As such, it is easy to see
+ * that backtracking only incurs constant overhead. If more
+ * sophisticated backtracking is implemented, care must be
+ * taken to avoid nontermination or exponential blowup.
*/
class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
@@ -38,6 +54,8 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
// get a copy of the HTML definition
$definition = $config->getHTMLDefinition();
+ $excludes_enabled = !$config->get('Core.DisableExcludes');
+
// insert implicit "parent" node, will be removed at end.
// DEFINITION CALL
$parent_name = $definition->info_parent;
@@ -147,7 +165,7 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
// parent exclusions. The array should not be very large, two
// elements at most.
$excluded = false;
- if (!empty($exclude_stack)) {
+ if (!empty($exclude_stack) && $excludes_enabled) {
foreach ($exclude_stack as $lookup) {
if (isset($lookup[$tokens[$i]->name])) {
$excluded = true;
@@ -235,7 +253,7 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
// our current implementation claims that that case would
// not allow empty, even if it did
if (!$parent_def->child->allow_empty) {
- // we need to do a double-check
+ // we need to do a double-check [BACKTRACK]
$i = $parent_index;
array_pop($stack);
}
diff --git a/tests/HTMLPurifier/Strategy/FixNestingTest.php b/tests/HTMLPurifier/Strategy/FixNestingTest.php
index 9394352e..965ae2a8 100644
--- a/tests/HTMLPurifier/Strategy/FixNestingTest.php
+++ b/tests/HTMLPurifier/Strategy/FixNestingTest.php
@@ -139,6 +139,11 @@ class HTMLPurifier_Strategy_FixNestingTest extends HTMLPurifier_StrategyHarness
$this->assertResult('text
', 'text
');
}
+ function testDisabledExcludes() {
+ $this->config->set('Core.DisableExcludes', true);
+ $this->assertResult('');
+ }
+
}
// vim: et sw=4 sts=4