diff --git a/library/HTMLPurifier/ChildDef.php b/library/HTMLPurifier/ChildDef.php index ca1accca..6555ea6e 100644 --- a/library/HTMLPurifier/ChildDef.php +++ b/library/HTMLPurifier/ChildDef.php @@ -11,12 +11,23 @@ // we may end up writing custom code for each HTML case // in order to make it self correcting + class HTMLPurifier_ChildDef +{ + var $type; + var $allow_empty; + function validateChildren($tokens_of_children) { + trigger_error('Call to abstract function', E_USER_ERROR); + } +} + +class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef { var $type = 'custom'; + var $allow_empty = false; var $dtd_regex; var $_pcre_regex; - function HTMLPurifier_ChildDef($dtd_regex) { + function HTMLPurifier_ChildDef_Custom($dtd_regex) { $this->dtd_regex = $dtd_regex; $this->_compileRegex(); } @@ -58,10 +69,11 @@ class HTMLPurifier_ChildDef return (bool) $okay; } } -class HTMLPurifier_ChildDef_Simple extends HTMLPurifier_ChildDef + +class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef { var $elements = array(); - function HTMLPurifier_ChildDef_Simple($elements) { + function HTMLPurifier_ChildDef_Required($elements) { if (is_string($elements)) { $elements = str_replace(' ', '', $elements); $elements = explode('|', $elements); @@ -71,12 +83,7 @@ class HTMLPurifier_ChildDef_Simple extends HTMLPurifier_ChildDef $this->elements = $elements; $this->gen = new HTMLPurifier_Generator(); } - function validateChildren() { - trigger_error('Cannot call abstract function!', E_USER_ERROR); - } -} -class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef_Simple -{ + var $allow_empty = false; var $type = 'required'; function validateChildren($tokens_of_children) { // if there are no tokens, delete parent node @@ -148,6 +155,7 @@ class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef_Simple // instead of a false (to delete the node) class HTMLPurifier_ChildDef_Optional extends HTMLPurifier_ChildDef_Required { + var $allow_empty = true; var $type = 'optional'; function validateChildren($tokens_of_children) { $result = parent::validateChildren($tokens_of_children); @@ -159,6 +167,7 @@ class HTMLPurifier_ChildDef_Optional extends HTMLPurifier_ChildDef_Required // placeholder class HTMLPurifier_ChildDef_Empty extends HTMLPurifier_ChildDef { + var $allow_empty = true; var $type = 'empty'; function HTMLPurifier_ChildDef_Empty() {} function validateChildren() { diff --git a/library/HTMLPurifier/Definition.php b/library/HTMLPurifier/Definition.php index 7ec774d0..0965bc69 100644 --- a/library/HTMLPurifier/Definition.php +++ b/library/HTMLPurifier/Definition.php @@ -56,13 +56,6 @@ class HTMLPurifier_Definition // these are condensed, however, with bad stuff taken out // screening process was done by hand - // The code makes certain assumptions about the structure of this - // definition for optimization reasons: - // - // FixNesting - There will never be a need for cascading removal - // of tags, usually triggered by a node requiring the - // existence of another node that may be deleted. - ////////////////////////////////////////////////////////////////////// // info[] : initializes the definition objects @@ -182,7 +175,7 @@ class HTMLPurifier_Definition $this->info['a']->child = $e_a_content; - $this->info['table']->child = new HTMLPurifier_ChildDef( + $this->info['table']->child = new HTMLPurifier_ChildDef_Custom( '(caption?, (col*|colgroup*), thead?, tfoot?, (tbody+|tr+))'); // not a real entity, watch the double underscore diff --git a/library/HTMLPurifier/Strategy/FixNesting.php b/library/HTMLPurifier/Strategy/FixNesting.php index 35e78e16..6f41d308 100644 --- a/library/HTMLPurifier/Strategy/FixNesting.php +++ b/library/HTMLPurifier/Strategy/FixNesting.php @@ -3,6 +3,9 @@ require_once 'HTMLPurifier/Strategy.php'; require_once 'HTMLPurifier/Definition.php'; +// EXTRA: provide a mechanism for elements to be bubbled OUT of a node +// or "Replace Nodes while including the parent nodes too" + class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy { @@ -13,12 +16,16 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy } function execute($tokens) { + // insert implicit "parent" node, will be removed at end $parent_name = $this->definition->info_parent; - array_unshift($tokens, new HTMLPurifier_Token_Start($parent_name)); $tokens[] = new HTMLPurifier_Token_End($parent_name); + // stack that contains the indexes of all parents, + // $stack[count($stack)-1] being the current parent + $stack = array(); + for ($i = 0, $size = count($tokens) ; $i < $size; ) { $child_tokens = array(); @@ -50,20 +57,16 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy // process result if ($result === true) { - // leave the nodes as is + // leave the node as is + + // register start token as a parental node start + $stack[] = $i; + + // move cursor to next possible start node + $i++; } elseif($result === false) { - // WARNING WARNING WARNING!!! - // While for the original DTD, there will never be - // cascading removal, more complex ones may have such - // a problem. - - // If you modify the info array such that an element - // that requires children may contain a child that requires - // children, you need to also scroll back and re-check that - // elements parent node - $length = $j - $i + 1; // remove entire node @@ -72,8 +75,18 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy // change size $size -= $length; - // ensure that we scroll to the next node - $i--; + // there is no start token to register, + // current node is now the next possible start node + // unless it turns out that we need to do a double-check + + $parent_index = $stack[count($stack)-1]; + $parent_name = $tokens[$parent_index]->name; + $parent_def = $this->definition->info[$parent_name]; + + if (!$parent_def->child->allow_empty) { + // we need to do a double-check + $i = $parent_index; + } } else { @@ -86,11 +99,24 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy $size -= $length; $size += count($result); + // register start token as a parental node start + $stack[] = $i; + + // move cursor to next possible start node + $i++; + } - // scroll to next node - $i++; - while ($i < $size and $tokens[$i]->type != 'start') $i++; + // We assume, at this point, that $i is the index of the token + // that is the first possible new start point for a node. + + // Test if the token indeed is a start tag, if not, move forward + // and test again. + while ($i < $size and $tokens[$i]->type != 'start') { + // pop a token index off the stack if we ended a node + if ($tokens[$i]->type == 'end') array_pop($stack); + $i++; + } } diff --git a/tests/HTMLPurifier/ChildDefTest.php b/tests/HTMLPurifier/ChildDefTest.php index 193e6d11..e86d2b4c 100644 --- a/tests/HTMLPurifier/ChildDefTest.php +++ b/tests/HTMLPurifier/ChildDefTest.php @@ -30,10 +30,10 @@ class HTMLPurifier_ChildDefTest extends UnitTestCase } } - function test_complex() { + function test_custom() { // the table definition - $def = new HTMLPurifier_ChildDef( + $def = new HTMLPurifier_ChildDef_Custom( '(caption?, (col*|colgroup*), thead?, tfoot?, (tbody+|tr+))'); $inputs[0] = ''; @@ -56,12 +56,9 @@ class HTMLPurifier_ChildDefTest extends UnitTestCase } - function test_simple() { + function test_parsing() { - // simple is actually an abstract class - // but we're unit testing some of the conv. functions it gives - - $def = new HTMLPurifier_ChildDef_Simple('foobar | bang |gizmo'); + $def = new HTMLPurifier_ChildDef_Required('foobar | bang |gizmo'); $this->assertEqual($def->elements, array( 'foobar' => true @@ -69,7 +66,7 @@ class HTMLPurifier_ChildDefTest extends UnitTestCase ,'gizmo' => true )); - $def = new HTMLPurifier_ChildDef_Simple(array('href', 'src')); + $def = new HTMLPurifier_ChildDef_Required(array('href', 'src')); $this->assertEqual($def->elements, array( 'href' => true diff --git a/tests/HTMLPurifier/Strategy/FixNestingTest.php b/tests/HTMLPurifier/Strategy/FixNestingTest.php index 0a404298..7b4f5bc0 100644 --- a/tests/HTMLPurifier/Strategy/FixNestingTest.php +++ b/tests/HTMLPurifier/Strategy/FixNestingTest.php @@ -38,9 +38,29 @@ class HTMLPurifier_Strategy_FixNestingTest $expect[4] = ''; // test custom table definition + $inputs[5] = '
Cell 1
'; $expect[5] = '
Cell 1
'; + $inputs[6] = '
'; + $expect[6] = ''; + + // breaks without the redundant checking code + $inputs[7] = '
'; + $expect[7] = ''; + + // special case, prevents scrolling one back to find parent + $inputs[8] = '
'; + $expect[8] = ''; + + // cascading rollbacks + $inputs[9] = '
'; + $expect[9] = ''; + + // rollbacks twice + $inputs[10] = '
'; + $expect[10] = ''; + $this->assertStrategyWorks($strategy, $inputs, $expect); }