mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2025-08-06 14:16:32 +02:00
Rewrite FixNesting implementation to be tree-based.
This mega-patch rips out the FixNesting implementation and the related ChildDef components. The primary algorithmic change is to convert from use of tokens to tree nodes, which are far more amenable to the style of processing that FixNesting uses. Additionally, FixNesting has been changed to go bottom-up rather than top-down, in order to avoid needing to implement backtracking. This patch simplifies a good deal of the relevant logic, since we no longer need to continually recalculate the nesting structure when processing things. However, the conversion to the alternate format incurs some overhead, so for small inputs these changes are not a win. One possibility to greatly reduce the constant factors here is to switch to entirely using libxml's representation, and never serializing tokens; this would require one to rewrite injectors, however. The iterative post-order traversal in FixNesting is a bit subtle, but we have essentially reified the stack and continuations. We've removed support for %Core.EscapeInvalidChildren. Signed-off-by: Edward Z. Yang <ezyang@mit.edu>
This commit is contained in:
@@ -2,6 +2,12 @@
|
||||
|
||||
/**
|
||||
* Definition for list containers ul and ol.
|
||||
*
|
||||
* What does this do? The big thing is to handle ol/ul at the top
|
||||
* level of list nodes, which should be handled specially by /folding/
|
||||
* them into the previous list node. We generally shouldn't ever
|
||||
* see other disallowed elements, because the autoclose behavior
|
||||
* in MakeWellFormed handles it.
|
||||
*/
|
||||
class HTMLPurifier_ChildDef_List extends HTMLPurifier_ChildDef
|
||||
{
|
||||
@@ -17,115 +23,55 @@ class HTMLPurifier_ChildDef_List extends HTMLPurifier_ChildDef
|
||||
public $elements = array('li' => true, 'ul' => true, 'ol' => true);
|
||||
|
||||
/**
|
||||
* @param array $tokens_of_children
|
||||
* @param array $children
|
||||
* @param HTMLPurifier_Config $config
|
||||
* @param HTMLPurifier_Context $context
|
||||
* @return array
|
||||
*/
|
||||
public function validateChildren($tokens_of_children, $config, $context)
|
||||
public function validateChildren($children, $config, $context)
|
||||
{
|
||||
// Flag for subclasses
|
||||
$this->whitespace = false;
|
||||
|
||||
// if there are no tokens, delete parent node
|
||||
if (empty($tokens_of_children)) {
|
||||
if (empty($children)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// the new set of children
|
||||
$result = array();
|
||||
|
||||
// current depth into the nest
|
||||
$nesting = 0;
|
||||
|
||||
// a little sanity check to make sure it's not ALL whitespace
|
||||
$all_whitespace = true;
|
||||
|
||||
$seen_li = false;
|
||||
$need_close_li = false;
|
||||
$current_li = false;
|
||||
|
||||
foreach ($tokens_of_children as $token) {
|
||||
if (!empty($token->is_whitespace)) {
|
||||
$result[] = $token;
|
||||
foreach ($children as $node) {
|
||||
if (!empty($node->is_whitespace)) {
|
||||
$result[] = $node;
|
||||
continue;
|
||||
}
|
||||
$all_whitespace = false; // phew, we're not talking about whitespace
|
||||
|
||||
if ($nesting == 1 && $need_close_li) {
|
||||
$result[] = new HTMLPurifier_Token_End('li');
|
||||
$nesting--;
|
||||
$need_close_li = false;
|
||||
}
|
||||
|
||||
$is_child = ($nesting == 0);
|
||||
|
||||
if ($token instanceof HTMLPurifier_Token_Start) {
|
||||
$nesting++;
|
||||
} elseif ($token instanceof HTMLPurifier_Token_End) {
|
||||
$nesting--;
|
||||
}
|
||||
|
||||
if ($is_child) {
|
||||
if ($token->name === 'li') {
|
||||
// good
|
||||
$seen_li = true;
|
||||
} elseif ($token->name === 'ul' || $token->name === 'ol') {
|
||||
// we want to tuck this into the previous li
|
||||
$need_close_li = true;
|
||||
$nesting++;
|
||||
if (!$seen_li) {
|
||||
// create a new li element
|
||||
$result[] = new HTMLPurifier_Token_Start('li');
|
||||
} else {
|
||||
// backtrack until </li> found
|
||||
while (true) {
|
||||
$t = array_pop($result);
|
||||
if ($t instanceof HTMLPurifier_Token_End) {
|
||||
// XXX actually, these invariants could very plausibly be violated
|
||||
// if we are doing silly things with modifying the set of allowed elements.
|
||||
// FORTUNATELY, it doesn't make a difference, since the allowed
|
||||
// elements are hard-coded here!
|
||||
if ($t->name !== 'li') {
|
||||
trigger_error("Only li present invariant violated in List ChildDef", E_USER_ERROR);
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
} elseif ($t instanceof HTMLPurifier_Token_Empty) { // bleagh
|
||||
if ($t->name !== 'li') {
|
||||
trigger_error("Only li present invariant violated in List ChildDef", E_USER_ERROR);
|
||||
return false;
|
||||
}
|
||||
// XXX this should have a helper for it...
|
||||
$result[] = new HTMLPurifier_Token_Start('li', $t->attr, $t->line, $t->col, $t->armor);
|
||||
break;
|
||||
} else {
|
||||
if (!$t->is_whitespace) {
|
||||
trigger_error(
|
||||
"Only whitespace present invariant violated in List ChildDef",
|
||||
E_USER_ERROR
|
||||
);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// start wrapping (this doesn't precisely mimic
|
||||
// browser behavior, but what browsers do is kind of
|
||||
// hard to mimic in a standards compliant way
|
||||
// XXX Actually, this has no impact in practice,
|
||||
// because this gets handled earlier. Arguably,
|
||||
// we should rip out all of that processing
|
||||
$result[] = new HTMLPurifier_Token_Start('li');
|
||||
$nesting++;
|
||||
$seen_li = true;
|
||||
$need_close_li = true;
|
||||
if ($node->name === 'li') {
|
||||
// good
|
||||
$current_li = $node;
|
||||
$result[] = $node;
|
||||
} else {
|
||||
// we want to tuck this into the previous li
|
||||
// Invariant: we expect the node to be ol/ul
|
||||
// ToDo: Make this more robust in the case of not ol/ul
|
||||
// by distinguishing between existing li and li created
|
||||
// to handle non-list elements; non-list elements should
|
||||
// not be appended to an existing li; only li created
|
||||
// for non-list. This distinction is not currently made.
|
||||
if ($current_li === false) {
|
||||
$current_li = new HTMLPurifier_Node_Element('li');
|
||||
$result[] = $current_li;
|
||||
}
|
||||
$current_li->children[] = $node;
|
||||
$current_li->empty = false; // XXX fascinating! Check for this error elsewhere ToDo
|
||||
}
|
||||
$result[] = $token;
|
||||
}
|
||||
if ($need_close_li) {
|
||||
$result[] = new HTMLPurifier_Token_End('li');
|
||||
}
|
||||
if (empty($result)) {
|
||||
return false;
|
||||
@@ -133,9 +79,6 @@ class HTMLPurifier_ChildDef_List extends HTMLPurifier_ChildDef
|
||||
if ($all_whitespace) {
|
||||
return false;
|
||||
}
|
||||
if ($tokens_of_children == $result) {
|
||||
return true;
|
||||
}
|
||||
return $result;
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user