1
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-08-06 14:16:32 +02:00

Rewrite FixNesting implementation to be tree-based.

This mega-patch rips out the FixNesting implementation and the related
ChildDef components.  The primary algorithmic change is to convert from
use of tokens to tree nodes, which are far more amenable to the style
of processing that FixNesting uses.  Additionally, FixNesting has been
changed to go bottom-up rather than top-down, in order to avoid needing
to implement backtracking.

This patch simplifies a good deal of the relevant logic, since we no
longer need to continually recalculate the nesting structure when
processing things.  However, the conversion to the alternate format
incurs some overhead, so for small inputs these changes are not a win.
One possibility to greatly reduce the constant factors here is to switch
to entirely using libxml's representation, and never serializing tokens;
this would require one to rewrite injectors, however.

The iterative post-order traversal in FixNesting is a bit subtle, but
we have essentially reified the stack and continuations.

We've removed support for %Core.EscapeInvalidChildren.

Signed-off-by: Edward Z. Yang <ezyang@mit.edu>
This commit is contained in:
Edward Z. Yang
2013-10-20 22:18:59 -07:00
parent b3640e1af6
commit 0767bbc12d
22 changed files with 358 additions and 698 deletions

View File

@@ -41,22 +41,22 @@ class HTMLPurifier_ChildDef_Chameleon extends HTMLPurifier_ChildDef
}
/**
* @param array $tokens_of_children
* @param HTMLPurifier_Node[] $children
* @param HTMLPurifier_Config $config
* @param HTMLPurifier_Context $context
* @return bool
*/
public function validateChildren($tokens_of_children, $config, $context)
public function validateChildren($children, $config, $context)
{
if ($context->get('IsInline') === false) {
return $this->block->validateChildren(
$tokens_of_children,
$children,
$config,
$context
);
} else {
return $this->inline->validateChildren(
$tokens_of_children,
$children,
$config,
$context
);

View File

@@ -73,31 +73,20 @@ class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef
}
/**
* @param array $tokens_of_children
* @param HTMLPurifier_Node[] $children
* @param HTMLPurifier_Config $config
* @param HTMLPurifier_Context $context
* @return bool
*/
public function validateChildren($tokens_of_children, $config, $context)
public function validateChildren($children, $config, $context)
{
$list_of_children = '';
$nesting = 0; // depth into the nest
foreach ($tokens_of_children as $token) {
if (!empty($token->is_whitespace)) {
foreach ($children as $node) {
if (!empty($node->is_whitespace)) {
continue;
}
$is_child = ($nesting == 0); // direct
if ($token instanceof HTMLPurifier_Token_Start) {
$nesting++;
} elseif ($token instanceof HTMLPurifier_Token_End) {
$nesting--;
}
if ($is_child) {
$list_of_children .= $token->name . ',';
}
$list_of_children .= $node->name . ',';
}
// add leading comma to deal with stray comma declarations
$list_of_children = ',' . rtrim($list_of_children, ',');

View File

@@ -24,12 +24,12 @@ class HTMLPurifier_ChildDef_Empty extends HTMLPurifier_ChildDef
}
/**
* @param array $tokens_of_children
* @param HTMLPurifier_Node[] $children
* @param HTMLPurifier_Config $config
* @param HTMLPurifier_Context $context
* @return array
*/
public function validateChildren($tokens_of_children, $config, $context)
public function validateChildren($children, $config, $context)
{
return array();
}

View File

@@ -2,6 +2,12 @@
/**
* Definition for list containers ul and ol.
*
* What does this do? The big thing is to handle ol/ul at the top
* level of list nodes, which should be handled specially by /folding/
* them into the previous list node. We generally shouldn't ever
* see other disallowed elements, because the autoclose behavior
* in MakeWellFormed handles it.
*/
class HTMLPurifier_ChildDef_List extends HTMLPurifier_ChildDef
{
@@ -17,115 +23,55 @@ class HTMLPurifier_ChildDef_List extends HTMLPurifier_ChildDef
public $elements = array('li' => true, 'ul' => true, 'ol' => true);
/**
* @param array $tokens_of_children
* @param array $children
* @param HTMLPurifier_Config $config
* @param HTMLPurifier_Context $context
* @return array
*/
public function validateChildren($tokens_of_children, $config, $context)
public function validateChildren($children, $config, $context)
{
// Flag for subclasses
$this->whitespace = false;
// if there are no tokens, delete parent node
if (empty($tokens_of_children)) {
if (empty($children)) {
return false;
}
// the new set of children
$result = array();
// current depth into the nest
$nesting = 0;
// a little sanity check to make sure it's not ALL whitespace
$all_whitespace = true;
$seen_li = false;
$need_close_li = false;
$current_li = false;
foreach ($tokens_of_children as $token) {
if (!empty($token->is_whitespace)) {
$result[] = $token;
foreach ($children as $node) {
if (!empty($node->is_whitespace)) {
$result[] = $node;
continue;
}
$all_whitespace = false; // phew, we're not talking about whitespace
if ($nesting == 1 && $need_close_li) {
$result[] = new HTMLPurifier_Token_End('li');
$nesting--;
$need_close_li = false;
}
$is_child = ($nesting == 0);
if ($token instanceof HTMLPurifier_Token_Start) {
$nesting++;
} elseif ($token instanceof HTMLPurifier_Token_End) {
$nesting--;
}
if ($is_child) {
if ($token->name === 'li') {
// good
$seen_li = true;
} elseif ($token->name === 'ul' || $token->name === 'ol') {
// we want to tuck this into the previous li
$need_close_li = true;
$nesting++;
if (!$seen_li) {
// create a new li element
$result[] = new HTMLPurifier_Token_Start('li');
} else {
// backtrack until </li> found
while (true) {
$t = array_pop($result);
if ($t instanceof HTMLPurifier_Token_End) {
// XXX actually, these invariants could very plausibly be violated
// if we are doing silly things with modifying the set of allowed elements.
// FORTUNATELY, it doesn't make a difference, since the allowed
// elements are hard-coded here!
if ($t->name !== 'li') {
trigger_error("Only li present invariant violated in List ChildDef", E_USER_ERROR);
return false;
}
break;
} elseif ($t instanceof HTMLPurifier_Token_Empty) { // bleagh
if ($t->name !== 'li') {
trigger_error("Only li present invariant violated in List ChildDef", E_USER_ERROR);
return false;
}
// XXX this should have a helper for it...
$result[] = new HTMLPurifier_Token_Start('li', $t->attr, $t->line, $t->col, $t->armor);
break;
} else {
if (!$t->is_whitespace) {
trigger_error(
"Only whitespace present invariant violated in List ChildDef",
E_USER_ERROR
);
return false;
}
}
}
}
} else {
// start wrapping (this doesn't precisely mimic
// browser behavior, but what browsers do is kind of
// hard to mimic in a standards compliant way
// XXX Actually, this has no impact in practice,
// because this gets handled earlier. Arguably,
// we should rip out all of that processing
$result[] = new HTMLPurifier_Token_Start('li');
$nesting++;
$seen_li = true;
$need_close_li = true;
if ($node->name === 'li') {
// good
$current_li = $node;
$result[] = $node;
} else {
// we want to tuck this into the previous li
// Invariant: we expect the node to be ol/ul
// ToDo: Make this more robust in the case of not ol/ul
// by distinguishing between existing li and li created
// to handle non-list elements; non-list elements should
// not be appended to an existing li; only li created
// for non-list. This distinction is not currently made.
if ($current_li === false) {
$current_li = new HTMLPurifier_Node_Element('li');
$result[] = $current_li;
}
$current_li->children[] = $node;
$current_li->empty = false; // XXX fascinating! Check for this error elsewhere ToDo
}
$result[] = $token;
}
if ($need_close_li) {
$result[] = new HTMLPurifier_Token_End('li');
}
if (empty($result)) {
return false;
@@ -133,9 +79,6 @@ class HTMLPurifier_ChildDef_List extends HTMLPurifier_ChildDef
if ($all_whitespace) {
return false;
}
if ($tokens_of_children == $result) {
return true;
}
return $result;
}
}

View File

@@ -20,20 +20,20 @@ class HTMLPurifier_ChildDef_Optional extends HTMLPurifier_ChildDef_Required
public $type = 'optional';
/**
* @param array $tokens_of_children
* @param array $children
* @param HTMLPurifier_Config $config
* @param HTMLPurifier_Context $context
* @return array
*/
public function validateChildren($tokens_of_children, $config, $context)
public function validateChildren($children, $config, $context)
{
$result = parent::validateChildren($tokens_of_children, $config, $context);
// we assume that $tokens_of_children is not modified
$result = parent::validateChildren($children, $config, $context);
// we assume that $children is not modified
if ($result === false) {
if (empty($tokens_of_children)) {
if (empty($children)) {
return true;
} elseif ($this->whitespace) {
return $tokens_of_children;
return $children;
} else {
return array();
}

View File

@@ -50,30 +50,24 @@ class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef
public $type = 'required';
/**
* @param array $tokens_of_children
* @param array $children
* @param HTMLPurifier_Config $config
* @param HTMLPurifier_Context $context
* @return array
*/
public function validateChildren($tokens_of_children, $config, $context)
public function validateChildren($children, $config, $context)
{
// Flag for subclasses
$this->whitespace = false;
// if there are no tokens, delete parent node
if (empty($tokens_of_children)) {
if (empty($children)) {
return false;
}
// the new set of children
$result = array();
// current depth into the nest
$nesting = 0;
// whether or not we're deleting a node
$is_deleting = false;
// whether or not parsed character data is allowed
// this controls whether or not we silently drop a tag
// or generate escaped HTML from it
@@ -82,51 +76,33 @@ class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef
// a little sanity check to make sure it's not ALL whitespace
$all_whitespace = true;
// some configuration
$escape_invalid_children = $config->get('Core.EscapeInvalidChildren');
// generator
$gen = new HTMLPurifier_Generator($config, $context);
foreach ($tokens_of_children as $token) {
if (!empty($token->is_whitespace)) {
$result[] = $token;
$stack = array_reverse($children);
while (!empty($stack)) {
$node = array_pop($stack);
if (!empty($node->is_whitespace)) {
$result[] = $node;
continue;
}
$all_whitespace = false; // phew, we're not talking about whitespace
$is_child = ($nesting == 0);
if ($token instanceof HTMLPurifier_Token_Start) {
$nesting++;
} elseif ($token instanceof HTMLPurifier_Token_End) {
$nesting--;
}
if ($is_child) {
$is_deleting = false;
if (!isset($this->elements[$token->name])) {
$is_deleting = true;
if ($pcdata_allowed && $token instanceof HTMLPurifier_Token_Text) {
$result[] = $token;
} elseif ($pcdata_allowed && $escape_invalid_children) {
$result[] = new HTMLPurifier_Token_Text(
$gen->generateFromToken($token)
);
if (!isset($this->elements[$node->name])) {
// special case text
// XXX One of these ought to be redundant or something
if ($pcdata_allowed && $node instanceof HTMLPurifier_Node_Text) {
$result[] = $node;
continue;
}
// spill the child contents in
// ToDo: Make configurable
if ($node instanceof HTMLPurifier_Node_Element) {
for ($i = count($node->children) - 1; $i >= 0; $i--) {
$stack[] = $node->children[$i];
}
continue;
}
continue;
}
if (!$is_deleting || ($pcdata_allowed && $token instanceof HTMLPurifier_Token_Text)) {
$result[] = $token;
} elseif ($pcdata_allowed && $escape_invalid_children) {
$result[] =
new HTMLPurifier_Token_Text(
$gen->generateFromToken($token)
);
} else {
// drop silently
}
$result[] = $node;
}
if (empty($result)) {
return false;
@@ -135,9 +111,6 @@ class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef
$this->whitespace = true;
return false;
}
if ($tokens_of_children == $result) {
return true;
}
return $result;
}
}

View File

@@ -43,68 +43,50 @@ class HTMLPurifier_ChildDef_StrictBlockquote extends HTMLPurifier_ChildDef_Requi
}
/**
* @param array $tokens_of_children
* @param array $children
* @param HTMLPurifier_Config $config
* @param HTMLPurifier_Context $context
* @return array
*/
public function validateChildren($tokens_of_children, $config, $context)
public function validateChildren($children, $config, $context)
{
$this->init($config);
// trick the parent class into thinking it allows more
$this->elements = $this->fake_elements;
$result = parent::validateChildren($tokens_of_children, $config, $context);
$result = parent::validateChildren($children, $config, $context);
$this->elements = $this->real_elements;
if ($result === false) {
return array();
}
if ($result === true) {
$result = $tokens_of_children;
$result = $children;
}
$def = $config->getHTMLDefinition();
$block_wrap_start = new HTMLPurifier_Token_Start($def->info_block_wrapper);
$block_wrap_end = new HTMLPurifier_Token_End($def->info_block_wrapper);
$is_inline = false;
$depth = 0;
$block_wrap_name = $def->info_block_wrapper;
$block_wrap = false;
$ret = array();
// assuming that there are no comment tokens
foreach ($result as $i => $token) {
$token = $result[$i];
// ifs are nested for readability
if (!$is_inline) {
if (!$depth) {
if (($token instanceof HTMLPurifier_Token_Text && !$token->is_whitespace) ||
(!$token instanceof HTMLPurifier_Token_Text && !isset($this->elements[$token->name]))) {
$is_inline = true;
$ret[] = $block_wrap_start;
}
foreach ($result as $node) {
if ($block_wrap === false) {
if (($node instanceof HTMLPurifier_Node_Text && !$node->is_whitespace) ||
($node instanceof HTMLPurifier_Node_Element && !isset($this->elements[$node->name]))) {
$block_wrap = new HTMLPurifier_Node_Element($def->info_block_wrapper);
$ret[] = $block_wrap;
}
} else {
if (!$depth) {
// starting tokens have been inline text / empty
if ($token instanceof HTMLPurifier_Token_Start || $token instanceof HTMLPurifier_Token_Empty) {
if (isset($this->elements[$token->name])) {
// ended
$ret[] = $block_wrap_end;
$is_inline = false;
}
}
if ($node instanceof HTMLPurifier_Node_Element && isset($this->elements[$node->name])) {
$block_wrap = false;
}
}
$ret[] = $token;
if ($token instanceof HTMLPurifier_Token_Start) {
$depth++;
if ($block_wrap) {
$block_wrap->children[] = $node;
} else {
$ret[] = $node;
}
if ($token instanceof HTMLPurifier_Token_End) {
$depth--;
}
}
if ($is_inline) {
$ret[] = $block_wrap_end;
}
return $ret;
}

View File

@@ -59,230 +59,164 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
}
/**
* @param array $tokens_of_children
* @param array $children
* @param HTMLPurifier_Config $config
* @param HTMLPurifier_Context $context
* @return array
*/
public function validateChildren($tokens_of_children, $config, $context)
public function validateChildren($children, $config, $context)
{
if (empty($tokens_of_children)) {
if (empty($children)) {
return false;
}
// this ensures that the loop gets run one last time before closing
// up. It's a little bit of a hack, but it works! Just make sure you
// get rid of the token later.
$tokens_of_children[] = false;
// only one of these elements is allowed in a table
$caption = false;
$thead = false;
$tfoot = false;
// whitespace
$initial_ws = array();
$after_caption_ws = array();
$after_thead_ws = array();
$after_tfoot_ws = array();
// as many of these as you want
$cols = array();
$content = array();
$nesting = 0; // current depth so we can determine nodes
$is_collecting = false; // are we globbing together tokens to package
// into one of the collectors?
$collection = array(); // collected nodes
// INVARIANT: if $is_collecting, then !empty($collection)
// The converse does NOT hold, see [WHITESPACE]
$tag_index = 0; // the first node might be whitespace,
// so this tells us where the start tag is
$tbody_mode = false; // if true, then we need to wrap any stray
// <tr>s with a <tbody>.
// <tr>s with a <tbody>.
foreach ($tokens_of_children as $token) {
$is_child = ($nesting == 0);
$ws_accum =& $initial_ws;
if ($token === false) {
// terminating sequence started
} elseif ($token instanceof HTMLPurifier_Token_Start) {
$nesting++;
} elseif ($token instanceof HTMLPurifier_Token_End) {
$nesting--;
foreach ($children as $node) {
if ($node instanceof HTMLPurifier_Node_Comment) {
$ws_accum[] = $node;
continue;
}
// handle node collection
if ($is_collecting) {
if ($is_child) {
// okay, let's stash the tokens away
// first token tells us the type of the collection
switch ($collection[$tag_index]->name) {
case 'tbody':
$tbody_mode = true;
// fall through
case 'tr':
$content[] = $collection;
break;
case 'caption':
if ($caption !== false) {
break;
}
$caption = $collection;
break;
case 'thead':
case 'tfoot':
$tbody_mode = true;
// XXX This breaks rendering properties with
// Firefox, which never floats a <thead> to
// the top. Ever. (Our scheme will float the
// first <thead> to the top.) So maybe
// <thead>s that are not first should be
// turned into <tbody>? Very tricky, indeed.
// access the appropriate variable, $thead or $tfoot
$var = $collection[$tag_index]->name;
if ($$var === false) {
$$var = $collection;
} else {
// Oops, there's a second one! What
// should we do? Current behavior is to
// transmutate the first and last entries into
// tbody tags, and then put into content.
// Maybe a better idea is to *attach
// it* to the existing thead or tfoot?
// We don't do this, because Firefox
// doesn't float an extra tfoot to the
// bottom like it does for the first one.
$collection[$tag_index]->name = 'tbody';
$collection[count($collection) - 1]->name = 'tbody';
$content[] = $collection;
}
break;
case 'colgroup':
$cols[] = $collection;
break;
}
$collection = array();
$is_collecting = false;
$tag_index = 0;
} else {
// add the node to the collection
$collection[] = $token;
}
}
// terminate
if ($token === false) {
switch ($node->name) {
case 'tbody':
$tbody_mode = true;
// fall through
case 'tr':
$content[] = $node;
$ws_accum =& $content;
break;
}
if ($is_child) {
// determine what we're dealing with
if ($token->name == 'col') {
// the only empty tag in the possie, we can handle it
// immediately
$cols[] = array_merge($collection, array($token));
$collection = array();
$is_collecting = false;
$tag_index = 0;
continue;
case 'caption':
// there can only be one caption!
if ($caption !== false) break;
$caption = $node;
$ws_accum =& $after_caption_ws;
break;
case 'thead':
$tbody_mode = true;
// XXX This breaks rendering properties with
// Firefox, which never floats a <thead> to
// the top. Ever. (Our scheme will float the
// first <thead> to the top.) So maybe
// <thead>s that are not first should be
// turned into <tbody>? Very tricky, indeed.
if ($thead === false) {
$thead = $node;
$ws_accum =& $after_thead_ws;
} else {
// Oops, there's a second one! What
// should we do? Current behavior is to
// transmutate the first and last entries into
// tbody tags, and then put into content.
// Maybe a better idea is to *attach
// it* to the existing thead or tfoot?
// We don't do this, because Firefox
// doesn't float an extra tfoot to the
// bottom like it does for the first one.
$node->name = 'tbody';
$content[] = $node;
$ws_accum =& $content;
}
switch ($token->name) {
case 'caption':
case 'colgroup':
case 'thead':
case 'tfoot':
case 'tbody':
case 'tr':
$is_collecting = true;
$collection[] = $token;
continue;
default:
// [WHITESPACE] Whitespace is added to the
// collection without triggering collection
// mode. This is a hack to make whitespace
// 'sticky' (that is to say, we ought /not/ to
// drop whitespace.)
if (!empty($token->is_whitespace)) {
$collection[] = $token;
$tag_index++;
}
continue;
break;
case 'tfoot':
// see above for some aveats
$tbody_mode = true;
if ($tfoot === false) {
$tfoot = $node;
$ws_accum =& $after_tfoot_ws;
} else {
$node->name = 'tbody';
$content[] = $node;
$ws_accum =& $content;
}
break;
case 'colgroup':
case 'col':
$cols[] = $node;
$ws_accum =& $cols;
break;
case '#PCDATA':
// How is whitespace handled? We treat is as sticky to
// the *end* of the previous element. So all of the
// nonsense we have worked on is to keep things
// together.
if (!empty($node->is_whitespace)) {
$ws_accum[] = $node;
}
break;
}
}
if (empty($content)) {
return false;
}
// INVARIANT: all members of content are non-empty. This can
// be shown by observing when things are pushed onto content:
// they are only ever pushed when is_collecting is true, and
// collection is the only thing ever pushed; but it is known
// that collections are non-empty when is_collecting is true.
$ret = array();
$ret = $initial_ws;
if ($caption !== false) {
$ret = array_merge($ret, $caption);
$ret[] = $caption;
$ret = array_merge($ret, $after_caption_ws);
}
if ($cols !== false) {
foreach ($cols as $token_array) {
$ret = array_merge($ret, $token_array);
}
$ret = array_merge($ret, $cols);
}
if ($thead !== false) {
$ret = array_merge($ret, $thead);
$ret[] = $thead;
$ret = array_merge($ret, $after_thead_ws);
}
if ($tfoot !== false) {
$ret = array_merge($ret, $tfoot);
$ret[] = $tfoot;
$ret = array_merge($ret, $after_tfoot_ws);
}
if ($tbody_mode) {
// a little tricky, since the start of the collection may be
// whitespace
$inside_tbody = false;
foreach ($content as $token_array) {
// find the starting token
// INVARIANT: token_array is not empty
$t = NULL;
foreach ($token_array as $t) {
if ($t->name === 'tr' || $t->name === 'tbody') {
break;
// we have to shuffle tr into tbody
$current_tr_tbody = null;
foreach($content as $node) {
switch ($node->name) {
case 'tbody':
$current_tr_tbody = null;
$ret[] = $node;
break;
case 'tr':
if ($current_tr_tbody === null) {
$current_tr_tbody = new HTMLPurifier_Node_Element('tbody');
$ret[] = $current_tr_tbody;
}
} // iterator variable carries over
if ($t->name === 'tr') {
if ($inside_tbody) {
$ret = array_merge($ret, $token_array);
$current_tr_tbody->children[] = $node;
break;
case '#PCDATA':
assert($node->is_whitespace);
if ($current_tr_tbody === null) {
$ret[] = $node;
} else {
$ret[] = new HTMLPurifier_Token_Start('tbody');
$ret = array_merge($ret, $token_array);
$inside_tbody = true;
$current_tr_tbody->children[] = $node;
}
} elseif ($t->name === 'tbody') {
if ($inside_tbody) {
$ret[] = new HTMLPurifier_Token_End('tbody');
$inside_tbody = false;
$ret = array_merge($ret, $token_array);
} else {
$ret = array_merge($ret, $token_array);
}
} else {
trigger_error("tr/tbody in content invariant failed in Table ChildDef", E_USER_ERROR);
break;
}
}
if ($inside_tbody) {
$ret[] = new HTMLPurifier_Token_End('tbody');
}
} else {
foreach ($content as $token_array) {
// invariant: everything in here is <tr>s
$ret = array_merge($ret, $token_array);
}
$ret = array_merge($ret, $content);
}
if (!empty($collection) && $is_collecting == false) {
// grab the trailing space
$ret = array_merge($ret, $collection);
}
array_pop($tokens_of_children); // remove phantom token
return ($ret === $tokens_of_children) ? true : $ret;
return $ret;
}
}