1
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-07-31 03:10:09 +02:00

Rewrite FixNesting implementation to be tree-based.

This mega-patch rips out the FixNesting implementation and the related
ChildDef components.  The primary algorithmic change is to convert from
use of tokens to tree nodes, which are far more amenable to the style
of processing that FixNesting uses.  Additionally, FixNesting has been
changed to go bottom-up rather than top-down, in order to avoid needing
to implement backtracking.

This patch simplifies a good deal of the relevant logic, since we no
longer need to continually recalculate the nesting structure when
processing things.  However, the conversion to the alternate format
incurs some overhead, so for small inputs these changes are not a win.
One possibility to greatly reduce the constant factors here is to switch
to entirely using libxml's representation, and never serializing tokens;
this would require one to rewrite injectors, however.

The iterative post-order traversal in FixNesting is a bit subtle, but
we have essentially reified the stack and continuations.

We've removed support for %Core.EscapeInvalidChildren.

Signed-off-by: Edward Z. Yang <ezyang@mit.edu>
This commit is contained in:
Edward Z. Yang
2013-10-20 22:18:59 -07:00
parent b3640e1af6
commit 0767bbc12d
22 changed files with 358 additions and 698 deletions

View File

@@ -43,68 +43,50 @@ class HTMLPurifier_ChildDef_StrictBlockquote extends HTMLPurifier_ChildDef_Requi
}
/**
* @param array $tokens_of_children
* @param array $children
* @param HTMLPurifier_Config $config
* @param HTMLPurifier_Context $context
* @return array
*/
public function validateChildren($tokens_of_children, $config, $context)
public function validateChildren($children, $config, $context)
{
$this->init($config);
// trick the parent class into thinking it allows more
$this->elements = $this->fake_elements;
$result = parent::validateChildren($tokens_of_children, $config, $context);
$result = parent::validateChildren($children, $config, $context);
$this->elements = $this->real_elements;
if ($result === false) {
return array();
}
if ($result === true) {
$result = $tokens_of_children;
$result = $children;
}
$def = $config->getHTMLDefinition();
$block_wrap_start = new HTMLPurifier_Token_Start($def->info_block_wrapper);
$block_wrap_end = new HTMLPurifier_Token_End($def->info_block_wrapper);
$is_inline = false;
$depth = 0;
$block_wrap_name = $def->info_block_wrapper;
$block_wrap = false;
$ret = array();
// assuming that there are no comment tokens
foreach ($result as $i => $token) {
$token = $result[$i];
// ifs are nested for readability
if (!$is_inline) {
if (!$depth) {
if (($token instanceof HTMLPurifier_Token_Text && !$token->is_whitespace) ||
(!$token instanceof HTMLPurifier_Token_Text && !isset($this->elements[$token->name]))) {
$is_inline = true;
$ret[] = $block_wrap_start;
}
foreach ($result as $node) {
if ($block_wrap === false) {
if (($node instanceof HTMLPurifier_Node_Text && !$node->is_whitespace) ||
($node instanceof HTMLPurifier_Node_Element && !isset($this->elements[$node->name]))) {
$block_wrap = new HTMLPurifier_Node_Element($def->info_block_wrapper);
$ret[] = $block_wrap;
}
} else {
if (!$depth) {
// starting tokens have been inline text / empty
if ($token instanceof HTMLPurifier_Token_Start || $token instanceof HTMLPurifier_Token_Empty) {
if (isset($this->elements[$token->name])) {
// ended
$ret[] = $block_wrap_end;
$is_inline = false;
}
}
if ($node instanceof HTMLPurifier_Node_Element && isset($this->elements[$node->name])) {
$block_wrap = false;
}
}
$ret[] = $token;
if ($token instanceof HTMLPurifier_Token_Start) {
$depth++;
if ($block_wrap) {
$block_wrap->children[] = $node;
} else {
$ret[] = $node;
}
if ($token instanceof HTMLPurifier_Token_End) {
$depth--;
}
}
if ($is_inline) {
$ret[] = $block_wrap_end;
}
return $ret;
}