1
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-08-05 21:57:26 +02:00

Implement %AutoFormat.RemoveEmpty, end to start ref, and injector rewind.

Injector rewind: Injectors can now use the method rewind() in order to move
the input index backwards, so that they can reprocess tokens (other injectors
are not affected by a rewind). This functionality was necessary to implement
nested node removals in %AutoFormat.RemoveEmpty.

End to start ref: To facilitate rewinding, HTMLPurifier_Token_End now
maintains a reference called $start to the starting token for their node.

%AutoFormat.RemoveEmpty removes empty nodes. Lots of people have requested
it, so here is a partially effective implementation. Because it is implemented
as an Injector, it's not possible for it to handle newly introduced empty
nodes by later validators, specifically auto-closing and child validation.
The Injector is only meant to be used on HTML-ish languages.

Signed-off-by: Edward Z. Yang <edwardzyang@thewritingpot.com>
This commit is contained in:
Edward Z. Yang
2008-06-27 16:09:14 -04:00
parent fd384129bf
commit 700d5bcbfc
12 changed files with 252 additions and 24 deletions

View File

@@ -165,6 +165,7 @@ require 'HTMLPurifier/HTMLModule/Tidy/XHTML.php';
require 'HTMLPurifier/Injector/AutoParagraph.php';
require 'HTMLPurifier/Injector/Linkify.php';
require 'HTMLPurifier/Injector/PurifierLinkify.php';
require 'HTMLPurifier/Injector/RemoveEmpty.php';
require 'HTMLPurifier/Injector/SafeObject.php';
require 'HTMLPurifier/Lexer/DOMLex.php';
require 'HTMLPurifier/Lexer/DirectLex.php';

View File

@@ -159,6 +159,7 @@ require_once $__dir . '/HTMLPurifier/HTMLModule/Tidy/XHTML.php';
require_once $__dir . '/HTMLPurifier/Injector/AutoParagraph.php';
require_once $__dir . '/HTMLPurifier/Injector/Linkify.php';
require_once $__dir . '/HTMLPurifier/Injector/PurifierLinkify.php';
require_once $__dir . '/HTMLPurifier/Injector/RemoveEmpty.php';
require_once $__dir . '/HTMLPurifier/Injector/SafeObject.php';
require_once $__dir . '/HTMLPurifier/Lexer/DOMLex.php';
require_once $__dir . '/HTMLPurifier/Lexer/DirectLex.php';

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,44 @@
AutoFormat.RemoveEmpty
TYPE: bool
VERSION: 3.1.2
DEFAULT: false
--DESCRIPTION--
<p>
When enabled, HTML Purifier will attempt to remove empty elements that
contribute no semantic information to the document. The following types
of nodes will be removed:
</p>
<ul><li>
Tags with no attributes and no content, and that are not empty
elements (remove <code>&lt;a&gt;&lt;/a&gt;</code> but not
<code>&lt;br /&gt;</code>), and
</li>
<li>
Tags with no content, except for:<ul>
<li>The <code>colgroup</code> element, or</li>
<li>
Elements with the <code>id</code> or <code>name</code> attribute,
when those attributes are permitted on those elements.
</li>
</ul></li>
</ul>
<p>
Please be very careful when using this functionality; while it may not
seem that empty elements contain useful information, they can alter the
layout of a document given appropriate styling. This directive is most
useful when you are processing machine-generated HTML, please avoid using
it on regular user HTML.
</p>
<p>
Elements that contain only whitespace will be treated as empty. Non-breaking
spaces, however, do not count as whitespace.
</p>
<p>
This algorithm is not perfect; you may still notice some empty tags,
particularly if a node had elements, but those elements were later removed
because they were not permitted in that context, or tags that, after
being auto-closed by another tag, where empty. This is for safety reasons
to prevent clever code from breaking validation. The general rule of thumb:
if a tag looked empty on the way end, it will get removed; if HTML Purifier
made it empty, it will stay.
</p>

View File

@@ -54,6 +54,30 @@ abstract class HTMLPurifier_Injector
*/
public $needed = array();
/**
* Index of inputTokens to rewind to.
*/
protected $rewind = false;
/**
* Rewind to a spot to re-perform processing. This is useful if you
* deleted a node, and now need to see if this change affected any
* earlier nodes. Rewinding does not affect other injectors, and can
* result in infinite loops if not used carefully.
*/
public function rewind($index) {
$this->rewind = $index;
}
/**
* Retrieves rewind, and then unsets it.
*/
public function getRewind() {
$r = $this->rewind;
$this->rewind = false;
return $r;
}
/**
* Prepares the injector by giving it the config and context objects:
* this allows references to important variables to be made within

View File

@@ -0,0 +1,40 @@
<?php
class HTMLPurifier_Injector_RemoveEmpty extends HTMLPurifier_Injector
{
private $context, $config;
public function prepare($config, $context) {
parent::prepare($config, $context);
$this->config = $config;
$this->context = $context;
$this->attrValidator = new HTMLPurifier_AttrValidator();
}
public function handleElement(&$token) {
if (!$token instanceof HTMLPurifier_Token_Start) return;
$next = false;
for ($i = $this->inputIndex + 1, $c = count($this->inputTokens); $i < $c; $i++) {
$next = $this->inputTokens[$i];
if ($next instanceof HTMLPurifier_Token_Text && $next->is_whitespace) continue;
break;
}
if (!$next || ($next instanceof HTMLPurifier_Token_End && $next->name == $token->name)) {
if ($token->name == 'colgroup') return;
$this->attrValidator->validateToken($token, $this->config, $this->context);
$token->armor['ValidateAttributes'] = true;
if (isset($token->attr['id']) || isset($token->attr['name'])) return;
$token = $i - $this->inputIndex + 1;
for ($b = $this->inputIndex - 1; $b > 0; $b--) {
$prev = $this->inputTokens[$b];
if ($prev instanceof HTMLPurifier_Token_Text && $prev->is_whitespace) continue;
break;
}
// This is safe because we removed the token that triggered this.
$this->rewind($b - 1);
return;
}
}
}

View File

@@ -100,7 +100,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
// injector handler code; duplicated for performance reasons
foreach ($this->injectors as $i => $injector) {
if (!$injector->skip) $injector->handleText($token);
if (is_array($token)) {
if (is_array($token) || is_int($token)) {
$this->currentInjector = $i;
break;
}
@@ -144,7 +144,9 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
if ($e) $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag auto closed', $parent);
// insert parent end tag before this tag;
// end tag isn't processed, but this tag is processed again
$this->insertBefore(new HTMLPurifier_Token_End($parent->name));
$new_token = new HTMLPurifier_Token_End($parent->name);
$new_token->start = $parent;
$this->insertBefore($new_token);
continue;
}
@@ -157,7 +159,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
if ($ok) {
foreach ($this->injectors as $i => $injector) {
if (!$injector->skip) $injector->handleElement($token);
if (is_array($token)) {
if (is_array($token) || is_int($token)) {
$this->currentInjector = $i;
break;
}
@@ -189,6 +191,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
// first, check for the simplest case: everything closes neatly
$current_parent = array_pop($this->currentNesting);
if ($current_parent->name == $token->name) {
$token->start = $current_parent;
foreach ($this->injectors as $i => $injector) {
$injector->notifyEnd($token);
}
@@ -236,6 +239,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
$e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by element end', $skipped_tags[$i]);
}
$new_token = new HTMLPurifier_Token_End($skipped_tags[$i]->name);
$new_token->start = $skipped_tags[$i];
$this->insertAfter($new_token);
//printTokens($tokens, $this->inputIndex);
//var_dump($this->currentNesting);
@@ -261,6 +265,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
}
// instead of splice, since we know this is the end
$tokens[] = $new_token = new HTMLPurifier_Token_End($this->currentNesting[$i]->name);
$new_token->start = $this->currentNesting[$i];
foreach ($this->injectors as $injector) {
$injector->notifyEnd($new_token);
}
@@ -313,34 +318,59 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
* If $token is false, the current token is deleted.
*/
protected function processToken($token, $config, $context) {
if (is_array($token)) {
if (is_array($token) || is_int($token)) {
// the original token was overloaded by an injector, time
// to some fancy acrobatics
// $this->inputIndex is decremented so that the entire set gets
// re-processed
array_splice($this->inputTokens, $this->inputIndex--, 1, $token);
// adjust the injector skips based on the array substitution
if (is_array($token)) {
array_splice($this->inputTokens, $this->inputIndex, 1, $token);
} else {
array_splice($this->inputTokens, $this->inputIndex, $token, array());
}
if ($this->injectors) {
$offset = count($token);
for ($i = 0; $i <= $this->currentInjector; $i++) {
// because of the skip back, we need to add one more
// for uninitialized injectors. I'm not exactly
// sure why this is the case, but I think it has to
// do with the fact that we're decrementing skips
// before re-checking text
if (!$this->injectors[$i]->skip) $this->injectors[$i]->skip++;
$this->injectors[$i]->skip += $offset;
$rewind = $this->injectors[$this->currentInjector]->getRewind();
if ($rewind < 0) $rewind = 0;
if ($rewind !== false) {
$offset = $this->inputIndex - $rewind;
if ($this->injectors) {
foreach ($this->injectors as $i => $injector) {
if ($i == $this->currentInjector) {
$injector->skip = 0;
} else {
$injector->skip += $offset;
}
}
}
for ($this->inputIndex--; $this->inputIndex >= $rewind; $this->inputIndex--) {
$prev = $this->inputTokens[$this->inputIndex];
if ($prev instanceof HTMLPurifier_Token_Start) array_pop($this->currentNesting);
elseif ($prev instanceof HTMLPurifier_Token_End) $this->currentNesting[] = $prev->start;
}
$this->inputIndex++;
} else {
// adjust the injector skips based on the array substitution
$offset = is_array($token) ? count($token) : 0;
for ($i = 0; $i <= $this->currentInjector; $i++) {
// because of the skip back, we need to add one more
// for uninitialized injectors. I'm not exactly
// sure why this is the case, but I think it has to
// do with the fact that we're decrementing skips
// before re-checking text
if (!$this->injectors[$i]->skip) $this->injectors[$i]->skip++;
$this->injectors[$i]->skip += $offset;
}
}
}
// ensure that we reprocess these tokens with the other injectors
--$this->inputIndex;
} elseif ($token) {
// regular case
$this->swap($token);
if ($token instanceof HTMLPurifier_Token_Start) {
$this->currentNesting[] = $token;
} elseif ($token instanceof HTMLPurifier_Token_End) {
array_pop($this->currentNesting); // not actually used
// not actually used
$token->start = array_pop($this->currentNesting);
}
} else {
$this->remove();

View File

@@ -9,5 +9,9 @@
*/
class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag
{
/**
* Token that started this node. Added by MakeWellFormed. Please
* do not edit this!
*/
public $start;
}