1
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-08-03 04:37:39 +02:00

svn:eol-style = native

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@97 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang
2006-07-23 00:11:03 +00:00
parent 39c16f5cfd
commit 14f481bcf6
21 changed files with 3219 additions and 3219 deletions

View File

@@ -1,11 +1,11 @@
<?php
class HTMLPurifier_AttrDef
{
var $def;
function HTMLPurifier_AttrDef($def) {
$this->def = $def;
}
}
<?php
class HTMLPurifier_AttrDef
{
var $def;
function HTMLPurifier_AttrDef($def) {
$this->def = $def;
}
}
?>

View File

@@ -1,169 +1,169 @@
<?php
// HTMLPurifier_ChildDef and inheritance have three types of output:
// true = leave nodes as is
// false = delete parent node and all children
// array(...) = replace children nodes with these
// this is the hardest one to implement. We'll use fancy regexp tricks
// right now, we only expect it to return TRUE or FALSE (it won't attempt
// to fix the tree)
// we may end up writing custom code for each HTML case
// in order to make it self correcting
class HTMLPurifier_ChildDef
{
var $type = 'custom';
var $dtd_regex;
var $_pcre_regex;
function HTMLPurifier_ChildDef($dtd_regex) {
$this->dtd_regex = $dtd_regex;
$this->_compileRegex();
}
function _compileRegex() {
$raw = str_replace(' ', '', $this->dtd_regex);
if ($raw{0} != '(') {
$raw = "($raw)";
}
$reg = str_replace(',', ',?', $raw);
$reg = preg_replace('/([#a-zA-Z0-9_.-]+)/', '(,?\\0)', $reg);
$this->_pcre_regex = $reg;
}
function validateChildren($tokens_of_children) {
$list_of_children = '';
$nesting = 0; // depth into the nest
foreach ($tokens_of_children as $token) {
if (!empty($token->is_whitespace)) continue;
$is_child = ($nesting == 0); // direct
if ($token->type == 'start') {
$nesting++;
} elseif ($token->type == 'end') {
$nesting--;
}
if ($is_child) {
$list_of_children .= $token->name . ',';
}
}
$list_of_children = rtrim($list_of_children, ',');
$okay =
preg_match(
'/^'.$this->_pcre_regex.'$/',
$list_of_children
);
return (bool) $okay;
}
}
class HTMLPurifier_ChildDef_Simple extends HTMLPurifier_ChildDef
{
var $elements = array();
function HTMLPurifier_ChildDef_Simple($elements) {
if (is_string($elements)) {
$elements = str_replace(' ', '', $elements);
$elements = explode('|', $elements);
}
$elements = array_flip($elements);
foreach ($elements as $i => $x) $elements[$i] = true;
$this->elements = $elements;
$this->gen = new HTMLPurifier_Generator();
}
function validateChildren() {
trigger_error('Cannot call abstract function!', E_USER_ERROR);
}
}
class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef_Simple
{
var $type = 'required';
function validateChildren($tokens_of_children) {
// if there are no tokens, delete parent node
if (empty($tokens_of_children)) return false;
// the new set of children
$result = array();
// current depth into the nest
$nesting = 0;
// whether or not we're deleting a node
$is_deleting = false;
// whether or not parsed character data is allowed
// this controls whether or not we silently drop a tag
// or generate escaped HTML from it
$pcdata_allowed = isset($this->elements['#PCDATA']);
// a little sanity check to make sure it's not ALL whitespace
$all_whitespace = true;
foreach ($tokens_of_children as $token) {
if (!empty($token->is_whitespace)) {
$result[] = $token;
continue;
}
$all_whitespace = false; // phew, we're not talking about whitespace
$is_child = ($nesting == 0);
if ($token->type == 'start') {
$nesting++;
} elseif ($token->type == 'end') {
$nesting--;
}
if ($is_child) {
$is_deleting = false;
if (!isset($this->elements[$token->name])) {
$is_deleting = true;
if ($pcdata_allowed) {
$result[] = new HTMLPurifier_Token_Text(
$this->gen->generateFromToken($token)
);
}
continue;
}
}
if (!$is_deleting) {
$result[] = $token;
} elseif ($pcdata_allowed) {
$result[] =
new HTMLPurifier_Token_Text(
$this->gen->generateFromToken( $token )
);
} else {
// drop silently
}
}
if (empty($result)) return false;
if ($all_whitespace) return false;
if ($tokens_of_children == $result) return true;
return $result;
}
}
// only altered behavior is that it returns an empty array
// instead of a false (to delete the node)
class HTMLPurifier_ChildDef_Optional extends HTMLPurifier_ChildDef_Required
{
var $type = 'optional';
function validateChildren($tokens_of_children) {
$result = parent::validateChildren($tokens_of_children);
if ($result === false) return array();
return $result;
}
}
// placeholder
class HTMLPurifier_ChildDef_Empty extends HTMLPurifier_ChildDef
{
var $type = 'empty';
function HTMLPurifier_ChildDef_Empty() {}
function validateChildren() {
return false;
}
}
<?php
// HTMLPurifier_ChildDef and inheritance have three types of output:
// true = leave nodes as is
// false = delete parent node and all children
// array(...) = replace children nodes with these
// this is the hardest one to implement. We'll use fancy regexp tricks
// right now, we only expect it to return TRUE or FALSE (it won't attempt
// to fix the tree)
// we may end up writing custom code for each HTML case
// in order to make it self correcting
class HTMLPurifier_ChildDef
{
var $type = 'custom';
var $dtd_regex;
var $_pcre_regex;
function HTMLPurifier_ChildDef($dtd_regex) {
$this->dtd_regex = $dtd_regex;
$this->_compileRegex();
}
function _compileRegex() {
$raw = str_replace(' ', '', $this->dtd_regex);
if ($raw{0} != '(') {
$raw = "($raw)";
}
$reg = str_replace(',', ',?', $raw);
$reg = preg_replace('/([#a-zA-Z0-9_.-]+)/', '(,?\\0)', $reg);
$this->_pcre_regex = $reg;
}
function validateChildren($tokens_of_children) {
$list_of_children = '';
$nesting = 0; // depth into the nest
foreach ($tokens_of_children as $token) {
if (!empty($token->is_whitespace)) continue;
$is_child = ($nesting == 0); // direct
if ($token->type == 'start') {
$nesting++;
} elseif ($token->type == 'end') {
$nesting--;
}
if ($is_child) {
$list_of_children .= $token->name . ',';
}
}
$list_of_children = rtrim($list_of_children, ',');
$okay =
preg_match(
'/^'.$this->_pcre_regex.'$/',
$list_of_children
);
return (bool) $okay;
}
}
class HTMLPurifier_ChildDef_Simple extends HTMLPurifier_ChildDef
{
var $elements = array();
function HTMLPurifier_ChildDef_Simple($elements) {
if (is_string($elements)) {
$elements = str_replace(' ', '', $elements);
$elements = explode('|', $elements);
}
$elements = array_flip($elements);
foreach ($elements as $i => $x) $elements[$i] = true;
$this->elements = $elements;
$this->gen = new HTMLPurifier_Generator();
}
function validateChildren() {
trigger_error('Cannot call abstract function!', E_USER_ERROR);
}
}
class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef_Simple
{
var $type = 'required';
function validateChildren($tokens_of_children) {
// if there are no tokens, delete parent node
if (empty($tokens_of_children)) return false;
// the new set of children
$result = array();
// current depth into the nest
$nesting = 0;
// whether or not we're deleting a node
$is_deleting = false;
// whether or not parsed character data is allowed
// this controls whether or not we silently drop a tag
// or generate escaped HTML from it
$pcdata_allowed = isset($this->elements['#PCDATA']);
// a little sanity check to make sure it's not ALL whitespace
$all_whitespace = true;
foreach ($tokens_of_children as $token) {
if (!empty($token->is_whitespace)) {
$result[] = $token;
continue;
}
$all_whitespace = false; // phew, we're not talking about whitespace
$is_child = ($nesting == 0);
if ($token->type == 'start') {
$nesting++;
} elseif ($token->type == 'end') {
$nesting--;
}
if ($is_child) {
$is_deleting = false;
if (!isset($this->elements[$token->name])) {
$is_deleting = true;
if ($pcdata_allowed) {
$result[] = new HTMLPurifier_Token_Text(
$this->gen->generateFromToken($token)
);
}
continue;
}
}
if (!$is_deleting) {
$result[] = $token;
} elseif ($pcdata_allowed) {
$result[] =
new HTMLPurifier_Token_Text(
$this->gen->generateFromToken( $token )
);
} else {
// drop silently
}
}
if (empty($result)) return false;
if ($all_whitespace) return false;
if ($tokens_of_children == $result) return true;
return $result;
}
}
// only altered behavior is that it returns an empty array
// instead of a false (to delete the node)
class HTMLPurifier_ChildDef_Optional extends HTMLPurifier_ChildDef_Required
{
var $type = 'optional';
function validateChildren($tokens_of_children) {
$result = parent::validateChildren($tokens_of_children);
if ($result === false) return array();
return $result;
}
}
// placeholder
class HTMLPurifier_ChildDef_Empty extends HTMLPurifier_ChildDef
{
var $type = 'empty';
function HTMLPurifier_ChildDef_Empty() {}
function validateChildren() {
return false;
}
}
?>

View File

@@ -1,445 +1,445 @@
<?php
require_once 'HTMLPurifier/AttrDef.php';
require_once 'HTMLPurifier/ChildDef.php';
require_once 'HTMLPurifier/Generator.php';
require_once 'HTMLPurifier/Token.php';
class HTMLPurifier_Definition
{
var $generator;
var $info = array();
var $info_closes_p = array(
// these are all block elements: blocks aren't allowed in P
'address' => true,
'blockquote' => true,
'dd' => true,
'dir' => true,
'div' => true,
'dl' => true,
'dt' => true,
'h1' => true,
'h2' => true,
'h3' => true,
'h4' => true,
'h5' => true,
'h6' => true,
'hr' => true,
'ol' => true,
'p' => true,
'pre' => true,
'table' => true,
'ul' => true
);
function HTMLPurifier_Definition() {
$this->generator = new HTMLPurifier_Generator();
}
function loadData() {
// emulates the structure of the DTD
// entities: prefixed with e_ and _ replaces .
// we don't use an array because that complicates interpolation
// strings are used instead of arrays because if you use arrays,
// you have to do some hideous manipulation with array_merge()
// these are condensed, remember, with bad stuff taken out
// transforms: font, menu, dir, center
// DON'T MONKEY AROUND THIS unless you know what you are doing
// and also know the assumptions the code makes about what this
// contains for optimization purposes (see fixNesting)
$e_special_extra = 'img';
$e_special_basic = 'br | span | bdo';
$e_special = "$e_special_basic | $e_special_extra";
$e_fontstyle_extra = 'big | small';
$e_fontstyle_basic = 'tt | i | b | u | s | strike';
$e_fontstyle = "$e_fontstyle_basic | $e_fontstyle_extra";
$e_phrase_extra = 'sub | sup';
$e_phrase_basic = 'em | strong | dfn | code | q | samp | kbd | var'.
' | cite | abbr | acronym';
$e_phrase = "$e_phrase_basic | $e_phrase_extra";
$e_inline_forms = ''; // humor the dtd
$e_misc_inline = 'ins | del';
$e_misc = "$e_misc_inline";
$e_inline = "a | $e_special | $e_fontstyle | $e_phrase".
" | $e_inline_forms";
// note the casing
$e_Inline = new HTMLPurifier_ChildDef_Optional("#PCDATA | $e_inline".
" | $e_misc_inline");
$e_heading = 'h1|h2|h3|h4|h5|h6';
$e_lists = 'ul | ol | dl';
$e_blocktext = 'pre | hr | blockquote | address';
$e_block = "p | $e_heading | div | $e_lists | $e_blocktext | table";
$e_Flow = new HTMLPurifier_ChildDef_Optional("#PCDATA | $e_block".
" | $e_inline | $e_misc");
$e_a_content = new HTMLPurifier_ChildDef_Optional("#PCDATA | $e_special".
" | $e_fontstyle | $e_phrase | $e_inline_forms | $e_misc_inline");
$e_pre_content = new HTMLPurifier_ChildDef_Optional("#PCDATA | a".
" | $e_special_basic | $e_fontstyle_basic | $e_phrase_basic".
" | $e_inline_forms | $e_misc_inline");
$e_form_content = new HTMLPurifier_ChildDef_Optional(''); //unused
$e_form_button_content = new HTMLPurifier_ChildDef_Optional(''); // unused
$this->info['ins'] =
$this->info['del'] =
$this->info['blockquote'] =
$this->info['dd'] =
$this->info['li'] =
$this->info['div'] = new HTMLPurifier_ElementDef($e_Flow);
$this->info['em'] =
$this->info['strong'] =
$this->info['dfn'] =
$this->info['code'] =
$this->info['samp'] =
$this->info['kbd'] =
$this->info['var'] =
$this->info['code'] =
$this->info['samp'] =
$this->info['kbd'] =
$this->info['var'] =
$this->info['cite'] =
$this->info['abbr'] =
$this->info['acronym'] =
$this->info['q'] =
$this->info['sub'] =
$this->info['tt'] =
$this->info['sup'] =
$this->info['i'] =
$this->info['b'] =
$this->info['big'] =
$this->info['small'] =
$this->info['u'] =
$this->info['s'] =
$this->info['strike'] =
$this->info['bdo'] =
$this->info['span'] =
$this->info['dt'] =
$this->info['p'] =
$this->info['h1'] =
$this->info['h2'] =
$this->info['h3'] =
$this->info['h4'] =
$this->info['h5'] =
$this->info['h6'] = new HTMLPurifier_ElementDef($e_Inline);
$this->info['ol'] =
$this->info['ul'] =
new HTMLPurifier_ElementDef(
new HTMLPurifier_ChildDef_Required('li')
);
$this->info['dl'] =
new HTMLPurifier_ElementDef(
new HTMLPurifier_ChildDef_Required('dt|dd')
);
$this->info['address'] =
new HTMLPurifier_ElementDef(
new HTMLPurifier_ChildDef_Optional("#PCDATA | p | $e_inline".
" | $e_misc_inline")
);
$this->info['img'] =
$this->info['br'] =
$this->info['hr'] = new HTMLPurifier_ElementDef(new HTMLPurifier_ChildDef_Empty());
$this->info['pre'] = new HTMLPurifier_ElementDef($e_pre_content);
$this->info['a'] = new HTMLPurifier_ElementDef($e_a_content);
}
function purifyTokens($tokens) {
if (empty($this->info)) $this->loadData();
$tokens = $this->removeForeignElements($tokens);
$tokens = $this->makeWellFormed($tokens);
$tokens = $this->fixNesting($tokens);
$tokens = $this->validateAttributes($tokens);
return $tokens;
}
function removeForeignElements($tokens) {
if (empty($this->info)) $this->loadData();
$result = array();
foreach($tokens as $token) {
if (!empty( $token->is_tag )) {
if (!isset($this->info[$token->name])) {
// invalid tag, generate HTML and insert in
$token = new HTMLPurifier_Token_Text(
$this->generator->generateFromToken($token)
);
}
} elseif ($token->type == 'comment') {
// strip comments
continue;
} elseif ($token->type == 'text') {
} else {
continue;
}
$result[] = $token;
}
return $result;
}
function makeWellFormed($tokens) {
if (empty($this->info)) $this->loadData();
$result = array();
$current_nesting = array();
foreach ($tokens as $token) {
if (empty( $token->is_tag )) {
$result[] = $token;
continue;
}
$info = $this->info[$token->name]; // assumption but valid
// test if it claims to be a start tag but is empty
if ($info->child_def->type == 'empty' &&
$token->type == 'start' ) {
$result[] = new HTMLPurifier_Token_Empty($token->name,
$token->attributes);
continue;
}
// test if it claims to be empty but really is a start tag
if ($info->child_def->type != 'empty' &&
$token->type == 'empty' ) {
$result[] = new HTMLPurifier_Token_Start($token->name,
$token->attributes);
$result[] = new HTMLPurifier_Token_End($token->name);
continue;
}
// automatically insert empty tags
if ($token->type == 'empty') {
$result[] = $token;
continue;
}
// we give start tags precedence, so automatically accept unless...
// it's one of those special cases
if ($token->type == 'start') {
// if there's a parent, check for special case
if (!empty($current_nesting)) {
$current_parent = array_pop($current_nesting);
// check if we're closing a P tag
if ($current_parent->name == 'p' &&
isset($this->info_closes_p[$token->name])
) {
$result[] = new HTMLPurifier_Token_End('p');
$result[] = $token;
$current_nesting[] = $token;
continue;
}
// check if we're closing a LI tag
if ($current_parent->name == 'li' &&
$token->name == 'li'
) {
$result[] = new HTMLPurifier_Token_End('li');
$result[] = $token;
$current_nesting[] = $token;
continue;
}
// this is more TIDY stuff
// we should also get some TABLE related code
// mismatched h#
$current_nesting[] = $current_parent; // undo the pop
}
$result[] = $token;
$current_nesting[] = $token;
continue;
}
// sanity check
if ($token->type != 'end') continue;
// okay, we're dealing with a closing tag
// make sure that we have something open
if (empty($current_nesting)) {
$result[] = new HTMLPurifier_Token_Text(
$this->generator->generateFromToken($token)
);
continue;
}
// first, check for the simplest case: everything closes neatly
// current_nesting is modified
$current_parent = array_pop($current_nesting);
if ($current_parent->name == $token->name) {
$result[] = $token;
continue;
}
// undo the array_pop
$current_nesting[] = $current_parent;
// okay, so we're trying to close the wrong tag
// scroll back the entire nest, trying to find our tag
// feature could be to specify how far you'd like to go
$size = count($current_nesting);
// -2 because -1 is the last element, but we already checked that
$skipped_tags = false;
for ($i = $size - 2; $i >= 0; $i--) {
if ($current_nesting[$i]->name == $token->name) {
// current nesting is modified
$skipped_tags = array_splice($current_nesting, $i);
break;
}
}
// we still didn't find the tag, so translate to text
if ($skipped_tags === false) {
$result[] = new HTMLPurifier_Token_Text(
$this->generator->generateFromToken($token)
);
continue;
}
// okay, we found it, close all the skipped tags
// note that skipped tags contains the element we need closed
$size = count($skipped_tags);
for ($i = $size - 1; $i >= 0; $i--) {
$result[] = new HTMLPurifier_Token_End($skipped_tags[$i]->name);
}
// done!
}
// we're at the end now, fix all still unclosed tags
if (!empty($current_nesting)) {
$size = count($current_nesting);
for ($i = $size - 1; $i >= 0; $i--) {
$result[] =
new HTMLPurifier_Token_End($current_nesting[$i]->name);
}
}
return $result;
}
function fixNesting($tokens) {
if (empty($this->info)) $this->loadData();
// insert implicit "parent" node, will be removed at end
array_unshift($tokens, new HTMLPurifier_Token_Start('div'));
$tokens[] = new HTMLPurifier_Token_End('div');
for ($i = 0, $size = count($tokens) ; $i < $size; ) {
$child_tokens = array();
// scroll to the end of this node, and report number
for ($j = $i, $depth = 0; ; $j++) {
if ($tokens[$j]->type == 'start') {
$depth++;
// skip token assignment on first iteration
if ($depth == 1) continue;
} elseif ($tokens[$j]->type == 'end') {
$depth--;
// skip token assignment on last iteration
if ($depth == 0) break;
}
$child_tokens[] = $tokens[$j];
}
// $i is index of start token
// $j is index of end token
// have DTD child def validate children
$element_def = $this->info[$tokens[$i]->name];
$result = $element_def->child_def->validateChildren($child_tokens);
// process result
if ($result === true) {
// leave the nodes as is
} elseif($result === false) {
// WARNING WARNING WARNING!!!
// While for the original DTD, there will never be
// cascading removal, more complex ones may have such
// a problem.
// If you modify the info array such that an element
// that requires children may contain a child that requires
// children, you need to also scroll back and re-check that
// elements parent node
$length = $j - $i + 1;
// remove entire node
array_splice($tokens, $i, $length);
// change size
$size -= $length;
// ensure that we scroll to the next node
$i--;
} else {
$length = $j - $i - 1;
// replace node with $result
array_splice($tokens, $i + 1, $length, $result);
// change size
$size -= $length;
$size += count($result);
}
// scroll to next node
$i++;
while ($i < $size and $tokens[$i]->type != 'start') $i++;
}
// remove implicit divs
array_shift($tokens);
array_pop($tokens);
return $tokens;
}
function validateAttributes($tokens) {
if (empty($this->info)) $this->loadData();
}
}
class HTMLPurifier_ElementDef
{
var $child_def;
var $attr_def = array();
function HTMLPurifier_ElementDef($child_def, $attr_def = array()) {
$this->child_def = $child_def;
$this->attr_def = $attr_def;
}
}
<?php
require_once 'HTMLPurifier/AttrDef.php';
require_once 'HTMLPurifier/ChildDef.php';
require_once 'HTMLPurifier/Generator.php';
require_once 'HTMLPurifier/Token.php';
class HTMLPurifier_Definition
{
var $generator;
var $info = array();
var $info_closes_p = array(
// these are all block elements: blocks aren't allowed in P
'address' => true,
'blockquote' => true,
'dd' => true,
'dir' => true,
'div' => true,
'dl' => true,
'dt' => true,
'h1' => true,
'h2' => true,
'h3' => true,
'h4' => true,
'h5' => true,
'h6' => true,
'hr' => true,
'ol' => true,
'p' => true,
'pre' => true,
'table' => true,
'ul' => true
);
function HTMLPurifier_Definition() {
$this->generator = new HTMLPurifier_Generator();
}
function loadData() {
// emulates the structure of the DTD
// entities: prefixed with e_ and _ replaces .
// we don't use an array because that complicates interpolation
// strings are used instead of arrays because if you use arrays,
// you have to do some hideous manipulation with array_merge()
// these are condensed, remember, with bad stuff taken out
// transforms: font, menu, dir, center
// DON'T MONKEY AROUND THIS unless you know what you are doing
// and also know the assumptions the code makes about what this
// contains for optimization purposes (see fixNesting)
$e_special_extra = 'img';
$e_special_basic = 'br | span | bdo';
$e_special = "$e_special_basic | $e_special_extra";
$e_fontstyle_extra = 'big | small';
$e_fontstyle_basic = 'tt | i | b | u | s | strike';
$e_fontstyle = "$e_fontstyle_basic | $e_fontstyle_extra";
$e_phrase_extra = 'sub | sup';
$e_phrase_basic = 'em | strong | dfn | code | q | samp | kbd | var'.
' | cite | abbr | acronym';
$e_phrase = "$e_phrase_basic | $e_phrase_extra";
$e_inline_forms = ''; // humor the dtd
$e_misc_inline = 'ins | del';
$e_misc = "$e_misc_inline";
$e_inline = "a | $e_special | $e_fontstyle | $e_phrase".
" | $e_inline_forms";
// note the casing
$e_Inline = new HTMLPurifier_ChildDef_Optional("#PCDATA | $e_inline".
" | $e_misc_inline");
$e_heading = 'h1|h2|h3|h4|h5|h6';
$e_lists = 'ul | ol | dl';
$e_blocktext = 'pre | hr | blockquote | address';
$e_block = "p | $e_heading | div | $e_lists | $e_blocktext | table";
$e_Flow = new HTMLPurifier_ChildDef_Optional("#PCDATA | $e_block".
" | $e_inline | $e_misc");
$e_a_content = new HTMLPurifier_ChildDef_Optional("#PCDATA | $e_special".
" | $e_fontstyle | $e_phrase | $e_inline_forms | $e_misc_inline");
$e_pre_content = new HTMLPurifier_ChildDef_Optional("#PCDATA | a".
" | $e_special_basic | $e_fontstyle_basic | $e_phrase_basic".
" | $e_inline_forms | $e_misc_inline");
$e_form_content = new HTMLPurifier_ChildDef_Optional(''); //unused
$e_form_button_content = new HTMLPurifier_ChildDef_Optional(''); // unused
$this->info['ins'] =
$this->info['del'] =
$this->info['blockquote'] =
$this->info['dd'] =
$this->info['li'] =
$this->info['div'] = new HTMLPurifier_ElementDef($e_Flow);
$this->info['em'] =
$this->info['strong'] =
$this->info['dfn'] =
$this->info['code'] =
$this->info['samp'] =
$this->info['kbd'] =
$this->info['var'] =
$this->info['code'] =
$this->info['samp'] =
$this->info['kbd'] =
$this->info['var'] =
$this->info['cite'] =
$this->info['abbr'] =
$this->info['acronym'] =
$this->info['q'] =
$this->info['sub'] =
$this->info['tt'] =
$this->info['sup'] =
$this->info['i'] =
$this->info['b'] =
$this->info['big'] =
$this->info['small'] =
$this->info['u'] =
$this->info['s'] =
$this->info['strike'] =
$this->info['bdo'] =
$this->info['span'] =
$this->info['dt'] =
$this->info['p'] =
$this->info['h1'] =
$this->info['h2'] =
$this->info['h3'] =
$this->info['h4'] =
$this->info['h5'] =
$this->info['h6'] = new HTMLPurifier_ElementDef($e_Inline);
$this->info['ol'] =
$this->info['ul'] =
new HTMLPurifier_ElementDef(
new HTMLPurifier_ChildDef_Required('li')
);
$this->info['dl'] =
new HTMLPurifier_ElementDef(
new HTMLPurifier_ChildDef_Required('dt|dd')
);
$this->info['address'] =
new HTMLPurifier_ElementDef(
new HTMLPurifier_ChildDef_Optional("#PCDATA | p | $e_inline".
" | $e_misc_inline")
);
$this->info['img'] =
$this->info['br'] =
$this->info['hr'] = new HTMLPurifier_ElementDef(new HTMLPurifier_ChildDef_Empty());
$this->info['pre'] = new HTMLPurifier_ElementDef($e_pre_content);
$this->info['a'] = new HTMLPurifier_ElementDef($e_a_content);
}
function purifyTokens($tokens) {
if (empty($this->info)) $this->loadData();
$tokens = $this->removeForeignElements($tokens);
$tokens = $this->makeWellFormed($tokens);
$tokens = $this->fixNesting($tokens);
$tokens = $this->validateAttributes($tokens);
return $tokens;
}
function removeForeignElements($tokens) {
if (empty($this->info)) $this->loadData();
$result = array();
foreach($tokens as $token) {
if (!empty( $token->is_tag )) {
if (!isset($this->info[$token->name])) {
// invalid tag, generate HTML and insert in
$token = new HTMLPurifier_Token_Text(
$this->generator->generateFromToken($token)
);
}
} elseif ($token->type == 'comment') {
// strip comments
continue;
} elseif ($token->type == 'text') {
} else {
continue;
}
$result[] = $token;
}
return $result;
}
function makeWellFormed($tokens) {
if (empty($this->info)) $this->loadData();
$result = array();
$current_nesting = array();
foreach ($tokens as $token) {
if (empty( $token->is_tag )) {
$result[] = $token;
continue;
}
$info = $this->info[$token->name]; // assumption but valid
// test if it claims to be a start tag but is empty
if ($info->child_def->type == 'empty' &&
$token->type == 'start' ) {
$result[] = new HTMLPurifier_Token_Empty($token->name,
$token->attributes);
continue;
}
// test if it claims to be empty but really is a start tag
if ($info->child_def->type != 'empty' &&
$token->type == 'empty' ) {
$result[] = new HTMLPurifier_Token_Start($token->name,
$token->attributes);
$result[] = new HTMLPurifier_Token_End($token->name);
continue;
}
// automatically insert empty tags
if ($token->type == 'empty') {
$result[] = $token;
continue;
}
// we give start tags precedence, so automatically accept unless...
// it's one of those special cases
if ($token->type == 'start') {
// if there's a parent, check for special case
if (!empty($current_nesting)) {
$current_parent = array_pop($current_nesting);
// check if we're closing a P tag
if ($current_parent->name == 'p' &&
isset($this->info_closes_p[$token->name])
) {
$result[] = new HTMLPurifier_Token_End('p');
$result[] = $token;
$current_nesting[] = $token;
continue;
}
// check if we're closing a LI tag
if ($current_parent->name == 'li' &&
$token->name == 'li'
) {
$result[] = new HTMLPurifier_Token_End('li');
$result[] = $token;
$current_nesting[] = $token;
continue;
}
// this is more TIDY stuff
// we should also get some TABLE related code
// mismatched h#
$current_nesting[] = $current_parent; // undo the pop
}
$result[] = $token;
$current_nesting[] = $token;
continue;
}
// sanity check
if ($token->type != 'end') continue;
// okay, we're dealing with a closing tag
// make sure that we have something open
if (empty($current_nesting)) {
$result[] = new HTMLPurifier_Token_Text(
$this->generator->generateFromToken($token)
);
continue;
}
// first, check for the simplest case: everything closes neatly
// current_nesting is modified
$current_parent = array_pop($current_nesting);
if ($current_parent->name == $token->name) {
$result[] = $token;
continue;
}
// undo the array_pop
$current_nesting[] = $current_parent;
// okay, so we're trying to close the wrong tag
// scroll back the entire nest, trying to find our tag
// feature could be to specify how far you'd like to go
$size = count($current_nesting);
// -2 because -1 is the last element, but we already checked that
$skipped_tags = false;
for ($i = $size - 2; $i >= 0; $i--) {
if ($current_nesting[$i]->name == $token->name) {
// current nesting is modified
$skipped_tags = array_splice($current_nesting, $i);
break;
}
}
// we still didn't find the tag, so translate to text
if ($skipped_tags === false) {
$result[] = new HTMLPurifier_Token_Text(
$this->generator->generateFromToken($token)
);
continue;
}
// okay, we found it, close all the skipped tags
// note that skipped tags contains the element we need closed
$size = count($skipped_tags);
for ($i = $size - 1; $i >= 0; $i--) {
$result[] = new HTMLPurifier_Token_End($skipped_tags[$i]->name);
}
// done!
}
// we're at the end now, fix all still unclosed tags
if (!empty($current_nesting)) {
$size = count($current_nesting);
for ($i = $size - 1; $i >= 0; $i--) {
$result[] =
new HTMLPurifier_Token_End($current_nesting[$i]->name);
}
}
return $result;
}
function fixNesting($tokens) {
if (empty($this->info)) $this->loadData();
// insert implicit "parent" node, will be removed at end
array_unshift($tokens, new HTMLPurifier_Token_Start('div'));
$tokens[] = new HTMLPurifier_Token_End('div');
for ($i = 0, $size = count($tokens) ; $i < $size; ) {
$child_tokens = array();
// scroll to the end of this node, and report number
for ($j = $i, $depth = 0; ; $j++) {
if ($tokens[$j]->type == 'start') {
$depth++;
// skip token assignment on first iteration
if ($depth == 1) continue;
} elseif ($tokens[$j]->type == 'end') {
$depth--;
// skip token assignment on last iteration
if ($depth == 0) break;
}
$child_tokens[] = $tokens[$j];
}
// $i is index of start token
// $j is index of end token
// have DTD child def validate children
$element_def = $this->info[$tokens[$i]->name];
$result = $element_def->child_def->validateChildren($child_tokens);
// process result
if ($result === true) {
// leave the nodes as is
} elseif($result === false) {
// WARNING WARNING WARNING!!!
// While for the original DTD, there will never be
// cascading removal, more complex ones may have such
// a problem.
// If you modify the info array such that an element
// that requires children may contain a child that requires
// children, you need to also scroll back and re-check that
// elements parent node
$length = $j - $i + 1;
// remove entire node
array_splice($tokens, $i, $length);
// change size
$size -= $length;
// ensure that we scroll to the next node
$i--;
} else {
$length = $j - $i - 1;
// replace node with $result
array_splice($tokens, $i + 1, $length, $result);
// change size
$size -= $length;
$size += count($result);
}
// scroll to next node
$i++;
while ($i < $size and $tokens[$i]->type != 'start') $i++;
}
// remove implicit divs
array_shift($tokens);
array_pop($tokens);
return $tokens;
}
function validateAttributes($tokens) {
if (empty($this->info)) $this->loadData();
}
}
class HTMLPurifier_ElementDef
{
var $child_def;
var $attr_def = array();
function HTMLPurifier_ElementDef($child_def, $attr_def = array()) {
$this->child_def = $child_def;
$this->attr_def = $attr_def;
}
}
?>

View File

@@ -1,45 +1,45 @@
<?php
class HTMLPurifier_Generator
{
function generateFromTokens($tokens) {
$html = '';
foreach ($tokens as $token) {
$html .= $this->generateFromToken($token);
}
return $html;
}
function generateFromToken($token) {
if ($token->type == 'start') {
$attr = $this->generateAttributes($token->attributes);
return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>';
} elseif ($token->type == 'end') {
return '</' . $token->name . '>';
} elseif ($token->type == 'empty') {
$attr = $this->generateAttributes($token->attributes);
return '<' . $token->name . ($attr ? ' ' : '') . $attr . ' />';
} elseif ($token->type == 'text') {
return htmlentities($token->data, ENT_COMPAT, 'UTF-8');
} else {
return '';
}
}
function generateAttributes($assoc_array_of_attributes) {
$html = '';
foreach ($assoc_array_of_attributes as $key => $value) {
$html .= $key.'="'.htmlentities($value, ENT_COMPAT, 'UTF-8').'" ';
}
return rtrim($html);
}
}
<?php
class HTMLPurifier_Generator
{
function generateFromTokens($tokens) {
$html = '';
foreach ($tokens as $token) {
$html .= $this->generateFromToken($token);
}
return $html;
}
function generateFromToken($token) {
if ($token->type == 'start') {
$attr = $this->generateAttributes($token->attributes);
return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>';
} elseif ($token->type == 'end') {
return '</' . $token->name . '>';
} elseif ($token->type == 'empty') {
$attr = $this->generateAttributes($token->attributes);
return '<' . $token->name . ($attr ? ' ' : '') . $attr . ' />';
} elseif ($token->type == 'text') {
return htmlentities($token->data, ENT_COMPAT, 'UTF-8');
} else {
return '';
}
}
function generateAttributes($assoc_array_of_attributes) {
$html = '';
foreach ($assoc_array_of_attributes as $key => $value) {
$html .= $key.'="'.htmlentities($value, ENT_COMPAT, 'UTF-8').'" ';
}
return rtrim($html);
}
}
?>

View File

@@ -1,354 +1,354 @@
<?php
/*
TODO:
* Reread the XML spec and make sure I got everything right
* Add support for CDATA sections
* Have comments output with the leading and trailing --s
* Optimize and benchmark
* Check MF_Text behavior: shouldn't the info in there be raw (entities parsed?)
*/
require_once 'HTMLPurifier/Lexer.php';
class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
{
// does this version of PHP support utf8 as entity function charset?
var $_entity_utf8;
function HTMLPurifier_Lexer() {
$this->_entity_utf8 = version_compare(PHP_VERSION, '5', '>=');
}
// this is QUITE a knotty problem
//
// The main trouble is that, even while assuming UTF-8 is what we're
// using, we've got to deal with HTML entities (like &mdash;)
// Not even sure if the PHP 5 decoding function does that. Plus,
// SimpleTest doesn't use UTF-8!
//
// However, we MUST parse everything possible, because once you get
// to the HTML generator, it will escape everything possible (although
// that may not be correct, and we should be using htmlspecialchars() ).
//
// Nevertheless, strictly XML speaking, we cannot assume any character
// entities are defined except the htmlspecialchars() ones, so leaving
// the entities inside HERE is not acceptable. (plus, htmlspecialchars
// might convert them anyway). So EVERYTHING must get parsed.
//
// We may need to roll our own character entity lookup table. It's only
// about 250, fortunantely, the decimal/hex ones map cleanly to UTF-8.
function parseData($string) {
// we may want to let the user do a different char encoding,
// although there is NO REASON why they shouldn't be able
// to convert it to UTF-8 before they pass it to us
// no support for less than PHP 4.3
if ($this->_entity_utf8) {
// PHP 5+, UTF-8 is nicely supported
return @html_entity_decode($string, ENT_QUOTES, 'UTF-8');
} else {
// PHP 4, do compat stuff
$string = html_entity_decode($string, ENT_QUOTES, 'ISO-8859-1');
// get the numeric UTF-8 stuff
$string = preg_replace('/&#(\d+);/me', "chr(\\1)", $string);
$string = preg_replace('/&#x([a-f0-9]+);/mei',"chr(0x\\1)",$string);
// get the stringy UTF-8 stuff
return $string;
}
}
function nextQuote($string, $offset = 0) {
$next = strcspn($string, '"\'', $offset) + $offset;
return strlen($string) == $next ? false : $next;
}
function nextWhiteSpace($string, $offset = 0) {
$next = strcspn($string, "\x20\x09\x0D\x0A", $offset) + $offset;
return strlen($string) == $next ? false : $next;
}
function tokenizeHTML($string) {
// some quick checking (if empty, return empty)
$string = @ (string) $string;
if ($string == '') return array();
$cursor = 0; // our location in the text
$inside_tag = false; // whether or not we're parsing the inside of a tag
$array = array(); // result array
// infinite loop protection
// has to be pretty big, since html docs can be big
// we're allow two hundred thousand tags... more than enough?
$loops = 0;
while(true) {
// infinite loop protection
if (++$loops > 200000) return array();
$position_next_lt = strpos($string, '<', $cursor);
$position_next_gt = strpos($string, '>', $cursor);
// triggers on "<b>asdf</b>" but not "asdf <b></b>"
if ($position_next_lt === $cursor) {
$inside_tag = true;
$cursor++;
}
if (!$inside_tag && $position_next_lt !== false) {
// We are not inside tag and there still is another tag to parse
$array[] = new
HTMLPurifier_Token_Text(
html_entity_decode(
substr(
$string, $cursor, $position_next_lt - $cursor
),
ENT_QUOTES
)
);
$cursor = $position_next_lt + 1;
$inside_tag = true;
continue;
} elseif (!$inside_tag) {
// We are not inside tag but there are no more tags
// If we're already at the end, break
if ($cursor === strlen($string)) break;
// Create Text of rest of string
$array[] = new
HTMLPurifier_Token_Text(
html_entity_decode(
substr(
$string, $cursor
),
ENT_QUOTES
)
);
break;
} elseif ($inside_tag && $position_next_gt !== false) {
// We are in tag and it is well formed
// Grab the internals of the tag
$segment = substr($string, $cursor, $position_next_gt-$cursor);
// Check if it's a comment
if (
substr($segment,0,3) == '!--' &&
substr($segment,strlen($segment)-2,2) == '--'
) {
$array[] = new
HTMLPurifier_Token_Comment(
substr(
$segment, 3, strlen($segment) - 5
)
);
$inside_tag = false;
$cursor = $position_next_gt + 1;
continue;
}
// Check if it's an end tag
$is_end_tag = (strpos($segment,'/') === 0);
if ($is_end_tag) {
$type = substr($segment, 1);
$array[] = new HTMLPurifier_Token_End($type);
$inside_tag = false;
$cursor = $position_next_gt + 1;
continue;
}
// Check if it is explicitly self closing, if so, remove
// trailing slash. Remember, we could have a tag like <br>, so
// any later token processing scripts must convert improperly
// classified EmptyTags from StartTags.
$is_self_closing= (strpos($segment,'/') === strlen($segment)-1);
if ($is_self_closing) {
$segment = substr($segment, 0, strlen($segment) - 1);
}
// Check if there are any attributes
$position_first_space = $this->nextWhiteSpace($segment);
if ($position_first_space === false) {
if ($is_self_closing) {
$array[] = new HTMLPurifier_Token_Empty($segment);
} else {
$array[] = new HTMLPurifier_Token_Start($segment);
}
$inside_tag = false;
$cursor = $position_next_gt + 1;
continue;
}
// Grab out all the data
$type = substr($segment, 0, $position_first_space);
$attribute_string =
trim(
substr(
$segment, $position_first_space
)
);
if ($attribute_string) {
$attributes = $this->tokenizeAttributeString(
$attribute_string
);
} else {
$attributes = array();
}
if ($is_self_closing) {
$array[] = new HTMLPurifier_Token_Empty($type, $attributes);
} else {
$array[] = new HTMLPurifier_Token_Start($type, $attributes);
}
$cursor = $position_next_gt + 1;
$inside_tag = false;
continue;
} else {
$array[] = new
HTMLPurifier_Token_Text(
'<' .
html_entity_decode(
substr($string, $cursor),
ENT_QUOTES
)
);
break;
}
break;
}
return $array;
}
function tokenizeAttributeString($string) {
$string = (string) $string; // quick typecast
if ($string == '') return array(); // no attributes
// let's see if we can abort as quickly as possible
// one equal sign, no spaces => one attribute
$num_equal = substr_count($string, '=');
$has_space = strpos($string, ' ');
if ($num_equal === 0 && !$has_space) {
// bool attribute
return array($string => $string);
} elseif ($num_equal === 1 && !$has_space) {
// only one attribute
list($key, $quoted_value) = explode('=', $string);
$quoted_value = trim($quoted_value);
if (!$key) return array();
if (!$quoted_value) return array($key => '');
$first_char = @$quoted_value[0];
$last_char = @$quoted_value[strlen($quoted_value)-1];
$same_quote = ($first_char == $last_char);
$open_quote = ($first_char == '"' || $first_char == "'");
if ( $same_quote && $open_quote) {
// well behaved
$value = substr($quoted_value, 1, strlen($quoted_value) - 2);
} else {
// not well behaved
if ($open_quote) {
$value = substr($quoted_value, 1);
} else {
$value = $quoted_value;
}
}
return array($key => $value);
}
// setup loop environment
$array = array(); // return assoc array of attributes
$cursor = 0; // current position in string (moves forward)
$size = strlen($string); // size of the string (stays the same)
// if we have unquoted attributes, the parser expects a terminating
// space, so let's guarantee that there's always a terminating space.
$string .= ' ';
// infinite loop protection
$loops = 0;
while(true) {
// infinite loop protection
if (++$loops > 1000) return array();
if ($cursor >= $size) {
break;
}
$cursor += ($value = strspn($string, "\x20\x09\x0D\x0A", $cursor));
$position_next_space = $this->nextWhiteSpace($string, $cursor);
$position_next_equal = strpos($string, '=', $cursor);
// grab the key
$key_begin = $cursor; //we're currently at the start of the key
// scroll past all characters that are the key (not whitespace or =)
$cursor += strcspn($string, "\x20\x09\x0D\x0A=", $cursor);
$key_end = $cursor; // now at the end of the key
$key = substr($string, $key_begin, $key_end - $key_begin);
if (!$key) continue; // empty key
// scroll past all whitespace
$cursor += strspn($string, "\x20\x09\x0D\x0A", $cursor);
if ($cursor >= $size) {
$array[$key] = $key;
break;
}
// if the next character is an equal sign, we've got a regular
// pair, otherwise, it's a bool attribute
$first_char = @$string[$cursor];
if ($first_char == '=') {
// key="value"
$cursor++;
$cursor += strspn($string, "\x20\x09\x0D\x0A", $cursor);
// we might be in front of a quote right now
$char = @$string[$cursor];
if ($char == '"' || $char == "'") {
// it's quoted, end bound is $char
$cursor++;
$value_begin = $cursor;
$cursor = strpos($string, $char, $cursor);
$value_end = $cursor;
} else {
// it's not quoted, end bound is whitespace
$value_begin = $cursor;
$cursor += strcspn($string, "\x20\x09\x0D\x0A", $cursor);
$value_end = $cursor;
}
$value = substr($string, $value_begin, $value_end - $value_begin);
$array[$key] = $value;
$cursor++;
} else {
// boolattr
if ($key !== '') {
$array[$key] = $key;
}
}
}
return $array;
}
}
<?php
/*
TODO:
* Reread the XML spec and make sure I got everything right
* Add support for CDATA sections
* Have comments output with the leading and trailing --s
* Optimize and benchmark
* Check MF_Text behavior: shouldn't the info in there be raw (entities parsed?)
*/
require_once 'HTMLPurifier/Lexer.php';
class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
{
// does this version of PHP support utf8 as entity function charset?
var $_entity_utf8;
function HTMLPurifier_Lexer() {
$this->_entity_utf8 = version_compare(PHP_VERSION, '5', '>=');
}
// this is QUITE a knotty problem
//
// The main trouble is that, even while assuming UTF-8 is what we're
// using, we've got to deal with HTML entities (like &mdash;)
// Not even sure if the PHP 5 decoding function does that. Plus,
// SimpleTest doesn't use UTF-8!
//
// However, we MUST parse everything possible, because once you get
// to the HTML generator, it will escape everything possible (although
// that may not be correct, and we should be using htmlspecialchars() ).
//
// Nevertheless, strictly XML speaking, we cannot assume any character
// entities are defined except the htmlspecialchars() ones, so leaving
// the entities inside HERE is not acceptable. (plus, htmlspecialchars
// might convert them anyway). So EVERYTHING must get parsed.
//
// We may need to roll our own character entity lookup table. It's only
// about 250, fortunantely, the decimal/hex ones map cleanly to UTF-8.
function parseData($string) {
// we may want to let the user do a different char encoding,
// although there is NO REASON why they shouldn't be able
// to convert it to UTF-8 before they pass it to us
// no support for less than PHP 4.3
if ($this->_entity_utf8) {
// PHP 5+, UTF-8 is nicely supported
return @html_entity_decode($string, ENT_QUOTES, 'UTF-8');
} else {
// PHP 4, do compat stuff
$string = html_entity_decode($string, ENT_QUOTES, 'ISO-8859-1');
// get the numeric UTF-8 stuff
$string = preg_replace('/&#(\d+);/me', "chr(\\1)", $string);
$string = preg_replace('/&#x([a-f0-9]+);/mei',"chr(0x\\1)",$string);
// get the stringy UTF-8 stuff
return $string;
}
}
function nextQuote($string, $offset = 0) {
$next = strcspn($string, '"\'', $offset) + $offset;
return strlen($string) == $next ? false : $next;
}
function nextWhiteSpace($string, $offset = 0) {
$next = strcspn($string, "\x20\x09\x0D\x0A", $offset) + $offset;
return strlen($string) == $next ? false : $next;
}
function tokenizeHTML($string) {
// some quick checking (if empty, return empty)
$string = @ (string) $string;
if ($string == '') return array();
$cursor = 0; // our location in the text
$inside_tag = false; // whether or not we're parsing the inside of a tag
$array = array(); // result array
// infinite loop protection
// has to be pretty big, since html docs can be big
// we're allow two hundred thousand tags... more than enough?
$loops = 0;
while(true) {
// infinite loop protection
if (++$loops > 200000) return array();
$position_next_lt = strpos($string, '<', $cursor);
$position_next_gt = strpos($string, '>', $cursor);
// triggers on "<b>asdf</b>" but not "asdf <b></b>"
if ($position_next_lt === $cursor) {
$inside_tag = true;
$cursor++;
}
if (!$inside_tag && $position_next_lt !== false) {
// We are not inside tag and there still is another tag to parse
$array[] = new
HTMLPurifier_Token_Text(
html_entity_decode(
substr(
$string, $cursor, $position_next_lt - $cursor
),
ENT_QUOTES
)
);
$cursor = $position_next_lt + 1;
$inside_tag = true;
continue;
} elseif (!$inside_tag) {
// We are not inside tag but there are no more tags
// If we're already at the end, break
if ($cursor === strlen($string)) break;
// Create Text of rest of string
$array[] = new
HTMLPurifier_Token_Text(
html_entity_decode(
substr(
$string, $cursor
),
ENT_QUOTES
)
);
break;
} elseif ($inside_tag && $position_next_gt !== false) {
// We are in tag and it is well formed
// Grab the internals of the tag
$segment = substr($string, $cursor, $position_next_gt-$cursor);
// Check if it's a comment
if (
substr($segment,0,3) == '!--' &&
substr($segment,strlen($segment)-2,2) == '--'
) {
$array[] = new
HTMLPurifier_Token_Comment(
substr(
$segment, 3, strlen($segment) - 5
)
);
$inside_tag = false;
$cursor = $position_next_gt + 1;
continue;
}
// Check if it's an end tag
$is_end_tag = (strpos($segment,'/') === 0);
if ($is_end_tag) {
$type = substr($segment, 1);
$array[] = new HTMLPurifier_Token_End($type);
$inside_tag = false;
$cursor = $position_next_gt + 1;
continue;
}
// Check if it is explicitly self closing, if so, remove
// trailing slash. Remember, we could have a tag like <br>, so
// any later token processing scripts must convert improperly
// classified EmptyTags from StartTags.
$is_self_closing= (strpos($segment,'/') === strlen($segment)-1);
if ($is_self_closing) {
$segment = substr($segment, 0, strlen($segment) - 1);
}
// Check if there are any attributes
$position_first_space = $this->nextWhiteSpace($segment);
if ($position_first_space === false) {
if ($is_self_closing) {
$array[] = new HTMLPurifier_Token_Empty($segment);
} else {
$array[] = new HTMLPurifier_Token_Start($segment);
}
$inside_tag = false;
$cursor = $position_next_gt + 1;
continue;
}
// Grab out all the data
$type = substr($segment, 0, $position_first_space);
$attribute_string =
trim(
substr(
$segment, $position_first_space
)
);
if ($attribute_string) {
$attributes = $this->tokenizeAttributeString(
$attribute_string
);
} else {
$attributes = array();
}
if ($is_self_closing) {
$array[] = new HTMLPurifier_Token_Empty($type, $attributes);
} else {
$array[] = new HTMLPurifier_Token_Start($type, $attributes);
}
$cursor = $position_next_gt + 1;
$inside_tag = false;
continue;
} else {
$array[] = new
HTMLPurifier_Token_Text(
'<' .
html_entity_decode(
substr($string, $cursor),
ENT_QUOTES
)
);
break;
}
break;
}
return $array;
}
function tokenizeAttributeString($string) {
$string = (string) $string; // quick typecast
if ($string == '') return array(); // no attributes
// let's see if we can abort as quickly as possible
// one equal sign, no spaces => one attribute
$num_equal = substr_count($string, '=');
$has_space = strpos($string, ' ');
if ($num_equal === 0 && !$has_space) {
// bool attribute
return array($string => $string);
} elseif ($num_equal === 1 && !$has_space) {
// only one attribute
list($key, $quoted_value) = explode('=', $string);
$quoted_value = trim($quoted_value);
if (!$key) return array();
if (!$quoted_value) return array($key => '');
$first_char = @$quoted_value[0];
$last_char = @$quoted_value[strlen($quoted_value)-1];
$same_quote = ($first_char == $last_char);
$open_quote = ($first_char == '"' || $first_char == "'");
if ( $same_quote && $open_quote) {
// well behaved
$value = substr($quoted_value, 1, strlen($quoted_value) - 2);
} else {
// not well behaved
if ($open_quote) {
$value = substr($quoted_value, 1);
} else {
$value = $quoted_value;
}
}
return array($key => $value);
}
// setup loop environment
$array = array(); // return assoc array of attributes
$cursor = 0; // current position in string (moves forward)
$size = strlen($string); // size of the string (stays the same)
// if we have unquoted attributes, the parser expects a terminating
// space, so let's guarantee that there's always a terminating space.
$string .= ' ';
// infinite loop protection
$loops = 0;
while(true) {
// infinite loop protection
if (++$loops > 1000) return array();
if ($cursor >= $size) {
break;
}
$cursor += ($value = strspn($string, "\x20\x09\x0D\x0A", $cursor));
$position_next_space = $this->nextWhiteSpace($string, $cursor);
$position_next_equal = strpos($string, '=', $cursor);
// grab the key
$key_begin = $cursor; //we're currently at the start of the key
// scroll past all characters that are the key (not whitespace or =)
$cursor += strcspn($string, "\x20\x09\x0D\x0A=", $cursor);
$key_end = $cursor; // now at the end of the key
$key = substr($string, $key_begin, $key_end - $key_begin);
if (!$key) continue; // empty key
// scroll past all whitespace
$cursor += strspn($string, "\x20\x09\x0D\x0A", $cursor);
if ($cursor >= $size) {
$array[$key] = $key;
break;
}
// if the next character is an equal sign, we've got a regular
// pair, otherwise, it's a bool attribute
$first_char = @$string[$cursor];
if ($first_char == '=') {
// key="value"
$cursor++;
$cursor += strspn($string, "\x20\x09\x0D\x0A", $cursor);
// we might be in front of a quote right now
$char = @$string[$cursor];
if ($char == '"' || $char == "'") {
// it's quoted, end bound is $char
$cursor++;
$value_begin = $cursor;
$cursor = strpos($string, $char, $cursor);
$value_end = $cursor;
} else {
// it's not quoted, end bound is whitespace
$value_begin = $cursor;
$cursor += strcspn($string, "\x20\x09\x0D\x0A", $cursor);
$value_end = $cursor;
}
$value = substr($string, $value_begin, $value_end - $value_begin);
$array[$key] = $value;
$cursor++;
} else {
// boolattr
if ($key !== '') {
$array[$key] = $key;
}
}
}
return $array;
}
}
?>

View File

@@ -1,58 +1,58 @@
<?php
require_once 'XML/HTMLSax3.php'; // PEAR
require_once 'HTMLPurifier/Lexer.php';
// uses the PEAR class XML_HTMLSax3 to parse XML
class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
{
var $tokens;
function tokenizeHTML($html) {
$this->tokens = array();
$parser=& new XML_HTMLSax3();
$parser->set_object($this);
$parser->set_element_handler('openHandler','closeHandler');
$parser->set_data_handler('dataHandler');
$parser->set_escape_handler('escapeHandler');
$parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);
$parser->parse($html);
return $this->tokens;
}
function openHandler(&$parser, $name, $attrs, $closed) {
if ($closed) {
$this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs);
} else {
$this->tokens[] = new HTMLPurifier_Token_Start($name, $attrs);
}
return true;
}
function closeHandler(&$parser, $name) {
// HTMLSax3 seems to always send empty tags an extra close tag
// check and ignore if you see it:
// [TESTME] to make sure it doesn't overreach
if ($this->tokens[count($this->tokens)-1]->type == 'empty') {
return true;
}
$this->tokens[] = new HTMLPurifier_Token_End($name);
return true;
}
function dataHandler(&$parser, $data) {
$this->tokens[] = new HTMLPurifier_Token_Text($data);
return true;
}
function escapeHandler(&$parser, $data) {
if (strpos($data, '-') === 0) {
$this->tokens[] = new HTMLPurifier_Token_Comment($data);
}
return true;
}
}
<?php
require_once 'XML/HTMLSax3.php'; // PEAR
require_once 'HTMLPurifier/Lexer.php';
// uses the PEAR class XML_HTMLSax3 to parse XML
class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
{
var $tokens;
function tokenizeHTML($html) {
$this->tokens = array();
$parser=& new XML_HTMLSax3();
$parser->set_object($this);
$parser->set_element_handler('openHandler','closeHandler');
$parser->set_data_handler('dataHandler');
$parser->set_escape_handler('escapeHandler');
$parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);
$parser->parse($html);
return $this->tokens;
}
function openHandler(&$parser, $name, $attrs, $closed) {
if ($closed) {
$this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs);
} else {
$this->tokens[] = new HTMLPurifier_Token_Start($name, $attrs);
}
return true;
}
function closeHandler(&$parser, $name) {
// HTMLSax3 seems to always send empty tags an extra close tag
// check and ignore if you see it:
// [TESTME] to make sure it doesn't overreach
if ($this->tokens[count($this->tokens)-1]->type == 'empty') {
return true;
}
$this->tokens[] = new HTMLPurifier_Token_End($name);
return true;
}
function dataHandler(&$parser, $data) {
$this->tokens[] = new HTMLPurifier_Token_Text($data);
return true;
}
function escapeHandler(&$parser, $data) {
if (strpos($data, '-') === 0) {
$this->tokens[] = new HTMLPurifier_Token_Comment($data);
}
return true;
}
}
?>

View File

@@ -1,60 +1,60 @@
<?php
// all objects here are immutable
class HTMLPurifier_Token {} // abstract
class HTMLPurifier_Token_Tag extends HTMLPurifier_Token // abstract
{
var $is_tag = true;
var $name;
var $attributes = array();
function HTMLPurifier_Token_Tag($name, $attributes = array()) {
$this->name = ctype_lower($name) ? $name : strtolower($name);
$this->attributes = $attributes;
}
}
// start CONCRETE ones
class HTMLPurifier_Token_Start extends HTMLPurifier_Token_Tag
{
var $type = 'start';
}
class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_Tag
{
var $type = 'empty';
}
// accepts attributes even though it really can't, for optimization reasons
class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag
{
var $type = 'end';
}
class HTMLPurifier_Token_Text extends HTMLPurifier_Token
{
var $name = '#PCDATA';
var $type = 'text';
var $data;
var $is_whitespace = false;
function HTMLPurifier_Token_Text($data) {
$this->data = $data;
if (ctype_space($data)) $this->is_whitespace = true;
}
function append($text) {
return new HTMLPurifier_Token_Text($this->data . $text->data);
}
}
class HTMLPurifier_Token_Comment extends HTMLPurifier_Token
{
var $data;
var $type = 'comment';
function HTMLPurifier_Token_Comment($data) {
$this->data = $data;
}
}
<?php
// all objects here are immutable
class HTMLPurifier_Token {} // abstract
class HTMLPurifier_Token_Tag extends HTMLPurifier_Token // abstract
{
var $is_tag = true;
var $name;
var $attributes = array();
function HTMLPurifier_Token_Tag($name, $attributes = array()) {
$this->name = ctype_lower($name) ? $name : strtolower($name);
$this->attributes = $attributes;
}
}
// start CONCRETE ones
class HTMLPurifier_Token_Start extends HTMLPurifier_Token_Tag
{
var $type = 'start';
}
class HTMLPurifier_Token_Empty extends HTMLPurifier_Token_Tag
{
var $type = 'empty';
}
// accepts attributes even though it really can't, for optimization reasons
class HTMLPurifier_Token_End extends HTMLPurifier_Token_Tag
{
var $type = 'end';
}
class HTMLPurifier_Token_Text extends HTMLPurifier_Token
{
var $name = '#PCDATA';
var $type = 'text';
var $data;
var $is_whitespace = false;
function HTMLPurifier_Token_Text($data) {
$this->data = $data;
if (ctype_space($data)) $this->is_whitespace = true;
}
function append($text) {
return new HTMLPurifier_Token_Text($this->data . $text->data);
}
}
class HTMLPurifier_Token_Comment extends HTMLPurifier_Token
{
var $data;
var $type = 'comment';
function HTMLPurifier_Token_Comment($data) {
$this->data = $data;
}
}
?>