diff --git a/AttrDef.php b/AttrDef.php index d454ea2e..19bb3805 100644 --- a/AttrDef.php +++ b/AttrDef.php @@ -1,608 +1,5 @@ true, - 'blockquote' => true, - 'dd' => true, - 'dir' => true, - 'div' => true, - 'dl' => true, - 'dt' => true, - 'h1' => true, - 'h2' => true, - 'h3' => true, - 'h4' => true, - 'h5' => true, - 'h6' => true, - 'hr' => true, - 'ol' => true, - 'p' => true, - 'pre' => true, - 'table' => true, - 'ul' => true - ); - - function HTMLPurifier_Definition() { - $this->generator = new HTMLPurifier_Generator(); - } - - function loadData() { - // emulates the structure of the DTD - - // entities: prefixed with e_ and _ replaces . - // we don't use an array because that complicates interpolation - // strings are used instead of arrays because if you use arrays, - // you have to do some hideous manipulation with array_merge() - - // these are condensed, remember, with bad stuff taken out - - // transforms: font, menu, dir, center - - // DON'T MONKEY AROUND THIS unless you know what you are doing - // and also know the assumptions the code makes about what this - // contains for optimization purposes (see fixNesting) - - $e_special_extra = 'img'; - $e_special_basic = 'br | span | bdo'; - $e_special = "$e_special_basic | $e_special_extra"; - $e_fontstyle_extra = 'big | small'; - $e_fontstyle_basic = 'tt | i | b | u | s | strike'; - $e_fontstyle = "$e_fontstyle_basic | $e_fontstyle_extra"; - $e_phrase_extra = 'sub | sup'; - $e_phrase_basic = 'em | strong | dfn | code | q | samp | kbd | var'. - ' | cite | abbr | acronym'; - $e_phrase = "$e_phrase_basic | $e_phrase_extra"; - $e_inline_forms = ''; // humor the dtd - $e_misc_inline = 'ins | del'; - $e_misc = "$e_misc_inline"; - $e_inline = "a | $e_special | $e_fontstyle | $e_phrase". - " | $e_inline_forms"; - // note the casing - $e_Inline = new HTMLPurifier_ChildDef_Optional("#PCDATA | $e_inline". - " | $e_misc_inline"); - $e_heading = 'h1|h2|h3|h4|h5|h6'; - $e_lists = 'ul | ol | dl'; - $e_blocktext = 'pre | hr | blockquote | address'; - $e_block = "p | $e_heading | div | $e_lists | $e_blocktext | table"; - $e_Flow = new HTMLPurifier_ChildDef_Optional("#PCDATA | $e_block". - " | $e_inline | $e_misc"); - $e_a_content = new HTMLPurifier_ChildDef_Optional("#PCDATA | $e_special". - " | $e_fontstyle | $e_phrase | $e_inline_forms | $e_misc_inline"); - $e_pre_content = new HTMLPurifier_ChildDef_Optional("#PCDATA | a". - " | $e_special_basic | $e_fontstyle_basic | $e_phrase_basic". - " | $e_inline_forms | $e_misc_inline"); - $e_form_content = new HTMLPurifier_ChildDef_Optional(''); //unused - $e_form_button_content = new HTMLPurifier_ChildDef_Optional(''); // unused - - $this->info['ins'] = - $this->info['del'] = - $this->info['blockquote'] = - $this->info['dd'] = - $this->info['li'] = - $this->info['div'] = new HTMLDTD_Element($e_Flow); - - $this->info['em'] = - $this->info['strong'] = - $this->info['dfn'] = - $this->info['code'] = - $this->info['samp'] = - $this->info['kbd'] = - $this->info['var'] = - $this->info['code'] = - $this->info['samp'] = - $this->info['kbd'] = - $this->info['var'] = - $this->info['cite'] = - $this->info['abbr'] = - $this->info['acronym'] = - $this->info['q'] = - $this->info['sub'] = - $this->info['tt'] = - $this->info['sup'] = - $this->info['i'] = - $this->info['b'] = - $this->info['big'] = - $this->info['small'] = - $this->info['u'] = - $this->info['s'] = - $this->info['strike'] = - $this->info['bdo'] = - $this->info['span'] = - $this->info['dt'] = - $this->info['p'] = - $this->info['h1'] = - $this->info['h2'] = - $this->info['h3'] = - $this->info['h4'] = - $this->info['h5'] = - $this->info['h6'] = new HTMLDTD_Element($e_Inline); - - $this->info['ol'] = - $this->info['ul'] = - new HTMLDTD_Element( - new HTMLPurifier_ChildDef_Required('li') - ); - - $this->info['dl'] = - new HTMLDTD_Element( - new HTMLPurifier_ChildDef_Required('dt|dd') - ); - $this->info['address'] = - new HTMLDTD_Element( - new HTMLPurifier_ChildDef_Optional("#PCDATA | p | $e_inline". - " | $e_misc_inline") - ); - - $this->info['img'] = - $this->info['br'] = - $this->info['hr'] = new HTMLDTD_Element(new HTMLPurifier_ChildDef_Empty()); - - $this->info['pre'] = new HTMLDTD_Element($e_pre_content); - - $this->info['a'] = new HTMLDTD_Element($e_a_content); - - } - - function purifyTokens($tokens) { - if (empty($this->info)) $this->loadData(); - $tokens = $this->removeForeignElements($tokens); - $tokens = $this->makeWellFormed($tokens); - $tokens = $this->fixNesting($tokens); - $tokens = $this->validateAttributes($tokens); - return $tokens; - } - - function removeForeignElements($tokens) { - if (empty($this->info)) $this->loadData(); - $result = array(); - foreach($tokens as $token) { - if (!empty( $token->is_tag )) { - if (!isset($this->info[$token->name])) { - // invalid tag, generate HTML and insert in - $token = new HTMLPurifier_Token_Text( - $this->generator->generateFromToken($token) - ); - } - } elseif ($token->type == 'comment') { - // strip comments - continue; - } elseif ($token->type == 'text') { - } else { - continue; - } - $result[] = $token; - } - return $result; - } - - function makeWellFormed($tokens) { - if (empty($this->info)) $this->loadData(); - $result = array(); - $current_nesting = array(); - foreach ($tokens as $token) { - if (empty( $token->is_tag )) { - $result[] = $token; - continue; - } - $info = $this->info[$token->name]; // assumption but valid - - // test if it claims to be a start tag but is empty - if ($info->child_def->type == 'empty' && - $token->type == 'start' ) { - - $result[] = new HTMLPurifier_Token_Empty($token->name, - $token->attributes); - continue; - } - - // test if it claims to be empty but really is a start tag - if ($info->child_def->type != 'empty' && - $token->type == 'empty' ) { - - $result[] = new HTMLPurifier_Token_Start($token->name, - $token->attributes); - $result[] = new HTMLPurifier_Token_End($token->name); - - continue; - } - - // automatically insert empty tags - if ($token->type == 'empty') { - $result[] = $token; - continue; - } - - // we give start tags precedence, so automatically accept unless... - // it's one of those special cases - if ($token->type == 'start') { - - // if there's a parent, check for special case - if (!empty($current_nesting)) { - $current_parent = array_pop($current_nesting); - - // check if we're closing a P tag - if ($current_parent->name == 'p' && - isset($this->info_closes_p[$token->name]) - ) { - $result[] = new HTMLPurifier_Token_End('p'); - $result[] = $token; - $current_nesting[] = $token; - continue; - } - - // check if we're closing a LI tag - if ($current_parent->name == 'li' && - $token->name == 'li' - ) { - $result[] = new HTMLPurifier_Token_End('li'); - $result[] = $token; - $current_nesting[] = $token; - continue; - } - - // this is more TIDY stuff - // we should also get some TABLE related code - // mismatched h# - - $current_nesting[] = $current_parent; // undo the pop - } - - $result[] = $token; - $current_nesting[] = $token; - continue; - } - - // sanity check - if ($token->type != 'end') continue; - - // okay, we're dealing with a closing tag - - // make sure that we have something open - if (empty($current_nesting)) { - $result[] = new HTMLPurifier_Token_Text( - $this->generator->generateFromToken($token) - ); - continue; - } - - // first, check for the simplest case: everything closes neatly - - // current_nesting is modified - $current_parent = array_pop($current_nesting); - if ($current_parent->name == $token->name) { - $result[] = $token; - continue; - } - - // undo the array_pop - $current_nesting[] = $current_parent; - - // okay, so we're trying to close the wrong tag - - // scroll back the entire nest, trying to find our tag - // feature could be to specify how far you'd like to go - $size = count($current_nesting); - // -2 because -1 is the last element, but we already checked that - $skipped_tags = false; - for ($i = $size - 2; $i >= 0; $i--) { - if ($current_nesting[$i]->name == $token->name) { - // current nesting is modified - $skipped_tags = array_splice($current_nesting, $i); - break; - } - } - - // we still didn't find the tag, so translate to text - if ($skipped_tags === false) { - $result[] = new HTMLPurifier_Token_Text( - $this->generator->generateFromToken($token) - ); - continue; - } - - // okay, we found it, close all the skipped tags - // note that skipped tags contains the element we need closed - $size = count($skipped_tags); - for ($i = $size - 1; $i >= 0; $i--) { - $result[] = new HTMLPurifier_Token_End($skipped_tags[$i]->name); - } - - // done! - - } - - // we're at the end now, fix all still unclosed tags - - if (!empty($current_nesting)) { - $size = count($current_nesting); - for ($i = $size - 1; $i >= 0; $i--) { - $result[] = - new HTMLPurifier_Token_End($current_nesting[$i]->name); - } - } - - return $result; - } - - function fixNesting($tokens) { - if (empty($this->info)) $this->loadData(); - - // insert implicit "parent" node, will be removed at end - array_unshift($tokens, new HTMLPurifier_Token_Start('div')); - $tokens[] = new HTMLPurifier_Token_End('div'); - - for ($i = 0, $size = count($tokens) ; $i < $size; ) { - - $child_tokens = array(); - - // scroll to the end of this node, and report number - for ($j = $i, $depth = 0; ; $j++) { - if ($tokens[$j]->type == 'start') { - $depth++; - // skip token assignment on first iteration - if ($depth == 1) continue; - } elseif ($tokens[$j]->type == 'end') { - $depth--; - // skip token assignment on last iteration - if ($depth == 0) break; - } - $child_tokens[] = $tokens[$j]; - } - - // $i is index of start token - // $j is index of end token - - // have DTD child def validate children - $element_def = $this->info[$tokens[$i]->name]; - $result = $element_def->child_def->validateChildren($child_tokens); - - // process result - if ($result === true) { - - // leave the nodes as is - - } elseif($result === false) { - - // WARNING WARNING WARNING!!! - // While for the original DTD, there will never be - // cascading removal, more complex ones may have such - // a problem. - - // If you modify the info array such that an element - // that requires children may contain a child that requires - // children, you need to also scroll back and re-check that - // elements parent node - - $length = $j - $i + 1; - - // remove entire node - array_splice($tokens, $i, $length); - - // change size - $size -= $length; - - // ensure that we scroll to the next node - $i--; - - } else { - - $length = $j - $i - 1; - - // replace node with $result - array_splice($tokens, $i + 1, $length, $result); - - // change size - $size -= $length; - $size += count($result); - - } - - // scroll to next node - $i++; - while ($i < $size and $tokens[$i]->type != 'start') $i++; - - } - - // remove implicit divs - array_shift($tokens); - array_pop($tokens); - - return $tokens; - - } - - function validateAttributes($tokens) { - if (empty($this->info)) $this->loadData(); - - } - -} - -class HTMLDTD_Element -{ - - var $child_def; - var $attr_def = array(); - - function HTMLDTD_Element($child_def, $attr_def = array()) { - $this->child_def = $child_def; - $this->attr_def = $attr_def; - } - -} - -// HTMLPurifier_ChildDef and inheritance have three types of output: -// true = leave nodes as is -// false = delete parent node and all children -// array(...) = replace children nodes with these - -// this is the hardest one to implement. We'll use fancy regexp tricks -// right now, we only expect it to return TRUE or FALSE (it won't attempt -// to fix the tree) - -// we may end up writing custom code for each HTML case -// in order to make it self correcting -class HTMLPurifier_ChildDef -{ - var $type = 'custom'; - var $dtd_regex; - var $_pcre_regex; - function HTMLPurifier_ChildDef($dtd_regex) { - $this->dtd_regex = $dtd_regex; - $this->_compileRegex(); - } - function _compileRegex() { - $raw = str_replace(' ', '', $this->dtd_regex); - if ($raw{0} != '(') { - $raw = "($raw)"; - } - $reg = str_replace(',', ',?', $raw); - $reg = preg_replace('/([#a-zA-Z0-9_.-]+)/', '(,?\\0)', $reg); - $this->_pcre_regex = $reg; - } - function validateChildren($tokens_of_children) { - $list_of_children = ''; - $nesting = 0; // depth into the nest - foreach ($tokens_of_children as $token) { - if (!empty($token->is_whitespace)) continue; - - $is_child = ($nesting == 0); // direct - - if ($token->type == 'start') { - $nesting++; - } elseif ($token->type == 'end') { - $nesting--; - } - - if ($is_child) { - $list_of_children .= $token->name . ','; - } - } - $list_of_children = rtrim($list_of_children, ','); - - $okay = - preg_match( - '/^'.$this->_pcre_regex.'$/', - $list_of_children - ); - - return (bool) $okay; - } -} -class HTMLPurifier_ChildDef_Simple extends HTMLPurifier_ChildDef -{ - var $elements = array(); - function HTMLPurifier_ChildDef_Simple($elements) { - if (is_string($elements)) { - $elements = str_replace(' ', '', $elements); - $elements = explode('|', $elements); - } - $elements = array_flip($elements); - foreach ($elements as $i => $x) $elements[$i] = true; - $this->elements = $elements; - $this->gen = new HTMLPurifier_Generator(); - } - function validateChildren() { - trigger_error('Cannot call abstract function!', E_USER_ERROR); - } -} -class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef_Simple -{ - var $type = 'required'; - function validateChildren($tokens_of_children) { - // if there are no tokens, delete parent node - if (empty($tokens_of_children)) return false; - - // the new set of children - $result = array(); - - // current depth into the nest - $nesting = 0; - - // whether or not we're deleting a node - $is_deleting = false; - - // whether or not parsed character data is allowed - // this controls whether or not we silently drop a tag - // or generate escaped HTML from it - $pcdata_allowed = isset($this->elements['#PCDATA']); - - // a little sanity check to make sure it's not ALL whitespace - $all_whitespace = true; - - foreach ($tokens_of_children as $token) { - if (!empty($token->is_whitespace)) { - $result[] = $token; - continue; - } - $all_whitespace = false; // phew, we're not talking about whitespace - - $is_child = ($nesting == 0); - - if ($token->type == 'start') { - $nesting++; - } elseif ($token->type == 'end') { - $nesting--; - } - - if ($is_child) { - $is_deleting = false; - if (!isset($this->elements[$token->name])) { - $is_deleting = true; - if ($pcdata_allowed) { - $result[] = new HTMLPurifier_Token_Text( - $this->gen->generateFromToken($token) - ); - } - continue; - } - } - if (!$is_deleting) { - $result[] = $token; - } elseif ($pcdata_allowed) { - $result[] = - new HTMLPurifier_Token_Text( - $this->gen->generateFromToken( $token ) - ); - } else { - // drop silently - } - } - if (empty($result)) return false; - if ($all_whitespace) return false; - if ($tokens_of_children == $result) return true; - return $result; - } -} - -// only altered behavior is that it returns an empty array -// instead of a false (to delete the node) -class HTMLPurifier_ChildDef_Optional extends HTMLPurifier_ChildDef_Required -{ - var $type = 'optional'; - function validateChildren($tokens_of_children) { - $result = parent::validateChildren($tokens_of_children); - if ($result === false) return array(); - return $result; - } -} - -// placeholder -class HTMLPurifier_ChildDef_Empty extends HTMLPurifier_ChildDef -{ - var $type = 'empty'; - function HTMLPurifier_ChildDef_Empty() {} - function validateChildren() { - return false; - } -} - class HTMLPurifier_AttrDef { var $def; diff --git a/ChildDef.php b/ChildDef.php index d454ea2e..a2382cc3 100644 --- a/ChildDef.php +++ b/ChildDef.php @@ -1,442 +1,5 @@ true, - 'blockquote' => true, - 'dd' => true, - 'dir' => true, - 'div' => true, - 'dl' => true, - 'dt' => true, - 'h1' => true, - 'h2' => true, - 'h3' => true, - 'h4' => true, - 'h5' => true, - 'h6' => true, - 'hr' => true, - 'ol' => true, - 'p' => true, - 'pre' => true, - 'table' => true, - 'ul' => true - ); - - function HTMLPurifier_Definition() { - $this->generator = new HTMLPurifier_Generator(); - } - - function loadData() { - // emulates the structure of the DTD - - // entities: prefixed with e_ and _ replaces . - // we don't use an array because that complicates interpolation - // strings are used instead of arrays because if you use arrays, - // you have to do some hideous manipulation with array_merge() - - // these are condensed, remember, with bad stuff taken out - - // transforms: font, menu, dir, center - - // DON'T MONKEY AROUND THIS unless you know what you are doing - // and also know the assumptions the code makes about what this - // contains for optimization purposes (see fixNesting) - - $e_special_extra = 'img'; - $e_special_basic = 'br | span | bdo'; - $e_special = "$e_special_basic | $e_special_extra"; - $e_fontstyle_extra = 'big | small'; - $e_fontstyle_basic = 'tt | i | b | u | s | strike'; - $e_fontstyle = "$e_fontstyle_basic | $e_fontstyle_extra"; - $e_phrase_extra = 'sub | sup'; - $e_phrase_basic = 'em | strong | dfn | code | q | samp | kbd | var'. - ' | cite | abbr | acronym'; - $e_phrase = "$e_phrase_basic | $e_phrase_extra"; - $e_inline_forms = ''; // humor the dtd - $e_misc_inline = 'ins | del'; - $e_misc = "$e_misc_inline"; - $e_inline = "a | $e_special | $e_fontstyle | $e_phrase". - " | $e_inline_forms"; - // note the casing - $e_Inline = new HTMLPurifier_ChildDef_Optional("#PCDATA | $e_inline". - " | $e_misc_inline"); - $e_heading = 'h1|h2|h3|h4|h5|h6'; - $e_lists = 'ul | ol | dl'; - $e_blocktext = 'pre | hr | blockquote | address'; - $e_block = "p | $e_heading | div | $e_lists | $e_blocktext | table"; - $e_Flow = new HTMLPurifier_ChildDef_Optional("#PCDATA | $e_block". - " | $e_inline | $e_misc"); - $e_a_content = new HTMLPurifier_ChildDef_Optional("#PCDATA | $e_special". - " | $e_fontstyle | $e_phrase | $e_inline_forms | $e_misc_inline"); - $e_pre_content = new HTMLPurifier_ChildDef_Optional("#PCDATA | a". - " | $e_special_basic | $e_fontstyle_basic | $e_phrase_basic". - " | $e_inline_forms | $e_misc_inline"); - $e_form_content = new HTMLPurifier_ChildDef_Optional(''); //unused - $e_form_button_content = new HTMLPurifier_ChildDef_Optional(''); // unused - - $this->info['ins'] = - $this->info['del'] = - $this->info['blockquote'] = - $this->info['dd'] = - $this->info['li'] = - $this->info['div'] = new HTMLDTD_Element($e_Flow); - - $this->info['em'] = - $this->info['strong'] = - $this->info['dfn'] = - $this->info['code'] = - $this->info['samp'] = - $this->info['kbd'] = - $this->info['var'] = - $this->info['code'] = - $this->info['samp'] = - $this->info['kbd'] = - $this->info['var'] = - $this->info['cite'] = - $this->info['abbr'] = - $this->info['acronym'] = - $this->info['q'] = - $this->info['sub'] = - $this->info['tt'] = - $this->info['sup'] = - $this->info['i'] = - $this->info['b'] = - $this->info['big'] = - $this->info['small'] = - $this->info['u'] = - $this->info['s'] = - $this->info['strike'] = - $this->info['bdo'] = - $this->info['span'] = - $this->info['dt'] = - $this->info['p'] = - $this->info['h1'] = - $this->info['h2'] = - $this->info['h3'] = - $this->info['h4'] = - $this->info['h5'] = - $this->info['h6'] = new HTMLDTD_Element($e_Inline); - - $this->info['ol'] = - $this->info['ul'] = - new HTMLDTD_Element( - new HTMLPurifier_ChildDef_Required('li') - ); - - $this->info['dl'] = - new HTMLDTD_Element( - new HTMLPurifier_ChildDef_Required('dt|dd') - ); - $this->info['address'] = - new HTMLDTD_Element( - new HTMLPurifier_ChildDef_Optional("#PCDATA | p | $e_inline". - " | $e_misc_inline") - ); - - $this->info['img'] = - $this->info['br'] = - $this->info['hr'] = new HTMLDTD_Element(new HTMLPurifier_ChildDef_Empty()); - - $this->info['pre'] = new HTMLDTD_Element($e_pre_content); - - $this->info['a'] = new HTMLDTD_Element($e_a_content); - - } - - function purifyTokens($tokens) { - if (empty($this->info)) $this->loadData(); - $tokens = $this->removeForeignElements($tokens); - $tokens = $this->makeWellFormed($tokens); - $tokens = $this->fixNesting($tokens); - $tokens = $this->validateAttributes($tokens); - return $tokens; - } - - function removeForeignElements($tokens) { - if (empty($this->info)) $this->loadData(); - $result = array(); - foreach($tokens as $token) { - if (!empty( $token->is_tag )) { - if (!isset($this->info[$token->name])) { - // invalid tag, generate HTML and insert in - $token = new HTMLPurifier_Token_Text( - $this->generator->generateFromToken($token) - ); - } - } elseif ($token->type == 'comment') { - // strip comments - continue; - } elseif ($token->type == 'text') { - } else { - continue; - } - $result[] = $token; - } - return $result; - } - - function makeWellFormed($tokens) { - if (empty($this->info)) $this->loadData(); - $result = array(); - $current_nesting = array(); - foreach ($tokens as $token) { - if (empty( $token->is_tag )) { - $result[] = $token; - continue; - } - $info = $this->info[$token->name]; // assumption but valid - - // test if it claims to be a start tag but is empty - if ($info->child_def->type == 'empty' && - $token->type == 'start' ) { - - $result[] = new HTMLPurifier_Token_Empty($token->name, - $token->attributes); - continue; - } - - // test if it claims to be empty but really is a start tag - if ($info->child_def->type != 'empty' && - $token->type == 'empty' ) { - - $result[] = new HTMLPurifier_Token_Start($token->name, - $token->attributes); - $result[] = new HTMLPurifier_Token_End($token->name); - - continue; - } - - // automatically insert empty tags - if ($token->type == 'empty') { - $result[] = $token; - continue; - } - - // we give start tags precedence, so automatically accept unless... - // it's one of those special cases - if ($token->type == 'start') { - - // if there's a parent, check for special case - if (!empty($current_nesting)) { - $current_parent = array_pop($current_nesting); - - // check if we're closing a P tag - if ($current_parent->name == 'p' && - isset($this->info_closes_p[$token->name]) - ) { - $result[] = new HTMLPurifier_Token_End('p'); - $result[] = $token; - $current_nesting[] = $token; - continue; - } - - // check if we're closing a LI tag - if ($current_parent->name == 'li' && - $token->name == 'li' - ) { - $result[] = new HTMLPurifier_Token_End('li'); - $result[] = $token; - $current_nesting[] = $token; - continue; - } - - // this is more TIDY stuff - // we should also get some TABLE related code - // mismatched h# - - $current_nesting[] = $current_parent; // undo the pop - } - - $result[] = $token; - $current_nesting[] = $token; - continue; - } - - // sanity check - if ($token->type != 'end') continue; - - // okay, we're dealing with a closing tag - - // make sure that we have something open - if (empty($current_nesting)) { - $result[] = new HTMLPurifier_Token_Text( - $this->generator->generateFromToken($token) - ); - continue; - } - - // first, check for the simplest case: everything closes neatly - - // current_nesting is modified - $current_parent = array_pop($current_nesting); - if ($current_parent->name == $token->name) { - $result[] = $token; - continue; - } - - // undo the array_pop - $current_nesting[] = $current_parent; - - // okay, so we're trying to close the wrong tag - - // scroll back the entire nest, trying to find our tag - // feature could be to specify how far you'd like to go - $size = count($current_nesting); - // -2 because -1 is the last element, but we already checked that - $skipped_tags = false; - for ($i = $size - 2; $i >= 0; $i--) { - if ($current_nesting[$i]->name == $token->name) { - // current nesting is modified - $skipped_tags = array_splice($current_nesting, $i); - break; - } - } - - // we still didn't find the tag, so translate to text - if ($skipped_tags === false) { - $result[] = new HTMLPurifier_Token_Text( - $this->generator->generateFromToken($token) - ); - continue; - } - - // okay, we found it, close all the skipped tags - // note that skipped tags contains the element we need closed - $size = count($skipped_tags); - for ($i = $size - 1; $i >= 0; $i--) { - $result[] = new HTMLPurifier_Token_End($skipped_tags[$i]->name); - } - - // done! - - } - - // we're at the end now, fix all still unclosed tags - - if (!empty($current_nesting)) { - $size = count($current_nesting); - for ($i = $size - 1; $i >= 0; $i--) { - $result[] = - new HTMLPurifier_Token_End($current_nesting[$i]->name); - } - } - - return $result; - } - - function fixNesting($tokens) { - if (empty($this->info)) $this->loadData(); - - // insert implicit "parent" node, will be removed at end - array_unshift($tokens, new HTMLPurifier_Token_Start('div')); - $tokens[] = new HTMLPurifier_Token_End('div'); - - for ($i = 0, $size = count($tokens) ; $i < $size; ) { - - $child_tokens = array(); - - // scroll to the end of this node, and report number - for ($j = $i, $depth = 0; ; $j++) { - if ($tokens[$j]->type == 'start') { - $depth++; - // skip token assignment on first iteration - if ($depth == 1) continue; - } elseif ($tokens[$j]->type == 'end') { - $depth--; - // skip token assignment on last iteration - if ($depth == 0) break; - } - $child_tokens[] = $tokens[$j]; - } - - // $i is index of start token - // $j is index of end token - - // have DTD child def validate children - $element_def = $this->info[$tokens[$i]->name]; - $result = $element_def->child_def->validateChildren($child_tokens); - - // process result - if ($result === true) { - - // leave the nodes as is - - } elseif($result === false) { - - // WARNING WARNING WARNING!!! - // While for the original DTD, there will never be - // cascading removal, more complex ones may have such - // a problem. - - // If you modify the info array such that an element - // that requires children may contain a child that requires - // children, you need to also scroll back and re-check that - // elements parent node - - $length = $j - $i + 1; - - // remove entire node - array_splice($tokens, $i, $length); - - // change size - $size -= $length; - - // ensure that we scroll to the next node - $i--; - - } else { - - $length = $j - $i - 1; - - // replace node with $result - array_splice($tokens, $i + 1, $length, $result); - - // change size - $size -= $length; - $size += count($result); - - } - - // scroll to next node - $i++; - while ($i < $size and $tokens[$i]->type != 'start') $i++; - - } - - // remove implicit divs - array_shift($tokens); - array_pop($tokens); - - return $tokens; - - } - - function validateAttributes($tokens) { - if (empty($this->info)) $this->loadData(); - - } - -} - -class HTMLDTD_Element -{ - - var $child_def; - var $attr_def = array(); - - function HTMLDTD_Element($child_def, $attr_def = array()) { - $this->child_def = $child_def; - $this->attr_def = $attr_def; - } - -} - // HTMLPurifier_ChildDef and inheritance have three types of output: // true = leave nodes as is // false = delete parent node and all children @@ -603,12 +166,4 @@ class HTMLPurifier_ChildDef_Empty extends HTMLPurifier_ChildDef } } -class HTMLPurifier_AttrDef -{ - var $def; - function HTMLPurifier_AttrDef($def) { - $this->def = $def; - } -} - ?> \ No newline at end of file diff --git a/Definition.php b/Definition.php index d454ea2e..e0a17fd5 100644 --- a/Definition.php +++ b/Definition.php @@ -85,7 +85,7 @@ class HTMLPurifier_Definition $this->info['blockquote'] = $this->info['dd'] = $this->info['li'] = - $this->info['div'] = new HTMLDTD_Element($e_Flow); + $this->info['div'] = new HTMLPurifier_ElementDef($e_Flow); $this->info['em'] = $this->info['strong'] = @@ -121,31 +121,31 @@ class HTMLPurifier_Definition $this->info['h3'] = $this->info['h4'] = $this->info['h5'] = - $this->info['h6'] = new HTMLDTD_Element($e_Inline); + $this->info['h6'] = new HTMLPurifier_ElementDef($e_Inline); $this->info['ol'] = $this->info['ul'] = - new HTMLDTD_Element( + new HTMLPurifier_ElementDef( new HTMLPurifier_ChildDef_Required('li') ); $this->info['dl'] = - new HTMLDTD_Element( + new HTMLPurifier_ElementDef( new HTMLPurifier_ChildDef_Required('dt|dd') ); $this->info['address'] = - new HTMLDTD_Element( + new HTMLPurifier_ElementDef( new HTMLPurifier_ChildDef_Optional("#PCDATA | p | $e_inline". " | $e_misc_inline") ); $this->info['img'] = $this->info['br'] = - $this->info['hr'] = new HTMLDTD_Element(new HTMLPurifier_ChildDef_Empty()); + $this->info['hr'] = new HTMLPurifier_ElementDef(new HTMLPurifier_ChildDef_Empty()); - $this->info['pre'] = new HTMLDTD_Element($e_pre_content); + $this->info['pre'] = new HTMLPurifier_ElementDef($e_pre_content); - $this->info['a'] = new HTMLDTD_Element($e_a_content); + $this->info['a'] = new HTMLPurifier_ElementDef($e_a_content); } @@ -424,191 +424,17 @@ class HTMLPurifier_Definition } -class HTMLDTD_Element +class HTMLPurifier_ElementDef { var $child_def; var $attr_def = array(); - function HTMLDTD_Element($child_def, $attr_def = array()) { + function HTMLPurifier_ElementDef($child_def, $attr_def = array()) { $this->child_def = $child_def; $this->attr_def = $attr_def; } } -// HTMLPurifier_ChildDef and inheritance have three types of output: -// true = leave nodes as is -// false = delete parent node and all children -// array(...) = replace children nodes with these - -// this is the hardest one to implement. We'll use fancy regexp tricks -// right now, we only expect it to return TRUE or FALSE (it won't attempt -// to fix the tree) - -// we may end up writing custom code for each HTML case -// in order to make it self correcting -class HTMLPurifier_ChildDef -{ - var $type = 'custom'; - var $dtd_regex; - var $_pcre_regex; - function HTMLPurifier_ChildDef($dtd_regex) { - $this->dtd_regex = $dtd_regex; - $this->_compileRegex(); - } - function _compileRegex() { - $raw = str_replace(' ', '', $this->dtd_regex); - if ($raw{0} != '(') { - $raw = "($raw)"; - } - $reg = str_replace(',', ',?', $raw); - $reg = preg_replace('/([#a-zA-Z0-9_.-]+)/', '(,?\\0)', $reg); - $this->_pcre_regex = $reg; - } - function validateChildren($tokens_of_children) { - $list_of_children = ''; - $nesting = 0; // depth into the nest - foreach ($tokens_of_children as $token) { - if (!empty($token->is_whitespace)) continue; - - $is_child = ($nesting == 0); // direct - - if ($token->type == 'start') { - $nesting++; - } elseif ($token->type == 'end') { - $nesting--; - } - - if ($is_child) { - $list_of_children .= $token->name . ','; - } - } - $list_of_children = rtrim($list_of_children, ','); - - $okay = - preg_match( - '/^'.$this->_pcre_regex.'$/', - $list_of_children - ); - - return (bool) $okay; - } -} -class HTMLPurifier_ChildDef_Simple extends HTMLPurifier_ChildDef -{ - var $elements = array(); - function HTMLPurifier_ChildDef_Simple($elements) { - if (is_string($elements)) { - $elements = str_replace(' ', '', $elements); - $elements = explode('|', $elements); - } - $elements = array_flip($elements); - foreach ($elements as $i => $x) $elements[$i] = true; - $this->elements = $elements; - $this->gen = new HTMLPurifier_Generator(); - } - function validateChildren() { - trigger_error('Cannot call abstract function!', E_USER_ERROR); - } -} -class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef_Simple -{ - var $type = 'required'; - function validateChildren($tokens_of_children) { - // if there are no tokens, delete parent node - if (empty($tokens_of_children)) return false; - - // the new set of children - $result = array(); - - // current depth into the nest - $nesting = 0; - - // whether or not we're deleting a node - $is_deleting = false; - - // whether or not parsed character data is allowed - // this controls whether or not we silently drop a tag - // or generate escaped HTML from it - $pcdata_allowed = isset($this->elements['#PCDATA']); - - // a little sanity check to make sure it's not ALL whitespace - $all_whitespace = true; - - foreach ($tokens_of_children as $token) { - if (!empty($token->is_whitespace)) { - $result[] = $token; - continue; - } - $all_whitespace = false; // phew, we're not talking about whitespace - - $is_child = ($nesting == 0); - - if ($token->type == 'start') { - $nesting++; - } elseif ($token->type == 'end') { - $nesting--; - } - - if ($is_child) { - $is_deleting = false; - if (!isset($this->elements[$token->name])) { - $is_deleting = true; - if ($pcdata_allowed) { - $result[] = new HTMLPurifier_Token_Text( - $this->gen->generateFromToken($token) - ); - } - continue; - } - } - if (!$is_deleting) { - $result[] = $token; - } elseif ($pcdata_allowed) { - $result[] = - new HTMLPurifier_Token_Text( - $this->gen->generateFromToken( $token ) - ); - } else { - // drop silently - } - } - if (empty($result)) return false; - if ($all_whitespace) return false; - if ($tokens_of_children == $result) return true; - return $result; - } -} - -// only altered behavior is that it returns an empty array -// instead of a false (to delete the node) -class HTMLPurifier_ChildDef_Optional extends HTMLPurifier_ChildDef_Required -{ - var $type = 'optional'; - function validateChildren($tokens_of_children) { - $result = parent::validateChildren($tokens_of_children); - if ($result === false) return array(); - return $result; - } -} - -// placeholder -class HTMLPurifier_ChildDef_Empty extends HTMLPurifier_ChildDef -{ - var $type = 'empty'; - function HTMLPurifier_ChildDef_Empty() {} - function validateChildren() { - return false; - } -} - -class HTMLPurifier_AttrDef -{ - var $def; - function HTMLPurifier_AttrDef($def) { - $this->def = $def; - } -} - ?> \ No newline at end of file diff --git a/tests/ChildDef.php b/tests/ChildDef.php index 39e0c062..a46147ea 100644 --- a/tests/ChildDef.php +++ b/tests/ChildDef.php @@ -125,265 +125,4 @@ class Test_HTMLPurifier_ChildDef extends UnitTestCase } -class Test_HTMLPurifier_Definition extends UnitTestCase -{ - - var $def, $lex; - - function Test_HTMLPurifier_Definition() { - $this->UnitTestCase(); - $this->def = new HTMLPurifier_Definition(); - $this->def->loadData(); - $this->lex = new HTMLPurifier_Lexer(); - } - - function test_removeForeignElements() { - - $inputs = array(); - $expect = array(); - - $inputs[0] = array(); - $expect[0] = $inputs[0]; - - $inputs[1] = array( - new HTMLPurifier_Token_Text('This is ') - ,new HTMLPurifier_Token_Start('b', array()) - ,new HTMLPurifier_Token_Text('bold') - ,new HTMLPurifier_Token_End('b') - ,new HTMLPurifier_Token_Text(' text') - ); - $expect[1] = $inputs[1]; - - $inputs[2] = array( - new HTMLPurifier_Token_Start('asdf') - ,new HTMLPurifier_Token_End('asdf') - ,new HTMLPurifier_Token_Start('d', array('href' => 'bang!')) - ,new HTMLPurifier_Token_End('d') - ,new HTMLPurifier_Token_Start('pooloka') - ,new HTMLPurifier_Token_Start('poolasdf') - ,new HTMLPurifier_Token_Start('ds', array('moogle' => '&')) - ,new HTMLPurifier_Token_End('asdf') - ,new HTMLPurifier_Token_End('asdf') - ); - $expect[2] = array( - new HTMLPurifier_Token_Text('') - ,new HTMLPurifier_Token_Text('') - ,new HTMLPurifier_Token_Text('') - ,new HTMLPurifier_Token_Text('') - ,new HTMLPurifier_Token_Text('') - ,new HTMLPurifier_Token_Text('') - ,new HTMLPurifier_Token_Text('') - ,new HTMLPurifier_Token_Text('') - ,new HTMLPurifier_Token_Text('') - ); - - foreach ($inputs as $i => $input) { - $result = $this->def->removeForeignElements($input); - $this->assertEqual($expect[$i], $result); - paintIf($result, $result != $expect[$i]); - } - - } - - function test_makeWellFormed() { - - $inputs = array(); - $expect = array(); - - $inputs[0] = array(); - $expect[0] = $inputs[0]; - - $inputs[1] = array( - new HTMLPurifier_Token_Text('This is ') - ,new HTMLPurifier_Token_Start('b') - ,new HTMLPurifier_Token_Text('bold') - ,new HTMLPurifier_Token_End('b') - ,new HTMLPurifier_Token_Text(' text') - ,new HTMLPurifier_Token_Empty('br') - ); - $expect[1] = $inputs[1]; - - $inputs[2] = array( - new HTMLPurifier_Token_Start('b') - ,new HTMLPurifier_Token_Text('Unclosed tag, gasp!') - ); - $expect[2] = array( - new HTMLPurifier_Token_Start('b') - ,new HTMLPurifier_Token_Text('Unclosed tag, gasp!') - ,new HTMLPurifier_Token_End('b') - ); - - $inputs[3] = array( - new HTMLPurifier_Token_Start('b') - ,new HTMLPurifier_Token_Start('i') - ,new HTMLPurifier_Token_Text('The b is closed, but the i is not') - ,new HTMLPurifier_Token_End('b') - ); - $expect[3] = array( - new HTMLPurifier_Token_Start('b') - ,new HTMLPurifier_Token_Start('i') - ,new HTMLPurifier_Token_Text('The b is closed, but the i is not') - ,new HTMLPurifier_Token_End('i') - ,new HTMLPurifier_Token_End('b') - ); - - $inputs[4] = array( - new HTMLPurifier_Token_Text('Hey, recycle unused end tags!') - ,new HTMLPurifier_Token_End('b') - ); - $expect[4] = array( - new HTMLPurifier_Token_Text('Hey, recycle unused end tags!') - ,new HTMLPurifier_Token_Text('') - ); - - $inputs[5] = array(new HTMLPurifier_Token_Start('br', array('style' => 'clear:both;'))); - $expect[5] = array(new HTMLPurifier_Token_Empty('br', array('style' => 'clear:both;'))); - - $inputs[6] = array(new HTMLPurifier_Token_Empty('div', array('style' => 'clear:both;'))); - $expect[6] = array( - new HTMLPurifier_Token_Start('div', array('style' => 'clear:both;')) - ,new HTMLPurifier_Token_End('div') - ); - - // test automatic paragraph closing - - $inputs[7] = array( - new HTMLPurifier_Token_Start('p') - ,new HTMLPurifier_Token_Text('Paragraph 1') - ,new HTMLPurifier_Token_Start('p') - ,new HTMLPurifier_Token_Text('Paragraph 2') - ); - $expect[7] = array( - new HTMLPurifier_Token_Start('p') - ,new HTMLPurifier_Token_Text('Paragraph 1') - ,new HTMLPurifier_Token_End('p') - ,new HTMLPurifier_Token_Start('p') - ,new HTMLPurifier_Token_Text('Paragraph 2') - ,new HTMLPurifier_Token_End('p') - ); - - $inputs[8] = array( - new HTMLPurifier_Token_Start('div') - ,new HTMLPurifier_Token_Start('p') - ,new HTMLPurifier_Token_Text('Paragraph 1 in a div') - ,new HTMLPurifier_Token_End('div') - ); - $expect[8] = array( - new HTMLPurifier_Token_Start('div') - ,new HTMLPurifier_Token_Start('p') - ,new HTMLPurifier_Token_Text('Paragraph 1 in a div') - ,new HTMLPurifier_Token_End('p') - ,new HTMLPurifier_Token_End('div') - ); - - // automatic list closing - - $inputs[9] = array( - new HTMLPurifier_Token_Start('ol') - - ,new HTMLPurifier_Token_Start('li') - ,new HTMLPurifier_Token_Text('Item 1') - - ,new HTMLPurifier_Token_Start('li') - ,new HTMLPurifier_Token_Text('Item 2') - - ,new HTMLPurifier_Token_End('ol') - ); - $expect[9] = array( - new HTMLPurifier_Token_Start('ol') - - ,new HTMLPurifier_Token_Start('li') - ,new HTMLPurifier_Token_Text('Item 1') - ,new HTMLPurifier_Token_End('li') - - ,new HTMLPurifier_Token_Start('li') - ,new HTMLPurifier_Token_Text('Item 2') - ,new HTMLPurifier_Token_End('li') - - ,new HTMLPurifier_Token_End('ol') - ); - - foreach ($inputs as $i => $input) { - $result = $this->def->makeWellFormed($input); - $this->assertEqual($expect[$i], $result); - paintIf($result, $result != $expect[$i]); - } - - } - - function test_fixNesting() { - $inputs = array(); - $expect = array(); - - // next id = 4 - - // legal inline nesting - $inputs[0] = array( - new HTMLPurifier_Token_Start('b'), - new HTMLPurifier_Token_Text('Bold text'), - new HTMLPurifier_Token_End('b'), - ); - $expect[0] = $inputs[0]; - - // legal inline and block - // as the parent element is considered FLOW - $inputs[1] = array( - new HTMLPurifier_Token_Start('a', array('href' => 'http://www.example.com/')), - new HTMLPurifier_Token_Text('Linky'), - new HTMLPurifier_Token_End('a'), - new HTMLPurifier_Token_Start('div'), - new HTMLPurifier_Token_Text('Block element'), - new HTMLPurifier_Token_End('div'), - ); - $expect[1] = $inputs[1]; - - // illegal block in inline, element -> text - $inputs[2] = array( - new HTMLPurifier_Token_Start('b'), - new HTMLPurifier_Token_Start('div'), - new HTMLPurifier_Token_Text('Illegal Div'), - new HTMLPurifier_Token_End('div'), - new HTMLPurifier_Token_End('b'), - ); - $expect[2] = array( - new HTMLPurifier_Token_Start('b'), - new HTMLPurifier_Token_Text('
'), - new HTMLPurifier_Token_Text('Illegal Div'), - new HTMLPurifier_Token_Text('
'), - new HTMLPurifier_Token_End('b'), - ); - - // test of empty set that's required, resulting in removal of node - $inputs[3] = array( - new HTMLPurifier_Token_Start('ul'), - new HTMLPurifier_Token_End('ul') - ); - $expect[3] = array(); - - // test illegal text which gets removed - $inputs[4] = array( - new HTMLPurifier_Token_Start('ul'), - new HTMLPurifier_Token_Text('Illegal Text'), - new HTMLPurifier_Token_Start('li'), - new HTMLPurifier_Token_Text('Legal item'), - new HTMLPurifier_Token_End('li'), - new HTMLPurifier_Token_End('ul') - ); - $expect[4] = array( - new HTMLPurifier_Token_Start('ul'), - new HTMLPurifier_Token_Start('li'), - new HTMLPurifier_Token_Text('Legal item'), - new HTMLPurifier_Token_End('li'), - new HTMLPurifier_Token_End('ul') - ); - - foreach ($inputs as $i => $input) { - $result = $this->def->fixNesting($input); - $this->assertEqual($expect[$i], $result); - paintIf($result, $result != $expect[$i]); - } - } - -} - ?> \ No newline at end of file diff --git a/tests/Definition.php b/tests/Definition.php index 39e0c062..37f141ea 100644 --- a/tests/Definition.php +++ b/tests/Definition.php @@ -1,130 +1,5 @@ lex = new HTMLPurifier_Lexer(); - $this->gen = new HTMLPurifier_Generator(); - parent::UnitTestCase(); - } - - function assertSeries($inputs, $expect, $def) { - foreach ($inputs as $i => $input) { - $tokens = $this->lex->tokenizeHTML($input); - $result = $def->validateChildren($tokens); - if (is_bool($expect[$i])) { - $this->assertIdentical($expect[$i], $result); - } else { - $result_html = $this->gen->generateFromTokens($result); - $this->assertEqual($expect[$i], $result_html); - paintIf($result_html, $result_html != $expect[$i]); - } - } - } - - function test_complex() { - - // the table definition - $def = new HTMLPurifier_ChildDef( - '(caption?, (col*|colgroup*), thead?, tfoot?, (tbody+|tr+))'); - - $inputs[0] = ''; - $expect[0] = false; - - // we really don't care what's inside, because if it turns out - // this tr is illegal, we'll end up re-evaluating the parent node - // anyway. - $inputs[1] = ''; - $expect[1] = true; - - $inputs[2] = '' . - ''; - $expect[2] = true; - - $inputs[3] = ''; - $expect[3] = true; - - $this->assertSeries($inputs, $expect, $def); - - } - - function test_simple() { - - // simple is actually an abstract class - // but we're unit testing some of the conv. functions it gives - - $def = new HTMLPurifier_ChildDef_Simple('foobar | bang |gizmo'); - $this->assertEqual($def->elements, - array( - 'foobar' => true - ,'bang' => true - ,'gizmo' => true - )); - - $def = new HTMLPurifier_ChildDef_Simple(array('href', 'src')); - $this->assertEqual($def->elements, - array( - 'href' => true - ,'src' => true - )); - } - - function test_required_pcdata_forbidden() { - - $def = new HTMLPurifier_ChildDef_Required('dt | dd'); - - $inputs[0] = array(); - $expect[0] = false; - - $inputs[1] = '
Term
Text in an illegal location'. - '
Definition
Illegal tag'; - - $expect[1] = '
Term
Definition
'; - - $inputs[2] = 'How do you do!'; - $expect[2] = false; - - // whitespace shouldn't trigger it - $inputs[3] = "\n
Definition
"; - $expect[3] = true; - - $inputs[4] ='
Definition
'; - $expect[4] = '
Definition
'; - - $inputs[5] = "\t "; - $expect[5] = false; - - $this->assertSeries($inputs, $expect, $def); - - } - - function test_required_pcdata_allowed() { - $def = new HTMLPurifier_ChildDef_Required('#PCDATA | b'); - - $inputs[0] = 'Bold text'; - $expect[0] = 'Bold text<img />'; - - $this->assertSeries($inputs, $expect, $def); - } - - function test_optional() { - $def = new HTMLPurifier_ChildDef_Optional('b | i'); - - $inputs[0] = 'Bold text'; - $expect[0] = 'Bold text'; - - $inputs[1] = 'Not allowed text'; - $expect[1] = ''; - - $this->assertSeries($inputs, $expect, $def); - } - -} - class Test_HTMLPurifier_Definition extends UnitTestCase { diff --git a/tests/index.php b/tests/index.php index 983580de..4ca16c77 100644 --- a/tests/index.php +++ b/tests/index.php @@ -12,6 +12,8 @@ require_once 'HTMLPurifier/HTMLPurifier.php'; require_once 'HTMLPurifier/Lexer.php'; require_once 'HTMLPurifier/Token.php'; require_once 'HTMLPurifier/Definition.php'; +require_once 'HTMLPurifier/AttrDef.php'; +require_once 'HTMLPurifier/ChildDef.php'; require_once 'HTMLPurifier/Generator.php'; $test = new GroupTest('HTMLPurifier'); @@ -20,6 +22,7 @@ $test->addTestFile('HTMLPurifier.php'); $test->addTestFile('Lexer.php'); //$test->addTestFile('Token.php'); $test->addTestFile('Definition.php'); +$test->addTestFile('ChildDef.php'); $test->addTestFile('Generator.php'); $test->run( new HtmlReporter() );