diff --git a/library/HTMLPurifier/Definition.php b/library/HTMLPurifier/Definition.php index ffc3ae78..7ec774d0 100644 --- a/library/HTMLPurifier/Definition.php +++ b/library/HTMLPurifier/Definition.php @@ -33,30 +33,6 @@ class HTMLPurifier_Definition var $info = array(); - // used solely by HTMLPurifier_Strategy_MakeWellFormed - var $info_closes_p = array( - // these are all block elements: blocks aren't allowed in P - 'address' => true, - 'blockquote' => true, - 'dd' => true, - 'dir' => true, - 'div' => true, - 'dl' => true, - 'dt' => true, - 'h1' => true, - 'h2' => true, - 'h3' => true, - 'h4' => true, - 'h5' => true, - 'h6' => true, - 'hr' => true, - 'ol' => true, - 'p' => true, - 'pre' => true, - 'table' => true, - 'ul' => true - ); - // used solely by HTMLPurifier_Strategy_ValidateAttributes var $info_global_attr = array(); @@ -75,7 +51,23 @@ class HTMLPurifier_Definition function HTMLPurifier_Definition() {} function setup() { + // emulates the structure of the DTD + // these are condensed, however, with bad stuff taken out + // screening process was done by hand + + // The code makes certain assumptions about the structure of this + // definition for optimization reasons: + // + // FixNesting - There will never be a need for cascading removal + // of tags, usually triggered by a node requiring the + // existence of another node that may be deleted. + + ////////////////////////////////////////////////////////////////////// + // info[] : initializes the definition objects + + // if you attempt to define rules later on for a tag not in this array + // PHP will create an stdclass $allowed_tags = array( @@ -84,28 +76,23 @@ class HTMLPurifier_Definition 'q', 'sub', 'tt', 'sup', 'i', 'b', 'big', 'small', 'u', 's', 'strike', 'bdo', 'span', 'dt', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul', 'dl', 'address', 'img', 'br', 'hr', - 'pre', 'a' + 'pre', 'a', 'table', 'caption', 'thead', 'tfoot', 'tbody', + 'colgroup', 'col', 'td', 'th', 'tr' ); foreach ($allowed_tags as $tag) { $this->info[$tag] = new HTMLPurifier_ElementDef(); } + ////////////////////////////////////////////////////////////////////// + // info[]->child : defines allowed children for elements + // entities: prefixed with e_ and _ replaces . + // we don't use an array because that complicates interpolation // strings are used instead of arrays because if you use arrays, // you have to do some hideous manipulation with array_merge() - // these are condensed, remember, with bad stuff taken out - - // transforms: font, menu, dir, center - - // DON'T MONKEY AROUND THIS unless you know what you are doing - // and also know the assumptions the code makes about what this - // contains for optimization purposes (see fixNesting) - - // child info - $e_special_extra = 'img'; $e_special_basic = 'br | span | bdo'; $e_special = "$e_special_basic | $e_special_extra"; @@ -140,13 +127,13 @@ class HTMLPurifier_Definition $this->info['ins']->child = $this->info['del']->child = - $this->info['blockquote']->child = + $this->info['blockquote']->child= $this->info['dd']->child = $this->info['li']->child = $this->info['div']->child = $e_Flow; $this->info['em']->child = - $this->info['strong']->child = + $this->info['strong']->child = $this->info['dfn']->child = $this->info['code']->child = $this->info['samp']->child = @@ -154,7 +141,7 @@ class HTMLPurifier_Definition $this->info['var']->child = $this->info['cite']->child = $this->info['abbr']->child = - $this->info['acronym']->child = + $this->info['acronym']->child = $this->info['q']->child = $this->info['sub']->child = $this->info['tt']->child = @@ -162,10 +149,10 @@ class HTMLPurifier_Definition $this->info['i']->child = $this->info['b']->child = $this->info['big']->child = - $this->info['small']->child = + $this->info['small']->child= $this->info['u']->child = $this->info['s']->child = - $this->info['strike']->child = + $this->info['strike']->child = $this->info['bdo']->child = $this->info['span']->child = $this->info['dt']->child = @@ -177,10 +164,12 @@ class HTMLPurifier_Definition $this->info['h5']->child = $this->info['h6']->child = $e_Inline; + // the only three required definitions, besides custom table code $this->info['ol']->child = $this->info['ul']->child = new HTMLPurifier_ChildDef_Required('li'); $this->info['dl']->child = new HTMLPurifier_ChildDef_Required('dt|dd'); + $this->info['address']->child = new HTMLPurifier_ChildDef_Optional("#PCDATA | p | $e_inline". " | $e_misc_inline"); @@ -193,7 +182,23 @@ class HTMLPurifier_Definition $this->info['a']->child = $e_a_content; - // attribute info + $this->info['table']->child = new HTMLPurifier_ChildDef( + '(caption?, (col*|colgroup*), thead?, tfoot?, (tbody+|tr+))'); + + // not a real entity, watch the double underscore + $e__row = new HTMLPurifier_ChildDef_Required('tr'); + $this->info['thead']->child = $e__row; + $this->info['tfoot']->child = $e__row; + $this->info['tbody']->child = $e__row; + $this->info['colgroup']->child = new HTMLPurifier_ChildDef_Optional('col'); + $this->info['col']->child = new HTMLPurifier_ChildDef_Empty(); + $this->info['tr']->child = new HTMLPurifier_ChildDef_Required('th | td'); + $this->info['th']->child = $e_Flow; + $this->info['td']->child = $e_Flow; + + ////////////////////////////////////////////////////////////////////// + // info[]->attr : defines allowed attributes for elements + // this doesn't include REQUIRED declarations, those are handled // by the transform classes @@ -205,6 +210,39 @@ class HTMLPurifier_Definition 'dir' => new HTMLPurifier_AttrDef_Enum(array('ltr','rtl'), false), ); + ////////////////////////////////////////////////////////////////////// + // UNIMP : info_tag_transform : transformations of tags + + // font -> span / attributes: size color face + // css: font-size color font-family + // menu -> ul + // dir -> ul + // center -> div / css: text-align: center; + + ////////////////////////////////////////////////////////////////////// + // info[]->auto_close : tags that automatically close another + + // these are all block elements: blocks aren't allowed in P + $this->info['p']->auto_close = array_flip(array( + 'address', 'blockquote', 'dd', 'dir', 'div', 'dl', 'dt', + 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'ol', 'p', 'pre', + 'table', 'ul' + )); + + $this->info['li']->auto_close = array('li' => true); + + // we need TABLE and heading mismatch code + // we may need to make this more flexible for heading mismatch, + // or we can just create another info + + ////////////////////////////////////////////////////////////////////// + // UNIMP : info[]->attr_transform : attribute transformations in elements + + ////////////////////////////////////////////////////////////////////// + // UNIMP : info_attr_transform : global attribute transform (for xml:lang) + + // this might have bad implications for performance + } } @@ -212,8 +250,9 @@ class HTMLPurifier_Definition class HTMLPurifier_ElementDef { - var $child; var $attr = array(); + var $auto_close = array(); + var $child; } diff --git a/library/HTMLPurifier/Strategy/MakeWellFormed.php b/library/HTMLPurifier/Strategy/MakeWellFormed.php index 6f982df0..4192cbbf 100644 --- a/library/HTMLPurifier/Strategy/MakeWellFormed.php +++ b/library/HTMLPurifier/Strategy/MakeWellFormed.php @@ -59,35 +59,19 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy // if there's a parent, check for special case if (!empty($current_nesting)) { - $current_parent = array_pop($current_nesting); - // this ought to be moved to definition + $parent = array_pop($current_nesting); + $parent_name = $parent->name; + $parent_info = $this->definition->info[$parent_name]; - // check if we're closing a P tag - if ($current_parent->name == 'p' && - isset($this->definition->info_closes_p[$token->name]) - ) { - $result[] = new HTMLPurifier_Token_End('p'); + if (isset($parent_info->auto_close[$token->name])) { + $result[] = new HTMLPurifier_Token_End($parent_name); $result[] = $token; $current_nesting[] = $token; continue; } - // check if we're closing a LI tag - if ($current_parent->name == 'li' && - $token->name == 'li' - ) { - $result[] = new HTMLPurifier_Token_End('li'); - $result[] = $token; - $current_nesting[] = $token; - continue; - } - - // this is more TIDY stuff - // we should also get some TABLE related code - // mismatched h# - - $current_nesting[] = $current_parent; // undo the pop + $current_nesting[] = $parent; // undo the pop } $result[] = $token; diff --git a/tests/HTMLPurifier/Strategy/FixNestingTest.php b/tests/HTMLPurifier/Strategy/FixNestingTest.php index 2b2d95b7..0a404298 100644 --- a/tests/HTMLPurifier/Strategy/FixNestingTest.php +++ b/tests/HTMLPurifier/Strategy/FixNestingTest.php @@ -37,6 +37,10 @@ class HTMLPurifier_Strategy_FixNestingTest $inputs[4] = ''; $expect[4] = ''; + // test custom table definition + $inputs[5] = '
Cell 1
'; + $expect[5] = '
Cell 1
'; + $this->assertStrategyWorks($strategy, $inputs, $expect); }