diff --git a/docs/spec.txt b/docs/spec.txt index 946aa61f..2b4684b2 100644 --- a/docs/spec.txt +++ b/docs/spec.txt @@ -1,9 +1,7 @@ -HTML Purifier Specification +HTML Purifier by Edward Z. Yang -== Introduction == - There are a number of ad hoc HTML filtering solutions out there on the web (some examples including HTML_Safe, kses and SafeHtmlChecker.class.php) that claim to filter HTML properly, preventing malicious JavaScript and layout @@ -56,29 +54,6 @@ HTML tags. Things like blog comments are, in all likelihood, most appropriately written in an extremely restrictive set of markup that doesn't require all this functionality (or not written in HTML at all). - - -== STAGE 1 - parsing == - - Status: A (see source, mainly internals and UTF-8) - -The Lexer (currently we have three choices) handles parsing into Tokens. - -Here are the mappings for Lexer_PEARSax3 - -* Start(name, attributes) is openHandler -* End(name) is closeHandler -* Empty(name, attributes) is openHandler (is in array of empties) -* Data(parse(text)) is dataHandler -* Comment(text) is escapeHandler (has leading -) -* Data(text) is escapeHandler (has leading [, CDATA) - -Ignorable/not being implemented (although we probably want to output them raw): -* ProcessingInstructions(text) is piHandler -* JavaOrASPInstructions(text) is jaspHandler - - - == STAGE 2 - remove foreign elements == Status: A- (transformations need to be implemented) diff --git a/library/HTMLPurifier/Definition.php b/library/HTMLPurifier/Definition.php index 6012bec3..2ece6ac1 100644 --- a/library/HTMLPurifier/Definition.php +++ b/library/HTMLPurifier/Definition.php @@ -34,6 +34,7 @@ class HTMLPurifier_Definition 'table' => true, 'ul' => true ); + var $info_global_attr = array(); function instance() { static $instance = null; @@ -49,6 +50,20 @@ class HTMLPurifier_Definition function setup() { // emulates the structure of the DTD + $allowed_tags = + array( + 'ins', 'del', 'blockquote', 'dd', 'li', 'div', 'em', 'strong', + 'dfn', 'code', 'samp', 'kbd', 'var', 'cite', 'abbr', 'acronym', + 'q', 'sub', 'tt', 'sup', 'i', 'b', 'big', 'small', 'u', 's', + 'strike', 'bdo', 'span', 'dt', 'p', 'h1', 'h2', 'h3', 'h4', + 'h5', 'h6', 'ol', 'ul', 'dl', 'address', 'img', 'br', 'hr', + 'pre', 'a' + ); + + foreach ($allowed_tags as $tag) { + $this->info[$tag] = new HTMLPurifier_ElementDef(); + } + // entities: prefixed with e_ and _ replaces . // we don't use an array because that complicates interpolation // strings are used instead of arrays because if you use arrays, @@ -96,73 +111,67 @@ class HTMLPurifier_Definition $e_form_content = new HTMLPurifier_ChildDef_Optional(''); //unused $e_form_button_content = new HTMLPurifier_ChildDef_Optional(''); // unused - $this->info['child'] = array(); + $this->info['ins']->child = + $this->info['del']->child = + $this->info['blockquote']->child = + $this->info['dd']->child = + $this->info['li']->child = + $this->info['div']->child = $e_Flow; - $this->info['child']['ins'] = - $this->info['child']['del'] = - $this->info['child']['blockquote'] = - $this->info['child']['dd'] = - $this->info['child']['li'] = - $this->info['child']['div'] = $e_Flow; + $this->info['em']->child = + $this->info['strong']->child = + $this->info['dfn']->child = + $this->info['code']->child = + $this->info['samp']->child = + $this->info['kbd']->child = + $this->info['var']->child = + $this->info['cite']->child = + $this->info['abbr']->child = + $this->info['acronym']->child = + $this->info['q']->child = + $this->info['sub']->child = + $this->info['tt']->child = + $this->info['sup']->child = + $this->info['i']->child = + $this->info['b']->child = + $this->info['big']->child = + $this->info['small']->child = + $this->info['u']->child = + $this->info['s']->child = + $this->info['strike']->child = + $this->info['bdo']->child = + $this->info['span']->child = + $this->info['dt']->child = + $this->info['p']->child = + $this->info['h1']->child = + $this->info['h2']->child = + $this->info['h3']->child = + $this->info['h4']->child = + $this->info['h5']->child = + $this->info['h6']->child = $e_Inline; - $this->info['child']['em'] = - $this->info['child']['strong'] = - $this->info['child']['dfn'] = - $this->info['child']['code'] = - $this->info['child']['samp'] = - $this->info['child']['kbd'] = - $this->info['child']['var'] = - $this->info['child']['code'] = - $this->info['child']['samp'] = - $this->info['child']['kbd'] = - $this->info['child']['var'] = - $this->info['child']['cite'] = - $this->info['child']['abbr'] = - $this->info['child']['acronym'] = - $this->info['child']['q'] = - $this->info['child']['sub'] = - $this->info['child']['tt'] = - $this->info['child']['sup'] = - $this->info['child']['i'] = - $this->info['child']['b'] = - $this->info['child']['big'] = - $this->info['child']['small'] = - $this->info['child']['u'] = - $this->info['child']['s'] = - $this->info['child']['strike'] = - $this->info['child']['bdo'] = - $this->info['child']['span'] = - $this->info['child']['dt'] = - $this->info['child']['p'] = - $this->info['child']['h1'] = - $this->info['child']['h2'] = - $this->info['child']['h3'] = - $this->info['child']['h4'] = - $this->info['child']['h5'] = - $this->info['child']['h6'] = $e_Inline; + $this->info['ol']->child = + $this->info['ul']->child = new HTMLPurifier_ChildDef_Required('li'); - $this->info['child']['ol'] = - $this->info['child']['ul'] = new HTMLPurifier_ChildDef_Required('li'); - - $this->info['child']['dl'] = new HTMLPurifier_ChildDef_Required('dt|dd'); - $this->info['child']['address'] = + $this->info['dl']->child = new HTMLPurifier_ChildDef_Required('dt|dd'); + $this->info['address']->child = new HTMLPurifier_ChildDef_Optional("#PCDATA | p | $e_inline". " | $e_misc_inline"); - $this->info['child']['img'] = - $this->info['child']['br'] = - $this->info['child']['hr'] = new HTMLPurifier_ChildDef_Empty(); + $this->info['img']->child = + $this->info['br']->child = + $this->info['hr']->child = new HTMLPurifier_ChildDef_Empty(); - $this->info['child']['pre'] = $e_pre_content; + $this->info['pre']->child = $e_pre_content; - $this->info['child']['a'] = $e_a_content; + $this->info['a']->child = $e_a_content; // attribute info // this doesn't include REQUIRED declarations, those are handled // by the transform classes // attrs, included in almost every single one except for a few - $this->info['attr']['*'] = array( + $this->info_global_attr = array( // core attrs 'id' => new HTMLPurifier_AttrDef_ID(), // i18n @@ -176,13 +185,8 @@ class HTMLPurifier_Definition class HTMLPurifier_ElementDef { - var $child_def; - var $attr_def = array(); - - function HTMLPurifier_ElementDef($child_def, $attr_def = array()) { - $this->child_def = $child_def; - $this->attr_def = $attr_def; - } + var $child; + var $attr = array(); } diff --git a/library/HTMLPurifier/Strategy/FixNesting.php b/library/HTMLPurifier/Strategy/FixNesting.php index ad54ed65..1bca623c 100644 --- a/library/HTMLPurifier/Strategy/FixNesting.php +++ b/library/HTMLPurifier/Strategy/FixNesting.php @@ -38,8 +38,11 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy // $i is index of start token // $j is index of end token + + // DEFINITION CALL + $child_def = $this->definition->info[$tokens[$i]->name]->child; + // have DTD child def validate children - $child_def = $this->definition->info['child'][$tokens[$i]->name]; $result = $child_def->validateChildren($child_tokens); // process result diff --git a/library/HTMLPurifier/Strategy/MakeWellFormed.php b/library/HTMLPurifier/Strategy/MakeWellFormed.php index 4e58dd49..397e6916 100644 --- a/library/HTMLPurifier/Strategy/MakeWellFormed.php +++ b/library/HTMLPurifier/Strategy/MakeWellFormed.php @@ -23,7 +23,9 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy $result[] = $token; continue; } - $info = $this->definition->info['child'][$token->name]; // assumption but valid + + // DEFINITION CALL + $info = $this->definition->info[$token->name]->child; // test if it claims to be a start tag but is empty if ($info->type == 'empty' && diff --git a/library/HTMLPurifier/Strategy/RemoveForeignElements.php b/library/HTMLPurifier/Strategy/RemoveForeignElements.php index e5755bb8..ff8eb001 100644 --- a/library/HTMLPurifier/Strategy/RemoveForeignElements.php +++ b/library/HTMLPurifier/Strategy/RemoveForeignElements.php @@ -4,6 +4,13 @@ require_once 'HTMLPurifier/Strategy.php'; require_once 'HTMLPurifier/Definition.php'; require_once 'HTMLPurifier/Generator.php'; +/** + * Removes all unrecognized tags from the list of tokens. + * + * This strategy iterates through all the tokens and removes unrecognized + * tokens. + */ + class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy { @@ -19,7 +26,8 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy $result = array(); foreach($tokens as $token) { if (!empty( $token->is_tag )) { - if (!isset($this->definition->info['child'][$token->name])) { + // DEFINITION CALL + if (!isset($this->definition->info[$token->name])) { // invalid tag, generate HTML and insert in $token = new HTMLPurifier_Token_Text( $this->generator->generateFromToken($token) diff --git a/library/HTMLPurifier/Strategy/ValidateAttributes.php b/library/HTMLPurifier/Strategy/ValidateAttributes.php index e36dcb3c..a9e21980 100644 --- a/library/HTMLPurifier/Strategy/ValidateAttributes.php +++ b/library/HTMLPurifier/Strategy/ValidateAttributes.php @@ -15,13 +15,14 @@ class HTMLPurifier_Strategy_ValidateAttributes extends HTMLPurifier_Strategy function execute($tokens) { $accumulator = new HTMLPurifier_IDAccumulator(); - $d_defs = $this->definition->info['attr']['*']; + $d_defs = $this->definition->info_global_attr; foreach ($tokens as $key => $token) { if ($token->type !== 'start' && $token->type !== 'end') continue; - $name = $token->name; + + // DEFINITION CALL + $defs = $this->definition->info[$token->name]->attr; + $attr = $token->attributes; - $defs = isset($this->definition->info['attr'][$name]) ? - $this->definition->attr[$name] : array(); $changed = false; foreach ($attr as $attr_key => $value) { if ( isset($defs[$attr_key]) ) {