diff --git a/library/HTMLPurifier/AttrCollection.php b/library/HTMLPurifier/AttrCollection.php index 332a264c..6d92ca1f 100644 --- a/library/HTMLPurifier/AttrCollection.php +++ b/library/HTMLPurifier/AttrCollection.php @@ -10,6 +10,11 @@ require_once 'HTMLPurifier/AttrDef/Lang.php'; class HTMLPurifier_AttrCollection { + /** + * Associative array of attribute collections, indexed by name + * @note Technically, the composition of these is more complicated, + * but we bypass it using our own excludes property + */ var $info = array( 'Core' => array( 0 => array('Style'), @@ -27,18 +32,29 @@ class HTMLPurifier_AttrCollection ) ); + /** + * Sets up direct objects not registered to HTMLPurifier_AttrTypes + */ function HTMLPurifier_AttrCollection() { // setup direct objects $this->info['I18N']['xml:lang'] = $this->info['I18N']['lang'] = new HTMLPurifier_AttrDef_Lang(); } + /** + * Performs all expansions on internal data for use by other inclusions + * It also collects all attribute collection extensions from + * modules + * @param $attr_types HTMLPurifier_AttrTypes instance + * @param $modules Hash array of HTMLPurifier_HTMLModule members + */ function setup($attr_types, $modules) { $info =& $this->info; + // load extensions from the modules foreach ($modules as $module) { foreach ($module->attr_collection as $coll_i => $coll) { foreach ($coll as $attr_i => $attr) { - if ($attr_i === 0) { + if ($attr_i === 0 && isset($info[$coll_i][$attr_i])) { // merge in includes $info[$coll_i][$attr_i] = array_merge( $info[$coll_i][$attr_i], $attr); @@ -48,6 +64,7 @@ class HTMLPurifier_AttrCollection } } } + // perform internal expansions and inclusions foreach ($info as $name => $attr) { // merge attribute collections that include others $this->performInclusions($info[$name]); @@ -56,6 +73,11 @@ class HTMLPurifier_AttrCollection } } + /** + * Takes a reference to an attribute associative array and performs + * all inclusions specified by the zero index. + * @param &$attr Reference to attribute array + */ function performInclusions(&$attr) { if (!isset($attr[0])) return; $merge = $attr[0]; @@ -74,6 +96,12 @@ class HTMLPurifier_AttrCollection unset($attr[0]); } + /** + * Expands all string identifiers in an attribute array by replacing + * them with the appropriate values inside HTMLPurifier_AttrTypes + * @param &$attr Reference to attribute array + * @param $attr_types HTMLPurifier_AttrTypes instance + */ function expandIdentifiers(&$attr, $attr_types) { foreach ($attr as $def_i => $def) { if ($def_i === 0) continue; diff --git a/library/HTMLPurifier/AttrTypes.php b/library/HTMLPurifier/AttrTypes.php index 59ed4e56..7b7f4ca9 100644 --- a/library/HTMLPurifier/AttrTypes.php +++ b/library/HTMLPurifier/AttrTypes.php @@ -10,7 +10,15 @@ require_once 'HTMLPurifier/AttrDef/URI.php'; */ class HTMLPurifier_AttrTypes { + /** + * Lookup array of attribute string identifiers to concrete implementations + * @public + */ var $info = array(); + + /** + * Constructs the info array + */ function HTMLPurifier_AttrTypes() { $this->info['NMTOKENS'] = new HTMLPurifier_AttrDef_Nmtokens(); $this->info['CDATA'] = new HTMLPurifier_AttrDef_Text(); @@ -19,6 +27,10 @@ class HTMLPurifier_AttrTypes $this->info['URI'] = new HTMLPurifier_AttrDef_URI(); $this->info['Pixels'] = new HTMLPurifier_AttrDef_Pixels(); $this->info['Length'] = new HTMLPurifier_AttrDef_Length(); + $this->info['MultiLength'] = new HTMLPurifier_AttrDef_MultiLength(); + // number is really a positive integer, according to XML one or + // more digits + $this->info['Number'] = new HTMLPurifier_AttrDef_Integer(false, false, true); } } diff --git a/library/HTMLPurifier/HTMLModule.php b/library/HTMLPurifier/HTMLModule.php index 0f9480b2..c5d3d2be 100644 --- a/library/HTMLPurifier/HTMLModule.php +++ b/library/HTMLPurifier/HTMLModule.php @@ -12,10 +12,59 @@ class HTMLPurifier_HTMLModule { + /** + * List of elements that the module implements. + * @note This is only for convention, as a module will often loop + * through the $elements array to define HTMLPurifier_ElementDef + * in the $info array. + * @protected + */ var $elements = array(); + + /** + * Associative array of element names to element definitions. + * Some definitions may be incomplete, to be merged in later + * with the full definition. + * @public + */ var $info = array(); + + /** + * Associative array of content set names to content set additions. + * This is commonly used to, say, add an A element to the Inline + * content set. + * @public + */ var $content_sets = array(); + + /** + * Associative array of attribute collection names to attribute + * collection additions. More rarely used for adding attributes to + * the global collections. Example is the StyleAttribute module adding + * the style attribute to the Core. + * @public + */ var $attr_collection = array(); + + /** + * Boolean flag that indicates whether or not getChildDef is implemented. + * For optimization reasons: may save a call to a function. Be sure + * to set it if you do implement getChildDef(), otherwise it will have + * no effect! + * @public + */ + var $defines_child_def = false; + + /** + * Retrieves a proper HTMLPurifier_ChildDef subclass based on + * content_model and content_model_type member variables of + * the HTMLPurifier_ElementDef class. There is a similar function + * in HTMLPurifier_HTMLDefinition. + * @param $def HTMLPurifier_ElementDef instance + * @return HTMLPurifier_ChildDef subclass + * @public + */ + function getChildDef($def) {return false;} } ?> \ No newline at end of file diff --git a/library/HTMLPurifier/HTMLModule/Bdo.php b/library/HTMLPurifier/HTMLModule/Bdo.php index 5cadb4c2..bb8e7ad3 100644 --- a/library/HTMLPurifier/HTMLModule/Bdo.php +++ b/library/HTMLPurifier/HTMLModule/Bdo.php @@ -1,6 +1,7 @@ info['bdo']->attr = array( 0 => array('Core'), 'dir' => $dir, // required + // The Abstract Module specification has the attribute + // inclusions wrong for bdo: bdo allows + // xml:lang too (and we'll toss in lang for good measure, + // though it is not allowed for XHTML 1.1, this will + // be managed with a global attribute transform) 'lang' => 'Lang', 'xml:lang' => 'Lang' ); $this->info['bdo']->content_model = '#PCDATA | Inline'; $this->info['bdo']->content_model_type = 'optional'; - $this->info['bdo']->content_model_type = 'optional'; - $this->info['bdo']->attr_transform_post[] = new HTMLPurifier_AttrTransform_BdoDir(); + // provides fallback behavior if dir's missing (dir is required) + $this->info['bdo']->attr_transform_post[] = + new HTMLPurifier_AttrTransform_BdoDir(); } } diff --git a/library/HTMLPurifier/HTMLModule/Edit.php b/library/HTMLPurifier/HTMLModule/Edit.php index e622baaf..7f58ed41 100644 --- a/library/HTMLPurifier/HTMLModule/Edit.php +++ b/library/HTMLPurifier/HTMLModule/Edit.php @@ -1,9 +1,11 @@ 'URI', // 'datetime' => 'Datetime' // Datetime not implemented ); - $this->info[$element]->content_model = '#PCDATA | Inline ! #PCDATA | Flow'; + // Inline context ! Block context (exclamation mark is + // separator, see getChildDef for parsing) + $this->info[$element]->content_model = + '#PCDATA | Inline ! #PCDATA | Flow'; + // HTML 4.01 specifies that ins/del must not contain block + // elements when used in an inline context, chameleon is + // a complicated workaround to acheive this effect $this->info[$element]->content_model_type = 'chameleon'; } } + var $defines_child_def = true; + function getChildDef($def) { + if ($def->content_model_type != 'chameleon') return false; + $value = explode('!', $def->content_model); + return new HTMLPurifier_ChildDef_Chameleon($value[0], $value[1]); + } + } ?> \ No newline at end of file diff --git a/library/HTMLPurifier/HTMLModule/Image.php b/library/HTMLPurifier/HTMLModule/Image.php index ff4fd43f..e758a052 100644 --- a/library/HTMLPurifier/HTMLModule/Image.php +++ b/library/HTMLPurifier/HTMLModule/Image.php @@ -2,8 +2,13 @@ require_once 'HTMLPurifier/HTMLModule.php'; +require_once 'HTMLPurifier/AttrDef/URI.php'; +require_once 'HTMLPurifier/AttrTransform/ImgRequired.php'; + /** * XHTML 1.1 Image Module provides basic image embedding. + * @note There is specialized code for removing empty images in + * HTMLPurifier_Strategy_RemoveForeignElements */ class HTMLPurifier_HTMLModule_Image extends HTMLPurifier_HTMLModule { diff --git a/library/HTMLPurifier/HTMLModule/List.php b/library/HTMLPurifier/HTMLModule/List.php index dcab1f36..b70fdc75 100644 --- a/library/HTMLPurifier/HTMLModule/List.php +++ b/library/HTMLPurifier/HTMLModule/List.php @@ -10,11 +10,14 @@ class HTMLPurifier_HTMLModule_List extends HTMLPurifier_HTMLModule var $elements = array('dl', 'dt', 'dd', 'ol', 'ul', 'li'); var $info = array(); - // technically speaking, the List content set is a fully formed + // According to the abstract schema, the List content set is a fully formed // one or more expr, but it invariably occurs in an optional declaration // so we're not going to do that subtlety. It might cause trouble // if a user defines "List" and expects that multiple lists are // allowed to be specified, but then again, that's not very intuitive. + // Furthermore, the actual XML Schema may disagree. Regardless, + // we don't have support for such nested expressions without using + // the incredibly inefficient and draconic Custom ChildDef. var $content_sets = array('List' => 'dl | ol | ul', 'Flow' => 'List'); function HTMLPurifier_HTMLModule_List() { @@ -33,6 +36,7 @@ class HTMLPurifier_HTMLModule_List extends HTMLPurifier_HTMLModule $this->info['dt']->content_model_type = 'optional'; $this->info['dl']->content_model = 'dt | dd'; $this->info['dl']->content_model_type = 'required'; + // this could be a LOT more robust $this->info['li']->auto_close = array('li' => true); } diff --git a/library/HTMLPurifier/HTMLModule/Presentation.php b/library/HTMLPurifier/HTMLModule/Presentation.php index 1b05b90b..45f41e57 100644 --- a/library/HTMLPurifier/HTMLModule/Presentation.php +++ b/library/HTMLPurifier/HTMLModule/Presentation.php @@ -3,7 +3,14 @@ require_once 'HTMLPurifier/HTMLModule.php'; /** - * XHTML 1.1 Presentation Module, defines hypertext links. Text Extension Module. + * XHTML 1.1 Presentation Module, defines simple presentation-related + * markup. Text Extension Module. + * @note The official XML Schema and DTD specs further divide this into + * two modules: + * - Block Presentation (hr) + * - Inline Presentation (b, big, i, small, sub, sup, tt) + * We have chosen not to heed this distinction, as content_sets + * provides satisfactory disambiguation. */ class HTMLPurifier_HTMLModule_Presentation extends HTMLPurifier_HTMLModule { diff --git a/library/HTMLPurifier/HTMLModule/StyleAttribute.php b/library/HTMLPurifier/HTMLModule/StyleAttribute.php index c42a43c8..8adae5b8 100644 --- a/library/HTMLPurifier/HTMLModule/StyleAttribute.php +++ b/library/HTMLPurifier/HTMLModule/StyleAttribute.php @@ -1,14 +1,18 @@ array('style' => false), + // The inclusion routine differs from the Abstract Modules but + // is in line with the DTD and XML Schemas. + 'Style' => array('style' => false), // see constructor 'Core' => array(0 => array('Style')) ); diff --git a/library/HTMLPurifier/HTMLModule/Tables.php b/library/HTMLPurifier/HTMLModule/Tables.php index 10263a2a..2865b4f4 100644 --- a/library/HTMLPurifier/HTMLModule/Tables.php +++ b/library/HTMLPurifier/HTMLModule/Tables.php @@ -1,6 +1,7 @@ info['caption']->content_model = '#PCDATA | Inline'; $this->info['caption']->content_model_type = 'optional'; - $this->info['table']->content_model = 'caption?, ( col* | colgroup* ), (( thead?, tfoot?, tbody+ ) | ( tr+ ))'; + // Is done directly because it doesn't leverage substitution + // mechanisms. True model is: + // 'caption?, ( col* | colgroup* ), (( thead?, tfoot?, tbody+ ) | ( tr+ ))' + $this->info['table']->content_model = new HTMLPurifier_ChildDef_Table(); $this->info['table']->content_model_type = 'table'; $this->info['td']->content_model = diff --git a/library/HTMLPurifier/HTMLModule/Text.php b/library/HTMLPurifier/HTMLModule/Text.php index 1aacb273..68900826 100644 --- a/library/HTMLPurifier/HTMLModule/Text.php +++ b/library/HTMLPurifier/HTMLModule/Text.php @@ -54,11 +54,21 @@ class HTMLPurifier_HTMLModule_Text extends HTMLPurifier_HTMLModule $this->info[$element]->content_model_type = 'optional'; } } + // SGML permits exclusions for all descendants, but this is + // not possible with DTDs or XML Schemas. W3C has elected to + // use complicated compositions of content_models to simulate + // exclusion for children, but we go the simpler, SGML-style + // route of flat-out exclusions. Note that the Abstract Module + // is blithely unaware of such distinctions. + $this->info['pre']->excludes = array_flip(array( + 'img', 'big', 'small', + 'object', 'applet', 'font', 'basefont' // generally not allowed + )); $this->info['p']->auto_close = array_flip(array( - 'address', 'blockquote', 'dd', 'dir', 'div', 'dl', 'dt', - 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'ol', 'p', 'pre', - 'table', 'ul' - )); + 'address', 'blockquote', 'dd', 'dir', 'div', 'dl', 'dt', + 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'ol', 'p', 'pre', + 'table', 'ul' + )); } } diff --git a/library/HTMLPurifier/Printer/HTMLDefinition.php b/library/HTMLPurifier/Printer/HTMLDefinition.php index fb1a357f..cb327cb6 100644 --- a/library/HTMLPurifier/Printer/HTMLDefinition.php +++ b/library/HTMLPurifier/Printer/HTMLDefinition.php @@ -173,6 +173,7 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer * @param $array Tag lookup array in form of array('tagname' => true) */ function listifyTagLookup($array) { + ksort($array); $list = array(); foreach ($array as $name => $discard) { if ($name !== '#PCDATA' && !isset($this->def->info[$name])) continue; @@ -187,6 +188,7 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer * @todo Also add information about internal state */ function listifyObjectList($array) { + ksort($array); $list = array(); foreach ($array as $discard => $obj) { $list[] = $this->getClass($obj, 'AttrTransform_'); @@ -199,6 +201,7 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer * @param $array Array hash in form of array('attrname' => HTMLPurifier_AttrDef) */ function listifyAttr($array) { + ksort($array); $list = array(); foreach ($array as $name => $obj) { if ($obj === false) continue; diff --git a/library/HTMLPurifier/XHTMLDefinition.php b/library/HTMLPurifier/XHTMLDefinition.php index f8502d77..4e4d762f 100644 --- a/library/HTMLPurifier/XHTMLDefinition.php +++ b/library/HTMLPurifier/XHTMLDefinition.php @@ -5,6 +5,19 @@ require_once 'HTMLPurifier/HTMLDefinition.php'; require_once 'HTMLPurifier/AttrTypes.php'; require_once 'HTMLPurifier/AttrCollection.php'; +// we'll manage loading extremely commonly used attr definitions +require_once 'HTMLPurifier/AttrDef.php'; +require_once 'HTMLPurifier/AttrDef/Enum.php'; + +// technically speaking, these includes would be more appropriate for +// other modules, but we're going to include all the common ones. A +// custom one would have to be fed in as an actual object +require_once 'HTMLPurifier/ChildDef.php'; +require_once 'HTMLPurifier/ChildDef/Empty.php'; +require_once 'HTMLPurifier/ChildDef/Required.php'; +require_once 'HTMLPurifier/ChildDef/Optional.php'; +require_once 'HTMLPurifier/ChildDef/StrictBlockquote.php'; + require_once 'HTMLPurifier/HTMLModule.php'; require_once 'HTMLPurifier/HTMLModule/Text.php'; require_once 'HTMLPurifier/HTMLModule/Hypertext.php'; @@ -22,11 +35,35 @@ require_once 'HTMLPurifier/HTMLModule/StyleAttribute.php'; class HTMLPurifier_XHTMLDefinition extends HTMLPurifier_HTMLDefinition { + /** + * Array of HTMLPurifier_Module instances, indexed by module name + * @public + */ var $modules = array(); + + /** + * Instance of HTMLPurifier_AttrTypes + * @public + */ var $attr_types; + + /** + * Instance of HTMLPurifier_AttrCollection + * @public + */ var $attr_collection; + + /** + * Nested lookup array of content set name (Block, Inline) to + * element name to whether or not it belongs in that content set. + * @public + */ var $content_sets; + /** + * Performs low-cost, preliminary initialization. + * @param $config Instance of HTMLPurifier_Config + */ function HTMLPurifier_XHTMLDefinition($config) { $this->modules['Text'] = new HTMLPurifier_HTMLModule_Text(); @@ -44,6 +81,12 @@ class HTMLPurifier_XHTMLDefinition extends HTMLPurifier_HTMLDefinition } + /** + * Processes internals into form usable by HTMLPurifier internals. + * Modifying the definition after calling this function should not + * be done. + * @param $config Instance of HTMLPurifier_Config + */ function setup($config) { // perform attribute collection substitutions @@ -93,7 +136,10 @@ class HTMLPurifier_XHTMLDefinition extends HTMLPurifier_HTMLDefinition $content_model = $def->content_model; if (is_string($content_model)) { if (strpos($content_model, 'Inline') !== false) { - $def->descendants_are_inline = true; + if ($name != 'del' && $name != 'ins') { + // this is for you, ins/del + $def->descendants_are_inline = true; + } } $def->content_model = str_replace( $content_sets_keys, $content_sets_values, $content_model); @@ -116,10 +162,18 @@ class HTMLPurifier_XHTMLDefinition extends HTMLPurifier_HTMLDefinition } + /** + * Sets up attribute transformations + * @param $config Instance of HTMLPurifier_Config + */ function setupAttrTransform($config) { $this->info_attr_transform_post[] = new HTMLPurifier_AttrTransform_Lang(); } + /** + * Sets up block wrapper based on config + * @param $config Instance of HTMLPurifier_Config + */ function setupBlockWrapper($config) { $block_wrapper = $config->get('HTML', 'BlockWrapper'); if (isset($this->content_sets['Block'][$block_wrapper])) { @@ -130,6 +184,10 @@ class HTMLPurifier_XHTMLDefinition extends HTMLPurifier_HTMLDefinition } } + /** + * Sets up parent of fragment based on config + * @param $config Instance of HTMLPurifier_Config + */ function setupParent($config) { $parent = $config->get('HTML', 'Parent'); if (isset($this->info[$parent])) { @@ -141,10 +199,18 @@ class HTMLPurifier_XHTMLDefinition extends HTMLPurifier_HTMLDefinition $this->info_parent_def = $this->info[$this->info_parent]; } + /** + * Instantiates a ChildDef based on content_model and content_model_type + * member variables in HTMLPurifier_ElementDef + * @note This will also defer to modules for custom HTMLPurifier_ChildDef + * subclasses that need content set expansion + * @param $def HTMLPurifier_ElementDef to have ChildDef extracted + * @return HTMLPurifier_ChildDef corresponding to ElementDef + */ function getChildDef($def) { $value = $def->content_model; - $type = $def->content_model_type; - switch ($type) { + if (is_object($value)) return $value; // direct object, return + switch ($def->content_model_type) { case 'required': return new HTMLPurifier_ChildDef_Required($value); case 'optional': @@ -153,18 +219,29 @@ class HTMLPurifier_XHTMLDefinition extends HTMLPurifier_HTMLDefinition return new HTMLPurifier_ChildDef_Empty(); case 'strictblockquote': return new HTMLPurifier_ChildDef_StrictBlockquote($value); - case 'table': - return new HTMLPurifier_ChildDef_Table(); - case 'chameleon': - $value = explode('!', $value); - return new HTMLPurifier_ChildDef_Chameleon($value[0], $value[1]); case 'custom': return new HTMLPurifier_ChildDef_Custom($value); } - if ($value) return new HTMLPurifier_ChildDef_Optional($value); - return new HTMLPurifier_ChildDef_Empty(); + // defer to modules, see if they know what child_def to use + foreach ($this->modules as $module) { + if (!$module->defines_child_def) continue; // save a func call + $return = $module->getChildDef($def); + if ($return !== false) return $return; + } + // error-out + trigger_error( + 'Could not determine which ChildDef class to instantiate', + E_USER_ERROR + ); + return false; } + /** + * Converts a string list of elements separated by pipes into + * a lookup array. + * @param $string List of elements + * @return Lookup array of elements + */ function convertToLookup($string) { $array = explode('|', str_replace(' ', '', $string)); $ret = array();