diff --git a/library/HTMLPurifier/AttrCollection.php b/library/HTMLPurifier/AttrCollection.php
index 332a264c..6d92ca1f 100644
--- a/library/HTMLPurifier/AttrCollection.php
+++ b/library/HTMLPurifier/AttrCollection.php
@@ -10,6 +10,11 @@ require_once 'HTMLPurifier/AttrDef/Lang.php';
class HTMLPurifier_AttrCollection
{
+ /**
+ * Associative array of attribute collections, indexed by name
+ * @note Technically, the composition of these is more complicated,
+ * but we bypass it using our own excludes property
+ */
var $info = array(
'Core' => array(
0 => array('Style'),
@@ -27,18 +32,29 @@ class HTMLPurifier_AttrCollection
)
);
+ /**
+ * Sets up direct objects not registered to HTMLPurifier_AttrTypes
+ */
function HTMLPurifier_AttrCollection() {
// setup direct objects
$this->info['I18N']['xml:lang'] =
$this->info['I18N']['lang'] = new HTMLPurifier_AttrDef_Lang();
}
+ /**
+ * Performs all expansions on internal data for use by other inclusions
+ * It also collects all attribute collection extensions from
+ * modules
+ * @param $attr_types HTMLPurifier_AttrTypes instance
+ * @param $modules Hash array of HTMLPurifier_HTMLModule members
+ */
function setup($attr_types, $modules) {
$info =& $this->info;
+ // load extensions from the modules
foreach ($modules as $module) {
foreach ($module->attr_collection as $coll_i => $coll) {
foreach ($coll as $attr_i => $attr) {
- if ($attr_i === 0) {
+ if ($attr_i === 0 && isset($info[$coll_i][$attr_i])) {
// merge in includes
$info[$coll_i][$attr_i] = array_merge(
$info[$coll_i][$attr_i], $attr);
@@ -48,6 +64,7 @@ class HTMLPurifier_AttrCollection
}
}
}
+ // perform internal expansions and inclusions
foreach ($info as $name => $attr) {
// merge attribute collections that include others
$this->performInclusions($info[$name]);
@@ -56,6 +73,11 @@ class HTMLPurifier_AttrCollection
}
}
+ /**
+ * Takes a reference to an attribute associative array and performs
+ * all inclusions specified by the zero index.
+ * @param &$attr Reference to attribute array
+ */
function performInclusions(&$attr) {
if (!isset($attr[0])) return;
$merge = $attr[0];
@@ -74,6 +96,12 @@ class HTMLPurifier_AttrCollection
unset($attr[0]);
}
+ /**
+ * Expands all string identifiers in an attribute array by replacing
+ * them with the appropriate values inside HTMLPurifier_AttrTypes
+ * @param &$attr Reference to attribute array
+ * @param $attr_types HTMLPurifier_AttrTypes instance
+ */
function expandIdentifiers(&$attr, $attr_types) {
foreach ($attr as $def_i => $def) {
if ($def_i === 0) continue;
diff --git a/library/HTMLPurifier/AttrTypes.php b/library/HTMLPurifier/AttrTypes.php
index 59ed4e56..7b7f4ca9 100644
--- a/library/HTMLPurifier/AttrTypes.php
+++ b/library/HTMLPurifier/AttrTypes.php
@@ -10,7 +10,15 @@ require_once 'HTMLPurifier/AttrDef/URI.php';
*/
class HTMLPurifier_AttrTypes
{
+ /**
+ * Lookup array of attribute string identifiers to concrete implementations
+ * @public
+ */
var $info = array();
+
+ /**
+ * Constructs the info array
+ */
function HTMLPurifier_AttrTypes() {
$this->info['NMTOKENS'] = new HTMLPurifier_AttrDef_Nmtokens();
$this->info['CDATA'] = new HTMLPurifier_AttrDef_Text();
@@ -19,6 +27,10 @@ class HTMLPurifier_AttrTypes
$this->info['URI'] = new HTMLPurifier_AttrDef_URI();
$this->info['Pixels'] = new HTMLPurifier_AttrDef_Pixels();
$this->info['Length'] = new HTMLPurifier_AttrDef_Length();
+ $this->info['MultiLength'] = new HTMLPurifier_AttrDef_MultiLength();
+ // number is really a positive integer, according to XML one or
+ // more digits
+ $this->info['Number'] = new HTMLPurifier_AttrDef_Integer(false, false, true);
}
}
diff --git a/library/HTMLPurifier/HTMLModule.php b/library/HTMLPurifier/HTMLModule.php
index 0f9480b2..c5d3d2be 100644
--- a/library/HTMLPurifier/HTMLModule.php
+++ b/library/HTMLPurifier/HTMLModule.php
@@ -12,10 +12,59 @@
class HTMLPurifier_HTMLModule
{
+ /**
+ * List of elements that the module implements.
+ * @note This is only for convention, as a module will often loop
+ * through the $elements array to define HTMLPurifier_ElementDef
+ * in the $info array.
+ * @protected
+ */
var $elements = array();
+
+ /**
+ * Associative array of element names to element definitions.
+ * Some definitions may be incomplete, to be merged in later
+ * with the full definition.
+ * @public
+ */
var $info = array();
+
+ /**
+ * Associative array of content set names to content set additions.
+ * This is commonly used to, say, add an A element to the Inline
+ * content set.
+ * @public
+ */
var $content_sets = array();
+
+ /**
+ * Associative array of attribute collection names to attribute
+ * collection additions. More rarely used for adding attributes to
+ * the global collections. Example is the StyleAttribute module adding
+ * the style attribute to the Core.
+ * @public
+ */
var $attr_collection = array();
+
+ /**
+ * Boolean flag that indicates whether or not getChildDef is implemented.
+ * For optimization reasons: may save a call to a function. Be sure
+ * to set it if you do implement getChildDef(), otherwise it will have
+ * no effect!
+ * @public
+ */
+ var $defines_child_def = false;
+
+ /**
+ * Retrieves a proper HTMLPurifier_ChildDef subclass based on
+ * content_model and content_model_type member variables of
+ * the HTMLPurifier_ElementDef class. There is a similar function
+ * in HTMLPurifier_HTMLDefinition.
+ * @param $def HTMLPurifier_ElementDef instance
+ * @return HTMLPurifier_ChildDef subclass
+ * @public
+ */
+ function getChildDef($def) {return false;}
}
?>
\ No newline at end of file
diff --git a/library/HTMLPurifier/HTMLModule/Bdo.php b/library/HTMLPurifier/HTMLModule/Bdo.php
index 5cadb4c2..bb8e7ad3 100644
--- a/library/HTMLPurifier/HTMLModule/Bdo.php
+++ b/library/HTMLPurifier/HTMLModule/Bdo.php
@@ -1,6 +1,7 @@
info['bdo']->attr = array(
0 => array('Core'),
'dir' => $dir, // required
+ // The Abstract Module specification has the attribute
+ // inclusions wrong for bdo: bdo allows
+ // xml:lang too (and we'll toss in lang for good measure,
+ // though it is not allowed for XHTML 1.1, this will
+ // be managed with a global attribute transform)
'lang' => 'Lang',
'xml:lang' => 'Lang'
);
$this->info['bdo']->content_model = '#PCDATA | Inline';
$this->info['bdo']->content_model_type = 'optional';
- $this->info['bdo']->content_model_type = 'optional';
- $this->info['bdo']->attr_transform_post[] = new HTMLPurifier_AttrTransform_BdoDir();
+ // provides fallback behavior if dir's missing (dir is required)
+ $this->info['bdo']->attr_transform_post[] =
+ new HTMLPurifier_AttrTransform_BdoDir();
}
}
diff --git a/library/HTMLPurifier/HTMLModule/Edit.php b/library/HTMLPurifier/HTMLModule/Edit.php
index e622baaf..7f58ed41 100644
--- a/library/HTMLPurifier/HTMLModule/Edit.php
+++ b/library/HTMLPurifier/HTMLModule/Edit.php
@@ -1,9 +1,11 @@
'URI',
// 'datetime' => 'Datetime' // Datetime not implemented
);
- $this->info[$element]->content_model = '#PCDATA | Inline ! #PCDATA | Flow';
+ // Inline context ! Block context (exclamation mark is
+ // separator, see getChildDef for parsing)
+ $this->info[$element]->content_model =
+ '#PCDATA | Inline ! #PCDATA | Flow';
+ // HTML 4.01 specifies that ins/del must not contain block
+ // elements when used in an inline context, chameleon is
+ // a complicated workaround to acheive this effect
$this->info[$element]->content_model_type = 'chameleon';
}
}
+ var $defines_child_def = true;
+ function getChildDef($def) {
+ if ($def->content_model_type != 'chameleon') return false;
+ $value = explode('!', $def->content_model);
+ return new HTMLPurifier_ChildDef_Chameleon($value[0], $value[1]);
+ }
+
}
?>
\ No newline at end of file
diff --git a/library/HTMLPurifier/HTMLModule/Image.php b/library/HTMLPurifier/HTMLModule/Image.php
index ff4fd43f..e758a052 100644
--- a/library/HTMLPurifier/HTMLModule/Image.php
+++ b/library/HTMLPurifier/HTMLModule/Image.php
@@ -2,8 +2,13 @@
require_once 'HTMLPurifier/HTMLModule.php';
+require_once 'HTMLPurifier/AttrDef/URI.php';
+require_once 'HTMLPurifier/AttrTransform/ImgRequired.php';
+
/**
* XHTML 1.1 Image Module provides basic image embedding.
+ * @note There is specialized code for removing empty images in
+ * HTMLPurifier_Strategy_RemoveForeignElements
*/
class HTMLPurifier_HTMLModule_Image extends HTMLPurifier_HTMLModule
{
diff --git a/library/HTMLPurifier/HTMLModule/List.php b/library/HTMLPurifier/HTMLModule/List.php
index dcab1f36..b70fdc75 100644
--- a/library/HTMLPurifier/HTMLModule/List.php
+++ b/library/HTMLPurifier/HTMLModule/List.php
@@ -10,11 +10,14 @@ class HTMLPurifier_HTMLModule_List extends HTMLPurifier_HTMLModule
var $elements = array('dl', 'dt', 'dd', 'ol', 'ul', 'li');
var $info = array();
- // technically speaking, the List content set is a fully formed
+ // According to the abstract schema, the List content set is a fully formed
// one or more expr, but it invariably occurs in an optional declaration
// so we're not going to do that subtlety. It might cause trouble
// if a user defines "List" and expects that multiple lists are
// allowed to be specified, but then again, that's not very intuitive.
+ // Furthermore, the actual XML Schema may disagree. Regardless,
+ // we don't have support for such nested expressions without using
+ // the incredibly inefficient and draconic Custom ChildDef.
var $content_sets = array('List' => 'dl | ol | ul', 'Flow' => 'List');
function HTMLPurifier_HTMLModule_List() {
@@ -33,6 +36,7 @@ class HTMLPurifier_HTMLModule_List extends HTMLPurifier_HTMLModule
$this->info['dt']->content_model_type = 'optional';
$this->info['dl']->content_model = 'dt | dd';
$this->info['dl']->content_model_type = 'required';
+ // this could be a LOT more robust
$this->info['li']->auto_close = array('li' => true);
}
diff --git a/library/HTMLPurifier/HTMLModule/Presentation.php b/library/HTMLPurifier/HTMLModule/Presentation.php
index 1b05b90b..45f41e57 100644
--- a/library/HTMLPurifier/HTMLModule/Presentation.php
+++ b/library/HTMLPurifier/HTMLModule/Presentation.php
@@ -3,7 +3,14 @@
require_once 'HTMLPurifier/HTMLModule.php';
/**
- * XHTML 1.1 Presentation Module, defines hypertext links. Text Extension Module.
+ * XHTML 1.1 Presentation Module, defines simple presentation-related
+ * markup. Text Extension Module.
+ * @note The official XML Schema and DTD specs further divide this into
+ * two modules:
+ * - Block Presentation (hr)
+ * - Inline Presentation (b, big, i, small, sub, sup, tt)
+ * We have chosen not to heed this distinction, as content_sets
+ * provides satisfactory disambiguation.
*/
class HTMLPurifier_HTMLModule_Presentation extends HTMLPurifier_HTMLModule
{
diff --git a/library/HTMLPurifier/HTMLModule/StyleAttribute.php b/library/HTMLPurifier/HTMLModule/StyleAttribute.php
index c42a43c8..8adae5b8 100644
--- a/library/HTMLPurifier/HTMLModule/StyleAttribute.php
+++ b/library/HTMLPurifier/HTMLModule/StyleAttribute.php
@@ -1,14 +1,18 @@
array('style' => false),
+ // The inclusion routine differs from the Abstract Modules but
+ // is in line with the DTD and XML Schemas.
+ 'Style' => array('style' => false), // see constructor
'Core' => array(0 => array('Style'))
);
diff --git a/library/HTMLPurifier/HTMLModule/Tables.php b/library/HTMLPurifier/HTMLModule/Tables.php
index 10263a2a..2865b4f4 100644
--- a/library/HTMLPurifier/HTMLModule/Tables.php
+++ b/library/HTMLPurifier/HTMLModule/Tables.php
@@ -1,6 +1,7 @@
info['caption']->content_model = '#PCDATA | Inline';
$this->info['caption']->content_model_type = 'optional';
- $this->info['table']->content_model = 'caption?, ( col* | colgroup* ), (( thead?, tfoot?, tbody+ ) | ( tr+ ))';
+ // Is done directly because it doesn't leverage substitution
+ // mechanisms. True model is:
+ // 'caption?, ( col* | colgroup* ), (( thead?, tfoot?, tbody+ ) | ( tr+ ))'
+ $this->info['table']->content_model = new HTMLPurifier_ChildDef_Table();
$this->info['table']->content_model_type = 'table';
$this->info['td']->content_model =
diff --git a/library/HTMLPurifier/HTMLModule/Text.php b/library/HTMLPurifier/HTMLModule/Text.php
index 1aacb273..68900826 100644
--- a/library/HTMLPurifier/HTMLModule/Text.php
+++ b/library/HTMLPurifier/HTMLModule/Text.php
@@ -54,11 +54,21 @@ class HTMLPurifier_HTMLModule_Text extends HTMLPurifier_HTMLModule
$this->info[$element]->content_model_type = 'optional';
}
}
+ // SGML permits exclusions for all descendants, but this is
+ // not possible with DTDs or XML Schemas. W3C has elected to
+ // use complicated compositions of content_models to simulate
+ // exclusion for children, but we go the simpler, SGML-style
+ // route of flat-out exclusions. Note that the Abstract Module
+ // is blithely unaware of such distinctions.
+ $this->info['pre']->excludes = array_flip(array(
+ 'img', 'big', 'small',
+ 'object', 'applet', 'font', 'basefont' // generally not allowed
+ ));
$this->info['p']->auto_close = array_flip(array(
- 'address', 'blockquote', 'dd', 'dir', 'div', 'dl', 'dt',
- 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'ol', 'p', 'pre',
- 'table', 'ul'
- ));
+ 'address', 'blockquote', 'dd', 'dir', 'div', 'dl', 'dt',
+ 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'hr', 'ol', 'p', 'pre',
+ 'table', 'ul'
+ ));
}
}
diff --git a/library/HTMLPurifier/Printer/HTMLDefinition.php b/library/HTMLPurifier/Printer/HTMLDefinition.php
index fb1a357f..cb327cb6 100644
--- a/library/HTMLPurifier/Printer/HTMLDefinition.php
+++ b/library/HTMLPurifier/Printer/HTMLDefinition.php
@@ -173,6 +173,7 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer
* @param $array Tag lookup array in form of array('tagname' => true)
*/
function listifyTagLookup($array) {
+ ksort($array);
$list = array();
foreach ($array as $name => $discard) {
if ($name !== '#PCDATA' && !isset($this->def->info[$name])) continue;
@@ -187,6 +188,7 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer
* @todo Also add information about internal state
*/
function listifyObjectList($array) {
+ ksort($array);
$list = array();
foreach ($array as $discard => $obj) {
$list[] = $this->getClass($obj, 'AttrTransform_');
@@ -199,6 +201,7 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer
* @param $array Array hash in form of array('attrname' => HTMLPurifier_AttrDef)
*/
function listifyAttr($array) {
+ ksort($array);
$list = array();
foreach ($array as $name => $obj) {
if ($obj === false) continue;
diff --git a/library/HTMLPurifier/XHTMLDefinition.php b/library/HTMLPurifier/XHTMLDefinition.php
index f8502d77..4e4d762f 100644
--- a/library/HTMLPurifier/XHTMLDefinition.php
+++ b/library/HTMLPurifier/XHTMLDefinition.php
@@ -5,6 +5,19 @@ require_once 'HTMLPurifier/HTMLDefinition.php';
require_once 'HTMLPurifier/AttrTypes.php';
require_once 'HTMLPurifier/AttrCollection.php';
+// we'll manage loading extremely commonly used attr definitions
+require_once 'HTMLPurifier/AttrDef.php';
+require_once 'HTMLPurifier/AttrDef/Enum.php';
+
+// technically speaking, these includes would be more appropriate for
+// other modules, but we're going to include all the common ones. A
+// custom one would have to be fed in as an actual object
+require_once 'HTMLPurifier/ChildDef.php';
+require_once 'HTMLPurifier/ChildDef/Empty.php';
+require_once 'HTMLPurifier/ChildDef/Required.php';
+require_once 'HTMLPurifier/ChildDef/Optional.php';
+require_once 'HTMLPurifier/ChildDef/StrictBlockquote.php';
+
require_once 'HTMLPurifier/HTMLModule.php';
require_once 'HTMLPurifier/HTMLModule/Text.php';
require_once 'HTMLPurifier/HTMLModule/Hypertext.php';
@@ -22,11 +35,35 @@ require_once 'HTMLPurifier/HTMLModule/StyleAttribute.php';
class HTMLPurifier_XHTMLDefinition extends HTMLPurifier_HTMLDefinition
{
+ /**
+ * Array of HTMLPurifier_Module instances, indexed by module name
+ * @public
+ */
var $modules = array();
+
+ /**
+ * Instance of HTMLPurifier_AttrTypes
+ * @public
+ */
var $attr_types;
+
+ /**
+ * Instance of HTMLPurifier_AttrCollection
+ * @public
+ */
var $attr_collection;
+
+ /**
+ * Nested lookup array of content set name (Block, Inline) to
+ * element name to whether or not it belongs in that content set.
+ * @public
+ */
var $content_sets;
+ /**
+ * Performs low-cost, preliminary initialization.
+ * @param $config Instance of HTMLPurifier_Config
+ */
function HTMLPurifier_XHTMLDefinition($config) {
$this->modules['Text'] = new HTMLPurifier_HTMLModule_Text();
@@ -44,6 +81,12 @@ class HTMLPurifier_XHTMLDefinition extends HTMLPurifier_HTMLDefinition
}
+ /**
+ * Processes internals into form usable by HTMLPurifier internals.
+ * Modifying the definition after calling this function should not
+ * be done.
+ * @param $config Instance of HTMLPurifier_Config
+ */
function setup($config) {
// perform attribute collection substitutions
@@ -93,7 +136,10 @@ class HTMLPurifier_XHTMLDefinition extends HTMLPurifier_HTMLDefinition
$content_model = $def->content_model;
if (is_string($content_model)) {
if (strpos($content_model, 'Inline') !== false) {
- $def->descendants_are_inline = true;
+ if ($name != 'del' && $name != 'ins') {
+ // this is for you, ins/del
+ $def->descendants_are_inline = true;
+ }
}
$def->content_model = str_replace(
$content_sets_keys, $content_sets_values, $content_model);
@@ -116,10 +162,18 @@ class HTMLPurifier_XHTMLDefinition extends HTMLPurifier_HTMLDefinition
}
+ /**
+ * Sets up attribute transformations
+ * @param $config Instance of HTMLPurifier_Config
+ */
function setupAttrTransform($config) {
$this->info_attr_transform_post[] = new HTMLPurifier_AttrTransform_Lang();
}
+ /**
+ * Sets up block wrapper based on config
+ * @param $config Instance of HTMLPurifier_Config
+ */
function setupBlockWrapper($config) {
$block_wrapper = $config->get('HTML', 'BlockWrapper');
if (isset($this->content_sets['Block'][$block_wrapper])) {
@@ -130,6 +184,10 @@ class HTMLPurifier_XHTMLDefinition extends HTMLPurifier_HTMLDefinition
}
}
+ /**
+ * Sets up parent of fragment based on config
+ * @param $config Instance of HTMLPurifier_Config
+ */
function setupParent($config) {
$parent = $config->get('HTML', 'Parent');
if (isset($this->info[$parent])) {
@@ -141,10 +199,18 @@ class HTMLPurifier_XHTMLDefinition extends HTMLPurifier_HTMLDefinition
$this->info_parent_def = $this->info[$this->info_parent];
}
+ /**
+ * Instantiates a ChildDef based on content_model and content_model_type
+ * member variables in HTMLPurifier_ElementDef
+ * @note This will also defer to modules for custom HTMLPurifier_ChildDef
+ * subclasses that need content set expansion
+ * @param $def HTMLPurifier_ElementDef to have ChildDef extracted
+ * @return HTMLPurifier_ChildDef corresponding to ElementDef
+ */
function getChildDef($def) {
$value = $def->content_model;
- $type = $def->content_model_type;
- switch ($type) {
+ if (is_object($value)) return $value; // direct object, return
+ switch ($def->content_model_type) {
case 'required':
return new HTMLPurifier_ChildDef_Required($value);
case 'optional':
@@ -153,18 +219,29 @@ class HTMLPurifier_XHTMLDefinition extends HTMLPurifier_HTMLDefinition
return new HTMLPurifier_ChildDef_Empty();
case 'strictblockquote':
return new HTMLPurifier_ChildDef_StrictBlockquote($value);
- case 'table':
- return new HTMLPurifier_ChildDef_Table();
- case 'chameleon':
- $value = explode('!', $value);
- return new HTMLPurifier_ChildDef_Chameleon($value[0], $value[1]);
case 'custom':
return new HTMLPurifier_ChildDef_Custom($value);
}
- if ($value) return new HTMLPurifier_ChildDef_Optional($value);
- return new HTMLPurifier_ChildDef_Empty();
+ // defer to modules, see if they know what child_def to use
+ foreach ($this->modules as $module) {
+ if (!$module->defines_child_def) continue; // save a func call
+ $return = $module->getChildDef($def);
+ if ($return !== false) return $return;
+ }
+ // error-out
+ trigger_error(
+ 'Could not determine which ChildDef class to instantiate',
+ E_USER_ERROR
+ );
+ return false;
}
+ /**
+ * Converts a string list of elements separated by pipes into
+ * a lookup array.
+ * @param $string List of elements
+ * @return Lookup array of elements
+ */
function convertToLookup($string) {
$array = explode('|', str_replace(' ', '', $string));
$ret = array();