1
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-07-13 18:46:34 +02:00

Release 2.0.0, merged in 1026 to HEAD.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/strict@1179 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
Edward Z. Yang
2007-06-21 00:36:12 +00:00
parent c35eb3e95f
commit 0101311193
172 changed files with 7713 additions and 2520 deletions

View File

@ -2,6 +2,8 @@
require_once 'HTMLPurifier/HTMLModule.php';
require_once 'HTMLPurifier/ElementDef.php';
require_once 'HTMLPurifier/Doctype.php';
require_once 'HTMLPurifier/DoctypeRegistry.php';
require_once 'HTMLPurifier/ContentSets.php';
require_once 'HTMLPurifier/AttrTypes.php';
@ -23,14 +25,20 @@ require_once 'HTMLPurifier/HTMLModule/Image.php';
require_once 'HTMLPurifier/HTMLModule/StyleAttribute.php';
require_once 'HTMLPurifier/HTMLModule/Legacy.php';
require_once 'HTMLPurifier/HTMLModule/Target.php';
require_once 'HTMLPurifier/HTMLModule/Scripting.php';
require_once 'HTMLPurifier/HTMLModule/XMLCommonAttributes.php';
require_once 'HTMLPurifier/HTMLModule/NonXMLCommonAttributes.php';
// proprietary modules
require_once 'HTMLPurifier/HTMLModule/TransformToStrict.php';
require_once 'HTMLPurifier/HTMLModule/TransformToXHTML11.php';
// tidy modules
require_once 'HTMLPurifier/HTMLModule/Tidy.php';
require_once 'HTMLPurifier/HTMLModule/Tidy/XHTMLAndHTML4.php';
require_once 'HTMLPurifier/HTMLModule/Tidy/XHTML.php';
require_once 'HTMLPurifier/HTMLModule/Tidy/XHTMLStrict.php';
require_once 'HTMLPurifier/HTMLModule/Tidy/Proprietary.php';
HTMLPurifier_ConfigSchema::define(
'HTML', 'Doctype', null, 'string/null',
'Doctype to use, valid values are HTML 4.01 Transitional, HTML 4.01 '.
'Doctype to use, pre-defined values are HTML 4.01 Transitional, HTML 4.01 '.
'Strict, XHTML 1.0 Transitional, XHTML 1.0 Strict, XHTML 1.1. '.
'Technically speaking this is not actually a doctype (as it does '.
'not identify a corresponding DTD), but we are using this name '.
@ -38,173 +46,159 @@ HTMLPurifier_ConfigSchema::define(
'like %Core.XHTML or %HTML.Strict.'
);
HTMLPurifier_ConfigSchema::define(
'HTML', 'Trusted', false, 'bool',
'Indicates whether or not the user input is trusted or not. If the '.
'input is trusted, a more expansive set of allowed tags and attributes '.
'will be used. This directive has been available since 2.0.0.'
);
HTMLPurifier_ConfigSchema::define(
'HTML', 'AllowedModules', null, 'lookup/null', '
<p>
A doctype comes with a set of usual modules to use. Without having
to mucking about with the doctypes, you can quickly activate or
disable these modules by specifying which modules you wish to allow
with this directive. This is most useful for unit testing specific
modules, although end users may find it useful for their own ends.
</p>
<p>
If you specify a module that does not exist, the manager will silently
fail to use it, so be careful! User-defined modules are not affected
by this directive. Modules defined in %HTML.CoreModules are not
affected by this directive. This directive has been available since 2.0.0.
</p>
');
HTMLPurifier_ConfigSchema::define(
'HTML', 'CoreModules', array(
'Structure' => true,
'Text' => true,
'Hypertext' => true,
'List' => true,
'NonXMLCommonAttributes' => true,
'XMLCommonAttributes' => true,
'CommonAttributes' => true
), 'lookup', '
<p>
Certain modularized doctypes (XHTML, namely), have certain modules
that must be included for the doctype to be an conforming document
type: put those modules here. By default, XHTML\'s core modules
are used. You can set this to a blank array to disable core module
protection, but this is not recommended. This directive has been
available since 2.0.0.
</p>
');
class HTMLPurifier_HTMLModuleManager
{
/**
* Array of HTMLPurifier_Module instances, indexed by module's class name.
* All known modules, regardless of use, are in this array.
* Instance of HTMLPurifier_DoctypeRegistry
* @public
*/
var $doctypes;
/**
* Instance of current doctype
* @public
*/
var $doctype;
/**
* Instance of HTMLPurifier_AttrTypes
* @public
*/
var $attrTypes;
/**
* Active instances of modules for the specified doctype are
* indexed, by name, in this array.
*/
var $modules = array();
/**
* String doctype we will validate against. See $validModules for use.
*
* @note
* There is a special doctype '*' that acts both as the "default"
* doctype if a customized system only defines one doctype and
* also a catch-all doctype that gets merged into all the other
* module collections. When possible, use a private collection to
* share modules between doctypes: this special doctype is to
* make life more convenient for users.
* Array of recognized HTMLPurifier_Module instances, indexed by
* module's class name. This array is usually lazy loaded, but a
* user can overload a module by pre-emptively registering it.
*/
var $doctype;
var $doctypeAliases = array(); /**< Lookup array of strings to real doctypes */
var $registeredModules = array();
/**
* Associative array: $collections[$type][$doctype] = list of modules.
* This is used to logically separate types of functionality so that
* based on the doctype and other configuration settings they may
* be easily switched and on and off. Custom setups may not need
* to use this abstraction, opting to have only one big collection
* with one valid doctype.
* List of extra modules that were added by the user using addModule().
* These get unconditionally merged into the current doctype, whatever
* it may be.
*/
var $collections = array();
var $userModules = array();
/**
* Modules that may be used in a valid doctype of this kind.
* Correctional and leniency modules should not be placed in this
* array unless the user said so: don't stuff every possible lenient
* module for this doctype in here.
* Associative array of element name to list of modules that have
* definitions for the element; this array is dynamically filled.
*/
var $validModules = array();
var $validCollections = array(); /**< Collections to merge into $validModules */
/**
* Modules that we will allow in input, subset of $validModules. Single
* element definitions may result in us consulting validModules.
*/
var $activeModules = array();
var $activeCollections = array(); /**< Collections to merge into $activeModules */
var $counter = 0; /**< Designates next available integer order for modules. */
var $initialized = false; /**< Says whether initialize() was called */
/**
* Specifies what doctype to siphon new modules from addModule() to,
* or false to disable the functionality. Must be used in conjunction
* with $autoCollection.
*/
var $autoDoctype = false;
/**
* Specifies what collection to siphon new modules from addModule() to,
* or false to disable the functionality. Must be used in conjunction
* with $autoCollection.
*/
var $autoCollection = false;
/** Associative array of element name to defining modules (always array) */
var $elementLookup = array();
/** List of prefixes we should use for resolving small names */
/** List of prefixes we should use for registering small names */
var $prefixes = array('HTMLPurifier_HTMLModule_');
var $contentSets; /**< Instance of HTMLPurifier_ContentSets */
var $attrTypes; /**< Instance of HTMLPurifier_AttrTypes */
var $contentSets; /**< Instance of HTMLPurifier_ContentSets */
var $attrCollections; /**< Instance of HTMLPurifier_AttrCollections */
/**
* @param $blank If true, don't do any initializing
*/
function HTMLPurifier_HTMLModuleManager($blank = false) {
/** If set to true, unsafe elements and attributes will be allowed */
var $trusted = false;
function HTMLPurifier_HTMLModuleManager() {
// the only editable internal object. The rest need to
// be manipulated through modules
// editable internal objects
$this->attrTypes = new HTMLPurifier_AttrTypes();
$this->doctypes = new HTMLPurifier_DoctypeRegistry();
if (!$blank) $this->initialize();
// setup default HTML doctypes
}
function initialize() {
$this->initialized = true;
// load default modules to the recognized modules list (not active)
$modules = array(
// define
'CommonAttributes',
'Text', 'Hypertext', 'List', 'Presentation',
'Edit', 'Bdo', 'Tables', 'Image', 'StyleAttribute',
'Target',
// define-redefine
'Legacy',
// redefine
'TransformToStrict', 'TransformToXHTML11'
// module reuse
$common = array(
'CommonAttributes', 'Text', 'Hypertext', 'List',
'Presentation', 'Edit', 'Bdo', 'Tables', 'Image',
'StyleAttribute', 'Scripting'
);
foreach ($modules as $module) {
$this->addModule($module);
}
$transitional = array('Legacy', 'Target');
$xml = array('XMLCommonAttributes');
$non_xml = array('NonXMLCommonAttributes');
// Safe modules for supported doctypes. These are included
// in the valid and active module lists by default
$this->collections['Safe'] = array(
'_Common' => array( // leading _ indicates private
'CommonAttributes', 'Text', 'Hypertext', 'List',
'Presentation', 'Edit', 'Bdo', 'Tables', 'Image',
'StyleAttribute'
),
// HTML definitions, defer to XHTML definitions
'HTML 4.01 Transitional' => array(array('XHTML 1.0 Transitional')),
'HTML 4.01 Strict' => array(array('XHTML 1.0 Strict')),
// XHTML definitions
'XHTML 1.0 Transitional' => array( array('XHTML 1.0 Strict'), 'Legacy', 'Target' ),
'XHTML 1.0 Strict' => array(array('_Common')),
'XHTML 1.1' => array(array('_Common')),
$this->doctypes->register(
'HTML 4.01 Transitional', false,
array_merge($common, $transitional, $non_xml),
array('Tidy_Transitional', 'Tidy_Proprietary')
);
// Modules that specify elements that are unsafe from untrusted
// third-parties. These should be registered in $validModules but
// almost never $activeModules unless you really know what you're
// doing.
$this->collections['Unsafe'] = array();
// Modules to import if lenient mode (attempt to convert everything
// to a valid representation) is on. These must not be in $validModules
// unless specified so.
$this->collections['Lenient'] = array(
'HTML 4.01 Strict' => array(array('XHTML 1.0 Strict')),
'XHTML 1.0 Strict' => array('TransformToStrict'),
'XHTML 1.1' => array(array('XHTML 1.0 Strict'), 'TransformToXHTML11')
$this->doctypes->register(
'HTML 4.01 Strict', false,
array_merge($common, $non_xml),
array('Tidy_Strict', 'Tidy_Proprietary')
);
// Modules to import if correctional mode (correct everything that
// is feasible to strict mode) is on. These must not be in $validModules
// unless specified so.
$this->collections['Correctional'] = array(
'HTML 4.01 Transitional' => array(array('XHTML 1.0 Transitional')),
'XHTML 1.0 Transitional' => array('TransformToStrict'), // probably want a different one
$this->doctypes->register(
'XHTML 1.0 Transitional', true,
array_merge($common, $transitional, $xml, $non_xml),
array('Tidy_Transitional', 'Tidy_XHTML', 'Tidy_Proprietary')
);
// User-space modules, custom code or whatever
$this->collections['Extension'] = array();
$this->doctypes->register(
'XHTML 1.0 Strict', true,
array_merge($common, $xml, $non_xml),
array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_XHTMLStrict', 'Tidy_Proprietary')
);
// setup active versus valid modules. ORDER IS IMPORTANT!
// definition modules
$this->makeCollectionActive('Safe');
$this->makeCollectionValid('Unsafe');
// redefinition modules
$this->makeCollectionActive('Lenient');
$this->makeCollectionActive('Correctional');
$this->autoDoctype = '*';
$this->autoCollection = 'Extension';
$this->doctypes->register(
'XHTML 1.1', true,
array_merge($common, $xml),
array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Proprietary') // Tidy_XHTML1_1
);
}
/**
* Adds a module to the recognized module list. This does not
* do anything else: the module must be added to a corresponding
* collection to be "activated".
* Registers a module to the recognized module list, useful for
* overloading pre-existing modules.
* @param $module Mixed: string module name, with or without
* HTMLPurifier_HTMLModule prefix, or instance of
* subclass of HTMLPurifier_HTMLModule.
@ -217,10 +211,15 @@ class HTMLPurifier_HTMLModuleManager
* - Check for literal object name
* - Throw fatal error
* If your object name collides with an internal class, specify
* your module manually.
* your module manually. All modules must have been included
* externally: registerModule will not perform inclusions for you!
* @warning If your module has the same name as an already loaded
* module, your module will overload the old one WITHOUT
* warning.
*/
function addModule($module) {
function registerModule($module) {
if (is_string($module)) {
// attempt to load the module
$original_module = $module;
$ok = false;
foreach ($this->prefixes as $prefix) {
@ -240,16 +239,19 @@ class HTMLPurifier_HTMLModuleManager
}
$module = new $module();
}
$module->order = $this->counter++; // assign then increment
$this->modules[$module->name] = $module;
if ($this->autoDoctype !== false && $this->autoCollection !== false) {
$this->collections[$this->autoCollection][$this->autoDoctype][] = $module->name;
if (empty($module->name)) {
trigger_error('Module instance of ' . get_class($module) . ' must have name');
return;
}
$this->registeredModules[$module->name] = $module;
}
/**
* Safely tests for class existence without invoking __autoload in PHP5
* or greater.
* @param $name String class name to test
* @note If any other class needs it, we'll need to stash in a
* conjectured "compatibility" class
* @private
*/
function _classExists($name) {
@ -265,55 +267,63 @@ class HTMLPurifier_HTMLModuleManager
}
/**
* Makes a collection active, while also making it valid if not
* already done so. See $activeModules for the semantics of "active".
* @param $collection_name Name of collection to activate
* Adds a module to the current doctype by first registering it,
* and then tacking it on to the active doctype
*/
function makeCollectionActive($collection_name) {
if (!in_array($collection_name, $this->validCollections)) {
$this->makeCollectionValid($collection_name);
}
$this->activeCollections[] = $collection_name;
function addModule($module) {
$this->registerModule($module);
if (is_object($module)) $module = $module->name;
$this->userModules[] = $module;
}
/**
* Makes a collection valid. See $validModules for the semantics of "valid"
*/
function makeCollectionValid($collection_name) {
$this->validCollections[] = $collection_name;
}
/**
* Adds a class prefix that addModule() will use to resolve a
* Adds a class prefix that registerModule() will use to resolve a
* string name to a concrete class
*/
function addPrefix($prefix) {
$this->prefixes[] = (string) $prefix;
$this->prefixes[] = $prefix;
}
/**
* Performs processing on modules, after being called you may
* use getElement() and getElements()
* @param $config Instance of HTMLPurifier_Config
*/
function setup($config) {
// load up the autocollection
if ($this->autoCollection !== false) {
$this->makeCollectionActive($this->autoCollection);
$this->trusted = $config->get('HTML', 'Trusted');
// generate
$this->doctype = $this->doctypes->make($config);
$modules = $this->doctype->modules;
// take out the default modules that aren't allowed
$lookup = $config->get('HTML', 'AllowedModules');
$special_cases = $config->get('HTML', 'CoreModules');
if (is_array($lookup)) {
foreach ($modules as $k => $m) {
if (isset($special_cases[$m])) continue;
if (!isset($lookup[$m])) unset($modules[$k]);
}
}
// retrieve the doctype
$this->doctype = $this->getDoctype($config);
if (isset($this->doctypeAliases[$this->doctype])) {
$this->doctype = $this->doctypeAliases[$this->doctype];
// merge in custom modules
$modules = array_merge($modules, $this->userModules);
foreach ($modules as $module) {
$this->processModule($module);
}
// process module collections to module name => module instance form
foreach ($this->collections as $col_i => $x) {
$this->processCollections($this->collections[$col_i]);
foreach ($this->doctype->tidyModules as $module) {
$this->processModule($module);
if (method_exists($this->modules[$module], 'construct')) {
$this->modules[$module]->construct($config);
}
}
$this->validModules = $this->assembleModules($this->validCollections);
$this->activeModules = $this->assembleModules($this->activeCollections);
// setup lookup table based on all valid modules
foreach ($this->validModules as $module) {
foreach ($this->modules as $module) {
foreach ($module->info as $name => $def) {
if (!isset($this->elementLookup[$name])) {
$this->elementLookup[$name] = array();
@ -324,214 +334,51 @@ class HTMLPurifier_HTMLModuleManager
// note the different choice
$this->contentSets = new HTMLPurifier_ContentSets(
// content models that contain non-allowed elements are
// harmless because RemoveForeignElements will ensure
// they never get in anyway, and there is usually no
// reason why you should want to restrict a content
// model beyond what is mandated by the doctype.
// Note, however, that this means redefinitions of
// content models can't be tossed in validModels willy-nilly:
// that stuff still is regulated by configuration.
$this->validModules
// content set assembly deals with all possible modules,
// not just ones deemed to be "safe"
$this->modules
);
$this->attrCollections = new HTMLPurifier_AttrCollections(
$this->attrTypes,
// only explicitly allowed modules are allowed to affect
// the global attribute collections. This mean's there's
// a distinction between loading the Bdo module, and the
// bdo element: Bdo will enable the dir attribute on all
// elements, while bdo will only define the bdo element,
// which will not have an editable directionality. This might
// catch people who are loading only elements by surprise, so
// we should consider loading an entire module if all the
// elements it defines are requested by the user, especially
// if it affects the global attribute collections.
$this->activeModules
// there is no way to directly disable a global attribute,
// but using AllowedAttributes or simply not including
// the module in your custom doctype should be sufficient
$this->modules
);
}
/**
* Takes a list of collections and merges together all the defined
* modules for the current doctype from those collections.
* @param $collections List of collection suffixes we should grab
* modules from (like 'Safe' or 'Lenient')
* Takes a module and adds it to the active module collection,
* registering it if necessary.
*/
function assembleModules($collections) {
$modules = array();
$numOfCollectionsUsed = 0;
foreach ($collections as $name) {
$disable_global = false;
if (!isset($this->collections[$name])) {
trigger_error("$name collection is undefined", E_USER_ERROR);
continue;
}
$cols = $this->collections[$name];
if (isset($cols[$this->doctype])) {
if (isset($cols[$this->doctype]['*'])) {
unset($cols[$this->doctype]['*']);
$disable_global = true;
}
$modules += $cols[$this->doctype];
$numOfCollectionsUsed++;
}
// accept catch-all doctype
if (
$this->doctype !== '*' &&
isset($cols['*']) &&
!$disable_global
) {
$modules += $cols['*'];
}
function processModule($module) {
if (!isset($this->registeredModules[$module]) || is_object($module)) {
$this->registerModule($module);
}
if ($numOfCollectionsUsed < 1) {
// possible XSS injection if user-specified doctypes
// are allowed
trigger_error("Doctype {$this->doctype} does not exist, ".
"check for typos (if you desire a doctype that allows ".
"no elements, use an empty array collection)", E_USER_ERROR);
}
return $modules;
$this->modules[$module] = $this->registeredModules[$module];
}
/**
* Takes a collection and performs inclusions and substitutions for it.
* @param $cols Reference to collections class member variable
* Retrieves merged element definitions.
* @return Array of HTMLPurifier_ElementDef
*/
function processCollections(&$cols) {
// $cols is the set of collections
// $col_i is the name (index) of a collection
// $col is a collection/list of modules
// perform inclusions
foreach ($cols as $col_i => $col) {
$seen = array();
if (!empty($col[0]) && is_array($col[0])) {
$seen[$col_i] = true; // recursion reporting
$includes = $col[0];
unset($cols[$col_i][0]); // remove inclusions value, recursion guard
} else {
$includes = array();
}
if (empty($includes)) continue;
for ($i = 0; isset($includes[$i]); $i++) {
$inc = $includes[$i];
if (isset($seen[$inc])) {
trigger_error(
"Circular inclusion detected in $col_i collection",
E_USER_ERROR
);
continue;
} else {
$seen[$inc] = true;
}
if (!isset($cols[$inc])) {
trigger_error(
"Collection $col_i tried to include undefined ".
"collection $inc", E_USER_ERROR);
continue;
}
foreach ($cols[$inc] as $module) {
if (is_array($module)) { // another inclusion!
foreach ($module as $inc2) $includes[] = $inc2;
continue;
}
$cols[$col_i][] = $module; // merge in the other modules
}
}
}
// replace with real modules, invert module from list to
// assoc array of module name to module instance
foreach ($cols as $col_i => $col) {
$ignore_global = false;
$order = array();
foreach ($col as $module_i => $module) {
unset($cols[$col_i][$module_i]);
if (is_array($module)) {
trigger_error("Illegal inclusion array at index".
" $module_i found collection $col_i, inclusion".
" arrays must be at start of collection (index 0)",
E_USER_ERROR);
continue;
}
if ($module_i === '*' && $module === false) {
$ignore_global = true;
continue;
}
if (!isset($this->modules[$module])) {
trigger_error(
"Collection $col_i references undefined ".
"module $module",
E_USER_ERROR
);
continue;
}
$module = $this->modules[$module];
$cols[$col_i][$module->name] = $module;
$order[$module->name] = $module->order;
}
array_multisort(
$order, SORT_ASC, SORT_NUMERIC, $cols[$col_i]
);
if ($ignore_global) $cols[$col_i]['*'] = false;
}
// delete pseudo-collections
foreach ($cols as $col_i => $col) {
if ($col_i[0] == '_') unset($cols[$col_i]);
}
}
/**
* Retrieves the doctype from the configuration object
*/
function getDoctype($config) {
$doctype = $config->get('HTML', 'Doctype');
if ($doctype !== null) {
return $doctype;
}
if (!$this->initialized) {
// don't do HTML-oriented backwards compatibility stuff
// use either the auto-doctype, or the catch-all doctype
return $this->autoDoctype ? $this->autoDoctype : '*';
}
// this is backwards-compatibility stuff
if ($config->get('Core', 'XHTML')) {
$doctype = 'XHTML 1.0';
} else {
$doctype = 'HTML 4.01';
}
if ($config->get('HTML', 'Strict')) {
$doctype .= ' Strict';
} else {
$doctype .= ' Transitional';
}
return $doctype;
}
/**
* Retrieves merged element definitions for all active elements.
* @note We may want to generate an elements array during setup
* and pass that on, because a specific combination of
* elements may trigger the loading of a module.
* @param $config Instance of HTMLPurifier_Config, for determining
* stray elements.
*/
function getElements($config) {
function getElements() {
$elements = array();
foreach ($this->activeModules as $module) {
foreach ($this->modules as $module) {
foreach ($module->info as $name => $v) {
if (isset($elements[$name])) continue;
$elements[$name] = $this->getElement($name, $config);
// if element is not safe, don't use it
if (!$this->trusted && ($v->safe === false)) continue;
$elements[$name] = $this->getElement($name);
}
}
// standalone elements now loaded
// remove dud elements, this happens when an element that
// appeared to be safe actually wasn't
foreach ($elements as $n => $v) {
if ($v === false) unset($elements[$n]);
}
return $elements;
@ -540,13 +387,16 @@ class HTMLPurifier_HTMLModuleManager
/**
* Retrieves a single merged element definition
* @param $name Name of element
* @param $config Instance of HTMLPurifier_Config, may not be necessary.
* @param $trusted Boolean trusted overriding parameter: set to true
* if you want the full version of an element
* @return Merged HTMLPurifier_ElementDef
*/
function getElement($name, $config) {
function getElement($name, $trusted = null) {
$def = false;
if ($trusted === null) $trusted = $this->trusted;
$modules = $this->validModules;
$modules = $this->modules;
if (!isset($this->elementLookup[$name])) {
return false;
@ -555,9 +405,23 @@ class HTMLPurifier_HTMLModuleManager
foreach($this->elementLookup[$name] as $module_name) {
$module = $modules[$module_name];
$new_def = $module->info[$name];
// copy is used because, ideally speaking, the original
// definition should not be modified. Usually, this will
// make no difference, but for consistency's sake
$new_def = $module->info[$name]->copy();
// refuse to create/merge in a definition that is deemed unsafe
if (!$trusted && ($new_def->safe === false)) {
$def = false;
continue;
}
if (!$def && $new_def->standalone) {
// element with unknown safety is not to be trusted.
// however, a merge-in definition with undefined safety
// is fine
if (!$trusted && !$new_def->safe) continue;
$def = $new_def;
} elseif ($def) {
$def->mergeIn($new_def);
@ -583,6 +447,13 @@ class HTMLPurifier_HTMLModuleManager
$this->contentSets->generateChildDef($def, $module);
}
// add information on required attributes
foreach ($def->attr as $attr_name => $attr_def) {
if ($attr_def->required) {
$def->required_attr[] = $attr_name;
}
}
return $def;