mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2025-08-03 12:47:56 +02:00
Compare commits
18 Commits
v4.11.0
...
v2.1.1-str
Author | SHA1 | Date | |
---|---|---|---|
|
80c60bb9b5 | ||
|
503e76081b | ||
|
678a593e62 | ||
|
495164e938 | ||
|
42858ad594 | ||
|
5ecb11f19a | ||
|
0101311193 | ||
|
c35eb3e95f | ||
|
b829e76bbf | ||
|
e967680250 | ||
|
dd2fd06591 | ||
|
cec7a1c087 | ||
|
c2d3d5b859 | ||
|
9a84e11f34 | ||
|
37ea1673dd | ||
|
5395d8b4bd | ||
|
c980e76197 | ||
|
2bf912d528 |
2
Doxyfile
2
Doxyfile
@@ -4,7 +4,7 @@
|
|||||||
# Project related configuration options
|
# Project related configuration options
|
||||||
#---------------------------------------------------------------------------
|
#---------------------------------------------------------------------------
|
||||||
PROJECT_NAME = HTML Purifier
|
PROJECT_NAME = HTML Purifier
|
||||||
PROJECT_NUMBER = 1.3.2
|
PROJECT_NUMBER = 2.1.1
|
||||||
OUTPUT_DIRECTORY = "C:/Documents and Settings/Edward/My Documents/My Webs/htmlpurifier/docs/doxygen"
|
OUTPUT_DIRECTORY = "C:/Documents and Settings/Edward/My Documents/My Webs/htmlpurifier/docs/doxygen"
|
||||||
CREATE_SUBDIRS = NO
|
CREATE_SUBDIRS = NO
|
||||||
OUTPUT_LANGUAGE = English
|
OUTPUT_LANGUAGE = English
|
||||||
|
103
INSTALL
103
INSTALL
@@ -1,4 +1,3 @@
|
|||||||
|
|
||||||
Install
|
Install
|
||||||
How to install HTML Purifier
|
How to install HTML Purifier
|
||||||
|
|
||||||
@@ -10,10 +9,11 @@ should make sure a few things are properly done.
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
1. Compatibility
|
1. Compatibility
|
||||||
|
|
||||||
HTML Purifier works in both PHP 4 and PHP 5, from PHP 4.3.9 and up. It has no
|
HTML Purifier works in both PHP 4 and PHP 5, from PHP 4.3.2 and up. It has no
|
||||||
core dependencies with other libraries. (Whoopee!)
|
core dependencies with other libraries.
|
||||||
|
|
||||||
Optional extensions are iconv (usually installed) and tidy (also common).
|
Optional extensions are iconv (usually installed) and tidy (also common).
|
||||||
If you use UTF-8 and don't plan on pretty-printing HTML, you can get away with
|
If you use UTF-8 and don't plan on pretty-printing HTML, you can get away with
|
||||||
@@ -46,7 +46,10 @@ HTML Purifier is all about web-standards, so accordingly your webpages should
|
|||||||
be standards compliant. HTML Purifier can deal with these doctypes:
|
be standards compliant. HTML Purifier can deal with these doctypes:
|
||||||
|
|
||||||
* XHTML 1.0 Transitional (default)
|
* XHTML 1.0 Transitional (default)
|
||||||
|
* XHTML 1.0 Strict
|
||||||
* HTML 4.01 Transitional
|
* HTML 4.01 Transitional
|
||||||
|
* HTML 4.01 Strict
|
||||||
|
* XHTML 1.1 (sans Ruby)
|
||||||
|
|
||||||
...and these character encodings:
|
...and these character encodings:
|
||||||
|
|
||||||
@@ -65,11 +68,11 @@ the doctype from this code in your HTML documents:
|
|||||||
<meta http-equiv="Content-type" content="text/html;charset=ENCODING">
|
<meta http-equiv="Content-type" content="text/html;charset=ENCODING">
|
||||||
|
|
||||||
For legacy codebases these declarations may be missing. If that is the case,
|
For legacy codebases these declarations may be missing. If that is the case,
|
||||||
STOP, and read up on character encodings and doctypes (in that order). Here
|
STOP, and read docs/enduser-utf8.html
|
||||||
are some links:
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
* http://www.joelonsoftware.com/articles/Unicode.html
|
|
||||||
* http://alistapart.com/stories/doctype/
|
|
||||||
|
|
||||||
You may currently be vulnerable to XSS and other security threats, and HTML
|
You may currently be vulnerable to XSS and other security threats, and HTML
|
||||||
Purifier won't be able to fix that.
|
Purifier won't be able to fix that.
|
||||||
@@ -86,7 +89,7 @@ into configuring things just for the heck of it, skip to 4.3).
|
|||||||
* Am I using UTF-8?
|
* Am I using UTF-8?
|
||||||
* Am I using XHTML 1.0 Transitional?
|
* Am I using XHTML 1.0 Transitional?
|
||||||
|
|
||||||
If you answered yes to any of these questions, instantiate a configuration
|
If you answered no to any of these questions, instantiate a configuration
|
||||||
object and read on:
|
object and read on:
|
||||||
|
|
||||||
$config = HTMLPurifier_Config::createDefault();
|
$config = HTMLPurifier_Config::createDefault();
|
||||||
@@ -113,36 +116,42 @@ websites):
|
|||||||
|
|
||||||
Note that HTML Purifier's support for non-Unicode encodings is crippled by the
|
Note that HTML Purifier's support for non-Unicode encodings is crippled by the
|
||||||
fact that any character not supported by that encoding will be silently
|
fact that any character not supported by that encoding will be silently
|
||||||
dropped, EVEN if it is ampersand escaped. This is a current limitation of
|
dropped, EVEN if it is ampersand escaped. If you want to work around
|
||||||
HTML Purifier that we are NOT actively working to fix. Patches are welcome,
|
this, you are welcome to read docs/enduser-utf8.html for a fix,
|
||||||
but there are so many other gotchas and problems in I18N for non-Unicode
|
but please be cognizant of the issues the "solution" creates (for this
|
||||||
encodings that this functionality is low priority. See
|
reason, I do not include the solution in this document).
|
||||||
<http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html> for a more
|
|
||||||
detailed lowdown on the topic.
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
4.2. Setting a different doctype
|
4.2. Setting a different doctype
|
||||||
|
|
||||||
For those of you stuck using HTML 4.01 Transitional, you can disable
|
For those of you using HTML 4.01 Transitional, you can disable
|
||||||
XHTML output like this:
|
XHTML output like this:
|
||||||
|
|
||||||
$config->set('Core', 'XHTML', false);
|
$config->set('HTML', 'Doctype', 'HTML 4.01 Transitional');
|
||||||
|
|
||||||
I recommend that you use XHTML, although not as much as I recommend UTF-8. If
|
Other supported doctypes include:
|
||||||
your HTML 4.01 page validates, good for you!
|
|
||||||
|
|
||||||
Currently, we can only guarantee transitional-complaint output, future
|
|
||||||
versions will also allow strict-compliant output.
|
* HTML 4.01 Strict
|
||||||
|
* HTML 4.01 Transitional
|
||||||
|
* XHTML 1.0 Strict
|
||||||
|
* XHTML 1.0 Transitional
|
||||||
|
* XHTML 1.1
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
4.3. Other settings
|
4.3. Other settings
|
||||||
|
|
||||||
There are more configuration directives which can be read about
|
There are more configuration directives which can be read about
|
||||||
here: <http://hp.jpsband.org/live/configdoc/plain.html> They're a bit boring,
|
here: <http://htmlpurifier.org/live/configdoc/plain.html> They're a bit boring,
|
||||||
but they can help out for those of you who like to exert maximum control over
|
but they can help out for those of you who like to exert maximum control over
|
||||||
your code.
|
your code. Some of the more interesting ones are configurable at the
|
||||||
|
demo <http://htmlpurifier.org/demo.php> and are well worth looking into
|
||||||
|
for your own system.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
@@ -159,13 +168,15 @@ The interface is mind-numbingly simple:
|
|||||||
$clean_html = $purifier->purify( $dirty_html );
|
$clean_html = $purifier->purify( $dirty_html );
|
||||||
|
|
||||||
That's it! For more examples, check out docs/examples/ (they aren't very
|
That's it! For more examples, check out docs/examples/ (they aren't very
|
||||||
different though). Also, SLOW gives advice on what to do if HTML Purifier
|
different though). Also, docs/enduser-slow.html gives advice on what to
|
||||||
is slowing down your application.
|
do if HTML Purifier is slowing down your application.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
6. Quick install
|
6. Quick install
|
||||||
|
|
||||||
|
First, make sure library/HTMLPurifier/DefinitionCache/Serializer is
|
||||||
|
writable by the webserver (see Section 7: Caching below for details).
|
||||||
If your website is in UTF-8 and XHTML Transitional, use this code:
|
If your website is in UTF-8 and XHTML Transitional, use this code:
|
||||||
|
|
||||||
<?php
|
<?php
|
||||||
@@ -181,9 +192,47 @@ If your website is in a different encoding or doctype, use this code:
|
|||||||
require_once '/path/to/htmlpurifier/library/HTMLPurifier.auto.php';
|
require_once '/path/to/htmlpurifier/library/HTMLPurifier.auto.php';
|
||||||
|
|
||||||
$config = HTMLPurifier_Config::createDefault();
|
$config = HTMLPurifier_Config::createDefault();
|
||||||
$config->set('Core', 'Encoding', 'ISO-8859-1'); //replace with your encoding
|
$config->set('Core', 'Encoding', 'ISO-8859-1'); // replace with your encoding
|
||||||
$config->set('Core', 'XHTML', true); //replace with false if HTML 4.01
|
$config->set('HTML', 'Doctype', 'HTML 4.01 Transitional'); // replace with your doctype
|
||||||
$purifier = new HTMLPurifier($config);
|
$purifier = new HTMLPurifier($config);
|
||||||
|
|
||||||
$clean_html = $purifier->purify($dirty_html);
|
$clean_html = $purifier->purify($dirty_html);
|
||||||
?>
|
?>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
7. Caching
|
||||||
|
|
||||||
|
HTML Purifier generates some cache files (generally one or two) to speed up
|
||||||
|
its execution. For maximum performance, make sure that
|
||||||
|
library/HTMLPurifier/DefinitionCache/Serializer is writeable by the webserver.
|
||||||
|
|
||||||
|
If you are in the library/ folder of HTML Purifier, you can set the
|
||||||
|
appropriate permissions using:
|
||||||
|
|
||||||
|
chmod -R 0755 HTMLPurifier/DefinitionCache/Serializer
|
||||||
|
|
||||||
|
If the above command doesn't work, you may need to assign write permissions
|
||||||
|
to all. This may be necessary if your webserver runs as nobody, but is
|
||||||
|
not recommended since it means any other user can write files in the
|
||||||
|
directory. Use:
|
||||||
|
|
||||||
|
chmod -R 0777 HTMLPurifier/DefinitionCache/Serializer
|
||||||
|
|
||||||
|
You can also chmod files via your FTP client; this option
|
||||||
|
is usually accessible by right clicking the corresponding directory and
|
||||||
|
then selecting "chmod" or "file permissions".
|
||||||
|
|
||||||
|
Starting with 2.0.1, HTML Purifier will generate friendly error messages
|
||||||
|
that will tell you exactly what you have to chmod the directory to, if in doubt,
|
||||||
|
follow its advice.
|
||||||
|
|
||||||
|
If you are unable or unwilling to give write permissions to the cache
|
||||||
|
directory, you can either disable the cache (and suffer a performance
|
||||||
|
hit):
|
||||||
|
|
||||||
|
$config->set('Core', 'DefinitionCache', null);
|
||||||
|
|
||||||
|
Or move the cache directory somewhere else (no trailing slash):
|
||||||
|
|
||||||
|
$config->set('Cache', 'SerializerPath', '/home/user/absolute/path');
|
||||||
|
71
INSTALL.fr.utf8
Normal file
71
INSTALL.fr.utf8
Normal file
@@ -0,0 +1,71 @@
|
|||||||
|
|
||||||
|
Installation
|
||||||
|
Comment installer HTML Purifier
|
||||||
|
|
||||||
|
Attention: Ce document a encode en UTF-8. Si les lettres avec les accents
|
||||||
|
est essoreuse, prenez un mieux editeur de texte.
|
||||||
|
|
||||||
|
À L'Aide: Je ne suis pas un diseur natif de français. Si vous trouvez une
|
||||||
|
erreur dans ce document, racontez-moi! Merci.
|
||||||
|
|
||||||
|
|
||||||
|
L'installation de HTML Purifier est trés simple, parce qu'il ne doit pas
|
||||||
|
la configuration. Dans le pied de de document, les utilisateurs
|
||||||
|
impatient peuvent trouver le code, mais je recommande que vous lisez
|
||||||
|
ce document pour quelques choses.
|
||||||
|
|
||||||
|
|
||||||
|
1. Compatibilité
|
||||||
|
|
||||||
|
HTML Purifier fonctionne dans PHP 4 et PHP 5. PHP 4.3.2 est le dernier
|
||||||
|
version que je le testais. Il ne dépend de les autre librairies.
|
||||||
|
|
||||||
|
Les extensions optionnel est iconv (en général déjà installer) et
|
||||||
|
tidy (répandu aussi). Si vous utilisez UTF-8 et ne voulez pas
|
||||||
|
l'indentation, vous pouvez utiliser HTML Purifier sans ces extensions.
|
||||||
|
|
||||||
|
|
||||||
|
2. Inclure la librarie
|
||||||
|
|
||||||
|
Utilisez:
|
||||||
|
|
||||||
|
require_once '/path/to/library/HTMLPurifier.auto.php';
|
||||||
|
|
||||||
|
...quand vous devez utiliser HTML Purifier (ne inclure pas quand vous
|
||||||
|
ne devez pas, parce que HTML Purifier est trés grand.)
|
||||||
|
|
||||||
|
Si vous n'aime pas que HTML Purifier change vos include_path, on peut
|
||||||
|
change vos include_path, et:
|
||||||
|
|
||||||
|
require_once 'HTMLPurifier.php';
|
||||||
|
|
||||||
|
Seuleument les contents dans library/ est essentiel; vous peut enlever
|
||||||
|
les autre fichiers quand vous est dans une atmosphère professionnel.
|
||||||
|
|
||||||
|
|
||||||
|
[En cours de construction]
|
||||||
|
|
||||||
|
|
||||||
|
6. Installation vite
|
||||||
|
|
||||||
|
Si votre site web est en UTF-8 et XHTML Transitional, utilisez:
|
||||||
|
|
||||||
|
<?php
|
||||||
|
require_once '/path/to/htmlpurifier/library/HTMLPurifier.auto.php';
|
||||||
|
|
||||||
|
$purificateur = new HTMLPurifier();
|
||||||
|
$html_propre = $purificateur->purify($html_salle);
|
||||||
|
?>
|
||||||
|
|
||||||
|
Sinon, utilisez:
|
||||||
|
|
||||||
|
<?php
|
||||||
|
require_once '/path/to/htmlpurifier/library/HTMLPurifier.auto.php';
|
||||||
|
|
||||||
|
$config = HTMLPurifier_Config::createDefault();
|
||||||
|
$config->set('Core', 'Encoding', 'ISO-8859-1'); //remplacez avec votre encoding
|
||||||
|
$config->set('Core', 'XHTML', true); //remplacez avec false si HTML 4.01
|
||||||
|
$purificateur = new HTMLPurifier($config);
|
||||||
|
|
||||||
|
$html_propre = $purificateur->purify($html_salle);
|
||||||
|
?>
|
275
NEWS
275
NEWS
@@ -9,11 +9,276 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
|||||||
. Internal change
|
. Internal change
|
||||||
==========================
|
==========================
|
||||||
|
|
||||||
1.4.0, unknown release date
|
2.1.1, released 2007-08-04
|
||||||
(major feature release)
|
- Fix show-stopper bug in %URI.MakeAbsolute functionality
|
||||||
|
- Fix PHP4 syntax error in standalone version
|
||||||
|
. Add prefix directory to include path for standalone, this prevents
|
||||||
|
other installations from clobbering the standalone's URI schemes
|
||||||
|
. Single test methods can be invoked by prefixing with __only
|
||||||
|
|
||||||
1.3.3, unknown release date, may be dropped
|
2.1.0, released 2007-08-02
|
||||||
(security/bugfix/minor feature release)
|
# flush-htmldefinition-cache.php superseded in favor of a generic
|
||||||
|
flush-definition-cache.php script, you can clear a specific cache
|
||||||
|
by passing its name as a parameter to the script
|
||||||
|
! Phorum mod implemented for HTML Purifier
|
||||||
|
! With %Core.AggressivelyFixLt, <3 and similar emoticons no longer
|
||||||
|
trigger HTML removal in PHP5 (DOMLex). This directive is not necessary
|
||||||
|
for PHP4 (DirectLex).
|
||||||
|
! Standalone file now available, which greatly reduces the amount of
|
||||||
|
includes (although there are still a few files that reside in the
|
||||||
|
standalone folder)
|
||||||
|
! Relative URIs can now be transformed into their absolute equivalents
|
||||||
|
using %URI.Base and %URI.MakeAbsolute
|
||||||
|
! Ruby implemented for XHTML 1.1
|
||||||
|
! You can now define custom URI filtering behavior, see enduser-uri-filter.html
|
||||||
|
for more details
|
||||||
|
! UTF-8 font names now supported in CSS
|
||||||
|
- AutoFormatters emit friendly error messages if tags or attributes they
|
||||||
|
need are not allowed
|
||||||
|
- ConfigForm's compactification of directive names is now configurable
|
||||||
|
- AutoParagraph autoformatter algorithm refined after field-testing
|
||||||
|
- XHTML 1.1 now applies XHTML 1.0 Strict cleanup routines, namely
|
||||||
|
blockquote wrapping
|
||||||
|
- Contents of <style> tags removed by default when tags are removed
|
||||||
|
. HTMLPurifier_Config->getSerial() implemented, this is extremely useful
|
||||||
|
for output cache invalidation
|
||||||
|
. ConfigForm printer now can retrieve CSS and JS files as strings, in
|
||||||
|
case HTML Purifier's directory is not publically accessible
|
||||||
|
. Introduce new text/itext configuration directive values: these represent
|
||||||
|
longer strings that would be more appropriately edited with a textarea
|
||||||
|
. Allow newlines to act as separators for lists, hashes, lookups and
|
||||||
|
%HTML.Allowed
|
||||||
|
. ConfigForm generates textareas instead of text inputs for lists, hashes,
|
||||||
|
lookups, text and itext fields
|
||||||
|
. Hidden element content removal genericized: %Core.HiddenElements can
|
||||||
|
be used to customize this behavior, by default <script> and <style> are
|
||||||
|
hidden
|
||||||
|
. Added HTMLPURIFIER_PREFIX constant, should be used instead of dirname(__FILE__)
|
||||||
|
. Custom ChildDef added to default include list
|
||||||
|
. URIScheme reflection improved: will not attempt to include file if class
|
||||||
|
already exists. May clobber autoload, so I need to keep an eye on it
|
||||||
|
. ConfigSchema heavily optimized, will only collect information and validate
|
||||||
|
definitions when HTMLPURIFIER_SCHEMA_STRICT is true.
|
||||||
|
. AttrDef_URI unit tests and implementation refactored
|
||||||
|
. benchmarks/ directory now protected from public view with .htaccess file;
|
||||||
|
run the tests via command line
|
||||||
|
. URI scheme is munged off if there is no authority and the scheme is the
|
||||||
|
default one
|
||||||
|
. All unit tests inherit from HTMLPurifier_Harness, not UnitTestCase
|
||||||
|
. Interface for URIScheme changed
|
||||||
|
. Generic URI object to hold components of URI added, most systems involved
|
||||||
|
in URI validation have been migrated to use it
|
||||||
|
. Custom filtering for URIs factored out to URIDefinition interface for
|
||||||
|
maximum extensibility
|
||||||
|
|
||||||
|
2.0.1, released 2007-06-27
|
||||||
|
! Tag auto-closing now based on a ChildDef heuristic rather than a
|
||||||
|
manually set auto_close array; some behavior may change
|
||||||
|
! Experimental AutoFormat functionality added: auto-paragraph and
|
||||||
|
linkify your HTML input by setting %AutoFormat.AutoParagraph and
|
||||||
|
%AutoFormat.Linkify to true
|
||||||
|
! Newlines normalized internally, and then converted back to the
|
||||||
|
value of PHP_EOL. If this is not desired, set your newline format
|
||||||
|
using %Output.Newline.
|
||||||
|
! Beta error collection, messages are implemented for the most generic
|
||||||
|
cases involving Lexing or Strategies
|
||||||
|
- Clean up special case code for <script> tags
|
||||||
|
- Reorder includes for DefinitionCache decorators, fixes a possible
|
||||||
|
missing class error
|
||||||
|
- Fixed bug where manually modified definitions were not saved via cache
|
||||||
|
(mostly harmless, except for the fact that it would be a little slower)
|
||||||
|
- Configuration objects with different serials do not clobber each
|
||||||
|
others when revision numbers are unequal
|
||||||
|
- Improve Serializer DefinitionCache directory permissions checks
|
||||||
|
- DefinitionCache no longer throws errors when it encounters old
|
||||||
|
serial files that do not conform to the current style
|
||||||
|
- Stray xmlns attributes removed from configuration documentation
|
||||||
|
- configForm.php smoketest no longer has XSS vulnerability due to
|
||||||
|
unescaped print_r output
|
||||||
|
- Printer adheres to configuration's directives on output format
|
||||||
|
- Fix improperly named form field in ConfigForm printer
|
||||||
|
. Rewire some test-cases to swallow errors rather than expect them
|
||||||
|
. HTMLDefinition printer updated with some of the new attributes
|
||||||
|
. DefinitionCache keys reordered to reflect precedence: version number,
|
||||||
|
hash, then revision number
|
||||||
|
. %Core.DefinitionCache renamed to %Cache.DefinitionImpl
|
||||||
|
. Interlinking in configuration documentation added using
|
||||||
|
Injector_PurifierLinkify
|
||||||
|
. Directives now keep track of aliases to themselves
|
||||||
|
. Error collector now requires a severity to be passed, use PHP's internal
|
||||||
|
error constants for this
|
||||||
|
. HTMLPurifier_Config::getAllowedDirectivesForForm implemented, allows
|
||||||
|
much easier selective embedding of configuration values
|
||||||
|
. Doctype objects now accept public and system DTD identifiers
|
||||||
|
. %HTML.Doctype is now constrained by specific values, to specify a custom
|
||||||
|
doctype use new %HTML.CustomDoctype
|
||||||
|
. ConfigForm truncates long directives to keep the form small, and does
|
||||||
|
not re-output namespaces
|
||||||
|
|
||||||
|
2.0.0, released 2007-06-20
|
||||||
|
# Completely refactored HTMLModuleManager, decentralizing safety
|
||||||
|
information
|
||||||
|
# Transform modules changed to Tidy modules, which offer more flexibility
|
||||||
|
and better modularization
|
||||||
|
# Configuration object now finalizes itself when a read operation is
|
||||||
|
performed on it, ensuring that its internal state stays consistent.
|
||||||
|
To revert this behavior, you can set the $autoFinalize member variable
|
||||||
|
off, but it's not recommended.
|
||||||
|
# New compact syntax for AttrDef objects that can be used to instantiate
|
||||||
|
new objects via make()
|
||||||
|
# Definitions (esp. HTMLDefinition) are now cached for a significant
|
||||||
|
performance boost. You can disable caching by setting %Core.DefinitionCache
|
||||||
|
to null. You CANNOT edit raw definitions without setting the corresponding
|
||||||
|
DefinitionID directive (%HTML.DefinitionID for HTMLDefinition).
|
||||||
|
# Contents between <script> tags are now completely removed if <script>
|
||||||
|
is not allowed
|
||||||
|
# Prototype-declarations for Lexer removed in favor of configuration
|
||||||
|
determination of Lexer implementations.
|
||||||
|
! HTML Purifier now works in PHP 4.3.2.
|
||||||
|
! Configuration form-editing API makes tweaking HTMLPurifier_Config a
|
||||||
|
breeze!
|
||||||
|
! Configuration directives that accept hashes now allow new string
|
||||||
|
format: key1:value1,key2:value2
|
||||||
|
! ConfigDoc now factored into OOP design
|
||||||
|
! All deprecated elements now natively supported
|
||||||
|
! Implement TinyMCE styled whitelist specification format in
|
||||||
|
%HTML.Allowed
|
||||||
|
! Config object gives more friendly error messages when things go wrong
|
||||||
|
! Advanced API implemented: easy functions for creating elements (addElement)
|
||||||
|
and attributes (addAttribute) on HTMLDefinition
|
||||||
|
! Add native support for required attributes
|
||||||
|
- Deprecated and removed EnableRedundantUTF8Cleaning. It didn't even work!
|
||||||
|
- DOMLex will not emit errors when a custom error handler that does not
|
||||||
|
honor error_reporting is used
|
||||||
|
- StrictBlockquote child definition refrains from wrapping whitespace
|
||||||
|
in tags now.
|
||||||
|
- Bug resulting from tag transforms to non-allowed elements fixed
|
||||||
|
- ChildDef_Custom's regex generation has been improved, removing several
|
||||||
|
false positives
|
||||||
|
. Unit test for ElementDef created, ElementDef behavior modified to
|
||||||
|
be more flexible
|
||||||
|
. Added convenience functions for HTMLModule constructors
|
||||||
|
. AttrTypes now has accessor functions that should be used instead
|
||||||
|
of directly manipulating info
|
||||||
|
. TagTransform_Center deprecated in favor of generic TagTransform_Simple
|
||||||
|
. Add extra protection in AttrDef_URI against phantom Schemes
|
||||||
|
. Doctype object added to HTMLDefinition which describes certain aspects
|
||||||
|
of the operational document type
|
||||||
|
. Lexer is now pre-emptively included, with a conditional include for the
|
||||||
|
PHP5 only version.
|
||||||
|
. HTMLDefinition and CSSDefinition have a common parent class: Definition.
|
||||||
|
. DirectLex can now track line-numbers
|
||||||
|
. Preliminary error collector is in place, although no code actually reports
|
||||||
|
errors yet
|
||||||
|
. Factor out most of ValidateAttributes to new AttrValidator class
|
||||||
|
|
||||||
|
1.6.1, released 2007-05-05
|
||||||
|
! Support for more deprecated attributes via transformations:
|
||||||
|
+ hspace and vspace in img
|
||||||
|
+ size and noshade in hr
|
||||||
|
+ nowrap in td
|
||||||
|
+ clear in br
|
||||||
|
+ align in caption, table, img and hr
|
||||||
|
+ type in ul, ol and li
|
||||||
|
! DirectLex now preserves text in which a < bracket is followed by
|
||||||
|
a non-alphanumeric character. This means that certain emoticons
|
||||||
|
are now preserved.
|
||||||
|
! %Core.RemoveInvalidImg is now operational, when set to false invalid
|
||||||
|
images will hang around with an empty src
|
||||||
|
! target attribute in a tag supported, use %Attr.AllowedFrameTargets
|
||||||
|
to enable
|
||||||
|
! CSS property white-space now allows nowrap (supported in all modern
|
||||||
|
browsers) but not others (which have spotty browser implementations)
|
||||||
|
! XHTML 1.1 mode now sort-of works without any fatal errors, and
|
||||||
|
lang is now moved over to xml:lang.
|
||||||
|
! Attribute transformation smoketest available at smoketests/attrTransform.php
|
||||||
|
! Transformation of font's size attribute now handles super-large numbers
|
||||||
|
- Possibly fatal bug with __autoload() fixed in module manager
|
||||||
|
- Invert HTMLModuleManager->addModule() processing order to check
|
||||||
|
prefixes first and then the literal module
|
||||||
|
- Empty strings get converted to empty arrays instead of arrays with
|
||||||
|
an empty string in them.
|
||||||
|
- Merging in attribute lists now works.
|
||||||
|
. Demo script removed: it has been added to the website's repository
|
||||||
|
. Basic.php script modified to work out of the box
|
||||||
|
. Refactor AttrTransform classes to reduce duplication
|
||||||
|
. AttrTransform_TextAlign axed in favor of a more general
|
||||||
|
AttrTransform_EnumToCSS, refer to HTMLModule/TransformToStrict.php to
|
||||||
|
see how the new equivalent is implemented
|
||||||
|
. Unit tests now use exclusively assertIdentical
|
||||||
|
|
||||||
|
1.6.0, released 2007-04-01
|
||||||
|
! Support for most common deprecated attributes via transformations:
|
||||||
|
+ bgcolor in td, th, tr and table
|
||||||
|
+ border in img
|
||||||
|
+ name in a and img
|
||||||
|
+ width in td, th and hr
|
||||||
|
+ height in td, th
|
||||||
|
! Support for CSS attribute 'height' added
|
||||||
|
! Support for rel and rev attributes in a tags added, use %Attr.AllowedRel
|
||||||
|
and %Attr.AllowedRev to activate
|
||||||
|
- You can define ID blacklists using regular expressions via
|
||||||
|
%Attr.IDBlacklistRegexp
|
||||||
|
- Error messages are emitted when you attempt to "allow" elements or
|
||||||
|
attributes that HTML Purifier does not support
|
||||||
|
|
||||||
|
|
||||||
|
- Fix segfault in unit test. The problem is not very reproduceable and
|
||||||
|
I don't know what causes it, but a six line patch fixed it.
|
||||||
|
|
||||||
|
1.5.0, released 2007-03-23
|
||||||
|
! Added a rudimentary I18N and L10N system modeled off MediaWiki. It
|
||||||
|
doesn't actually do anything yet, but keep your eyes peeled.
|
||||||
|
! docs/enduser-utf8.html explains how to use UTF-8 and HTML Purifier
|
||||||
|
! Newly structured HTMLDefinition modeled off of XHTML 1.1 modules.
|
||||||
|
I am loathe to release beta quality APIs, but this is exactly that;
|
||||||
|
don't use the internal interfaces if you're not willing to do migration
|
||||||
|
later on.
|
||||||
|
- Allow 'x' subtag in language codes
|
||||||
|
- Fixed buggy chameleon-support for ins and del
|
||||||
|
. Added support for IDREF attributes (i.e. for)
|
||||||
|
. Renamed HTMLPurifier_AttrDef_Class to HTMLPurifier_AttrDef_Nmtokens
|
||||||
|
. Removed context variable ParentType, replaced with IsInline, which
|
||||||
|
is false when you're not inline and an integer of the parent that
|
||||||
|
caused you to become inline when you are (so possibly zero)
|
||||||
|
. Removed ElementDef->type in favor of ElementDef->descendants_are_inline
|
||||||
|
and HTMLDefinition->content_sets
|
||||||
|
. StrictBlockquote now reports what elements its supposed to allow,
|
||||||
|
rather than what it does allow
|
||||||
|
. Removed HTMLDefinition->info_flow_elements in favor of
|
||||||
|
HTMLDefinition->content_sets['Flow']
|
||||||
|
. Removed redundant "exclusionary" definitions from DTD roster
|
||||||
|
. StrictBlockquote now requires a construction parameter as if it
|
||||||
|
were an Required ChildDef, this is the "real" set of allowed elements
|
||||||
|
. AttrDef partitioned into HTML, CSS and URI segments
|
||||||
|
. Modify Youtube filter regexp to be multiline
|
||||||
|
. Require both PHP5 and DOM extension in order to use DOMLex, fixes
|
||||||
|
some edge cases where a DOMDocument class exists in a PHP4 environment
|
||||||
|
due to DOM XML extension.
|
||||||
|
|
||||||
|
1.4.1, released 2007-01-21
|
||||||
|
! docs/enduser-youtube.html updated according to new functionality
|
||||||
|
- YouTube IDs can have underscores and dashes
|
||||||
|
|
||||||
|
1.4.0, released 2007-01-21
|
||||||
|
! Implemented list-style-image, URIs now allowed in list-style
|
||||||
|
! Implemented background-image, background-repeat, background-attachment
|
||||||
|
and background-position CSS properties. Shorthand property background
|
||||||
|
supports all of these properties.
|
||||||
|
! Configuration documentation looks nicer
|
||||||
|
! Added %Core.EscapeNonASCIICharacters to workaround loss of Unicode
|
||||||
|
characters while %Core.Encoding is set to a non-UTF-8 encoding.
|
||||||
|
! Support for configuration directive aliases added
|
||||||
|
! Config object can now be instantiated from ini files
|
||||||
|
! YouTube preservation code added to the core, with two lines of code
|
||||||
|
you can add it as a filter to your code. See smoketests/preserveYouTube.php
|
||||||
|
for sample code.
|
||||||
|
! Moved SLOW to docs/enduser-slow.html and added code examples
|
||||||
|
- Replaced version check with functionality check for DOM (thanks Stephen
|
||||||
|
Khoo)
|
||||||
|
. Added smoketest 'all.php', which loads all other smoketests via frames
|
||||||
|
. Implemented AttrDef_CSSURI for url(http://google.com) style declarations
|
||||||
|
. Added convenient single test selector form on test runner
|
||||||
|
|
||||||
1.3.2, released 2006-12-25
|
1.3.2, released 2006-12-25
|
||||||
! HTMLPurifier object now accepts configuration arrays, no need to manually
|
! HTMLPurifier object now accepts configuration arrays, no need to manually
|
||||||
@@ -156,4 +421,4 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
|||||||
! First public release, most functionality implemented. Notable omissions are:
|
! First public release, most functionality implemented. Notable omissions are:
|
||||||
+ Shorthand CSS properties
|
+ Shorthand CSS properties
|
||||||
+ Table CSS properties
|
+ Table CSS properties
|
||||||
+ Deprecated attribute transformations
|
+ Deprecated attribute transformations
|
25
README
25
README
@@ -1,13 +1,22 @@
|
|||||||
|
|
||||||
README
|
README
|
||||||
All about HTMLPurifier
|
All about HTML Purifier
|
||||||
|
|
||||||
HTMLPurifier is an HTML filtering solution. It uses a unique combination of
|
HTML Purifier is an HTML filtering solution that uses a unique combination
|
||||||
robust whitelists and agressive parsing to ensure that not only are XSS
|
of robust whitelists and agressive parsing to ensure that not only are
|
||||||
attacks thwarted, but the resulting HTML is standards compliant.
|
XSS attacks thwarted, but the resulting HTML is standards compliant.
|
||||||
|
|
||||||
See INSTALL on how to use the library. See docs/ for more developer-oriented
|
HTML Purifier is oriented towards richly formatted documents from
|
||||||
documentation as well as some code examples. Users of TinyMCE or FCKeditor
|
untrusted sources that require CSS and a full tag-set. This library can
|
||||||
may be especially interested in WYSIWYG.
|
be configured to accept a more restrictive set of tags, but it won't be
|
||||||
|
as efficient as more bare-bones parsers. It will, however, do the job
|
||||||
|
right, which may be more important.
|
||||||
|
|
||||||
HTMLPurifier can be found on the web at: http://hp.jpsband.org/
|
Places to go:
|
||||||
|
|
||||||
|
* See INSTALL for a quick installation guide
|
||||||
|
* See docs/ for developer-oriented documentation, code examples and
|
||||||
|
an in-depth installation guide.
|
||||||
|
* See WYSIWYG for information on editors like TinyMCE and FCKeditor
|
||||||
|
|
||||||
|
HTML Purifier can be found on the web at: http://htmlpurifier.org/
|
||||||
|
40
SLOW
40
SLOW
@@ -1,40 +0,0 @@
|
|||||||
|
|
||||||
SLOW
|
|
||||||
also known as the HELP ME LIBRARY IS TOO SLOW MY PAGE TAKE TOO LONG LOAD page
|
|
||||||
|
|
||||||
HTML Purifier is a very powerful library. But with power comes great
|
|
||||||
responsibility, or, at least, longer execution times. Remember, this
|
|
||||||
library isn't lightly grazing over submitted HTML: it's deconstructing
|
|
||||||
the whole thing, rigorously checking the parts, and then putting it
|
|
||||||
back together.
|
|
||||||
|
|
||||||
So, if it so turns out that HTML Purifier is kinda too slow for outbound
|
|
||||||
filtering, you've got a few options:
|
|
||||||
|
|
||||||
1. Inbound filtering - perform filtering of HTML when it's submitted by the
|
|
||||||
user. Since the user is already submitting something, an extra half a
|
|
||||||
second tacked on to the load time probably isn't going to be that huge of
|
|
||||||
a problem. Then, displaying the content is a simple a manner of outputting
|
|
||||||
it directly from your database/filesystem. The trouble with this method is
|
|
||||||
that your user loses the original text, and when doing edits, will be
|
|
||||||
handling the filtered text. While this may be a good thing, especially if
|
|
||||||
you're using a WYSIWYG editor, it can also result in data-loss if a user
|
|
||||||
makes a typo.
|
|
||||||
|
|
||||||
2. Caching the filtered output - accept the submitted text and put it
|
|
||||||
unaltered into the database, but then also generate a filtered version and
|
|
||||||
stash that in the database. Serve the filtered version to readers, and the
|
|
||||||
unaltered version to editors. If need be, you can invalidate the cache and
|
|
||||||
have the cached filtered version be regenerated on the first page view. Pros?
|
|
||||||
Full data retention. Cons? It's more complicated, and opens other editors
|
|
||||||
up to XSS if they are using a WYSIWYG editor (to fix that, they'd have to
|
|
||||||
be able to get their hands on the *really* original text served in plaintext
|
|
||||||
mode).
|
|
||||||
|
|
||||||
In short, inbound filtering is almost as simple as outbound filtering, but
|
|
||||||
it has some drawbacks which cannot be fixed unless you save both the original
|
|
||||||
and the filtered versions.
|
|
||||||
|
|
||||||
There is a third option: profile and optimize HTMLPurifier yourself. Be sure
|
|
||||||
to report back your results if you decide to do that! Especially if you
|
|
||||||
port HTML Purifier to C++. ;-)
|
|
136
TODO
136
TODO
@@ -1,97 +1,87 @@
|
|||||||
|
|
||||||
TODO List
|
TODO List
|
||||||
|
|
||||||
= KEY ====================
|
= KEY ====================
|
||||||
# Flagship
|
# Flagship
|
||||||
- Regular
|
- Regular
|
||||||
? At-risk
|
? Maybe I'll Do It
|
||||||
==========================
|
==========================
|
||||||
|
|
||||||
1.4 release
|
If no interest is expressed for a feature that may required a considerable
|
||||||
# More extensive URI filtering schemes (see docs/proposal-new-directives.txt)
|
amount of effort to implement, it may get endlessly delayed. Do not be
|
||||||
# Allow for background-image and list-style-image (intrinsically tied to above)
|
afraid to cast your vote for the next feature to be implemented!
|
||||||
# Add hooks for custom behavior (for instance, YouTube preservation)
|
|
||||||
- Aggressive caching
|
|
||||||
? Rich set* methods and config file loaders for HTMLPurifier_Config
|
|
||||||
? Configuration profiles: sets of directives that get set with one func call
|
|
||||||
? ConfigSchema directive aliases (so we can rename some of them)
|
|
||||||
? URI validation routines tighter (see docs/dev-code-quality.html) (COMPLEX)
|
|
||||||
|
|
||||||
1.5 release
|
2.2 release [Error'ed]
|
||||||
# Error logging for filtering/cleanup procedures
|
# Error logging for filtering/cleanup procedures
|
||||||
- Requires I18N facilities to be created first (COMPLEX)
|
- XSS-attempt detection
|
||||||
|
|
||||||
1.6 release
|
2.3 release [Do What I Mean, Not What I Say]
|
||||||
# Add pre-packaged "levels" of cleaning (custom behavior already done)
|
|
||||||
- More fine-grained control over escaping behavior
|
|
||||||
- Silently drop content inbetween SCRIPT tags (can be generalized to allow
|
|
||||||
specification of elements that, when detected as foreign, trigger removal
|
|
||||||
of children, although unbalanced tags could wreck havoc (or at least
|
|
||||||
delete the rest of the document)).
|
|
||||||
|
|
||||||
1.7 release
|
|
||||||
# Additional support for poorly written HTML
|
# Additional support for poorly written HTML
|
||||||
- Implement all non-essential attribute transforms (BIG!)
|
|
||||||
- Microsoft Word HTML cleaning (i.e. MsoNormal, but research essential!)
|
- Microsoft Word HTML cleaning (i.e. MsoNormal, but research essential!)
|
||||||
- Friendly strict handling of <address> (block -> <br>)
|
- Friendly strict handling of <address> (block -> <br>)
|
||||||
|
|
||||||
2.0 release
|
|
||||||
# Formatters for plaintext (COMPLEX)
|
|
||||||
- Auto-paragraphing (be sure to leverage fact that we know when things
|
|
||||||
shouldn't be paragraphed, such as lists and tables).
|
|
||||||
- Linkify URLs
|
|
||||||
- Smileys
|
|
||||||
- Linkification for HTML Purifier docs: notably configuration and classes
|
|
||||||
|
|
||||||
3.0 release
|
|
||||||
- Extended HTML capabilities based on namespacing and tag transforms (COMPLEX)
|
|
||||||
- Hooks for adding custom processors to custom namespaced tags and
|
|
||||||
attributes, offer default implementation
|
|
||||||
- Lots of documentation and samples
|
|
||||||
- XHTML 1.1 support
|
|
||||||
|
|
||||||
Ongoing
|
|
||||||
- Lots of profiling, make it faster!
|
|
||||||
- Plugins for major CMSes (COMPLEX)
|
|
||||||
- Drupal
|
|
||||||
- WordPress
|
|
||||||
- eFiction
|
|
||||||
- more! (look for ones that use WYSIWYGs)
|
|
||||||
|
|
||||||
Unknown release (on a scratch-an-itch basis)
|
|
||||||
- Fixes for Firefox's inability to handle COL alignment props (Bug 915)
|
|
||||||
- Automatically add non-breaking spaces to empty table cells when
|
|
||||||
empty-cells:show is applied to have compatibility with Internet Explorer
|
|
||||||
- Convert RTL/LTR override characters to <bdo> tags, or vice versa on demand.
|
|
||||||
Also, enable disabling of directionality
|
|
||||||
- Append something to duplicate IDs so they're still usable (impl. note: the
|
|
||||||
dupe detector would also need to detect the suffix as well)
|
|
||||||
- Have 'lang' attribute be checked against official lists
|
|
||||||
|
|
||||||
Encoding workarounds
|
|
||||||
- Non-lossy dumb alternate character encoding transformations, achieved by
|
|
||||||
numerically encoding all non-ASCII characters
|
|
||||||
- Semi-lossy dumb alternate character encoding transformations, achieved by
|
|
||||||
encoding all characters that have string entity equivalents
|
|
||||||
|
|
||||||
Requested
|
|
||||||
- Native content compression, whitespace stripping (don't rely on Tidy, make
|
|
||||||
sure we don't remove from <pre> or related tags)
|
|
||||||
- Win32 Phalanger C# binaries (?)
|
|
||||||
- Remove redundant tags, ex. <u><u>Underlined</u></u>. Implementation notes:
|
- Remove redundant tags, ex. <u><u>Underlined</u></u>. Implementation notes:
|
||||||
1. Analyzing which tags to remove duplicants
|
1. Analyzing which tags to remove duplicants
|
||||||
2. Ensure attributes are merged into the parent tag
|
2. Ensure attributes are merged into the parent tag
|
||||||
3. Extend the tag exclusion system to specify whether or not the
|
3. Extend the tag exclusion system to specify whether or not the
|
||||||
contents should be dropped or not (currently, there's code that could do
|
contents should be dropped or not (currently, there's code that could do
|
||||||
something like this if it didn't drop the inner text too.)
|
something like this if it didn't drop the inner text too.)
|
||||||
- More user-friendly warnings when %HTML.Allow* attempts to specify a
|
- Remove <span> tags that don't do anything (no attributes)
|
||||||
tag or attribute that is not supported
|
- Remove empty inline tags<i></i>
|
||||||
- Allow specifying global attributes on a tag-by-tag basis in
|
- Append something to duplicate IDs so they're still usable (impl. note: the
|
||||||
%HTML.AllowAttributes
|
dupe detector would also need to detect the suffix as well)
|
||||||
- Parse TinyMCE whitelist into our %HTML.Allow* whitelists
|
|
||||||
- XSS-attempt detection
|
2.4 release [It's All About Trust] (floating)
|
||||||
|
# Implement untrusted, dangerous elements/attributes
|
||||||
|
# Implement IDREF support (harder than it seems, since you cannot have
|
||||||
|
IDREFs to non-existent IDs)
|
||||||
|
|
||||||
|
3.0 release [Beyond HTML]
|
||||||
|
# Legit token based CSS parsing (will require revamping almost every
|
||||||
|
AttrDef class)
|
||||||
|
# More control over allowed CSS properties (maybe modularize it in the
|
||||||
|
same fashion!)
|
||||||
|
# Formatters for plaintext
|
||||||
|
- Smileys
|
||||||
|
- Standardize token armor for all areas of processing
|
||||||
|
- Fixes for Firefox's inability to handle COL alignment props (Bug 915)
|
||||||
|
- Automatically add non-breaking spaces to empty table cells when
|
||||||
|
empty-cells:show is applied to have compatibility with Internet Explorer
|
||||||
|
- Convert RTL/LTR override characters to <bdo> tags, or vice versa on demand.
|
||||||
|
Also, enable disabling of directionality
|
||||||
|
|
||||||
|
4.0 release [To XML and Beyond]
|
||||||
|
- Extended HTML capabilities based on namespacing and tag transforms (COMPLEX)
|
||||||
|
- Hooks for adding custom processors to custom namespaced tags and
|
||||||
|
attributes, offer default implementation
|
||||||
|
- Lots of documentation and samples
|
||||||
|
|
||||||
|
Ongoing
|
||||||
|
- Lots of profiling, make it faster!
|
||||||
|
- Plugins for major CMSes (COMPLEX)
|
||||||
|
- phpBB
|
||||||
|
- eFiction
|
||||||
|
- more! (look for ones that use WYSIWYGs)
|
||||||
|
- Complete basic smoketests
|
||||||
|
|
||||||
|
Unknown release (on a scratch-an-itch basis)
|
||||||
|
? Semi-lossy dumb alternate character encoding transfor
|
||||||
|
? Have 'lang' attribute be checked against official lists, achieved by
|
||||||
|
encoding all characters that have string entity equivalents
|
||||||
|
- Abstract ChildDef_BlockQuote to work with all elements that only
|
||||||
|
allow blocks in them, required or optional
|
||||||
|
- Reorganize Unit Tests
|
||||||
|
- Refactor loop tests: Lexer
|
||||||
|
- Reorganize configuration directives (Create more namespaces! Get messy!)
|
||||||
|
- Advanced URI filtering schemes (see docs/proposal-new-directives.txt)
|
||||||
|
- Implement lenient <ruby> child validation
|
||||||
|
- Explain how to use HTML Purifier in non-PHP languages / create
|
||||||
|
a simple command line stub (or complicated?)
|
||||||
|
|
||||||
|
Requested
|
||||||
|
|
||||||
Wontfix
|
Wontfix
|
||||||
- Non-lossy smart alternate character encoding transformations (unless
|
- Non-lossy smart alternate character encoding transformations (unless
|
||||||
patch provided)
|
patch provided)
|
||||||
- Pretty-printing HTML, users can use Tidy on the output on entire page
|
- Pretty-printing HTML, users can use Tidy on the output on entire page
|
||||||
|
- Native content compression, whitespace stripping (don't rely on Tidy, make
|
||||||
|
sure we don't remove from <pre> or related tags): use gzip if this is
|
||||||
|
really important
|
||||||
|
10
WHATSNEW
Normal file
10
WHATSNEW
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
In version 2.1, HTML Purifier's URI validation and filtering handling
|
||||||
|
system has been revamped with a new, extensible URIFilter system. Also
|
||||||
|
notable features include preservation of emoticons in PHP5 with
|
||||||
|
%Core.AggressivelyFixLt, standalone and lite download versions,
|
||||||
|
transforming relative URIs to absolute URIs, Ruby in XHTML 1.1, a Phorum
|
||||||
|
mod, and UTF-8 font names. Notable bug-fixes include refinement of
|
||||||
|
the auto-paragraphing algorithm (no longer experimental), better XHTML
|
||||||
|
1.1 support and the removal of the contents of <style> elements. Version
|
||||||
|
2.1.1 amends a few bugs in some of newly introduced features, namely
|
||||||
|
running the standalone download version in PHP4 and %URI.MakeAbsolute.
|
3
WYSIWYG
3
WYSIWYG
@@ -16,6 +16,3 @@ trouble. Therein lies the solution:
|
|||||||
HTML Purifier is perfect for filtering pure-HTML input from WYSIWYG editors.
|
HTML Purifier is perfect for filtering pure-HTML input from WYSIWYG editors.
|
||||||
|
|
||||||
Enough said.
|
Enough said.
|
||||||
|
|
||||||
There is a proof-of-concept integration of HTML Purifier with the Mantis
|
|
||||||
bugtracker at http://hp.jpsband.org/mantis/
|
|
||||||
|
BIN
art/1000passes.png
Normal file
BIN
art/1000passes.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 3.4 KiB |
BIN
art/100cases.png
Normal file
BIN
art/100cases.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 2.7 KiB |
1
benchmarks/.htaccess
Normal file
1
benchmarks/.htaccess
Normal file
@@ -0,0 +1 @@
|
|||||||
|
Deny from all
|
@@ -7,6 +7,7 @@ set_include_path(get_include_path() . PATH_SEPARATOR . '../library/');
|
|||||||
|
|
||||||
require_once 'HTMLPurifier/ConfigSchema.php';
|
require_once 'HTMLPurifier/ConfigSchema.php';
|
||||||
require_once 'HTMLPurifier/Config.php';
|
require_once 'HTMLPurifier/Config.php';
|
||||||
|
require_once 'HTMLPurifier/Context.php';
|
||||||
|
|
||||||
$LEXERS = array();
|
$LEXERS = array();
|
||||||
$RUNS = isset($GLOBALS['HTMLPurifierTest']['Runs'])
|
$RUNS = isset($GLOBALS['HTMLPurifierTest']['Runs'])
|
||||||
@@ -93,11 +94,14 @@ function print_lexers() {
|
|||||||
function do_benchmark($name, $document) {
|
function do_benchmark($name, $document) {
|
||||||
global $LEXERS, $RUNS;
|
global $LEXERS, $RUNS;
|
||||||
|
|
||||||
|
$config = HTMLPurifier_Config::createDefault();
|
||||||
|
$context = new HTMLPurifier_Context();
|
||||||
|
|
||||||
$timer = new RowTimer($name);
|
$timer = new RowTimer($name);
|
||||||
$timer->start();
|
$timer->start();
|
||||||
|
|
||||||
foreach($LEXERS as $key => $lexer) {
|
foreach($LEXERS as $key => $lexer) {
|
||||||
for ($i=0; $i<$RUNS; $i++) $tokens = $lexer->tokenizeHTML($document);
|
for ($i=0; $i<$RUNS; $i++) $tokens = $lexer->tokenizeHTML($document, $config, $context);
|
||||||
$timer->setMarker($key);
|
$timer->setMarker($key);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -161,4 +165,4 @@ echo '<div>Random input was: ' .
|
|||||||
?>
|
?>
|
||||||
|
|
||||||
|
|
||||||
</body></html>
|
</body></html>
|
||||||
|
@@ -5,12 +5,13 @@ set_include_path(get_include_path() . PATH_SEPARATOR . '../library/');
|
|||||||
require_once 'HTMLPurifier/ConfigSchema.php';
|
require_once 'HTMLPurifier/ConfigSchema.php';
|
||||||
require_once 'HTMLPurifier/Config.php';
|
require_once 'HTMLPurifier/Config.php';
|
||||||
require_once 'HTMLPurifier/Lexer/DirectLex.php';
|
require_once 'HTMLPurifier/Lexer/DirectLex.php';
|
||||||
|
require_once 'HTMLPurifier/Context.php';
|
||||||
|
|
||||||
$input = file_get_contents('samples/Lexer/4.html');
|
$input = file_get_contents('samples/Lexer/4.html');
|
||||||
$lexer = new HTMLPurifier_Lexer_DirectLex();
|
$lexer = new HTMLPurifier_Lexer_DirectLex();
|
||||||
|
$config = HTMLPurifier_Config::createDefault();
|
||||||
|
$context = new HTMLPurifier_Context();
|
||||||
|
|
||||||
for ($i = 0; $i < 10; $i++) {
|
for ($i = 0; $i < 10; $i++) {
|
||||||
$tokens = $lexer->tokenizeHTML($input);
|
$tokens = $lexer->tokenizeHTML($input, $config, $context);
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
12
benchmarks/Trace.php
Normal file
12
benchmarks/Trace.php
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
ini_set('xdebug.trace_format', 1);
|
||||||
|
ini_set('xdebug.show_mem_delta', true);
|
||||||
|
|
||||||
|
xdebug_start_trace(dirname(__FILE__) . '/Trace');
|
||||||
|
require_once '../library/HTMLPurifier.auto.php';
|
||||||
|
|
||||||
|
$purifier = new HTMLPurifier();
|
||||||
|
|
||||||
|
$data = $purifier->purify(file_get_contents('samples/Lexer/4.html'));
|
||||||
|
xdebug_stop_trace();
|
@@ -2,217 +2,44 @@
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Generates XML and HTML documents describing configuration.
|
* Generates XML and HTML documents describing configuration.
|
||||||
|
* @note PHP 5 only!
|
||||||
*/
|
*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
TODO:
|
TODO:
|
||||||
- make XML format richer (see below)
|
- make XML format richer (see XMLSerializer_ConfigSchema)
|
||||||
- extend XSLT transformation (see the corresponding XSLT file)
|
- extend XSLT transformation (see the corresponding XSLT file)
|
||||||
- allow generation of packaged docs that can be easily moved
|
- allow generation of packaged docs that can be easily moved
|
||||||
- multipage documentation
|
- multipage documentation
|
||||||
- determine how to multilingualize
|
- determine how to multilingualize
|
||||||
- factor out code into classes
|
- add blurbs to ToC
|
||||||
*/
|
*/
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// Check and configure environment
|
|
||||||
|
|
||||||
if (version_compare('5', PHP_VERSION, '>')) exit('Requires PHP 5 or higher.');
|
if (version_compare('5', PHP_VERSION, '>')) exit('Requires PHP 5 or higher.');
|
||||||
error_reporting(E_ALL);
|
error_reporting(E_ALL); // probably not possible to use E_STRICT
|
||||||
|
|
||||||
|
define('HTMLPURIFIER_SCHEMA_STRICT', true); // description data needs to be collected
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
// load dual-libraries
|
||||||
// Include HTML Purifier library
|
require_once '../library/HTMLPurifier.auto.php';
|
||||||
|
require_once 'library/ConfigDoc.auto.php';
|
||||||
|
|
||||||
set_include_path('../library' . PATH_SEPARATOR . get_include_path());
|
$purifier = HTMLPurifier::getInstance(array(
|
||||||
require_once 'HTMLPurifier.php';
|
'AutoFormat.PurifierLinkify' => true
|
||||||
|
));
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// Setup convenience functions
|
|
||||||
|
|
||||||
function appendHTMLDiv($document, $node, $html) {
|
|
||||||
global $purifier;
|
|
||||||
$html = $purifier->purify($html);
|
|
||||||
$dom_html = $document->createDocumentFragment();
|
|
||||||
$dom_html->appendXML($html);
|
|
||||||
|
|
||||||
$dom_div = $document->createElement('div');
|
|
||||||
$dom_div->setAttribute('xmlns', 'http://www.w3.org/1999/xhtml');
|
|
||||||
$dom_div->appendChild($dom_html);
|
|
||||||
|
|
||||||
$node->appendChild($dom_div);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// Load copies of HTMLPurifier_ConfigDef and HTMLPurifier
|
|
||||||
|
|
||||||
$schema = HTMLPurifier_ConfigSchema::instance();
|
$schema = HTMLPurifier_ConfigSchema::instance();
|
||||||
$purifier = new HTMLPurifier();
|
$style = 'plain'; // use $_GET in the future
|
||||||
|
$configdoc = new ConfigDoc();
|
||||||
|
$output = $configdoc->generate($schema, $style);
|
||||||
|
|
||||||
|
// write out
|
||||||
// ---------------------------------------------------------------------------
|
file_put_contents("$style.html", $output);
|
||||||
// Generate types.xml, a document describing the constraint "type"
|
|
||||||
|
|
||||||
$types_document = new DOMDocument('1.0', 'UTF-8');
|
|
||||||
$types_root = $types_document->createElement('types');
|
|
||||||
$types_document->appendChild($types_root);
|
|
||||||
$types_document->formatOutput = true;
|
|
||||||
foreach ($schema->types as $name => $expanded_name) {
|
|
||||||
$types_type = $types_document->createElement('type', $expanded_name);
|
|
||||||
$types_type->setAttribute('id', $name);
|
|
||||||
$types_root->appendChild($types_type);
|
|
||||||
}
|
|
||||||
$types_document->save('types.xml');
|
|
||||||
|
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// Generate configdoc.xml, a document documenting configuration directives
|
|
||||||
|
|
||||||
$dom_document = new DOMDocument('1.0', 'UTF-8');
|
|
||||||
$dom_root = $dom_document->createElement('configdoc');
|
|
||||||
$dom_document->appendChild($dom_root);
|
|
||||||
$dom_document->formatOutput = true;
|
|
||||||
|
|
||||||
// add the name of the application
|
|
||||||
$dom_root->appendChild($dom_document->createElement('title', 'HTML Purifier'));
|
|
||||||
|
|
||||||
/*
|
|
||||||
TODO for XML format:
|
|
||||||
- create a definition (DTD or other) once interface stabilizes
|
|
||||||
*/
|
|
||||||
|
|
||||||
foreach($schema->info as $namespace_name => $namespace_info) {
|
|
||||||
|
|
||||||
$dom_namespace = $dom_document->createElement('namespace');
|
|
||||||
$dom_root->appendChild($dom_namespace);
|
|
||||||
|
|
||||||
$dom_namespace->setAttribute('id', $namespace_name);
|
|
||||||
$dom_namespace->appendChild(
|
|
||||||
$dom_document->createElement('name', $namespace_name)
|
|
||||||
);
|
|
||||||
$dom_namespace_description = $dom_document->createElement('description');
|
|
||||||
$dom_namespace->appendChild($dom_namespace_description);
|
|
||||||
appendHTMLDiv($dom_document, $dom_namespace_description,
|
|
||||||
$schema->info_namespace[$namespace_name]->description);
|
|
||||||
|
|
||||||
foreach ($namespace_info as $name => $info) {
|
|
||||||
|
|
||||||
$dom_directive = $dom_document->createElement('directive');
|
|
||||||
$dom_namespace->appendChild($dom_directive);
|
|
||||||
|
|
||||||
$dom_directive->setAttribute('id', $namespace_name . '.' . $name);
|
|
||||||
$dom_directive->appendChild(
|
|
||||||
$dom_document->createElement('name', $name)
|
|
||||||
);
|
|
||||||
|
|
||||||
$dom_constraints = $dom_document->createElement('constraints');
|
|
||||||
$dom_directive->appendChild($dom_constraints);
|
|
||||||
|
|
||||||
$dom_type = $dom_document->createElement('type', $info->type);
|
|
||||||
if ($info->allow_null) {
|
|
||||||
$dom_type->setAttribute('allow-null', 'yes');
|
|
||||||
}
|
|
||||||
$dom_constraints->appendChild($dom_type);
|
|
||||||
|
|
||||||
if ($info->allowed !== true) {
|
|
||||||
$dom_allowed = $dom_document->createElement('allowed');
|
|
||||||
$dom_constraints->appendChild($dom_allowed);
|
|
||||||
foreach ($info->allowed as $allowed => $bool) {
|
|
||||||
$dom_allowed->appendChild(
|
|
||||||
$dom_document->createElement('value', $allowed)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
$raw_default = $schema->defaults[$namespace_name][$name];
|
|
||||||
if (is_bool($raw_default)) {
|
|
||||||
$default = $raw_default ? 'true' : 'false';
|
|
||||||
} elseif (is_string($raw_default)) {
|
|
||||||
$default = "\"$raw_default\"";
|
|
||||||
} elseif (is_null($raw_default)) {
|
|
||||||
$default = 'null';
|
|
||||||
} else {
|
|
||||||
$default = print_r(
|
|
||||||
$schema->defaults[$namespace_name][$name], true
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
$dom_default = $dom_document->createElement('default', $default);
|
|
||||||
|
|
||||||
// remove this once we get a DTD
|
|
||||||
$dom_default->setAttribute('xml:space', 'preserve');
|
|
||||||
|
|
||||||
$dom_constraints->appendChild($dom_default);
|
|
||||||
|
|
||||||
$dom_descriptions = $dom_document->createElement('descriptions');
|
|
||||||
$dom_directive->appendChild($dom_descriptions);
|
|
||||||
|
|
||||||
foreach ($info->descriptions as $file => $file_descriptions) {
|
|
||||||
foreach ($file_descriptions as $line => $description) {
|
|
||||||
$dom_description = $dom_document->createElement('description');
|
|
||||||
$dom_description->setAttribute('file', $file);
|
|
||||||
$dom_description->setAttribute('line', $line);
|
|
||||||
appendHTMLDiv($dom_document, $dom_description, $description);
|
|
||||||
$dom_descriptions->appendChild($dom_description);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
// print_r($dom_document->saveXML());
|
|
||||||
|
|
||||||
// save a copy of the raw XML
|
|
||||||
$dom_document->save('configdoc.xml');
|
|
||||||
|
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// Generate final output using XSLT
|
|
||||||
|
|
||||||
// load the stylesheet
|
|
||||||
$xsl_stylesheet_name = 'plain';
|
|
||||||
$xsl_stylesheet = "styles/$xsl_stylesheet_name.xsl";
|
|
||||||
$xsl_dom_stylesheet = new DOMDocument();
|
|
||||||
$xsl_dom_stylesheet->load($xsl_stylesheet);
|
|
||||||
|
|
||||||
// setup the XSLT processor
|
|
||||||
$xsl_processor = new XSLTProcessor();
|
|
||||||
|
|
||||||
// perform the transformation
|
|
||||||
$xsl_processor->importStylesheet($xsl_dom_stylesheet);
|
|
||||||
$html_output = $xsl_processor->transformToXML($dom_document);
|
|
||||||
|
|
||||||
// some slight fudges to preserve backwards compatibility
|
|
||||||
$html_output = str_replace('/>', ' />', $html_output); // <br /> not <br>
|
|
||||||
$html_output = str_replace(' xmlns=""', '', $html_output); // rm unnecessary xmlns
|
|
||||||
|
|
||||||
if (class_exists('Tidy')) {
|
|
||||||
// cleanup output
|
|
||||||
$config = array(
|
|
||||||
'indent' => true,
|
|
||||||
'output-xhtml' => true,
|
|
||||||
'wrap' => 80
|
|
||||||
);
|
|
||||||
$tidy = new Tidy;
|
|
||||||
$tidy->parseString($html_output, $config, 'utf8');
|
|
||||||
$tidy->cleanRepair();
|
|
||||||
$html_output = (string) $tidy;
|
|
||||||
}
|
|
||||||
|
|
||||||
// write it to a file (todo: parse into seperate pages)
|
|
||||||
file_put_contents("$xsl_stylesheet_name.html", $html_output);
|
|
||||||
|
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// Output for instant feedback
|
|
||||||
|
|
||||||
if (php_sapi_name() != 'cli') {
|
if (php_sapi_name() != 'cli') {
|
||||||
echo $html_output;
|
// output = instant feedback
|
||||||
|
echo $output;
|
||||||
} else {
|
} else {
|
||||||
echo 'Files generated successfully.';
|
echo 'Files generated successfully.';
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
9
configdoc/library/ConfigDoc.auto.php
Normal file
9
configdoc/library/ConfigDoc.auto.php
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
/**
|
||||||
|
* This is a stub include that automatically configures the include path.
|
||||||
|
*/
|
||||||
|
|
||||||
|
set_include_path(dirname(__FILE__) . PATH_SEPARATOR . get_include_path() );
|
||||||
|
require_once 'ConfigDoc.php';
|
||||||
|
|
38
configdoc/library/ConfigDoc.php
Normal file
38
configdoc/library/ConfigDoc.php
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
require_once 'ConfigDoc/HTMLXSLTProcessor.php';
|
||||||
|
require_once 'ConfigDoc/XMLSerializer/Types.php';
|
||||||
|
require_once 'ConfigDoc/XMLSerializer/ConfigSchema.php';
|
||||||
|
|
||||||
|
class ConfigDoc
|
||||||
|
{
|
||||||
|
|
||||||
|
function generate($schema, $xsl_stylesheet_name = 'plain', $parameters = array()) {
|
||||||
|
// generate types document, describing type constraints
|
||||||
|
$types_serializer = new ConfigDoc_XMLSerializer_Types();
|
||||||
|
$types_document = $types_serializer->serialize($schema);
|
||||||
|
$types_document->save(dirname(__FILE__) . '/../types.xml'); // only ONE
|
||||||
|
|
||||||
|
// generate configdoc.xml, documents configuration directives
|
||||||
|
$schema_serializer = new ConfigDoc_XMLSerializer_ConfigSchema();
|
||||||
|
$schema_document = $schema_serializer->serialize($schema);
|
||||||
|
$schema_document->save('configdoc.xml');
|
||||||
|
|
||||||
|
// setup transformation
|
||||||
|
$xsl_stylesheet = dirname(__FILE__) . "/../styles/$xsl_stylesheet_name.xsl";
|
||||||
|
$xslt_processor = new ConfigDoc_HTMLXSLTProcessor();
|
||||||
|
$xslt_processor->setParameters($parameters);
|
||||||
|
$xslt_processor->importStylesheet($xsl_stylesheet);
|
||||||
|
|
||||||
|
return $xslt_processor->transformToHTML($schema_document);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Remove any generated files
|
||||||
|
*/
|
||||||
|
function cleanup() {
|
||||||
|
unlink('configdoc.xml');
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
62
configdoc/library/ConfigDoc/HTMLXSLTProcessor.php
Normal file
62
configdoc/library/ConfigDoc/HTMLXSLTProcessor.php
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Special XSLTProcessor specifically for HTML documents. Loosely
|
||||||
|
* based off of XSLTProcessor, but not really
|
||||||
|
*/
|
||||||
|
class ConfigDoc_HTMLXSLTProcessor
|
||||||
|
{
|
||||||
|
|
||||||
|
protected $xsltProcessor;
|
||||||
|
|
||||||
|
public function __construct() {
|
||||||
|
$this->xsltProcessor = new XSLTProcessor();
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Imports stylesheet for processor to use
|
||||||
|
* @param $xsl XSLT DOM tree, or filename of the XSL transformation
|
||||||
|
*/
|
||||||
|
public function importStylesheet($xsl) {
|
||||||
|
if (is_string($xsl)) {
|
||||||
|
$xsl_file = $xsl;
|
||||||
|
$xsl = new DOMDocument();
|
||||||
|
$xsl->load($xsl_file);
|
||||||
|
}
|
||||||
|
return $this->xsltProcessor->importStylesheet($xsl);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Transforms an XML file into HTML based on the stylesheet
|
||||||
|
* @param $xml XML DOM tree
|
||||||
|
*/
|
||||||
|
public function transformToHTML($xml) {
|
||||||
|
$out = $this->xsltProcessor->transformToXML($xml);
|
||||||
|
|
||||||
|
// fudges for HTML backwards compatibility
|
||||||
|
$out = str_replace('/>', ' />', $out); // <br /> not <br/>
|
||||||
|
$out = str_replace(' xmlns=""', '', $out); // rm unnecessary xmlns
|
||||||
|
$out = str_replace(' xmlns="http://www.w3.org/1999/xhtml"', '', $out); // rm unnecessary xmlns
|
||||||
|
if (class_exists('Tidy')) {
|
||||||
|
// cleanup output
|
||||||
|
$config = array(
|
||||||
|
'indent' => true,
|
||||||
|
'output-xhtml' => true,
|
||||||
|
'wrap' => 80
|
||||||
|
);
|
||||||
|
$tidy = new Tidy;
|
||||||
|
$tidy->parseString($out, $config, 'utf8');
|
||||||
|
$tidy->cleanRepair();
|
||||||
|
$out = (string) $tidy;
|
||||||
|
}
|
||||||
|
return $out;
|
||||||
|
}
|
||||||
|
|
||||||
|
public function setParameters($options) {
|
||||||
|
foreach ($options as $name => $value) {
|
||||||
|
$this->xsltProcessor->setParameter('', $name, $value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
25
configdoc/library/ConfigDoc/XMLSerializer.php
Normal file
25
configdoc/library/ConfigDoc/XMLSerializer.php
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
/**
|
||||||
|
* The XMLSerializer hierarchy of classes consist of classes that take
|
||||||
|
* objects and serialize them into XML, specifically DOM, form; this
|
||||||
|
* super-class contains convenience functions for those classes.
|
||||||
|
*/
|
||||||
|
class ConfigDoc_XMLSerializer
|
||||||
|
{
|
||||||
|
|
||||||
|
protected function appendHTMLDiv($document, $node, $html) {
|
||||||
|
$purifier = HTMLPurifier::getInstance();
|
||||||
|
$html = $purifier->purify($html);
|
||||||
|
$dom_html = $document->createDocumentFragment();
|
||||||
|
$dom_html->appendXML($html);
|
||||||
|
|
||||||
|
$dom_div = $document->createElement('div');
|
||||||
|
$dom_div->setAttribute('xmlns', 'http://www.w3.org/1999/xhtml');
|
||||||
|
$dom_div->appendChild($dom_html);
|
||||||
|
|
||||||
|
$node->appendChild($dom_div);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
123
configdoc/library/ConfigDoc/XMLSerializer/ConfigSchema.php
Normal file
123
configdoc/library/ConfigDoc/XMLSerializer/ConfigSchema.php
Normal file
@@ -0,0 +1,123 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
require_once 'ConfigDoc/XMLSerializer.php';
|
||||||
|
|
||||||
|
class ConfigDoc_XMLSerializer_ConfigSchema extends ConfigDoc_XMLSerializer
|
||||||
|
{
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Serializes a schema into DOM form
|
||||||
|
* @todo Split into sub-serializers
|
||||||
|
* @param $schema HTMLPurifier_ConfigSchema to serialize
|
||||||
|
*/
|
||||||
|
public function serialize($schema) {
|
||||||
|
$dom_document = new DOMDocument('1.0', 'UTF-8');
|
||||||
|
$dom_root = $dom_document->createElement('configdoc');
|
||||||
|
$dom_document->appendChild($dom_root);
|
||||||
|
$dom_document->formatOutput = true;
|
||||||
|
|
||||||
|
// add the name of the application
|
||||||
|
$dom_root->appendChild($dom_document->createElement('title', 'HTML Purifier'));
|
||||||
|
|
||||||
|
/*
|
||||||
|
TODO for XML format:
|
||||||
|
- create a definition (DTD or other) once interface stabilizes
|
||||||
|
*/
|
||||||
|
|
||||||
|
foreach($schema->info as $namespace_name => $namespace_info) {
|
||||||
|
|
||||||
|
$dom_namespace = $dom_document->createElement('namespace');
|
||||||
|
$dom_root->appendChild($dom_namespace);
|
||||||
|
|
||||||
|
$dom_namespace->setAttribute('id', $namespace_name);
|
||||||
|
$dom_namespace->appendChild(
|
||||||
|
$dom_document->createElement('name', $namespace_name)
|
||||||
|
);
|
||||||
|
$dom_namespace_description = $dom_document->createElement('description');
|
||||||
|
$dom_namespace->appendChild($dom_namespace_description);
|
||||||
|
$this->appendHTMLDiv($dom_document, $dom_namespace_description,
|
||||||
|
$schema->info_namespace[$namespace_name]->description);
|
||||||
|
|
||||||
|
foreach ($namespace_info as $name => $info) {
|
||||||
|
|
||||||
|
if ($info->class == 'alias') continue;
|
||||||
|
|
||||||
|
$dom_directive = $dom_document->createElement('directive');
|
||||||
|
$dom_namespace->appendChild($dom_directive);
|
||||||
|
|
||||||
|
$dom_directive->setAttribute('id', $namespace_name . '.' . $name);
|
||||||
|
$dom_directive->appendChild(
|
||||||
|
$dom_document->createElement('name', $name)
|
||||||
|
);
|
||||||
|
|
||||||
|
$dom_aliases = $dom_document->createElement('aliases');
|
||||||
|
$dom_directive->appendChild($dom_aliases);
|
||||||
|
foreach ($info->directiveAliases as $alias) {
|
||||||
|
$dom_aliases->appendChild($dom_document->createElement('alias', $alias));
|
||||||
|
}
|
||||||
|
|
||||||
|
$dom_constraints = $dom_document->createElement('constraints');
|
||||||
|
$dom_directive->appendChild($dom_constraints);
|
||||||
|
|
||||||
|
$dom_type = $dom_document->createElement('type', $info->type);
|
||||||
|
if ($info->allow_null) {
|
||||||
|
$dom_type->setAttribute('allow-null', 'yes');
|
||||||
|
}
|
||||||
|
$dom_constraints->appendChild($dom_type);
|
||||||
|
|
||||||
|
if ($info->allowed !== true) {
|
||||||
|
$dom_allowed = $dom_document->createElement('allowed');
|
||||||
|
$dom_constraints->appendChild($dom_allowed);
|
||||||
|
foreach ($info->allowed as $allowed => $bool) {
|
||||||
|
$dom_allowed->appendChild(
|
||||||
|
$dom_document->createElement('value', $allowed)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$raw_default = $schema->defaults[$namespace_name][$name];
|
||||||
|
if (is_bool($raw_default)) {
|
||||||
|
$default = $raw_default ? 'true' : 'false';
|
||||||
|
} elseif (is_string($raw_default)) {
|
||||||
|
$default = "\"$raw_default\"";
|
||||||
|
} elseif (is_null($raw_default)) {
|
||||||
|
$default = 'null';
|
||||||
|
} else {
|
||||||
|
$default = print_r(
|
||||||
|
$schema->defaults[$namespace_name][$name], true
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
$dom_default = $dom_document->createElement('default', $default);
|
||||||
|
|
||||||
|
// remove this once we get a DTD
|
||||||
|
$dom_default->setAttribute('xml:space', 'preserve');
|
||||||
|
|
||||||
|
$dom_constraints->appendChild($dom_default);
|
||||||
|
|
||||||
|
$dom_descriptions = $dom_document->createElement('descriptions');
|
||||||
|
$dom_directive->appendChild($dom_descriptions);
|
||||||
|
|
||||||
|
foreach ($info->descriptions as $file => $file_descriptions) {
|
||||||
|
foreach ($file_descriptions as $line => $description) {
|
||||||
|
$dom_description = $dom_document->createElement('description');
|
||||||
|
// refuse to write $file if it's a full path
|
||||||
|
if (str_replace('\\', '/', realpath($file)) != $file) {
|
||||||
|
$dom_description->setAttribute('file', $file);
|
||||||
|
$dom_description->setAttribute('line', $line);
|
||||||
|
}
|
||||||
|
$this->appendHTMLDiv($dom_document, $dom_description, $description);
|
||||||
|
$dom_descriptions->appendChild($dom_description);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
return $dom_document;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
26
configdoc/library/ConfigDoc/XMLSerializer/Types.php
Normal file
26
configdoc/library/ConfigDoc/XMLSerializer/Types.php
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
require_once 'ConfigDoc/XMLSerializer.php';
|
||||||
|
|
||||||
|
class ConfigDoc_XMLSerializer_Types extends ConfigDoc_XMLSerializer
|
||||||
|
{
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Serializes the types in a schema into DOM form
|
||||||
|
* @param $schema HTMLPurifier_ConfigSchema owner of types to serialize
|
||||||
|
*/
|
||||||
|
public function serialize($schema) {
|
||||||
|
$types_document = new DOMDocument('1.0', 'UTF-8');
|
||||||
|
$types_root = $types_document->createElement('types');
|
||||||
|
$types_document->appendChild($types_root);
|
||||||
|
$types_document->formatOutput = true;
|
||||||
|
foreach ($schema->types as $name => $expanded_name) {
|
||||||
|
$types_type = $types_document->createElement('type', $expanded_name);
|
||||||
|
$types_type->setAttribute('id', $name);
|
||||||
|
$types_root->appendChild($types_type);
|
||||||
|
}
|
||||||
|
return $types_document;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@@ -1,3 +1,6 @@
|
|||||||
|
|
||||||
|
body {margin:1em 4em;}
|
||||||
|
|
||||||
table {border-collapse:collapse;}
|
table {border-collapse:collapse;}
|
||||||
table td, table th {padding:0.2em;}
|
table td, table th {padding:0.2em;}
|
||||||
|
|
||||||
@@ -8,3 +11,14 @@ table.constraints td pre {margin:0;}
|
|||||||
|
|
||||||
#toc {list-style-type:none; font-weight:bold;}
|
#toc {list-style-type:none; font-weight:bold;}
|
||||||
#toc ul {list-style-type:disc; font-weight:normal;}
|
#toc ul {list-style-type:disc; font-weight:normal;}
|
||||||
|
|
||||||
|
.description p {margin-top:0;margin-bottom:1em;}
|
||||||
|
|
||||||
|
#library, h1 {text-align:center; font-family:Garamond, serif;
|
||||||
|
font-variant:small-caps;}
|
||||||
|
#library {font-size:1em;}
|
||||||
|
h1 {margin-top:0;}
|
||||||
|
h2 {border-bottom:1px solid #CCC; font-family:sans-serif; font-weight:normal;
|
||||||
|
font-size:1.3em;}
|
||||||
|
h3 {font-family:sans-serif; font-size:1.1em; font-weight:bold; }
|
||||||
|
h4 {font-family:sans-serif; font-size:0.9em; font-weight:bold; }
|
||||||
|
@@ -12,18 +12,21 @@
|
|||||||
indent = "no"
|
indent = "no"
|
||||||
media-type = "text/html"
|
media-type = "text/html"
|
||||||
/>
|
/>
|
||||||
|
<xsl:param name="css" select="'styles/plain.css'"/>
|
||||||
|
<xsl:param name="title" select="'Configuration Documentation'"/>
|
||||||
|
|
||||||
<xsl:variable name="typeLookup" select="document('../types.xml')" />
|
<xsl:variable name="typeLookup" select="document('../types.xml')" />
|
||||||
|
|
||||||
<xsl:template match="/">
|
<xsl:template match="/">
|
||||||
<html lang="en" xml:lang="en">
|
<html lang="en" xml:lang="en">
|
||||||
<head>
|
<head>
|
||||||
<title><xsl:value-of select="/configdoc/title" /> Configuration Documentation</title>
|
<title><xsl:value-of select="$title" /> - <xsl:value-of select="/configdoc/title" /></title>
|
||||||
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8" />
|
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8" />
|
||||||
<link rel="stylesheet" type="text/css" href="styles/plain.css" />
|
<link rel="stylesheet" type="text/css" href="{$css}" />
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<h1><xsl:value-of select="/configdoc/title" /> Configuration Documentation</h1>
|
<div id="library"><xsl:value-of select="/configdoc/title" /></div>
|
||||||
|
<h1><xsl:value-of select="$title" /></h1>
|
||||||
<h2>Table of Contents</h2>
|
<h2>Table of Contents</h2>
|
||||||
<ul id="toc">
|
<ul id="toc">
|
||||||
<xsl:apply-templates mode="toc" />
|
<xsl:apply-templates mode="toc" />
|
||||||
@@ -69,23 +72,45 @@
|
|||||||
<xsl:apply-templates />
|
<xsl:apply-templates />
|
||||||
</xsl:template>
|
</xsl:template>
|
||||||
<xsl:template match="directive/name">
|
<xsl:template match="directive/name">
|
||||||
|
<xsl:apply-templates select="../aliases/alias" mode="anchor" />
|
||||||
<h3 id="{../@id}"><xsl:value-of select="../@id" /></h3>
|
<h3 id="{../@id}"><xsl:value-of select="../@id" /></h3>
|
||||||
</xsl:template>
|
</xsl:template>
|
||||||
|
<xsl:template match="alias" mode="anchor">
|
||||||
|
<a id="{.}"></a>
|
||||||
|
</xsl:template>
|
||||||
|
|
||||||
|
<!-- Do not pass through -->
|
||||||
|
<xsl:template match="alias"></xsl:template>
|
||||||
|
|
||||||
<xsl:template match="directive/constraints">
|
<xsl:template match="directive/constraints">
|
||||||
<table class="constraints">
|
<table class="constraints">
|
||||||
<xsl:apply-templates />
|
<xsl:apply-templates />
|
||||||
<!-- Calculated other values -->
|
<!-- Calculated other values -->
|
||||||
<tr>
|
<xsl:if test="../descriptions/description[@file]">
|
||||||
<th>Used by:</th>
|
<tr>
|
||||||
<td>
|
<th>Used by:</th>
|
||||||
<xsl:for-each select="../descriptions/description">
|
<td>
|
||||||
<xsl:if test="position()>1">, </xsl:if>
|
<xsl:for-each select="../descriptions/description">
|
||||||
<xsl:value-of select="@file" />
|
<xsl:if test="position()>1">, </xsl:if>
|
||||||
</xsl:for-each>
|
<xsl:value-of select="@file" />
|
||||||
</td>
|
</xsl:for-each>
|
||||||
</tr>
|
</td>
|
||||||
|
</tr>
|
||||||
|
</xsl:if>
|
||||||
|
<xsl:if test="../aliases/alias">
|
||||||
|
<xsl:apply-templates select="../aliases" mode="constraints" />
|
||||||
|
</xsl:if>
|
||||||
</table>
|
</table>
|
||||||
</xsl:template>
|
</xsl:template>
|
||||||
|
<xsl:template match="directive/aliases" mode="constraints">
|
||||||
|
<th>Aliases:</th>
|
||||||
|
<td>
|
||||||
|
<xsl:for-each select="alias">
|
||||||
|
<xsl:if test="position()>1">, </xsl:if>
|
||||||
|
<xsl:value-of select="." />
|
||||||
|
</xsl:for-each>
|
||||||
|
</td>
|
||||||
|
</xsl:template>
|
||||||
<xsl:template match="directive//description">
|
<xsl:template match="directive//description">
|
||||||
<div class="description">
|
<div class="description">
|
||||||
<xsl:copy-of select="div/node()" />
|
<xsl:copy-of select="div/node()" />
|
||||||
@@ -123,4 +148,4 @@
|
|||||||
</tr>
|
</tr>
|
||||||
</xsl:template>
|
</xsl:template>
|
||||||
|
|
||||||
</xsl:stylesheet>
|
</xsl:stylesheet>
|
||||||
|
213
docs/dev-advanced-api.html
Normal file
213
docs/dev-advanced-api.html
Normal file
@@ -0,0 +1,213 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
||||||
|
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head>
|
||||||
|
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||||
|
<meta name="description" content="Functional specification for HTML Purifier's advanced API for defining custom filtering behavior." />
|
||||||
|
<link rel="stylesheet" type="text/css" href="style.css" />
|
||||||
|
|
||||||
|
<title>Advanced API - HTML Purifier</title>
|
||||||
|
|
||||||
|
</head><body>
|
||||||
|
|
||||||
|
<h1>Advanced API</h1>
|
||||||
|
|
||||||
|
<div id="filing">Filed under Development</div>
|
||||||
|
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
||||||
|
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
|
||||||
|
|
||||||
|
<p>HTML Purifier currently natively supports only a subset of HTML's
|
||||||
|
allowed elements, attributes, and behavior; specifically, this subset
|
||||||
|
is the set of elements that are safe for untrusted users to use.
|
||||||
|
However, HTML Purifier is often utilized to ensure standards-compliance
|
||||||
|
from input that is trusted (making it a sort of Tidy substitute),
|
||||||
|
and often users need to define new elements or attributes. The
|
||||||
|
advanced API is oriented specifically for these use-cases.</p>
|
||||||
|
|
||||||
|
<p>Our goals are to let the user:</p>
|
||||||
|
|
||||||
|
<dl>
|
||||||
|
<dt>Select</dt>
|
||||||
|
<dd><ul>
|
||||||
|
<li>Doctype</li>
|
||||||
|
<!-- <li>Filterset</li> -->
|
||||||
|
<li>Elements / Attributes / Modules</li>
|
||||||
|
<li>Tidy</li>
|
||||||
|
</ul></dd>
|
||||||
|
<dt>Customize</dt>
|
||||||
|
<dd><ul>
|
||||||
|
<li>Attributes</li>
|
||||||
|
<li>Elements</li>
|
||||||
|
<!--<li>Doctypes</li>-->
|
||||||
|
</ul></dd>
|
||||||
|
</dl>
|
||||||
|
|
||||||
|
<h2>Select</h2>
|
||||||
|
|
||||||
|
<p>For basic use, the user will have to specify some basic parameters. This
|
||||||
|
is not strictly necessary, as HTML Purifier's default setting will always
|
||||||
|
output safe code, but is required for standards-compliant output.</p>
|
||||||
|
|
||||||
|
<h3>Selecting a Doctype</h3>
|
||||||
|
|
||||||
|
<p>The first thing to select is the <strong>doctype</strong>. This
|
||||||
|
is essential for standards-compliant output.</p>
|
||||||
|
|
||||||
|
<p class="technical">This identifier is based
|
||||||
|
on the name the W3C has given to the document type and <em>not</em>
|
||||||
|
the DTD identifier.</p>
|
||||||
|
|
||||||
|
<p>This parameter is set via the configuration object:</p>
|
||||||
|
|
||||||
|
<pre>$config->set('HTML', 'Doctype', 'XHTML 1.0 Transitional');</pre>
|
||||||
|
|
||||||
|
<p>Due to historical reasons, the default doctype is XHTML 1.0
|
||||||
|
Transitional, however, we really shouldn't be guessing what the user's
|
||||||
|
doctype is. Fortunantely, people who can't be bothered to set this won't
|
||||||
|
be bothered when their pages stop validating.</p>
|
||||||
|
|
||||||
|
<h3>Selecting Elements / Attributes / Modules</h3>
|
||||||
|
|
||||||
|
<p>HTML Purifier will, by default, allow as many elements and attributes
|
||||||
|
as possible. However, a user may decide to roll their own filterset by
|
||||||
|
selecting modules, elements and attributes to allow for their own
|
||||||
|
specific use-case. This can be done using %HTML.Allowed:</p>
|
||||||
|
|
||||||
|
<pre>$config->set('HTML', 'Allowed', 'a[href|title],em,p,blockquote');</pre>
|
||||||
|
|
||||||
|
<p class="technical">The directive %HTML.Allowed is a convenience feature
|
||||||
|
that may be fully expressed with the legacy interface.</p>
|
||||||
|
|
||||||
|
<p>We currently support another interface from older versions:</p>
|
||||||
|
|
||||||
|
<pre>$config->set('HTML', 'AllowedElements', 'a,em,p,blockquote');
|
||||||
|
$config->set('HTML', 'AllowedAttributes', 'a.href,a.title');</pre>
|
||||||
|
|
||||||
|
<p>A user may also choose to allow modules using a specialized
|
||||||
|
directive:</p>
|
||||||
|
|
||||||
|
<pre>$config->set('HTML', 'AllowedModules', 'Hypertext,Text,Lists');</pre>
|
||||||
|
|
||||||
|
<p>But it is not expected that this feature will be widely used.</p>
|
||||||
|
|
||||||
|
<p class="technical">Module selection will work slightly differently
|
||||||
|
from the other AllowedElements and AllowedAttributes directives by
|
||||||
|
directly modifying the doctype you are operating in, in the spirit of
|
||||||
|
XHTML 1.1's modularization. We stop users from shooting themselves in the
|
||||||
|
foot by mandating the modules in %HTML.CoreModules be used.</p>
|
||||||
|
|
||||||
|
<p class="technical">Modules are distinguished from regular elements by the
|
||||||
|
case of their first letter. While XML distinguishes between and allows
|
||||||
|
lower and uppercase letters in element names, XHTML uses only lower-case
|
||||||
|
element names for sake of consistency.</p>
|
||||||
|
|
||||||
|
<h3>Selecting Tidy</h3>
|
||||||
|
|
||||||
|
<p>The name of this segment of functionality is inspired off of Dave
|
||||||
|
Ragget's program HTML Tidy, which purported to help clean up HTML. In
|
||||||
|
HTML Purifier, Tidy functionality involves turning unsupported and
|
||||||
|
deprecated elements into standards-compliant ones, maintaining
|
||||||
|
backwards compatibility, and enforcing best practices.</p>
|
||||||
|
|
||||||
|
<p>This is a complicated feature, and is explained more in depth at
|
||||||
|
<a href="enduser-tidy.html">the Tidy documentation page</a>.</p>
|
||||||
|
|
||||||
|
<!--
|
||||||
|
<h3>Unified selector</h3>
|
||||||
|
|
||||||
|
<p>Because selecting each and every one of these configuration options
|
||||||
|
is a chore, we may wish to offer a specialized configuration method
|
||||||
|
for selecting a filterset. Possibility:</p>
|
||||||
|
|
||||||
|
<pre>function selectFilter($doctype, $filterset, $tidy)</pre>
|
||||||
|
|
||||||
|
<p>...which is simply a light wrapper over the individual configuration
|
||||||
|
calls. A custom config file format or text format could also be adopted.</p>
|
||||||
|
-->
|
||||||
|
|
||||||
|
<h2>Customize</h2>
|
||||||
|
|
||||||
|
<p>By reviewing topic posts in the support forum, we determined that
|
||||||
|
there were two primarily demanded customization features people wanted:
|
||||||
|
to add an attribute to an existing element, and to add an element.
|
||||||
|
Thus, we'll want to create convenience functions for these common
|
||||||
|
use-cases.</p>
|
||||||
|
|
||||||
|
<p>Note that the functions described here are only available if
|
||||||
|
a raw copy of <code>HTMLPurifier_HTMLDefinition</code> was retrieved.
|
||||||
|
Furthermore, caching may prevent your changes from immediately
|
||||||
|
being seen: consult <a href="enduser-customize.html">enduser-customize.html</a> on how
|
||||||
|
to work around this.</p>
|
||||||
|
|
||||||
|
<h3>Attributes</h3>
|
||||||
|
|
||||||
|
<p>An attribute is bound to an element by a name and has a specific
|
||||||
|
<code>AttrDef</code> that validates it. The interface is therefore:</p>
|
||||||
|
|
||||||
|
<pre>function addAttribute($element, $attribute, $attribute_def);</pre>
|
||||||
|
|
||||||
|
<p>Example of the functionality in action:</p>
|
||||||
|
|
||||||
|
<pre>$def->addAttribute('a', 'rel', 'Enum#nofollow');</pre>
|
||||||
|
|
||||||
|
<p>The <code>$attribute_def</code> value is flexible,
|
||||||
|
to make things simpler. It can be a literal object or:</p>
|
||||||
|
|
||||||
|
<ul>
|
||||||
|
<!--<li>Class name: We'll instantiate it for you</li>
|
||||||
|
<li>Function name: We'll create an <code>HTMLPurifier_AttrDef_Anonymous</code>
|
||||||
|
class with that function registered as a callback.</li>-->
|
||||||
|
<li>String attribute type: We'll use <code>HTMLPurifier_AttrTypes</code>
|
||||||
|
to resolve it for you. Any data that follows a hash mark (#) will
|
||||||
|
be used to customize the attribute type: in the example above,
|
||||||
|
we specify which values for Enum to allow.</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<h3>Elements</h3>
|
||||||
|
|
||||||
|
<p>An element requires certain information as specified by
|
||||||
|
<code>HTMLPurifier_ElementDef</code>. However, not all of it is necessary,
|
||||||
|
the usual things required are:</p>
|
||||||
|
|
||||||
|
<ul>
|
||||||
|
<li>Attributes</li>
|
||||||
|
<li>Content model/type</li>
|
||||||
|
<li>Registration in a content set</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<p>This suggests an API like this:</p>
|
||||||
|
|
||||||
|
<pre>function addElement($element, $type, $contents,
|
||||||
|
$attr_collections = array(); $attributes = array());</pre>
|
||||||
|
|
||||||
|
<p>Each parameter explained in depth:</p>
|
||||||
|
|
||||||
|
<dl>
|
||||||
|
<dt><code>$element</code></dt>
|
||||||
|
<dd>Element name, ex. 'label'</dd>
|
||||||
|
<dt><code>$type</code></dt>
|
||||||
|
<dd>Content set to register in, ex. 'Inline' or 'Flow'</dd>
|
||||||
|
<dt><code>$contents</code></dt>
|
||||||
|
<dd>Description of allowed children. This is a merged form of
|
||||||
|
<code>HTMLPurifier_ElementDef</code>'s member variables
|
||||||
|
<code>$content_model</code> and <code>$content_model_type</code>,
|
||||||
|
where the form is <q>Type: Model</q>, ex. 'Optional: Inline'.
|
||||||
|
There are also a number of predefined templates one may use.</dd>
|
||||||
|
<dt><code>$attr_collections</code></dt>
|
||||||
|
<dd>Array (or string if only one) of attribute collection(s) to
|
||||||
|
merge into the attributes array.</dd>
|
||||||
|
<dt><code>$attributes</code></dt>
|
||||||
|
<dd>Array of attribute names to attribute definitions, much like
|
||||||
|
the above-described attribute customization.</dd>
|
||||||
|
</dl>
|
||||||
|
|
||||||
|
<p>A possible usage:</p>
|
||||||
|
|
||||||
|
<pre>$def->addElement('font', 'Inline', 'Optional: Inline', 'Common',
|
||||||
|
array('color' => 'Color'));</pre>
|
||||||
|
|
||||||
|
<p>See <code>HTMLPurifier/HTMLModule.php</code> for details.</p>
|
||||||
|
|
||||||
|
<div id="version">$Id$</div>
|
||||||
|
|
||||||
|
</body></html>
|
@@ -1,32 +1,17 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
|
||||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
|
||||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head>
|
|
||||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
|
||||||
<meta name="description" content="Discusses code quality issues and places that need to be refactored in HTML Purifier." />
|
|
||||||
<link rel="stylesheet" type="text/css" href="./style.css" />
|
|
||||||
|
|
||||||
<title>Code Quality Issues - HTML Purifier</title>
|
Code Quality Issues
|
||||||
|
|
||||||
</head><body>
|
Okay, face it. Programmers can get lazy, cut corners, or make mistakes. They
|
||||||
|
|
||||||
<h1>Code Quality Issues</h1>
|
|
||||||
|
|
||||||
<div id="filing">Filed under Development</div>
|
|
||||||
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
|
||||||
|
|
||||||
<p>Okay, face it. Programmers can get lazy, cut corners, or make mistakes. They
|
|
||||||
also can do quick prototypes, and then forget to rewrite them later. Well,
|
also can do quick prototypes, and then forget to rewrite them later. Well,
|
||||||
while I can't list mistakes in here, I can list prototype-like segments
|
while I can't list mistakes in here, I can list prototype-like segments
|
||||||
of code that should be aggressively refactored. This does not list
|
of code that should be aggressively refactored. This does not list
|
||||||
optimization issues, that needs to be done after intense profiling.</p>
|
optimization issues, that needs to be done after intense profiling.
|
||||||
|
|
||||||
<pre>
|
|
||||||
docs/examples/demo.php - ad hoc HTML/PHP soup to the extreme
|
docs/examples/demo.php - ad hoc HTML/PHP soup to the extreme
|
||||||
|
|
||||||
AttrDef
|
AttrDef - a lot of duplication, more generic classes need to be created;
|
||||||
Class - doesn't support Unicode characters (fringe); uses regular
|
a lot of strtolower() calls, no legit casing
|
||||||
expressions
|
Class - doesn't support Unicode characters (fringe); uses regular expressions
|
||||||
Lang - code duplication; premature optimization
|
Lang - code duplication; premature optimization
|
||||||
Length - easily mistaken for CSSLength
|
Length - easily mistaken for CSSLength
|
||||||
URI - multiple regular expressions; missing validation for parts (?)
|
URI - multiple regular expressions; missing validation for parts (?)
|
||||||
@@ -36,16 +21,8 @@ ConfigSchema - redefinition is a mess
|
|||||||
Strategy
|
Strategy
|
||||||
FixNesting - cannot bubble nodes out of structures, duplicated checks
|
FixNesting - cannot bubble nodes out of structures, duplicated checks
|
||||||
for special-case parent node
|
for special-case parent node
|
||||||
MakeWellFormed - insufficient automatic closing definitions (check HTML
|
|
||||||
spec for optional end tags, also, closing based on type (block/inline)
|
|
||||||
might be efficient).
|
|
||||||
RemoveForeignElements - should be run in parallel with MakeWellFormed
|
RemoveForeignElements - should be run in parallel with MakeWellFormed
|
||||||
URIScheme - needs to have callable generic checks
|
URIScheme - needs to have callable generic checks
|
||||||
mailto - doesn't validate emails, doesn't validate querystring
|
mailto - doesn't validate emails, doesn't validate querystring
|
||||||
news - doesn't validate opaque path
|
news - doesn't validate opaque path
|
||||||
nntp - doesn't constrain path
|
nntp - doesn't constrain path
|
||||||
</pre>
|
|
||||||
|
|
||||||
<div id="version">$Id$</div>
|
|
||||||
|
|
||||||
</body></html>
|
|
@@ -14,6 +14,7 @@
|
|||||||
|
|
||||||
<div id="filing">Filed under Development</div>
|
<div id="filing">Filed under Development</div>
|
||||||
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
||||||
|
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
|
||||||
|
|
||||||
<p>The classes in this library follow a few naming conventions, which may
|
<p>The classes in this library follow a few naming conventions, which may
|
||||||
help you find the correct functionality more quickly. Here they are:</p>
|
help you find the correct functionality more quickly. Here they are:</p>
|
||||||
@@ -78,4 +79,4 @@ help you find the correct functionality more quickly. Here they are:</p>
|
|||||||
|
|
||||||
<div id="version">$Id$</div>
|
<div id="version">$Id$</div>
|
||||||
|
|
||||||
</body></html>
|
</body></html>
|
||||||
|
@@ -14,6 +14,7 @@
|
|||||||
|
|
||||||
<div id="filing">Filed under Development</div>
|
<div id="filing">Filed under Development</div>
|
||||||
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
||||||
|
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
|
||||||
|
|
||||||
<p>Here are some possible optimization techniques we can apply to code sections if
|
<p>Here are some possible optimization techniques we can apply to code sections if
|
||||||
they turn out to be slow. Be sure not to prematurely optimize: if you get
|
they turn out to be slow. Be sure not to prematurely optimize: if you get
|
||||||
@@ -29,4 +30,4 @@ that itch, put it here!</p>
|
|||||||
|
|
||||||
<div id="version">$Id$</div>
|
<div id="version">$Id$</div>
|
||||||
|
|
||||||
</body></html>
|
</body></html>
|
||||||
|
@@ -32,6 +32,7 @@ thead th {text-align:left;padding:0.1em;background-color:#EEE;}
|
|||||||
|
|
||||||
<div id="filing">Filed under Development</div>
|
<div id="filing">Filed under Development</div>
|
||||||
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
||||||
|
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
|
||||||
|
|
||||||
<h2>Key</h2>
|
<h2>Key</h2>
|
||||||
|
|
||||||
@@ -59,7 +60,7 @@ thead th {text-align:left;padding:0.1em;background-color:#EEE;}
|
|||||||
<tbody>
|
<tbody>
|
||||||
<tr><th colspan="2">Standard</th></tr>
|
<tr><th colspan="2">Standard</th></tr>
|
||||||
<tr class="css1 impl-yes"><td>background-color</td><td>COMPOSITE(<color>, transparent)</td></tr>
|
<tr class="css1 impl-yes"><td>background-color</td><td>COMPOSITE(<color>, transparent)</td></tr>
|
||||||
<tr class="css1 impl-yes"><td>background</td><td>SHORTHAND, only for color, see below for info on background-image and friends</td></tr>
|
<tr class="css1 impl-yes"><td>background</td><td>SHORTHAND, currently alias for background-color</td></tr>
|
||||||
<tr class="css1 impl-yes"><td>border</td><td>SHORTHAND, MULTIPLE</td></tr>
|
<tr class="css1 impl-yes"><td>border</td><td>SHORTHAND, MULTIPLE</td></tr>
|
||||||
<tr class="css1 impl-yes"><td>border-color</td><td>MULTIPLE</td></tr>
|
<tr class="css1 impl-yes"><td>border-color</td><td>MULTIPLE</td></tr>
|
||||||
<tr class="css1 impl-yes"><td>border-style</td><td>MULTIPLE</td></tr>
|
<tr class="css1 impl-yes"><td>border-style</td><td>MULTIPLE</td></tr>
|
||||||
@@ -141,17 +142,17 @@ thead th {text-align:left;padding:0.1em;background-color:#EEE;}
|
|||||||
|
|
||||||
<tbody>
|
<tbody>
|
||||||
<tr><th colspan="2">Unknown</th></tr>
|
<tr><th colspan="2">Unknown</th></tr>
|
||||||
<tr class="danger css1"><td>background-image</td><td>Dangerous, target milestone 1.3</td></tr>
|
<tr class="danger css1 impl-yes"><td>background-image</td><td>Dangerous</td></tr>
|
||||||
<tr class="css1"><td>background-attachment</td><td>ENUM(scroll, fixed),
|
<tr class="css1 impl-yes"><td>background-attachment</td><td>ENUM(scroll, fixed),
|
||||||
Depends on background-image</td></tr>
|
Depends on background-image</td></tr>
|
||||||
<tr class="css1"><td>background-position</td><td>Depends on background-image</td></tr>
|
<tr class="css1 impl-yes"><td>background-position</td><td>Depends on background-image</td></tr>
|
||||||
<tr class="danger impl-no"><td>cursor</td><td>Dangerous but fluffy</td></tr>
|
<tr class="danger impl-no"><td>cursor</td><td>Dangerous but fluffy</td></tr>
|
||||||
<tr class="danger css1"><td>display</td><td>ENUM(...), Dangerous but interesting;
|
<tr class="danger css1"><td>display</td><td>ENUM(...), Dangerous but interesting;
|
||||||
will not implement list-item, run-in (Opera only) or table (no IE);
|
will not implement list-item, run-in (Opera only) or table (no IE);
|
||||||
inline-block has incomplete IE6 support and requires -moz-inline-box
|
inline-block has incomplete IE6 support and requires -moz-inline-box
|
||||||
for Mozilla. Unknown target milestone.</td></tr>
|
for Mozilla. Unknown target milestone.</td></tr>
|
||||||
<tr><td class="css1">height</td><td>Interesting, why use it? Unknown target milestone.</td></tr>
|
<tr class="css1 impl-yes"><td>height</td><td>Interesting, why use it? Unknown target milestone.</td></tr>
|
||||||
<tr class="danger css1"><td>list-style-image</td><td>Dangerous? Target milestone 1.3</td></tr>
|
<tr class="danger css1 impl-yes"><td>list-style-image</td><td>Dangerous?</td></tr>
|
||||||
<tr class="impl-no"><td>max-height</td><td rowspan="4">No IE 5/6</td></tr>
|
<tr class="impl-no"><td>max-height</td><td rowspan="4">No IE 5/6</td></tr>
|
||||||
<tr class="impl-no"><td>min-height</td></tr>
|
<tr class="impl-no"><td>min-height</td></tr>
|
||||||
<tr class="impl-no"><td>max-width</td></tr>
|
<tr class="impl-no"><td>max-width</td></tr>
|
||||||
@@ -167,9 +168,9 @@ thead th {text-align:left;padding:0.1em;background-color:#EEE;}
|
|||||||
<tr class="impl-no"><td>quotes</td><td>May be dropped from CSS2, fairly useless for inline context</td></tr>
|
<tr class="impl-no"><td>quotes</td><td>May be dropped from CSS2, fairly useless for inline context</td></tr>
|
||||||
<tr class="impl-no"><td>visibility</td><td>ENUM(visible, hidden, collapse),
|
<tr class="impl-no"><td>visibility</td><td>ENUM(visible, hidden, collapse),
|
||||||
Dangerous</td></tr>
|
Dangerous</td></tr>
|
||||||
<tr class="css1 feature"><td>white-space</td><td>ENUM(normal, pre, nowrap, pre-wrap,
|
<tr class="css1 feature impl-partial"><td>white-space</td><td>ENUM(normal, pre, nowrap, pre-wrap,
|
||||||
pre-line), Spotty implementation:
|
pre-line), Spotty implementation:
|
||||||
pre (no IE 5/6), nowrap (no IE 5),
|
pre (no IE 5/6), <em>nowrap</em> (no IE 5, supported),
|
||||||
pre-wrap (only Opera), pre-line (no support). Fixable? Unknown target milestone.</td></tr>
|
pre-wrap (only Opera), pre-line (no support). Fixable? Unknown target milestone.</td></tr>
|
||||||
</tbody>
|
</tbody>
|
||||||
|
|
||||||
@@ -230,21 +231,21 @@ Mozilla on inside and needs -moz-outline, no IE support.</td></tr>
|
|||||||
|
|
||||||
<tbody>
|
<tbody>
|
||||||
<tr><th colspan="3">CSS</th></tr>
|
<tr><th colspan="3">CSS</th></tr>
|
||||||
<tr class="impl-yes"><td>style</td><td>All</td><td>Not all properties may be implemented, parser is good though.</td></tr>
|
<tr class="impl-yes"><td>style</td><td>All</td><td>Parser is reasonably functional. Status here doesn't count individual properties.</td></tr>
|
||||||
</tbody>
|
</tbody>
|
||||||
|
|
||||||
<tbody>
|
<tbody>
|
||||||
<tr><th colspan="3">Questionable</th></tr>
|
<tr><th colspan="3">Questionable</th></tr>
|
||||||
<tr class="impl-no"><td>accesskey</td><td>A</td><td>May interfere with main interface</td></tr>
|
<tr class="impl-no"><td>accesskey</td><td>A</td><td>May interfere with main interface</td></tr>
|
||||||
<tr class="impl-no"><td>tabindex</td><td>A</td><td>May interfere with main interface</td></tr>
|
<tr class="impl-no"><td>tabindex</td><td>A</td><td>May interfere with main interface</td></tr>
|
||||||
<tr><td>target</td><td>A</td><td>Config enabled, only useful for frame layouts, disallowed in strict</td></tr>
|
<tr class="impl-yes"><td>target</td><td>A</td><td>Config enabled, only useful for frame layouts, disallowed in strict</td></tr>
|
||||||
</tbody>
|
</tbody>
|
||||||
|
|
||||||
<tbody>
|
<tbody>
|
||||||
<tr><th colspan="3">Miscellaneous</th></tr>
|
<tr><th colspan="3">Miscellaneous</th></tr>
|
||||||
<tr><td>datetime</td><td>DEL, INS</td><td>No visible effect, ISO format</td></tr>
|
<tr><td>datetime</td><td>DEL, INS</td><td>No visible effect, ISO format</td></tr>
|
||||||
<tr><td>rel</td><td>A</td><td>Largely user-defined: nofollow, tag (see microformats)</td></tr>
|
<tr class="impl-yes"><td>rel</td><td>A</td><td>Largely user-defined: nofollow, tag (see microformats)</td></tr>
|
||||||
<tr><td>rev</td><td>A</td><td>Largely user-defined: vote-*</td></tr>
|
<tr class="impl-yes"><td>rev</td><td>A</td><td>Largely user-defined: vote-*</td></tr>
|
||||||
<tr class="feature"><td>axis</td><td>TD, TH</td><td>W3C only: No browser implementation</td></tr>
|
<tr class="feature"><td>axis</td><td>TD, TH</td><td>W3C only: No browser implementation</td></tr>
|
||||||
<tr class="feature"><td>char</td><td>COL, COLGROUP, TBODY, TD, TFOOT, TH, THEAD, TR</td><td>W3C only: No browser implementation</td></tr>
|
<tr class="feature"><td>char</td><td>COL, COLGROUP, TBODY, TD, TFOOT, TH, THEAD, TR</td><td>W3C only: No browser implementation</td></tr>
|
||||||
<tr class="feature"><td>headers</td><td>TD, TH</td><td>W3C only: No browser implementation</td></tr>
|
<tr class="feature"><td>headers</td><td>TD, TH</td><td>W3C only: No browser implementation</td></tr>
|
||||||
@@ -261,41 +262,41 @@ Mozilla on inside and needs -moz-outline, no IE support.</td></tr>
|
|||||||
</tbody>
|
</tbody>
|
||||||
|
|
||||||
<tbody>
|
<tbody>
|
||||||
<tr><th colspan="3">Transform, target milestone 1.4</th></tr>
|
<tr><th colspan="3">Transform</th></tr>
|
||||||
<tr><td rowspan="5">align</td><td>CAPTION</td><td>Near-equiv style 'caption-side', drop left and right</td></tr>
|
<tr class="impl-yes"><td rowspan="5">align</td><td>CAPTION</td><td>'caption-side' for top/bottom, 'text-align' for left/right</td></tr>
|
||||||
<tr><td>IMG</td><td rowspan="2">Margin-left and margin-right = auto or parent div</td></tr>
|
<tr class="impl-yes"><td>IMG</td><td rowspan="3">See specimens/html-align-to-css.html</td></tr>
|
||||||
<tr><td>TABLE</td></tr>
|
<tr class="impl-yes"><td>TABLE</td></tr>
|
||||||
<tr><td>HR</td><td>Equivalent style 'text-align' (IE tested)</td></tr>
|
<tr class="impl-yes"><td>HR</td></tr>
|
||||||
<tr class="impl-yes"><td>H1, H2, H3, H4, H5, H6, P</td><td>Equivalent style 'text-align'</td></tr>
|
<tr class="impl-yes"><td>H1, H2, H3, H4, H5, H6, P</td><td>Equivalent style 'text-align'</td></tr>
|
||||||
<tr class="required impl-yes"><td>alt</td><td>IMG</td><td>Required, insert image filename if src is present or default invalid image text</td></tr>
|
<tr class="required impl-yes"><td>alt</td><td>IMG</td><td>Required, insert image filename if src is present or default invalid image text</td></tr>
|
||||||
<tr><td rowspan="3">bgcolor</td><td>TABLE</td><td>Equivalent style 'background-color' (IE tested)</td></tr>
|
<tr class="impl-yes"><td rowspan="3">bgcolor</td><td>TABLE</td><td>Superset style 'background-color'</td></tr>
|
||||||
<tr><td>TR</td><td>Equivalent style 'background-color' (IE tested)</td></tr>
|
<tr class="impl-yes"><td>TR</td><td>Superset style 'background-color'</td></tr>
|
||||||
<tr><td>TD, TH</td><td>Equivalent style 'background-color'</td></tr>
|
<tr class="impl-yes"><td>TD, TH</td><td>Superset style 'background-color'</td></tr>
|
||||||
<tr><td>border</td><td>IMG</td><td>Equivalent style 'border-width', only applies when link present</td></tr>
|
<tr class="impl-yes"><td>border</td><td>IMG</td><td>Equivalent style <code>border:[number]px solid</code></td></tr>
|
||||||
<tr><td>clear</td><td>BR</td><td>Near-equiv style 'clear', transform 'all' into 'both'</td></tr>
|
<tr class="impl-yes"><td>clear</td><td>BR</td><td>Near-equiv style 'clear', transform 'all' into 'both'</td></tr>
|
||||||
<tr class="impl-no"><td>compact</td><td>DL, OL, UL</td><td>Boolean, needs custom CSS class; rarely used anyway</td></tr>
|
<tr class="impl-no"><td>compact</td><td>DL, OL, UL</td><td>Boolean, needs custom CSS class; rarely used anyway</td></tr>
|
||||||
<tr class="required impl-yes"><td>dir</td><td>BDO</td><td>Required, insert ltr (or configuration value) if none</td></tr>
|
<tr class="required impl-yes"><td>dir</td><td>BDO</td><td>Required, insert ltr (or configuration value) if none</td></tr>
|
||||||
<tr><td>height</td><td>TD, TH</td><td>Near-equiv style 'height', needs px suffix if original was in pixels</td></tr>
|
<tr class="impl-yes"><td>height</td><td>TD, TH</td><td>Near-equiv style 'height', needs px suffix if original was in pixels</td></tr>
|
||||||
<tr><td>hspace</td><td>IMG</td><td>Near-equiv styles 'margin-top' and 'margin-bottom', needs px suffix</td></tr>
|
<tr class="impl-yes"><td>hspace</td><td>IMG</td><td>Near-equiv styles 'margin-top' and 'margin-bottom', needs px suffix</td></tr>
|
||||||
<tr class="impl-yes"><td>lang</td><td>*</td><td>Copy value to xml:lang</td></tr>
|
<tr class="impl-yes"><td>lang</td><td>*</td><td>Copy value to xml:lang</td></tr>
|
||||||
<tr><td rowspan="2">name</td><td>IMG</td><td>Turn into ID</td></tr>
|
<tr class="impl-yes"><td rowspan="2">name</td><td>IMG</td><td>Turn into ID</td></tr>
|
||||||
<tr><td>A</td><td>Turn into ID? (not deprecated, though in which specs?)</td></tr>
|
<tr class="impl-yes"><td>A</td><td>Turn into ID</td></tr>
|
||||||
<tr><td>noshade</td><td>HR</td><td>Boolean, style 'border-style:solid;'</td></tr>
|
<tr class="impl-yes"><td>noshade</td><td>HR</td><td>Boolean, style 'border-style:solid;'</td></tr>
|
||||||
<tr><td>nowrap</td><td>TD, TH</td><td>Boolean, style 'white-space:nowrap;' (not compat with IE5)</td></tr>
|
<tr class="impl-yes"><td>nowrap</td><td>TD, TH</td><td>Boolean, style 'white-space:nowrap;' (not compat with IE5)</td></tr>
|
||||||
<tr><td>size</td><td>HR</td><td>Near-equiv 'width', needs px suffix if original was pixels</td></tr>
|
<tr class="impl-yes"><td>size</td><td>HR</td><td>Near-equiv 'height', needs px suffix if original was pixels</td></tr>
|
||||||
<tr class="required impl-yes"><td>src</td><td>IMG</td><td>Required, insert blank or default img if not set</td></tr>
|
<tr class="required impl-yes"><td>src</td><td>IMG</td><td>Required, insert blank or default img if not set</td></tr>
|
||||||
<tr class="impl-yes"><td>start</td><td>OL</td><td>Poorly supported 'counter-reset', allowed in loose, dropped in strict</td></tr>
|
<tr class="impl-yes"><td>start</td><td>OL</td><td>Poorly supported 'counter-reset', allowed in loose, dropped in strict</td></tr>
|
||||||
<tr><td rowspan="3">type</td><td>LI</td><td rowspan="3">Equivalent style 'list-style-type', different allowed values though. (needs testing)</td></tr>
|
<tr class="impl-yes"><td rowspan="3">type</td><td>LI</td><td rowspan="3">Equivalent style 'list-style-type', different allowed values though. (needs testing)</td></tr>
|
||||||
<tr><td>OL</td></tr>
|
<tr class="impl-yes"><td>OL</td></tr>
|
||||||
<tr><td>UL</td></tr>
|
<tr class="impl-yes"><td>UL</td></tr>
|
||||||
<tr class="impl-yes"><td>value</td><td>LI</td><td>Poorly supported 'counter-reset', allowed in loose, dropped in strict</td></tr>
|
<tr class="impl-yes"><td>value</td><td>LI</td><td>Poorly supported 'counter-reset', allowed in loose, dropped in strict</td></tr>
|
||||||
<tr><td>vspace</td><td>IMG</td><td>Near-equiv styles 'margin-left' and 'margin-right', needs px suffix, see hspace</td></tr>
|
<tr class="impl-yes"><td>vspace</td><td>IMG</td><td>Near-equiv styles 'margin-left' and 'margin-right', needs px suffix, see hspace</td></tr>
|
||||||
<tr><td rowspan="2">width</td><td>HR</td><td rowspan="2">Near-equiv style 'width', needs px suffix if original was pixels</td></tr>
|
<tr class="impl-yes"><td rowspan="2">width</td><td>HR</td><td rowspan="2">Near-equiv style 'width', needs px suffix if original was pixels</td></tr>
|
||||||
<tr><td>TD, TH</td></tr>
|
<tr class="impl-yes"><td>TD, TH</td></tr>
|
||||||
</tbody>
|
</tbody>
|
||||||
|
|
||||||
</table>
|
</table>
|
||||||
|
|
||||||
<div id="version">$Id$</div>
|
<div id="version">$Id$</div>
|
||||||
|
|
||||||
</body></html>
|
</body></html>
|
||||||
|
786
docs/enduser-customize.html
Normal file
786
docs/enduser-customize.html
Normal file
@@ -0,0 +1,786 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
||||||
|
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head>
|
||||||
|
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||||
|
<meta name="description" content="Tutorial for customizing HTML Purifier's tag and attribute sets." />
|
||||||
|
<link rel="stylesheet" type="text/css" href="style.css" />
|
||||||
|
|
||||||
|
<title>Customize - HTML Purifier</title>
|
||||||
|
|
||||||
|
</head><body>
|
||||||
|
|
||||||
|
<h1 class="subtitled">Customize!</h1>
|
||||||
|
<div class="subtitle">HTML Purifier is a Swiss-Army Knife</div>
|
||||||
|
|
||||||
|
<div id="filing">Filed under End-User</div>
|
||||||
|
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
||||||
|
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
You may have heard of the <a href="dev-advanced-api.html">Advanced API</a>.
|
||||||
|
If you're interested in reading dry prose and boring functional
|
||||||
|
specifications, feel free to click that link to get a no-nonsense overview
|
||||||
|
on the Advanced API. For the rest of us, there's this tutorial. By the time
|
||||||
|
you're finished reading this, you should have a pretty good idea on
|
||||||
|
how to implement custom tags and attributes that HTML Purifier may not have.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<h2>Is it necessary?</h2>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Before we even write any code, it is paramount to consider whether or
|
||||||
|
not the code we're writing is necessary or not. HTML Purifier, by default,
|
||||||
|
contains a large set of elements and attributes: large enough so that
|
||||||
|
<em>any</em> element or attribute in XHTML 1.0 (and its HTML variant)
|
||||||
|
that can be safely used by the general public is implemented.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
So what needs to be implemented? (Feel free to skip this section if
|
||||||
|
you know what you want).
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<h3>XHTML 1.0</h3>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
All of the modules listed below are based off of the
|
||||||
|
<a href="http://www.w3.org/TR/2001/REC-xhtml-modularization-20010410/abstract_modules.html#sec_5.2.">modularization of
|
||||||
|
XHTML</a>, which, while technically for XHTML 1.1, is quite a useful
|
||||||
|
resource.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<ul>
|
||||||
|
<li>Structure</li>
|
||||||
|
<li>Frames</li>
|
||||||
|
<li>Applets (deprecated)</li>
|
||||||
|
<li>Forms</li>
|
||||||
|
<li>Image maps</li>
|
||||||
|
<li>Objects</li>
|
||||||
|
<li>Frames</li>
|
||||||
|
<li>Events</li>
|
||||||
|
<li>Meta-information</li>
|
||||||
|
<li>Style sheets</li>
|
||||||
|
<li>Link (not hypertext)</li>
|
||||||
|
<li>Base</li>
|
||||||
|
<li>Name</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
If you don't recognize it, you probably don't need it. But the curious
|
||||||
|
can look all of these modules up in the above-mentioned document. Note
|
||||||
|
that inline scripting comes packaged with HTML Purifier (more on this
|
||||||
|
later).
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<h3>XHTML 1.1</h3>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
We have not implemented the
|
||||||
|
<a href="http://www.w3.org/TR/2001/REC-ruby-20010531/">Ruby module</a>,
|
||||||
|
which defines a set of tags
|
||||||
|
for publishing short annotations for text, used mostly in Japanese
|
||||||
|
and Chinese school texts.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<h3>XHTML 2.0</h3>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
<a href="http://www.w3.org/TR/xhtml2/">XHTML 2.0</a> is still a
|
||||||
|
working draft, so any elements introduced in the
|
||||||
|
specification have not been implemented and will not be implemented
|
||||||
|
until we get a recommendation or proposal. Because XHTML 2.0 is
|
||||||
|
an entirely new markup language, implementing rules for it will be
|
||||||
|
no easy task.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<h3>HTML 5</h3>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
<a href="http://www.whatwg.org/specs/web-apps/current-work/">HTML 5</a>
|
||||||
|
is a fork of HTML 4.01 by WHATWG, who believed that XHTML 2.0 was headed
|
||||||
|
in the wrong direction. It too is a working draft, and may change
|
||||||
|
drastically before publication, but it should be noted that the
|
||||||
|
<code>canvas</code> tag has been implemented by many browser vendors.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<h3>Proprietary</h3>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
There are a number of proprietary tags still in the wild. Many of them
|
||||||
|
have been documented in <a href="ref-proprietary-tags.txt">ref-proprietary-tags.txt</a>,
|
||||||
|
but there is currently no implementation for any of them.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<h3>Extensions</h3>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
There are also a number of other XML languages out there that can
|
||||||
|
be embedded in HTML documents: two of the most popular are MathML and
|
||||||
|
SVG, and I frequently get requests to implement these. But they are
|
||||||
|
expansive, comprehensive specifications, and it would take far too long
|
||||||
|
to implement them <em>correctly</em> (most systems I've seen go as far
|
||||||
|
as whitelisting tags and no further; come on, what about nesting!)
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Word of warning: HTML Purifier is currently <em>not</em> namespace
|
||||||
|
aware.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<h2>Giving back</h2>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
As you may imagine from the details above (don't be abashed if you didn't
|
||||||
|
read it all: a glance over would have done), there's quite a bit that
|
||||||
|
HTML Purifier doesn't implement. Recent architectural changes have
|
||||||
|
allowed HTML Purifier to implement elements and attributes that are not
|
||||||
|
safe! Don't worry, they won't be activated unless you set %HTML.Trusted
|
||||||
|
to true, but they certainly help out users who need to put, say, forms
|
||||||
|
on their page and don't want to go through the trouble of reading this
|
||||||
|
and implementing it themself.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
So any of the above that you implement for your own application could
|
||||||
|
help out some other poor sap on the other side of the globe. Help us
|
||||||
|
out, and send back code so that it can be hammered into a module and
|
||||||
|
released with the core. Any code would be greatly appreciated!
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<h2>And now...</h2>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Enough philosophical talk, time for some code:
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<pre>$config = HTMLPurifier_Config::createDefault();
|
||||||
|
$config->set('HTML', 'DefinitionID', 'enduser-customize.html tutorial');
|
||||||
|
$config->set('HTML', 'DefinitionRev', 1);
|
||||||
|
$def =& $config->getHTMLDefinition(true);</pre>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Assuming that HTML Purifier has already been properly loaded (hint:
|
||||||
|
include <code>HTMLPurifier.auto.php</code>), this code will set up
|
||||||
|
the environment that you need to start customizing the HTML definition.
|
||||||
|
What's going on?
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<ul>
|
||||||
|
<li>
|
||||||
|
The first three lines are regular configuration code:
|
||||||
|
<ul>
|
||||||
|
<li>
|
||||||
|
%HTML.DefinitionID is set to a unique identifier for your
|
||||||
|
custom HTML definition. This prevents it from clobbering
|
||||||
|
other custom definitions on the same installation.
|
||||||
|
</li>
|
||||||
|
<li>
|
||||||
|
%HTML.DefinitionRev is a revision integer of your HTML
|
||||||
|
definition. Because HTML definitions are cached, you'll need
|
||||||
|
to increment this whenever you make a change in order to flush
|
||||||
|
the cache.
|
||||||
|
</li>
|
||||||
|
</ul>
|
||||||
|
</li>
|
||||||
|
<li>
|
||||||
|
The fourth line retrieves a raw <code>HTMLPurifier_HTMLDefinition</code>
|
||||||
|
object that we will be tweaking. If the parameter was removed, we
|
||||||
|
would be retrieving a fully formed definition object, which is somewhat
|
||||||
|
useless for customization purposes.
|
||||||
|
</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<h3>Broken backwards-compatibility</h3>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Those of you who have already been twiddling around with the raw
|
||||||
|
HTML definition object, you'll be noticing that you're getting an error
|
||||||
|
when you attempt to retrieve the raw definition object without specifying
|
||||||
|
a DefinitionID. It is vital to caching (see below) that you make a unique
|
||||||
|
name for your customized definition, so make up something right now and
|
||||||
|
things will operate again.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<h2>Turn off caching</h2>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
To make development easier, we're going to temporarily turn off
|
||||||
|
definition caching:
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<pre>$config = HTMLPurifier_Config::createDefault();
|
||||||
|
$config->set('HTML', 'DefinitionID', 'enduser-customize.html tutorial');
|
||||||
|
$config->set('HTML', 'DefinitionRev', 1);
|
||||||
|
<strong>$config->set('Core', 'DefinitionCache', null); // remove this later!</strong>
|
||||||
|
$def =& $config->getHTMLDefinition(true);</pre>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
A few things should be mentioned about the caching mechanism before
|
||||||
|
we move on. For performance reasons, HTML Purifier caches generated
|
||||||
|
<code>HTMLPurifier_Definition</code> objects in serialized files
|
||||||
|
stored (by default) in <code>library/HTMLPurifier/DefinitionCache/Serializer</code>.
|
||||||
|
A lot of processing is done in order to create these objects, so it
|
||||||
|
makes little sense to repeat the same processing over and over again
|
||||||
|
whenever HTML Purifier is called.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
In order to identify a cache entry, HTML Purifier uses three variables:
|
||||||
|
the library's version number, the value of %HTML.DefinitionRev and
|
||||||
|
a serial of relevant configuration. Whenever any of these changes,
|
||||||
|
a new HTML definition is generated. Notice that there is no way
|
||||||
|
for the definition object to track changes to customizations: here, it
|
||||||
|
is up to you to supply appropriate information to DefinitionID and
|
||||||
|
DefinitionRev.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<h2 id="addAttribute">Add an attribute</h2>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
For this example, we're going to implement the <code>target</code> attribute found
|
||||||
|
on <code>a</code> elements. To implement an attribute, we have to
|
||||||
|
ask a few questions:
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<ol>
|
||||||
|
<li>What element is it found on?</li>
|
||||||
|
<li>What is its name?</li>
|
||||||
|
<li>Is it required or optional?</li>
|
||||||
|
<li>What are valid values for it?</li>
|
||||||
|
</ol>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
The first three are easy: the element is <code>a</code>, the attribute
|
||||||
|
is <code>target</code>, and it is not a required attribute. (If it
|
||||||
|
was required, we'd need to append an asterisk to the attribute name,
|
||||||
|
you'll see an example of this in the addElement() example).
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
The last question is a little trickier.
|
||||||
|
Lets allow the special values: _blank, _self, _target and _top.
|
||||||
|
The form of this is called an <strong>enumeration</strong>, a list of
|
||||||
|
valid values, although only one can be used at a time. To translate
|
||||||
|
this into code form, we write:
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<pre>$config = HTMLPurifier_Config::createDefault();
|
||||||
|
$config->set('HTML', 'DefinitionID', 'enduser-customize.html tutorial');
|
||||||
|
$config->set('HTML', 'DefinitionRev', 1);
|
||||||
|
$config->set('Core', 'DefinitionCache', null); // remove this later!
|
||||||
|
$def =& $config->getHTMLDefinition(true);
|
||||||
|
<strong>$def->addAttribute('a', 'target', 'Enum#_blank,_self,_target,_top');</strong></pre>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
The <code>Enum#_blank,_self,_target,_top</code> does all the magic.
|
||||||
|
The string is split into two parts, separated by a hash mark (#):
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<ol>
|
||||||
|
<li>The first part is the name of what we call an <code>AttrDef</code></li>
|
||||||
|
<li>The second part is the parameter of the above-mentioned <code>AttrDef</code></li>
|
||||||
|
</ol>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
If that sounds vague and generic, it's because it is! HTML Purifier defines
|
||||||
|
an assortment of different attribute types one can use, and each of these
|
||||||
|
has their own specialized parameter format. Here are some of the more useful
|
||||||
|
ones:
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<table class="table">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Type</th>
|
||||||
|
<th>Format</th>
|
||||||
|
<th>Description</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<th>Enum</th>
|
||||||
|
<td><em>[s:]</em>value1,value2,...</td>
|
||||||
|
<td>
|
||||||
|
Attribute with a number of valid values, one of which may be used. When
|
||||||
|
s: is present, the enumeration is case sensitive.
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Bool</th>
|
||||||
|
<td>attribute_name</td>
|
||||||
|
<td>
|
||||||
|
Boolean attribute, with only one valid value: the name
|
||||||
|
of the attribute.
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>CDATA</th>
|
||||||
|
<td></td>
|
||||||
|
<td>
|
||||||
|
Attribute of arbitrary text. Can also be referred to as <strong>Text</strong>
|
||||||
|
(the specification makes a semantic distinction between the two).
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>ID</th>
|
||||||
|
<td></td>
|
||||||
|
<td>
|
||||||
|
Attribute that specifies a unique ID
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Pixels</th>
|
||||||
|
<td></td>
|
||||||
|
<td>
|
||||||
|
Attribute that specifies an integer pixel length
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Length</th>
|
||||||
|
<td></td>
|
||||||
|
<td>
|
||||||
|
Attribute that specifies a pixel or percentage length
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>NMTOKENS</th>
|
||||||
|
<td></td>
|
||||||
|
<td>
|
||||||
|
Attribute that specifies a number of name tokens, example: the
|
||||||
|
<code>class</code> attribute
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>URI</th>
|
||||||
|
<td></td>
|
||||||
|
<td>
|
||||||
|
Attribute that specifies a URI, example: the <code>href</code>
|
||||||
|
attribute
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Number</th>
|
||||||
|
<td></td>
|
||||||
|
<td>
|
||||||
|
Attribute that specifies an positive integer number
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
For a complete list, consult
|
||||||
|
<a href="http://htmlpurifier.org/svnroot/htmlpurifier/trunk/library/HTMLPurifier/AttrTypes.php"><code>library/HTMLPurifier/AttrTypes.php</code></a>;
|
||||||
|
more information on attributes that accept parameters can be found on their
|
||||||
|
respective includes in
|
||||||
|
<a href="http://htmlpurifier.org/svnroot/htmlpurifier/trunk/library/HTMLPurifier/AttrDef/"><code>library/HTMLPurifier/AttrDef</code></a>.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Sometimes, the restrictive list in AttrTypes just doesn't cut it. Don't
|
||||||
|
sweat: you can also use a fully instantiated object as the value. The
|
||||||
|
equivalent, verbose form of the above example is:
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<pre>$config = HTMLPurifier_Config::createDefault();
|
||||||
|
$config->set('HTML', 'DefinitionID', 'enduser-customize.html tutorial');
|
||||||
|
$config->set('HTML', 'DefinitionRev', 1);
|
||||||
|
$config->set('Core', 'DefinitionCache', null); // remove this later!
|
||||||
|
$def =& $config->getHTMLDefinition(true);
|
||||||
|
<strong>$def->addAttribute('a', 'target', new HTMLPurifier_AttrDef_Enum(
|
||||||
|
array('_blank','_self','_target','_top')
|
||||||
|
));</strong></pre>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Trust me, you'll learn to love the shorthand.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<h2>Add an element</h2>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Adding attributes is really small-fry stuff, though, and it was possible
|
||||||
|
to add them (albeit a bit more wordy) prior to 2.0. The real gem of
|
||||||
|
the Advanced API is adding elements. There are five questions to
|
||||||
|
ask when adding a new element:
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<ol>
|
||||||
|
<li>What is the element's name?</li>
|
||||||
|
<li>What content set does this element belong to?</li>
|
||||||
|
<li>What are the allowed children of this element?</li>
|
||||||
|
<li>What attributes does the element allow that are general?</li>
|
||||||
|
<li>What attributes does the element allow that are specific to this element?</li>
|
||||||
|
</ol>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
It's a mouthful, and you'll be slightly lost if your not familiar with
|
||||||
|
the HTML specification, so let's explain them step by step.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<h3>Content set</h3>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
The HTML specification defines two major content sets: Inline
|
||||||
|
and Block. Each of these
|
||||||
|
content sets contain a list of elements: Inline contains things like
|
||||||
|
<code>span</code> and <code>b</code> while Block contains things like
|
||||||
|
<code>div</code> and <code>blockquote</code>.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
These content sets amount to a macro mechanism for HTML definition. Most
|
||||||
|
elements in HTML are organized into one of these two sets, and most
|
||||||
|
elements in HTML allow elements from one of these sets. If we had
|
||||||
|
to write each element verbatim into each other element's allowed
|
||||||
|
children, we would have ridiculously large lists; instead we use
|
||||||
|
content sets to compactify the declaration.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Practically speaking, there are several useful values you can use here:
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<table class="table">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Content set</th>
|
||||||
|
<th>Description</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<th>Inline</th>
|
||||||
|
<td>Character level elements, text</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Block</th>
|
||||||
|
<td>Block-like elements, like paragraphs and lists</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th><em>false</em></th>
|
||||||
|
<td>
|
||||||
|
Any element that doesn't fit into the mold, for example <code>li</code>
|
||||||
|
or <code>tr</code>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
By specifying a valid value here, all other elements that use that
|
||||||
|
content set will also allow your element, without you having to do
|
||||||
|
anything. If you specify <em>false</em>, you'll have to register
|
||||||
|
your element manually.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<h3>Allowed children</h3>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Allowed children defines the elements that this element can contain.
|
||||||
|
The allowed values may range from none to a complex regexp depending on
|
||||||
|
your element.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
If you've ever taken a look at the HTML DTD's before, you may have
|
||||||
|
noticed declarations like this:
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<pre><!ELEMENT LI - O (%flow;)* -- list item --></pre>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
The <code>(%flow;)*</code> indicates the allowed children of the
|
||||||
|
<code>li</code> tag: <code>li</code> allows any number of flow
|
||||||
|
elements as its children. In HTML Purifier, we'd write it like
|
||||||
|
<code>Flow</code> (here's where the content sets we were
|
||||||
|
discussing earlier come into play). There are three shorthand content models you
|
||||||
|
can specify:
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<table class="table">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Content model</th>
|
||||||
|
<th>Description</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<th>Empty</th>
|
||||||
|
<td>No children allowed, like <code>br</code> or <code>hr</code></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Inline</th>
|
||||||
|
<td>Any number of inline elements and text, like <code>span</code></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Flow</th>
|
||||||
|
<td>Any number of inline elements, block elements and text, like <code>div</code></td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
This covers 90% of all the cases out there, but what about elements that
|
||||||
|
break the mold like <code>ul</code>? This guy requires at least one
|
||||||
|
child, and the only valid children for it are <code>li</code>. The
|
||||||
|
content model is: <code>Required: li</code>. There are two parts: the
|
||||||
|
first type determines what <code>ChildDef</code> will be used to validate
|
||||||
|
content models. The most common values are:
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<table class="table">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Type</th>
|
||||||
|
<th>Description</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<th>Required</th>
|
||||||
|
<td>Children must be one or more of the valid elements</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Optional</th>
|
||||||
|
<td>Children can be any number of the valid elements</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Custom</th>
|
||||||
|
<td>Children must follow the DTD-style regex</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
You can also implement your own <code>ChildDef</code>: this was done
|
||||||
|
for a few special cases in HTML Purifier such as <code>Chameleon</code>
|
||||||
|
(for <code>ins</code> and <code>del</code>), <code>StrictBlockquote</code>
|
||||||
|
and <code>Table</code>.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
The second part specifies either valid elements or a regular expression.
|
||||||
|
Valid elements are separated with horizontal bars (|), i.e.
|
||||||
|
"<code>a | b | c</code>". Use #PCDATA to represent plain text.
|
||||||
|
Regular expressions are based off of DTD's style:
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<ul>
|
||||||
|
<li>Parentheses () are used for grouping</li>
|
||||||
|
<li>Commas (,) separate elements that should come one after another</li>
|
||||||
|
<li>Horizontal bars (|) indicate one or the other elements should be used</li>
|
||||||
|
<li>Plus signs (+) are used for a one or more match</li>
|
||||||
|
<li>Asterisks (*) are used for a zero or more match</li>
|
||||||
|
<li>Question marks (?) are used for a zero or one match</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
For example, "<code>a, b?, (c | d), e+, f*</code>" means "In this order,
|
||||||
|
one <code>a</code> element, at most one <code>b</code> element,
|
||||||
|
one <code>c</code> or <code>d</code> element (but not both), one or more
|
||||||
|
<code>e</code> elements, and any number of <code>f</code> elements."
|
||||||
|
Regex veterans should be able to jump right in, and those not so savvy
|
||||||
|
can always copy-paste W3C's content model definitions into HTML Purifier
|
||||||
|
and hope for the best.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
A word of warning: while the regex format is extremely flexible on
|
||||||
|
the developer's side, it is
|
||||||
|
quite unforgiving on the user's side. If the user input does not <em>exactly</em>
|
||||||
|
match the specification, the entire contents of the element will
|
||||||
|
be nuked. This is why there is are specific content model types like
|
||||||
|
Optional and Required: while they could be implemented as <code>Custom:
|
||||||
|
(valid | elements)*</code>, the custom classes contain special recovery
|
||||||
|
measures that make sure as much of the user's original content gets
|
||||||
|
through. HTML Purifier's core, as a rule, does not use Custom.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
One final note: you can also use Content Sets inside your valid elements
|
||||||
|
lists or regular expressions. In fact, the three shorthand content models
|
||||||
|
mentioned above are just that: abbreviations:
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<table class="table">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Content model</th>
|
||||||
|
<th>Implementation</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<th>Inline</th>
|
||||||
|
<td>Optional: Inline | #PCDATA</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Flow</th>
|
||||||
|
<td>Optional: Flow | #PCDATA</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
When the definition is compiled, Inline will be replaced with a
|
||||||
|
horizontal-bar separated list of inline elements. Also, notice that
|
||||||
|
it does not contain text: you have to specify that yourself.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<h3>Common attributes</h3>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Congratulations: you have just gotten over the proverbial hump (Allowed
|
||||||
|
children). Common attributes is much simpler, and boils down to
|
||||||
|
one question: does your element have the <code>id</code>, <code>style</code>,
|
||||||
|
<code>class</code>, <code>title</code> and <code>lang</code> attributes?
|
||||||
|
If so, you'll want to specify the <code>Common</code> attribute collection,
|
||||||
|
which contains these five attributes that are found on almost every
|
||||||
|
HTML element in the specification.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
There are a few more collections, but they're really edge cases:
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<table class="table">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Collection</th>
|
||||||
|
<th>Attributes</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<th>I18N</th>
|
||||||
|
<td><code>lang</code>, possibly <code>xml:lang</code></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Core</th>
|
||||||
|
<td><code>style</code>, <code>class</code>, <code>id</code> and <code>title</code></td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Common is a combination of the above-mentioned collections.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<h3>Attributes</h3>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
If you didn't read the <a href="#addAttribute">previous section on
|
||||||
|
adding attributes</a>, read it now. The last parameter is simply
|
||||||
|
array of attribute names to attribute implementations, in the exact
|
||||||
|
same format as <code>addAttribute()</code>.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<h3>Putting it all together</h3>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
We're going to implement <code>form</code>. Before we embark, lets
|
||||||
|
grab a reference implementation from over at the
|
||||||
|
<a href="http://www.w3.org/TR/html4/sgml/loosedtd.html">transitional DTD</a>:
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<pre><!ELEMENT FORM - - (%flow;)* -(FORM) -- interactive form -->
|
||||||
|
<!ATTLIST FORM
|
||||||
|
%attrs; -- %coreattrs, %i18n, %events --
|
||||||
|
action %URI; #REQUIRED -- server-side form handler --
|
||||||
|
method (GET|POST) GET -- HTTP method used to submit the form--
|
||||||
|
enctype %ContentType; "application/x-www-form-urlencoded"
|
||||||
|
accept %ContentTypes; #IMPLIED -- list of MIME types for file upload --
|
||||||
|
name CDATA #IMPLIED -- name of form for scripting --
|
||||||
|
onsubmit %Script; #IMPLIED -- the form was submitted --
|
||||||
|
onreset %Script; #IMPLIED -- the form was reset --
|
||||||
|
target %FrameTarget; #IMPLIED -- render in this frame --
|
||||||
|
accept-charset %Charsets; #IMPLIED -- list of supported charsets --
|
||||||
|
></pre>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Juicy! With just this, we can answer four of our five questions:
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<ol>
|
||||||
|
<li>What is the element's name? <strong>form</strong></li>
|
||||||
|
<li>What content set does this element belong to? <strong>Block</strong>
|
||||||
|
(this needs a little sleuthing, I find the easiest way is to search
|
||||||
|
the DTD for <code>FORM</code> and determine which set it is in.)</li>
|
||||||
|
<li>What are the allowed children of this element? <strong>One
|
||||||
|
or more flow elements, but no nested <code>form</code>s</strong></li>
|
||||||
|
<li>What attributes does the element allow that are general? <strong>Common</strong></li>
|
||||||
|
<li>What attributes does the element allow that are specific to this element? <strong>A whole bunch, see ATTLIST;
|
||||||
|
we're going to the vital ones: <code>action</code>, <code>method</code> and <code>name</code></strong></li>
|
||||||
|
</ol>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Time for some code:
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<pre>$config = HTMLPurifier_Config::createDefault();
|
||||||
|
$config->set('HTML', 'DefinitionID', 'enduser-customize.html tutorial');
|
||||||
|
$config->set('HTML', 'DefinitionRev', 1);
|
||||||
|
$config->set('Core', 'DefinitionCache', null); // remove this later!
|
||||||
|
$def =& $config->getHTMLDefinition(true);
|
||||||
|
$def->addAttribute('a', 'target', new HTMLPurifier_AttrDef_Enum(
|
||||||
|
array('_blank','_self','_target','_top')
|
||||||
|
));
|
||||||
|
<strong>$form =& $def->addElement(
|
||||||
|
'form', // name
|
||||||
|
'Block', // content set
|
||||||
|
'Flow', // allowed children
|
||||||
|
'Common', // attribute collection
|
||||||
|
array( // attributes
|
||||||
|
'action*' => 'URI',
|
||||||
|
'method' => 'Enum#get|post',
|
||||||
|
'name' => 'ID'
|
||||||
|
)
|
||||||
|
);
|
||||||
|
$form->excludes = array('form' => true);</strong></pre>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Each of the parameters corresponds to one of the questions we asked.
|
||||||
|
Notice that we added an asterisk to the end of the <code>action</code>
|
||||||
|
attribute to indicate that it is required. If someone specifies a
|
||||||
|
<code>form</code> without that attribute, the tag will be axed.
|
||||||
|
Also, the extra line at the end is a special extra declaration that
|
||||||
|
prevents forms from being nested within each other.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
And that's all there is to it! Implementing the rest of the form
|
||||||
|
module is left as an exercise to the user; to see more examples
|
||||||
|
check the <a href="http://htmlpurifier.org/svnroot/htmlpurifier/trunk/library/HTMLPurifier/HTMLModule/"><code>library/HTMLPurifier/HTMLModule/</code></a> directory
|
||||||
|
in your local HTML Purifier installation.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<h2>And beyond...</h2>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Perceptive users may have realized that, to a certain extent, we
|
||||||
|
have simply re-implemented the facilities of XML Schema or the
|
||||||
|
Document Type Definition. What you are seeing here, however, is
|
||||||
|
not just an XML Schema or Document Type Definition: it is a fully
|
||||||
|
expressive method of specifying the definition of HTML that is
|
||||||
|
a portable superset of the capabilities of the two above-mentioned schema
|
||||||
|
languages. What makes HTMLDefinition so powerful is the fact that
|
||||||
|
if we don't have an implementation for a content model or an attribute
|
||||||
|
definition, you can supply it yourself by writing a PHP class.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
There are many facets of HTMLDefinition beyond the Advanced API I have
|
||||||
|
walked you through today. To find out more about these, you can
|
||||||
|
check out these source files:
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<ul>
|
||||||
|
<li><a href="http://htmlpurifier.org/svnroot/htmlpurifier/trunk/library/HTMLPurifier/HTMLModule.php"><code>library/HTMLPurifier/HTMLModule.php</code></a></li>
|
||||||
|
<li><a href="http://htmlpurifier.org/svnroot/htmlpurifier/trunk/library/HTMLPurifier/ElementDef.php"><code>library/HTMLPurifier/ElementDef.php</code></a></li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<div id="version">$Id: enduser-tidy.html 1158 2007-06-18 19:26:29Z Edward $</div>
|
||||||
|
|
||||||
|
</body></html>
|
@@ -15,6 +15,7 @@
|
|||||||
|
|
||||||
<div id="filing">Filed under End-User</div>
|
<div id="filing">Filed under End-User</div>
|
||||||
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
||||||
|
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
|
||||||
|
|
||||||
<p>Prior to HTML Purifier 1.2.0, this library blithely accepted user input that
|
<p>Prior to HTML Purifier 1.2.0, this library blithely accepted user input that
|
||||||
looked like this:</p>
|
looked like this:</p>
|
||||||
@@ -143,4 +144,4 @@ anchors is beyond me.</p>
|
|||||||
<div id="version">$Id$</div>
|
<div id="version">$Id$</div>
|
||||||
|
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
@@ -36,7 +36,7 @@ forgiving lexer. You may also be interested in the unit tests located in the
|
|||||||
tests/ folder, which provide a living document on how exactly the filter deals
|
tests/ folder, which provide a living document on how exactly the filter deals
|
||||||
with malformed input.
|
with malformed input.
|
||||||
|
|
||||||
In summary:
|
In summary (see corresponding classes for more details):
|
||||||
|
|
||||||
1. Parse document into an array of tag and text tokens (Lexer)
|
1. Parse document into an array of tag and text tokens (Lexer)
|
||||||
2. Remove all elements not on whitelist and transform certain other elements
|
2. Remove all elements not on whitelist and transform certain other elements
|
||||||
|
@@ -6,43 +6,11 @@ through negligence of people. This class will do its job: no more, no less,
|
|||||||
and it's up to you to provide it the proper information and proper context
|
and it's up to you to provide it the proper information and proper context
|
||||||
to be effective. Things to remember:
|
to be effective. Things to remember:
|
||||||
|
|
||||||
1. Character Encoding: UTF-8.
|
1. Character Encoding: see enduser-utf8.html for more info.
|
||||||
Currently, the parser runs under the assumption that it is dealing
|
|
||||||
with UTF-8. Not ISO-8859-1 or Windows-1252, UTF-8. And definitely not "no
|
|
||||||
character encoding explicitly stated" or UTF-7. If you're not using UTF-8 as
|
|
||||||
your character encoding, make sure you configure HTML Purifier or switch
|
|
||||||
to UTF-8. Now. Also, make sure any input is properly converted to UTF-8, or
|
|
||||||
the parser will mangle it badly (though it won't be a security risk if you're
|
|
||||||
outputting it as UTF-8 though). Character encoding is, in general, a knotty
|
|
||||||
issue, but do yourself a favor and learn about it:
|
|
||||||
<http://www.joelonsoftware.com/articles/Unicode.html>
|
|
||||||
|
|
||||||
2. Doctype: XHTML 1.0 Transitional
|
2. IDs: see enduser-id.html for more info
|
||||||
This is what the parser is outputting. For the most
|
|
||||||
part, it's compatible with HTML 4.01, but XHTML enforces some very nice things
|
|
||||||
that all web developers should use. Regardless, NO DOCTYPE is a NO. Quirks mode
|
|
||||||
has waaaay too many quirks for a little parser to handle. We did not select
|
|
||||||
strict in order to prevent ourselves from being too draconic on users, but
|
|
||||||
this may be configurable in the future. Do you want standards compliance?
|
|
||||||
The doctype is a good place to start.
|
|
||||||
|
|
||||||
3. IDs
|
3. URIs: see enduser-uri-filter.html
|
||||||
They need to be unique, but without some knowledge of the
|
|
||||||
rest of the document, it's difficult to know what's unique. %Attr.IDBlacklist
|
|
||||||
needs to be set: we may want to consider disallowing IDs by default to
|
|
||||||
save lazy programmers.
|
|
||||||
|
|
||||||
4. [PROJECTED] Links
|
4. CSS: document pending
|
||||||
We're not going to try for spam protection (although
|
Explain which CSS styles we blocked and why.
|
||||||
some hooks for such a module might be nice) but we may offer the ability to
|
|
||||||
only accept relative URLs. Pick the one that's right for you.
|
|
||||||
|
|
||||||
5. CSS
|
|
||||||
While we can prevent the most flagrant cases from affecting your
|
|
||||||
layout (such as absolutely positioned elements), no amount of code is going
|
|
||||||
to protect your pages from being attacked by garish colors and plain old
|
|
||||||
bad taste. A neat feature would be the ability to define acceptable colors
|
|
||||||
in a document, but that's not likely to be implemented for a while. In the
|
|
||||||
meantime, be sure to make sure that floated elements (permitted, since they
|
|
||||||
can be quite useful) can't mess up your layout. Once again, we may want to
|
|
||||||
disable this by default to protect lazy developers.
|
|
||||||
|
117
docs/enduser-slow.html
Normal file
117
docs/enduser-slow.html
Normal file
@@ -0,0 +1,117 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
||||||
|
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head>
|
||||||
|
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||||
|
<meta name="description" content="Explains how to speed up HTML Purifier through caching or inbound filtering." />
|
||||||
|
<link rel="stylesheet" type="text/css" href="./style.css" />
|
||||||
|
|
||||||
|
<title>Speeding up HTML Purifier - HTML Purifier</title>
|
||||||
|
|
||||||
|
</head><body>
|
||||||
|
|
||||||
|
<h1 class="subtitled">Speeding up HTML Purifier</h1>
|
||||||
|
<div class="subtitle">...also known as the HELP ME LIBRARY IS TOO SLOW MY PAGE TAKE TOO LONG page</div>
|
||||||
|
|
||||||
|
<div id="filing">Filed under End-User</div>
|
||||||
|
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
||||||
|
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
|
||||||
|
|
||||||
|
<p>HTML Purifier is a very powerful library. But with power comes great
|
||||||
|
responsibility, in the form of longer execution times. Remember, this
|
||||||
|
library isn't lightly grazing over submitted HTML: it's deconstructing
|
||||||
|
the whole thing, rigorously checking the parts, and then putting it back
|
||||||
|
together. </p>
|
||||||
|
|
||||||
|
<p>So, if it so turns out that HTML Purifier is kinda too slow for outbound
|
||||||
|
filtering, you've got a few options: </p>
|
||||||
|
|
||||||
|
<h2>Inbound filtering</h2>
|
||||||
|
|
||||||
|
<p>Perform filtering of HTML when it's submitted by the user. Since the
|
||||||
|
user is already submitting something, an extra half a second tacked on
|
||||||
|
to the load time probably isn't going to be that huge of a problem.
|
||||||
|
Then, displaying the content is a simple a manner of outputting it
|
||||||
|
directly from your database/filesystem. The trouble with this method is
|
||||||
|
that your user loses the original text, and when doing edits, will be
|
||||||
|
handling the filtered text. While this may be a good thing, especially
|
||||||
|
if you're using a WYSIWYG editor, it can also result in data-loss if a
|
||||||
|
user makes a typo. </p>
|
||||||
|
|
||||||
|
<p>Example (non-functional):</p>
|
||||||
|
|
||||||
|
<pre><?php
|
||||||
|
/**
|
||||||
|
* FORM SUBMISSION PAGE
|
||||||
|
* display_error($message) : displays nice error page with message
|
||||||
|
* display_success() : displays a nice success page
|
||||||
|
* display_form() : displays the HTML submission form
|
||||||
|
* database_insert($html) : inserts data into database as new row
|
||||||
|
*/
|
||||||
|
if (!empty($_POST)) {
|
||||||
|
require_once '/path/to/library/HTMLPurifier.auto.php';
|
||||||
|
require_once 'HTMLPurifier.func.php';
|
||||||
|
$dirty_html = isset($_POST['html']) ? $_POST['html'] : false;
|
||||||
|
if (!$dirty_html) {
|
||||||
|
display_error('You must write some HTML!');
|
||||||
|
}
|
||||||
|
$html = HTMLPurifier($dirty_html);
|
||||||
|
database_insert($html);
|
||||||
|
display_success();
|
||||||
|
// notice that $dirty_html is *not* saved
|
||||||
|
} else {
|
||||||
|
display_form();
|
||||||
|
}
|
||||||
|
?></pre>
|
||||||
|
|
||||||
|
<h2>Caching the filtered output</h2>
|
||||||
|
|
||||||
|
<p>Accept the submitted text and put it unaltered into the database, but
|
||||||
|
then also generate a filtered version and stash that in the database.
|
||||||
|
Serve the filtered version to readers, and the unaltered version to
|
||||||
|
editors. If need be, you can invalidate the cache and have the cached
|
||||||
|
filtered version be regenerated on the first page view. Pros? Full data
|
||||||
|
retention. Cons? It's more complicated, and opens other editors up to
|
||||||
|
XSS if they are using a WYSIWYG editor (to fix that, they'd have to be
|
||||||
|
able to get their hands on the *really* original text served in
|
||||||
|
plaintext mode). </p>
|
||||||
|
|
||||||
|
<p>Example (non-functional):</p>
|
||||||
|
|
||||||
|
<pre><?php
|
||||||
|
/**
|
||||||
|
* VIEW PAGE
|
||||||
|
* display_error($message) : displays nice error page with message
|
||||||
|
* cache_get($id) : retrieves HTML from fast cache (db or file)
|
||||||
|
* cache_insert($id, $html) : inserts good HTML into cache system
|
||||||
|
* database_get($id) : retrieves raw HTML from database
|
||||||
|
*/
|
||||||
|
$id = isset($_GET['id']) ? (int) $_GET['id'] : false;
|
||||||
|
if (!$id) {
|
||||||
|
display_error('Must specify ID.');
|
||||||
|
exit;
|
||||||
|
}
|
||||||
|
$html = cache_get($id); // filesystem or database
|
||||||
|
if ($html === false) {
|
||||||
|
// cache didn't have the HTML, generate it
|
||||||
|
$raw_html = database_get($id);
|
||||||
|
require_once '/path/to/library/HTMLPurifier.auto.php';
|
||||||
|
require_once 'HTMLPurifier.func.php';
|
||||||
|
$html = HTMLPurifier($raw_html);
|
||||||
|
cache_insert($id, $html);
|
||||||
|
}
|
||||||
|
echo $html;
|
||||||
|
?></pre>
|
||||||
|
|
||||||
|
<h2>Summary</h2>
|
||||||
|
|
||||||
|
<p>In short, inbound filtering is the simple option and caching is the
|
||||||
|
robust option (albeit with bigger storage requirements). </p>
|
||||||
|
|
||||||
|
<p>There is a third option, independent of the two we've discussed: profile
|
||||||
|
and optimize HTMLPurifier yourself. Be sure to report back your results
|
||||||
|
if you decide to do that! Especially if you port HTML Purifier to C++.
|
||||||
|
<tt>;-)</tt></p>
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>
|
230
docs/enduser-tidy.html
Normal file
230
docs/enduser-tidy.html
Normal file
@@ -0,0 +1,230 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
||||||
|
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head>
|
||||||
|
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||||
|
<meta name="description" content="Tutorial for tweaking HTML Purifier's Tidy-like behavior." />
|
||||||
|
<link rel="stylesheet" type="text/css" href="style.css" />
|
||||||
|
|
||||||
|
<title>Tidy - HTML Purifier</title>
|
||||||
|
|
||||||
|
</head><body>
|
||||||
|
|
||||||
|
<h1>Tidy</h1>
|
||||||
|
|
||||||
|
<div id="filing">Filed under Development</div>
|
||||||
|
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
||||||
|
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
|
||||||
|
|
||||||
|
<p>You've probably heard of HTML Tidy, Dave Raggett's little piece
|
||||||
|
of software that cleans up poorly written HTML. Let me say it straight
|
||||||
|
out:</p>
|
||||||
|
|
||||||
|
<p class="emphasis">This ain't HTML Tidy!</p>
|
||||||
|
|
||||||
|
<p>Rather, Tidy stands for a cool set of Tidy-inspired in HTML Purifier
|
||||||
|
that allows users to submit deprecated elements and attributes and get
|
||||||
|
valid strict markup back. For example:</p>
|
||||||
|
|
||||||
|
<pre><center>Centered</center></pre>
|
||||||
|
|
||||||
|
<p>...becomes:</p>
|
||||||
|
|
||||||
|
<pre><div style="text-align:center;">Centered</div></pre>
|
||||||
|
|
||||||
|
<p>...when this particular fix is run on the HTML. This tutorial will give
|
||||||
|
you down the lowdown of what exactly HTML Purifier will do when Tidy
|
||||||
|
is on, and how to fine tune this behavior. Once again, <strong>you do
|
||||||
|
not need Tidy installed on your PHP to use these features!</strong></p>
|
||||||
|
|
||||||
|
<h2>What does it do?</h2>
|
||||||
|
|
||||||
|
<p>Tidy will do several things to your HTML:</p>
|
||||||
|
|
||||||
|
<ul>
|
||||||
|
<li>Convert deprecated elements and attributes to standards-compliant
|
||||||
|
alternatives</li>
|
||||||
|
<li>Enforce XHTML compatibility guidelines and other best practices</li>
|
||||||
|
<li>Preserve data that would normally be removed as per W3C</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<h2>What are levels?</h2>
|
||||||
|
|
||||||
|
<p>Levels describe how aggressive the Tidy module should be when
|
||||||
|
cleaning up HTML. There are four levels to pick: none, light, medium
|
||||||
|
and heavy. Each of these levels has a well-defined set of behavior
|
||||||
|
associated with it, although it may change depending on your doctype.</p>
|
||||||
|
|
||||||
|
<dl>
|
||||||
|
<dt>light</dt>
|
||||||
|
<dd>This is the <strong>lenient</strong> level. If a tag or attribute
|
||||||
|
is about to be removed because it isn't supported by the
|
||||||
|
doctype, Tidy will step in and change into an alternative that
|
||||||
|
is supported.</dd>
|
||||||
|
<dt>medium</dt>
|
||||||
|
<dd>This is the <strong>correctional</strong> level. At this level,
|
||||||
|
all the functions of light are performed, as well as some extra,
|
||||||
|
non-essential best practices enforcement. Changes made on this
|
||||||
|
level are very benign and are unlikely to cause problems.</dd>
|
||||||
|
<dt>heavy</dt>
|
||||||
|
<dd>This is the <strong>aggressive</strong> level. If a tag or
|
||||||
|
attribute is deprecated, it will be converted into a non-deprecated
|
||||||
|
version, no ifs ands or buts.</dd>
|
||||||
|
</dl>
|
||||||
|
|
||||||
|
<p>By default, Tidy operates on the <strong>medium</strong> level. You can
|
||||||
|
change the level of cleaning by setting the %HTML.TidyLevel configuration
|
||||||
|
directive:</p>
|
||||||
|
|
||||||
|
<pre>$config->set('HTML', 'TidyLevel', 'heavy'); // burn baby burn!</pre>
|
||||||
|
|
||||||
|
<h2>Is the light level really light?</h2>
|
||||||
|
|
||||||
|
<p>It depends on what doctype you're using. If your documents are HTML
|
||||||
|
4.01 <em>Transitional</em>, HTML Purifier will be lazy
|
||||||
|
and won't clean up your <code>center</code>
|
||||||
|
or <code>font</code> tags. But if you're using HTML 4.01 <em>Strict</em>,
|
||||||
|
HTML Purifier has no choice: it has to convert them, or they will
|
||||||
|
be nuked out of existence. So while light on Transitional will result
|
||||||
|
in little to no changes, light on Strict will still result in quite
|
||||||
|
a lot of fixes.</p>
|
||||||
|
|
||||||
|
<p>This is different behavior from 1.6 or before, where deprecated
|
||||||
|
tags in transitional documents would
|
||||||
|
always be cleaned up regardless. This is also better behavior.</p>
|
||||||
|
|
||||||
|
<h2>My pages look different!</h2>
|
||||||
|
|
||||||
|
<p>HTML Purifier is tasked with converting deprecated tags and
|
||||||
|
attributes to standards-compliant alternatives, which usually
|
||||||
|
need copious amounts of CSS. It's also not foolproof: sometimes
|
||||||
|
things do get lost in the translation. This is why when HTML Purifier
|
||||||
|
can get away with not doing cleaning, it won't; this is why
|
||||||
|
the default value is <strong>medium</strong> and not heavy.</p>
|
||||||
|
|
||||||
|
<p>Fortunately, only a few attributes have problems with the switch
|
||||||
|
over. They are described below:</p>
|
||||||
|
|
||||||
|
<table class="table">
|
||||||
|
<thead><tr>
|
||||||
|
<th>Element@Attr</th>
|
||||||
|
<th>Changes</th>
|
||||||
|
</tr></thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td>caption@align</td>
|
||||||
|
<td>Firefox supports stuffing the caption on the
|
||||||
|
left and right side of the table, a feature that
|
||||||
|
Internet Explorer, understandably, does not have.
|
||||||
|
When align equals right or left, the text will simply
|
||||||
|
be aligned on the left or right side.</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>img@align</td>
|
||||||
|
<td>The implementation for align bottom is good, but not
|
||||||
|
perfect. There are a few pixel differences.</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>br@clear</td>
|
||||||
|
<td>Clear both gets a little wonky in Internet Explorer. Haven't
|
||||||
|
really been able to figure out why.</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>hr@noshade</td>
|
||||||
|
<td>All browsers implement this slightly differently: we've
|
||||||
|
chosen to make noshade horizontal rules gray.</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
<p>There are a few more minor, although irritating, bugs.
|
||||||
|
Some older browsers support deprecated attributes,
|
||||||
|
but not CSS. Transformed elements and attributes will look unstyled
|
||||||
|
to said browsers. Also, CSS precedence is slightly different for
|
||||||
|
inline styles versus presentational markup. In increasing precedence:</p>
|
||||||
|
|
||||||
|
<ol>
|
||||||
|
<li>Presentational attributes</li>
|
||||||
|
<li>External style sheets</li>
|
||||||
|
<li>Inline styling</li>
|
||||||
|
</ol>
|
||||||
|
|
||||||
|
<p>This means that styling that may have been masked by external CSS
|
||||||
|
declarations will start showing up (a good thing, perhaps). Finally,
|
||||||
|
if you've turned off the style attribute, almost all of
|
||||||
|
these transformations will not work. Sorry mates.</p>
|
||||||
|
|
||||||
|
<p>You can review the rendering before and after of these transformations
|
||||||
|
by consulting the <a
|
||||||
|
href="http://htmlpurifier.org/live/smoketests/attrTransform.php">attrTransform.php
|
||||||
|
smoketest</a>.</p>
|
||||||
|
|
||||||
|
<h2>I like the general idea, but the specifics bug me!</h2>
|
||||||
|
|
||||||
|
<p>So you want HTML Purifier to clean up your HTML, but you're not
|
||||||
|
so happy about the br@clear implementation. That's perfectly fine!
|
||||||
|
HTML Purifier will make accomodations:</p>
|
||||||
|
|
||||||
|
<pre>$config->set('HTML', 'Doctype', 'XHTML 1.0 Transitional');
|
||||||
|
$config->set('HTML', 'TidyLevel', 'heavy'); // all changes, minus...
|
||||||
|
<strong>$config->set('HTML', 'TidyRemove', 'br@clear');</strong></pre>
|
||||||
|
|
||||||
|
<p>That third line does the magic, removing the br@clear fix
|
||||||
|
from the module, ensuring that <code><br clear="both" /></code>
|
||||||
|
will pass through unharmed. The reverse is possible too:</p>
|
||||||
|
|
||||||
|
<pre>$config->set('HTML', 'Doctype', 'XHTML 1.0 Transitional');
|
||||||
|
$config->set('HTML', 'TidyLevel', 'none'); // no changes, plus...
|
||||||
|
<strong>$config->set('HTML', 'TidyAdd', 'p@align');</strong></pre>
|
||||||
|
|
||||||
|
<p>In this case, all transformations are shut off, except for the p@align
|
||||||
|
one, which you found handy.</p>
|
||||||
|
|
||||||
|
<p>To find out what the names of fixes you want to turn on or off are,
|
||||||
|
you'll have to consult the source code, specifically the files in
|
||||||
|
<code>HTMLPurifier/HTMLModule/Tidy/</code>. There is, however, a
|
||||||
|
general syntax:</p>
|
||||||
|
|
||||||
|
<table class="table">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th>Name</th>
|
||||||
|
<th>Example</th>
|
||||||
|
<th>Interpretation</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td>element</td>
|
||||||
|
<td>font</td>
|
||||||
|
<td>Tag transform for <em>element</em></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>element@attr</td>
|
||||||
|
<td>br@clear</td>
|
||||||
|
<td>Attribute transform for <em>attr</em> on <em>element</em></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>@attr</td>
|
||||||
|
<td>@lang</td>
|
||||||
|
<td>Global attribute transform for <em>attr</em></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td>e#content_model_type</td>
|
||||||
|
<td>blockquote#content_model_type</td>
|
||||||
|
<td>Change of child processing implementation for <em>e</em></td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
|
||||||
|
<h2>So... what's the lowdown?</h2>
|
||||||
|
|
||||||
|
<p>The lowdown is, quite frankly, HTML Purifier's default settings are
|
||||||
|
probably good enough. The next step is to bump the level up to heavy,
|
||||||
|
and if that still doesn't satisfy your appetite, do some fine tuning.
|
||||||
|
Other than that, don't worry about it: this all works silently and
|
||||||
|
effectively in the background.</p>
|
||||||
|
|
||||||
|
<div id="version">$Id$</div>
|
||||||
|
|
||||||
|
</body></html>
|
201
docs/enduser-uri-filter.html
Normal file
201
docs/enduser-uri-filter.html
Normal file
@@ -0,0 +1,201 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
||||||
|
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head>
|
||||||
|
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||||
|
<meta name="description" content="Tutorial for creating custom URI filters." />
|
||||||
|
<link rel="stylesheet" type="text/css" href="style.css" />
|
||||||
|
|
||||||
|
<title>URI Filters - HTML Purifier</title>
|
||||||
|
|
||||||
|
</head><body>
|
||||||
|
|
||||||
|
<h1>URI Filters</h1>
|
||||||
|
|
||||||
|
<div id="filing">Filed under End-User</div>
|
||||||
|
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
||||||
|
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
This is a quick and dirty document to get you on your way to writing
|
||||||
|
custom URI filters for your own URL filtering needs. Why would you
|
||||||
|
want to write a URI filter? If you need URIs your users put into
|
||||||
|
HTML to magically change into a different URI, this is
|
||||||
|
exactly what you need!
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<h2>Creating the class</h2>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Any URI filter you make will be a subclass of <code>HTMLPurifier_URIFilter</code>.
|
||||||
|
The scaffolding is thus:
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<pre>class HTMLPurifier_URIFilter_<strong>NameOfFilter</strong> extends HTMLPurifier_URIFilter
|
||||||
|
{
|
||||||
|
var $name = '<strong>NameOfFilter</strong>';
|
||||||
|
function prepare($config) {}
|
||||||
|
function filter(&$uri, $config, &$context) {}
|
||||||
|
}</pre>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Fill in the variable <code>$name</code> with the name of your filter, and
|
||||||
|
take a look at the two methods. <code>prepare()</code> is an initialization
|
||||||
|
method that is called only once, before any filtering has been done of the
|
||||||
|
HTML. Use it to perform any costly setup work that only needs to be done
|
||||||
|
once. <code>filter()</code> is the guts and innards of our filter:
|
||||||
|
it takes the URI and does whatever needs to be done to it.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
If you've worked with HTML Purifier, you'll recognize the <code>$config</code>
|
||||||
|
and <code>$context</code> parameters. On the other hand, <code>$uri</code>
|
||||||
|
is something unique to this section of the application: it's a
|
||||||
|
<code>HTMLPurifier_URI</code> object. The interface is thus:
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<pre>class HTMLPurifier_URI
|
||||||
|
{
|
||||||
|
var $scheme, $userinfo, $host, $port, $path, $query, $fragment;
|
||||||
|
function HTMLPurifier_URI($scheme, $userinfo, $host, $port, $path, $query, $fragment);
|
||||||
|
function toString();
|
||||||
|
function copy();
|
||||||
|
function getSchemeObj($config, &$context);
|
||||||
|
function validate($config, &$context);
|
||||||
|
}</pre>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
The first three methods are fairly self-explanatory: you have a constructor,
|
||||||
|
a serializer, and a cloner. Generally, you won't be using them when
|
||||||
|
you are manipulating the URI objects themselves.
|
||||||
|
<code>getSchemeObj()</code> is a special purpose method that returns
|
||||||
|
a <code>HTMLPurifier_URIScheme</code> object corresponding to the specific
|
||||||
|
URI at hand. <code>validate()</code> performs general-purpose validation
|
||||||
|
on the internal components of a URI. Once again, you don't need to
|
||||||
|
worry about these: they've already been handled for you.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<h2>URI format</h2>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
As a URIFilter, we're interested in the member variables of the URI object.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<table class="quick"><tbody>
|
||||||
|
<tr><th>Scheme</th> <td>The protocol for identifying (and possibly locating) a resource (http, ftp, https)</td></tr>
|
||||||
|
<tr><th>Userinfo</th> <td>User information such as a username (bob)</td></tr>
|
||||||
|
<tr><th>Host</th> <td>Domain name or IP address of the server (example.com, 127.0.0.1)</td></tr>
|
||||||
|
<tr><th>Port</th> <td>Network port number for the server (80, 12345)</td></tr>
|
||||||
|
<tr><th>Path</th> <td>Data that identifies the resource, possibly hierarchical (/path/to, ed@example.com)</td></tr>
|
||||||
|
<tr><th>Query</th> <td>String of information to be interpreted by the resource (?q=search-term)</td></tr>
|
||||||
|
<tr><th>Fragment</th> <td>Additional information for the resource after retrieval (#bookmark)</td></tr>
|
||||||
|
</tbody></table>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Because the URI is presented to us in this form, and not
|
||||||
|
<code>http://bob@example.com:8080/foo.php?q=string#hash</code>, it saves us
|
||||||
|
a lot of trouble in having to parse the URI every time we want to filter
|
||||||
|
it. For the record, the above URI has the following components:
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<table class="quick"><tbody>
|
||||||
|
<tr><th>Scheme</th> <td>http</td></tr>
|
||||||
|
<tr><th>Userinfo</th> <td>bob</td></tr>
|
||||||
|
<tr><th>Host</th> <td>example.com</td></tr>
|
||||||
|
<tr><th>Port</th> <td>8080</td></tr>
|
||||||
|
<tr><th>Path</th> <td>/foo.php</td></tr>
|
||||||
|
<tr><th>Query</th> <td>q=string</td></tr>
|
||||||
|
<tr><th>Fragment</th> <td>hash</td></tr>
|
||||||
|
</tbody></table>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Note that there is no question mark or octothorpe in the query or
|
||||||
|
fragment: these get removed during parsing.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
With this information, you can get straight to implementing your
|
||||||
|
<code>filter()</code> method. But one more thing...
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<h2>Return value: Boolean, not URI</h2>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
You may have noticed that the URI is being passed in by reference.
|
||||||
|
This means that whatever changes you make to it, those changes will
|
||||||
|
be reflected in the URI object the callee had. <strong>Do not
|
||||||
|
return the URI object: it is unnecessary and will cause bugs.</strong>
|
||||||
|
Instead, return a boolean value, true if the filtering was successful,
|
||||||
|
or false if the URI is beyond repair and needs to be axed.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Let's suppose I wanted to write a filter that de-internationalized domain
|
||||||
|
names by converting them to <a href="http://en.wikipedia.org/wiki/Punycode">Punycode</a>.
|
||||||
|
Assuming that <code>punycode_encode($input)</code> converts <code>$input</code> to
|
||||||
|
Punycode and returns <code>false</code> on failure:
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<pre>class HTMLPurifier_URIFilter_ConvertIDNToPunycode extends HTMLPurifier_URIFilter
|
||||||
|
{
|
||||||
|
var $name = 'ConvertIDNToPunycode';
|
||||||
|
function filter(&$uri, $config, &$context) {
|
||||||
|
if (is_null($uri->host)) return true;
|
||||||
|
if ($uri->host == utf8_decode($uri->host)) {
|
||||||
|
// is ASCII, abort
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
$host = punycode_encode($uri->host);
|
||||||
|
if ($host === false) return false;
|
||||||
|
$uri->host = $host;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}</pre>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Notice I did not <code>return $uri;</code>.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<h2>Activating your filter</h2>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Having a filter is all well and good, but you need to tell HTML Purifier
|
||||||
|
to use it. Fortunately, this part's simple:
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<pre>$uri =& $config->getDefinition('URI');
|
||||||
|
$uri->addFilter(new HTMLPurifier_URIFilter_<strong>NameOfFilter</strong>());</pre>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
If you want to be really fancy, you can define a configuration directive
|
||||||
|
for your filter and have HTML Purifier automatically manage whether or
|
||||||
|
not your filter gets loaded or not (this is how internal filters manage
|
||||||
|
things):
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<pre>HTMLPurifier_ConfigSchema::define(
|
||||||
|
'URI', '<strong>NameOfFilter</strong>', false, 'bool',
|
||||||
|
'<strong>What your filter does.</strong>'
|
||||||
|
);
|
||||||
|
$uri =& $config->getDefinition('URI', true);
|
||||||
|
$uri->registerFilter(new HTMLPurifier_URIFilter_<strong>NameOfFilter</strong>());
|
||||||
|
</pre>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Now, your filter will only be called when %URI.<strong>NameOfFilter</strong>
|
||||||
|
is set to true.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<h2>Examples</h2>
|
||||||
|
|
||||||
|
<p>
|
||||||
|
Check the
|
||||||
|
<a href="http://htmlpurifier.org/svnroot/htmlpurifier/trunk/library/HTMLPurifier/URIFilter/">URIFilter</a>
|
||||||
|
directory for more implementation examples, and see <a href="http://htmlpurifier.org/svnroot/htmlpurifier/trunk/docs/proposal-new-directives.txt">the
|
||||||
|
new directives proposal document</a> for ideas on what could be implemented
|
||||||
|
as a filter.
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<div id="version">$Id$</div>
|
||||||
|
|
||||||
|
</body></html>
|
1047
docs/enduser-utf8.html
Normal file
1047
docs/enduser-utf8.html
Normal file
File diff suppressed because it is too large
Load Diff
@@ -15,6 +15,7 @@
|
|||||||
|
|
||||||
<div id="filing">Filed under End-User</div>
|
<div id="filing">Filed under End-User</div>
|
||||||
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
||||||
|
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
|
||||||
|
|
||||||
<p>Clients like their YouTube videos. It gives them a warm fuzzy feeling when
|
<p>Clients like their YouTube videos. It gives them a warm fuzzy feeling when
|
||||||
they see a neat little embedded video player on their websites that can play
|
they see a neat little embedded video player on their websites that can play
|
||||||
@@ -36,7 +37,7 @@ from a specific website, it probably is okay. If no amount of pleading will
|
|||||||
convince the people upstairs that they should just settle with just linking
|
convince the people upstairs that they should just settle with just linking
|
||||||
to their movies, you may find this technique very useful.</p>
|
to their movies, you may find this technique very useful.</p>
|
||||||
|
|
||||||
<h2>Sample</h2>
|
<h2>Looking in</h2>
|
||||||
|
|
||||||
<p>Below is custom code that allows users to embed
|
<p>Below is custom code that allows users to embed
|
||||||
YouTube videos. This is not favoritism: this trick can easily be adapted for
|
YouTube videos. This is not favoritism: this trick can easily be adapted for
|
||||||
@@ -68,55 +69,27 @@ into your documents. YouTube's code goes like this:</p>
|
|||||||
<p>What point 2 means is that if we have code like <code><span
|
<p>What point 2 means is that if we have code like <code><span
|
||||||
class="embed-youtube">AyPzM5WK8ys</span></code> your
|
class="embed-youtube">AyPzM5WK8ys</span></code> your
|
||||||
application can reconstruct the full object from this small snippet that
|
application can reconstruct the full object from this small snippet that
|
||||||
passes through HTML Purifier <em>unharmed</em>.</p>
|
passes through HTML Purifier <em>unharmed</em>.
|
||||||
|
<a href="http://htmlpurifier.org/svnroot/htmlpurifier/trunk/library/HTMLPurifier/Filter/YouTube.php">Show me the code!</a></p>
|
||||||
|
|
||||||
<pre>
|
<p>And the corresponding usage:</p>
|
||||||
<?php
|
|
||||||
|
|
||||||
class HTMLPurifierX_PreserveYouTube extends HTMLPurifier
|
<pre><?php
|
||||||
{
|
// assuming $purifier is an instance of HTMLPurifier
|
||||||
function purify($html, $config = null) {
|
require_once 'HTMLPurifier/Filter/YouTube.php';
|
||||||
$pre_regex = '#<object[^>]+>.+?'.
|
$purifier->addFilter(new HTMLPurifier_Filter_YouTube());
|
||||||
'http://www.youtube.com/v/([A-Za-z0-9]+).+?</object>#';
|
?></pre>
|
||||||
$pre_replace = '<span class="youtube-embed">\1</span>';
|
|
||||||
$html = preg_replace($pre_regex, $pre_replace, $html);
|
|
||||||
$html = parent::purify($html, $config);
|
|
||||||
$post_regex = '#<span class="youtube-embed">([A-Za-z0-9]+)</span>#';
|
|
||||||
$post_replace = '<object width="425" height="350" '.
|
|
||||||
'data="http://www.youtube.com/v/\1">'.
|
|
||||||
'<param name="movie" value="http://www.youtube.com/v/\1"></param>'.
|
|
||||||
'<param name="wmode" value="transparent"></param>'.
|
|
||||||
'<!--[if IE]>'.
|
|
||||||
'<embed src="http://www.youtube.com/v/\1"'.
|
|
||||||
'type="application/x-shockwave-flash"'.
|
|
||||||
'wmode="transparent" width="425" height="350" />'.
|
|
||||||
'<![endif]-->'.
|
|
||||||
'</object>';
|
|
||||||
$html = preg_replace($post_regex, $post_replace, $html);
|
|
||||||
return $html;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
$purifier = new HTMLPurifierX_PreserveYouTube();
|
<p>There is a bit going in the two code snippets, so let's explain.</p>
|
||||||
$html_still_with_youtube = $purifier->purify($html_with_youtube);
|
|
||||||
|
|
||||||
?>
|
|
||||||
</pre>
|
|
||||||
|
|
||||||
<p>There is a bit going on here, so let's explain.</p>
|
|
||||||
|
|
||||||
<ol>
|
<ol>
|
||||||
<li>The class uses the prefix <code>HTMLPurifierX</code> because it's
|
<li>This is a Filter object, which intercepts the HTML that is
|
||||||
userspace code. Don't use <code>HTMLPurifier</code> in front of your
|
coming into and out of the purifier. You can add as many
|
||||||
class, since it might clobber another class in the library.</li>
|
filter objects as you like. <code>preFilter()</code>
|
||||||
<li>In order to keep the interface compatible, we've extended HTMLPurifier
|
processes the code before it gets purified, and <code>postFilter()</code>
|
||||||
into a new class that preserves the YouTube videos. This means that
|
processes the code afterwards. So, we'll use <code>preFilter()</code> to
|
||||||
all you have to do is replace all instances of
|
replace the object tag with a <code>span</code>, and <code>postFilter()</code>
|
||||||
<code>new HTMLPurifier</code> to <code>new
|
to restore it.</li>
|
||||||
HTMLPurifierX_PreserveYouTube</code>. There's other ways to go about
|
|
||||||
doing this: if you were calling a function that wrapped HTML Purifier,
|
|
||||||
you could paste the PHP right there. If you wanted to be really
|
|
||||||
fancy, you could make a decorator for HTMLPurifier.</li>
|
|
||||||
<li>The first preg_replace call replaces any YouTube code users may have
|
<li>The first preg_replace call replaces any YouTube code users may have
|
||||||
embedded into the benign span tag. Span is used because it is inline,
|
embedded into the benign span tag. Span is used because it is inline,
|
||||||
and objects are inline too. We are very careful to be extremely
|
and objects are inline too. We are very careful to be extremely
|
||||||
@@ -164,16 +137,16 @@ it is important that you are cognizant of the risk.</p>
|
|||||||
|
|
||||||
<p>This should go without saying, but if you're going to adapt this code
|
<p>This should go without saying, but if you're going to adapt this code
|
||||||
for Google Video or the like, make sure you do it <em>right</em>. It's
|
for Google Video or the like, make sure you do it <em>right</em>. It's
|
||||||
extremely easy to allow a character too many in the final section and
|
extremely easy to allow a character too many in <code>postFilter()</code> and
|
||||||
suddenly you're introducing XSS into HTML Purifier's XSS free output. HTML
|
suddenly you're introducing XSS into HTML Purifier's XSS free output. HTML
|
||||||
Purifier may be well written, but it cannot guard against vulnerabilities
|
Purifier may be well written, but it cannot guard against vulnerabilities
|
||||||
introduced after it has finished.</p>
|
introduced after it has finished.</p>
|
||||||
|
|
||||||
<h2>Future plans</h2>
|
<h2>Help out!</h2>
|
||||||
|
|
||||||
<p>It would probably be a good idea if this code was added to the core
|
<p>If you write a filter for your favorite video destination (or anything
|
||||||
library. Look out for the inclusion of this into the core as a decorator
|
like that, for that matter), send it over and it might get included
|
||||||
or the like.</p>
|
with the core!</p>
|
||||||
|
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
@@ -2,14 +2,21 @@
|
|||||||
|
|
||||||
// This file demonstrates basic usage of HTMLPurifier.
|
// This file demonstrates basic usage of HTMLPurifier.
|
||||||
|
|
||||||
exit; // not to be called directly, it will fail fantastically!
|
// replace this with the path to the HTML Purifier library
|
||||||
|
require_once '../../library/HTMLPurifier.auto.php';
|
||||||
|
|
||||||
set_include_path('/path/to/htmlpurifier/library' . PATH_SEPARATOR . get_include_path());
|
$config = HTMLPurifier_Config::createDefault();
|
||||||
require_once 'HTMLPurifier.php';
|
|
||||||
|
|
||||||
$purifier = new HTMLPurifier();
|
// configuration goes here:
|
||||||
|
$config->set('Core', 'Encoding', 'UTF-8'); // replace with your encoding
|
||||||
|
$config->set('HTML', 'Doctype', 'XHTML 1.0 Transitional'); // replace with your doctype
|
||||||
|
|
||||||
|
$purifier = new HTMLPurifier($config);
|
||||||
|
|
||||||
|
// untrusted input HTML
|
||||||
$html = '<b>Simple and short';
|
$html = '<b>Simple and short';
|
||||||
|
|
||||||
$pure_html = $purifier->purify($html);
|
$pure_html = $purifier->purify($html);
|
||||||
|
|
||||||
?>
|
echo '<pre>' . htmlspecialchars($pure_html) . '</pre>';
|
||||||
|
|
||||||
|
@@ -1,136 +0,0 @@
|
|||||||
<?php
|
|
||||||
|
|
||||||
// using _REQUEST because we accept GET and POST requests
|
|
||||||
|
|
||||||
$content = empty($_REQUEST['xml']) ? 'text/html' : 'application/xhtml+xml';
|
|
||||||
header("Content-type:$content;charset=UTF-8");
|
|
||||||
|
|
||||||
// prevent PHP versions with shorttags from barfing
|
|
||||||
echo '<?xml version="1.0" encoding="UTF-8" ?>
|
|
||||||
';
|
|
||||||
|
|
||||||
function getFormMethod() {
|
|
||||||
return (isset($_REQUEST['post'])) ? 'post' : 'get';
|
|
||||||
}
|
|
||||||
|
|
||||||
if (empty($_REQUEST['strict'])) {
|
|
||||||
?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
|
||||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
|
||||||
<?php
|
|
||||||
} else {
|
|
||||||
?>
|
|
||||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
|
||||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
|
||||||
<?php
|
|
||||||
}
|
|
||||||
?>
|
|
||||||
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
|
|
||||||
<head>
|
|
||||||
<title>HTML Purifier Live Demo</title>
|
|
||||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
|
||||||
</head>
|
|
||||||
<body>
|
|
||||||
<h1>HTML Purifier Live Demo</h1>
|
|
||||||
<?php
|
|
||||||
|
|
||||||
require_once '../../library/HTMLPurifier.auto.php';
|
|
||||||
|
|
||||||
if (!empty($_REQUEST['html'])) { // start result
|
|
||||||
|
|
||||||
if (strlen($_REQUEST['html']) > 50000) {
|
|
||||||
?>
|
|
||||||
<p>Request exceeds maximum allowed text size of 50kb.</p>
|
|
||||||
<?php
|
|
||||||
} else { // start main processing
|
|
||||||
|
|
||||||
$html = get_magic_quotes_gpc() ? stripslashes($_REQUEST['html']) : $_REQUEST['html'];
|
|
||||||
|
|
||||||
$config = HTMLPurifier_Config::createDefault();
|
|
||||||
$config->set('Core', 'TidyFormat', !empty($_REQUEST['tidy']));
|
|
||||||
$config->set('HTML', 'Strict', !empty($_REQUEST['strict']));
|
|
||||||
$purifier = new HTMLPurifier($config);
|
|
||||||
$pure_html = $purifier->purify($html);
|
|
||||||
|
|
||||||
?>
|
|
||||||
<p>Here is your purified HTML:</p>
|
|
||||||
<div style="border:5px solid #CCC;margin:0 10%;padding:1em;">
|
|
||||||
<?php if(getFormMethod() == 'get') { ?>
|
|
||||||
<div style="float:right;">
|
|
||||||
<a href="http://validator.w3.org/check?uri=referer"><img
|
|
||||||
src="http://www.w3.org/Icons/valid-xhtml10"
|
|
||||||
alt="Valid XHTML 1.0 Transitional" height="31" width="88" style="border:0;" /></a>
|
|
||||||
</div>
|
|
||||||
<?php } ?>
|
|
||||||
<?php
|
|
||||||
|
|
||||||
echo $pure_html;
|
|
||||||
|
|
||||||
?>
|
|
||||||
<div style="clear:both;"></div>
|
|
||||||
</div>
|
|
||||||
<p>Here is the source code of the purified HTML:</p>
|
|
||||||
<pre><?php
|
|
||||||
|
|
||||||
echo htmlspecialchars($pure_html, ENT_COMPAT, 'UTF-8');
|
|
||||||
|
|
||||||
?></pre>
|
|
||||||
<?php
|
|
||||||
if (getFormMethod() == 'post') { // start POST validation notice
|
|
||||||
?>
|
|
||||||
<p>If you would like to validate the code with
|
|
||||||
<a href="http://validator.w3.org/#validate-by-input">W3C's
|
|
||||||
validator</a>, copy and paste the <em>entire</em> demo page's source.</p>
|
|
||||||
<?php
|
|
||||||
} // end POST validation notice
|
|
||||||
|
|
||||||
} // end main processing
|
|
||||||
|
|
||||||
// end result
|
|
||||||
} else {
|
|
||||||
|
|
||||||
?>
|
|
||||||
<p>Welcome to the live demo. Enter some HTML and see how HTML Purifier
|
|
||||||
will filter it.</p>
|
|
||||||
<?php
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
?>
|
|
||||||
<form id="filter" action="demo.php<?php
|
|
||||||
echo '?' . getFormMethod();
|
|
||||||
if (isset($_REQUEST['profile']) || isset($_REQUEST['XDEBUG_PROFILE'])) {
|
|
||||||
echo '&XDEBUG_PROFILE=1';
|
|
||||||
} ?>" method="<?php echo getFormMethod(); ?>">
|
|
||||||
<fieldset>
|
|
||||||
<legend>HTML Purifier Input (<?php echo getFormMethod(); ?>)</legend>
|
|
||||||
<textarea name="html" cols="60" rows="15"><?php
|
|
||||||
|
|
||||||
if (isset($html)) {
|
|
||||||
echo htmlspecialchars(
|
|
||||||
HTMLPurifier_Encoder::cleanUTF8($html), ENT_COMPAT, 'UTF-8');
|
|
||||||
}
|
|
||||||
?></textarea>
|
|
||||||
<?php if (getFormMethod() == 'get') { ?>
|
|
||||||
<p><strong>Warning:</strong> GET request method can only hold
|
|
||||||
8129 characters (probably less depending on your browser).
|
|
||||||
If you need to test anything
|
|
||||||
larger than that, try the <a href="demo.php?post">POST form</a>.</p>
|
|
||||||
<?php } ?>
|
|
||||||
<?php if (extension_loaded('tidy')) { ?>
|
|
||||||
<div>Nicely format output with Tidy? <input type="checkbox" value="1"
|
|
||||||
name="tidy"<?php if (!empty($_REQUEST['tidy'])) echo ' checked="checked"'; ?> /></div>
|
|
||||||
<?php } ?>
|
|
||||||
<div>XHTML 1.0 Strict output? <input type="checkbox" value="1"
|
|
||||||
name="strict"<?php if (!empty($_REQUEST['strict'])) echo ' checked="checked"'; ?> /></div>
|
|
||||||
<div>Serve as application/xhtml+xml? (not for IE) <input type="checkbox" value="1"
|
|
||||||
name="xml"<?php if (!empty($_REQUEST['xml'])) echo ' checked="checked"'; ?> /></div>
|
|
||||||
<div>
|
|
||||||
<input type="submit" value="Submit" name="submit" class="button" />
|
|
||||||
</div>
|
|
||||||
</fieldset>
|
|
||||||
</form>
|
|
||||||
<p>Return to <a href="http://hp.jpsband.org/">HTML Purifier's home page</a>.
|
|
||||||
Try the form in <a href="demo.php?get">GET</a> and <a href="demo.php?post">POST</a> request
|
|
||||||
flavors (GET is easy to validate with W3C, but POST allows larger inputs).</p>
|
|
||||||
</body>
|
|
||||||
</html>
|
|
6
docs/fixquotes.htc
Normal file
6
docs/fixquotes.htc
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
<public:attach event="oncontentready" onevent="init();" />
|
||||||
|
<script>
|
||||||
|
function init() {
|
||||||
|
element.innerHTML = '“'+element.innerHTML+'”';
|
||||||
|
}
|
||||||
|
</script>
|
@@ -13,7 +13,7 @@
|
|||||||
|
|
||||||
<h1>Documentation</h1>
|
<h1>Documentation</h1>
|
||||||
|
|
||||||
<p><strong>HTML Purifier</strong> has documentation for all types of people.
|
<p><strong><a href="http://htmlpurifier.org/">HTML Purifier</a></strong> has documentation for all types of people.
|
||||||
Here is an index of all of them.</p>
|
Here is an index of all of them.</p>
|
||||||
|
|
||||||
<h2>End-user</h2>
|
<h2>End-user</h2>
|
||||||
@@ -28,6 +28,21 @@ information for casual developers using HTML Purifier.</p>
|
|||||||
<dt><a href="enduser-youtube.html">Embedding YouTube videos</a></dt>
|
<dt><a href="enduser-youtube.html">Embedding YouTube videos</a></dt>
|
||||||
<dd>Explains how to safely allow the embedding of flash from trusted sites.</dd>
|
<dd>Explains how to safely allow the embedding of flash from trusted sites.</dd>
|
||||||
|
|
||||||
|
<dt><a href="enduser-slow.html">Speeding up HTML Purifier</a></dt>
|
||||||
|
<dd>Explains how to speed up HTML Purifier through caching or inbound filtering.</dd>
|
||||||
|
|
||||||
|
<dt><a href="enduser-utf8.html">UTF-8: The Secret of Character Encoding</a></dt>
|
||||||
|
<dd>Describes the rationale for using UTF-8, the ramifications otherwise, and how to make the switch.</dd>
|
||||||
|
|
||||||
|
<dt><a href="enduser-tidy.html">Tidy</a></dt>
|
||||||
|
<dd>Tutorial for tweaking HTML Purifier's Tidy-like behavior.</dd>
|
||||||
|
|
||||||
|
<dt><a href="enduser-customize.html">Customize</a></dt>
|
||||||
|
<dd>Tutorial for customizing HTML Purifier's tag and attribute sets.</dd>
|
||||||
|
|
||||||
|
<dt><a href="enduser-uri-filter.html">URI Filters</a></dt>
|
||||||
|
<dd>Tutorial for creating custom URI filters.</dd>
|
||||||
|
|
||||||
</dl>
|
</dl>
|
||||||
|
|
||||||
<h2>Development</h2>
|
<h2>Development</h2>
|
||||||
@@ -36,9 +51,6 @@ conventions.</p>
|
|||||||
|
|
||||||
<dl>
|
<dl>
|
||||||
|
|
||||||
<dt><a href="dev-code-quality.html">Code Quality Issues</a></dt>
|
|
||||||
<dd>Discusses code quality issues and places that need to be refactored.</dd>
|
|
||||||
|
|
||||||
<dt><a href="dev-progress.html">Implementation Progress</a></dt>
|
<dt><a href="dev-progress.html">Implementation Progress</a></dt>
|
||||||
<dd>Tables detailing HTML element and CSS property implementation coverage.</dd>
|
<dd>Tables detailing HTML element and CSS property implementation coverage.</dd>
|
||||||
|
|
||||||
@@ -48,6 +60,10 @@ conventions.</p>
|
|||||||
<dt><a href="dev-optimization.html">Optimization</a></dt>
|
<dt><a href="dev-optimization.html">Optimization</a></dt>
|
||||||
<dd>Discusses possible methods of optimizing HTML Purifier.</dd>
|
<dd>Discusses possible methods of optimizing HTML Purifier.</dd>
|
||||||
|
|
||||||
|
<dt><a href="dev-advanced-api.html">Advanced API</a></dt>
|
||||||
|
<dd>Functional specification for HTML Purifier's advanced API for defining
|
||||||
|
custom filtering behavior.</dd>
|
||||||
|
|
||||||
</dl>
|
</dl>
|
||||||
|
|
||||||
<h2>Proposals</h2>
|
<h2>Proposals</h2>
|
||||||
@@ -95,6 +111,12 @@ the code. They may be upgraded to HTML files or stay as TXT scratchpads.</p>
|
|||||||
<td>Common security issues that may still arise (half-baked).</td>
|
<td>Common security issues that may still arise (half-baked).</td>
|
||||||
</tr>
|
</tr>
|
||||||
|
|
||||||
|
<tr>
|
||||||
|
<td>Development</td>
|
||||||
|
<td><a href="enduser-code-quality.txt">Code Quality Issues</a></td>
|
||||||
|
<td>Enumerates code quality issues and places that need to be refactored.</td>
|
||||||
|
</tr>
|
||||||
|
|
||||||
<tr>
|
<tr>
|
||||||
<td>Proposal</td>
|
<td>Proposal</td>
|
||||||
<td><a href="proposal-filter-levels.txt">Filter levels</a></td>
|
<td><a href="proposal-filter-levels.txt">Filter levels</a></td>
|
||||||
@@ -115,8 +137,8 @@ the code. They may be upgraded to HTML files or stay as TXT scratchpads.</p>
|
|||||||
|
|
||||||
<tr>
|
<tr>
|
||||||
<td>Reference</td>
|
<td>Reference</td>
|
||||||
<td><a href="ref-loose-vs-strict.txt">Loose vs.Strict</a></td>
|
<td><a href="ref-content-models.txt">Handling Content Model Changes</a></td>
|
||||||
<td>Differences between HTML Strict and Transitional versions.</td>
|
<td>Discusses how to tidy up content model changes using custom ChildDef classes.</td>
|
||||||
</tr>
|
</tr>
|
||||||
|
|
||||||
<tr>
|
<tr>
|
||||||
@@ -127,14 +149,8 @@ the code. They may be upgraded to HTML files or stay as TXT scratchpads.</p>
|
|||||||
|
|
||||||
<tr>
|
<tr>
|
||||||
<td>Reference</td>
|
<td>Reference</td>
|
||||||
<td><a href="ref-strictness.txt">Strictness</a></td>
|
<td><a href="ref-html-modularization.txt">Modularization of HTMLDefinition</a></td>
|
||||||
<td>Short essay on how loose definition isn't really loose.</td>
|
<td>Provides a high-level overview of the concepts behind HTMLModules.</td>
|
||||||
</tr>
|
|
||||||
|
|
||||||
<tr>
|
|
||||||
<td>Reference</td>
|
|
||||||
<td><a href="ref-xhtml-1.1.txt">XHTML 1.1</a></td>
|
|
||||||
<td>What we'd have to do to support XHTML 1.1.</td>
|
|
||||||
</tr>
|
</tr>
|
||||||
|
|
||||||
<tr>
|
<tr>
|
||||||
@@ -149,4 +165,4 @@ the code. They may be upgraded to HTML files or stay as TXT scratchpads.</p>
|
|||||||
|
|
||||||
<div id="version">$Id$</div>
|
<div id="version">$Id$</div>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
@@ -15,6 +15,7 @@
|
|||||||
|
|
||||||
<div id="filing">Filed under Proposals</div>
|
<div id="filing">Filed under Proposals</div>
|
||||||
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
||||||
|
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
|
||||||
|
|
||||||
<p>Your website probably has a color-scheme.
|
<p>Your website probably has a color-scheme.
|
||||||
<span style="color:#090; background:#FFF;">Green on white</span>,
|
<span style="color:#090; background:#FFF;">Green on white</span>,
|
||||||
|
@@ -7,29 +7,15 @@ value is used for. This means decentralized configuration declarations that
|
|||||||
are nevertheless error checking and a centralized configuration object.
|
are nevertheless error checking and a centralized configuration object.
|
||||||
|
|
||||||
Directives are divided into namespaces, indicating the major portion of
|
Directives are divided into namespaces, indicating the major portion of
|
||||||
functionality they cover (although there may be overlaps. Please consult
|
functionality they cover (although there may be overlaps). Please consult
|
||||||
the documentation in ConfigDef for more information on these namespaces.
|
the documentation in ConfigDef for more information on these namespaces.
|
||||||
|
|
||||||
Since configuration is dependant on context, internal classes require a
|
Since configuration is dependant on context, internal classes require a
|
||||||
configuration object to be passed as a parameter. (They also require a
|
configuration object to be passed as a parameter. (They also require a
|
||||||
Context object).
|
Context object). A majority of classes do not need the config object,
|
||||||
|
but for those who do, it is a lifesaver.
|
||||||
|
|
||||||
In relation to HTMLDefinition and CSSDefinition, there is a special class
|
Definition objects are complex datatypes influenced by their respective
|
||||||
of directives that influence the *construction* of the Definition object.
|
directive namespaces (HTMLDefinition with HTML and CSSDefinition with CSS).
|
||||||
A standard call pattern would look like:
|
If any of these directives is updated, HTML Purifier forces the definition
|
||||||
|
to be regenerated.
|
||||||
1. Client calls Config->getHTMLDefinition()
|
|
||||||
2. Config calls HTMLDefinition->createNew(this)
|
|
||||||
3. HTMLDefinition constructs itself with base configuration
|
|
||||||
4. HTMLDefinition calls Config->get('HTMLDefinition')
|
|
||||||
5. Config returns array of directives that later construction
|
|
||||||
6. HTMLDefinition performs operations and changes specified by directives
|
|
||||||
7. HTMLPurifier returns constructed definition
|
|
||||||
8. Config caches definition so it doesn't have to be generated again
|
|
||||||
9. Config returns definition
|
|
||||||
|
|
||||||
You could also override Config's copy of the definition with your own
|
|
||||||
custom copy, which OVERRIDES all directives. Only the base, vanilla copy
|
|
||||||
is the Singleton, the object actually interfaced with is a operated-upon
|
|
||||||
clone of that object. Also, if an update to the directives would update
|
|
||||||
the definition, you'd have to force reconstruction.
|
|
||||||
|
@@ -2,20 +2,16 @@
|
|||||||
Filter Levels
|
Filter Levels
|
||||||
When one size *does not* fit all
|
When one size *does not* fit all
|
||||||
|
|
||||||
The more I think about it, the less sense it makes for maintaining one huge
|
It makes little sense to constrain users to one set of HTML elements and
|
||||||
monolithic HTMLDefinition class. There's simply so much variation that
|
attributes and tell them that they are not allowed to mold this in
|
||||||
could go into this definition: the set of HTML good for blog entries is
|
any fashion. Many users demand to be able to custom-select which elements
|
||||||
definitely too large for HTML that would be allowed in blog comments. Going
|
and attributes they want. This is fine: because HTML Purifier keeps close
|
||||||
from Transitional to Strict requires changes to the definition.
|
track of what elements are safe to use, there is no way for them to
|
||||||
|
accidently allow an XSS-able tag.
|
||||||
|
|
||||||
Allowing users to specify their own whitelists is one step (implemented, btw),
|
However, combing through the HTML spec to make your own whitelist can
|
||||||
but I have doubts on only doing this. Simply put, the typical programmer is too
|
be a daunting task. HTML Purifier ought to offer pre-canned filter levels
|
||||||
lazy to actually go through the trouble of investigating which tags, attributes
|
that amateur users can select based on what they think is their use-case.
|
||||||
and properties to allow. HTMLDefinition makes a big part of what HTMLPurifier
|
|
||||||
is.
|
|
||||||
|
|
||||||
The idea, then, is to setup fundamentally different set of definitions, which
|
|
||||||
can further be customized using simpler configuration options.
|
|
||||||
|
|
||||||
Here are some fuzzy levels you could set:
|
Here are some fuzzy levels you could set:
|
||||||
|
|
||||||
@@ -36,13 +32,17 @@ Here are some fuzzy levels you could set:
|
|||||||
|
|
||||||
One final note: when you start axing tags that are more commonly used, you
|
One final note: when you start axing tags that are more commonly used, you
|
||||||
run the risk of accidentally destroying user data, especially if the data
|
run the risk of accidentally destroying user data, especially if the data
|
||||||
is incoming from a WYSIWYG eidtor that hasn't been synced accordingly. This may
|
is incoming from a WYSIWYG editor that hasn't been synced accordingly. This may
|
||||||
make forbidden element to text transformations desirable (for example, images).
|
make forbidden element to text transformations desirable (for example, images).
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
== Element Risk Analysis ==
|
== Element Risk Analysis ==
|
||||||
|
|
||||||
|
Although none of the currently supported elements presents a security
|
||||||
|
threat per-say, some can cause problems for page layouts or be
|
||||||
|
extremely complicated.
|
||||||
|
|
||||||
Legend:
|
Legend:
|
||||||
[danger level] - regular tags / uncommon tags ~ deprecated tags
|
[danger level] - regular tags / uncommon tags ~ deprecated tags
|
||||||
[danger level]* - rare tags
|
[danger level]* - rare tags
|
||||||
@@ -111,6 +111,10 @@ Partially presentational - table.cellpadding, table.cellspacing,
|
|||||||
|
|
||||||
== CSS Risk Analysis ==
|
== CSS Risk Analysis ==
|
||||||
|
|
||||||
|
Currently, there is no support for fine-grained "allowed CSS" specification,
|
||||||
|
mainly because I'm lazy, partially because no one has asked for it. However,
|
||||||
|
this will be added eventually.
|
||||||
|
|
||||||
There are certain CSS elements that are extremely useful inline, but then
|
There are certain CSS elements that are extremely useful inline, but then
|
||||||
as you get to more presentation oriented styling it may not always be
|
as you get to more presentation oriented styling it may not always be
|
||||||
appropriate to inline them.
|
appropriate to inline them.
|
||||||
@@ -123,6 +127,7 @@ any CSS properties that are not currently implemented (such as position).
|
|||||||
Dangerous, can go outside container - float
|
Dangerous, can go outside container - float
|
||||||
Easy to abuse - font-size, font-family (font), width
|
Easy to abuse - font-size, font-family (font), width
|
||||||
Colored - background-color (background), border-color (border), color
|
Colored - background-color (background), border-color (border), color
|
||||||
|
(see proposal-colors.html)
|
||||||
Dramatic - border, list-style-position (list-style), margin, padding,
|
Dramatic - border, list-style-position (list-style), margin, padding,
|
||||||
text-align, text-indent, text-transform, vertical-align, line-height
|
text-align, text-indent, text-transform, vertical-align, line-height
|
||||||
|
|
||||||
|
@@ -1,42 +1,6 @@
|
|||||||
We are going to model our I18N/L10N off of MediaWiki's system. Their's is
|
We are going to model our I18N/L10N off of MediaWiki's system. Their's is
|
||||||
obviously quite complicated, so we're going to simplify it a bit for our needs.
|
obviously quite complicated, so we're going to simplify it a bit for our needs.
|
||||||
|
|
||||||
== Structure ==
|
|
||||||
|
|
||||||
First, you have a Language object. This object contains all the localisable
|
|
||||||
message strings, as well as other important language-specific settings and
|
|
||||||
custom behavior (uppercasing, lowercasing, printing dates, formatting
|
|
||||||
numbers, etc.)
|
|
||||||
|
|
||||||
The object is constructed from two sources: subclassed versions of itself
|
|
||||||
(classes) and Message files (messages).
|
|
||||||
|
|
||||||
== General use ==
|
|
||||||
|
|
||||||
You load a language object by calling the Language::factory() function.
|
|
||||||
This function the class file for the object (taking in account fallback
|
|
||||||
languages by using the fallback langauge's object but overloading the
|
|
||||||
language key) and returns that object. Nothing else happens.
|
|
||||||
|
|
||||||
When a message/etc is requested, a lazy load initializor is called. Now the
|
|
||||||
real work starts. We're first going to take the scenario that the language
|
|
||||||
is not cached. The system loads the Messages file by:
|
|
||||||
|
|
||||||
require( $filename );
|
|
||||||
$cache = compact( self::$mLocalisationKeys );
|
|
||||||
|
|
||||||
...where self::$mLocalisationKeys is the name of variables that could be used
|
|
||||||
in the localization file. This lets you use things like:
|
|
||||||
|
|
||||||
$fallback = false;
|
|
||||||
$rtl = false;
|
|
||||||
|
|
||||||
...and easily siphon them into arrays.
|
|
||||||
|
|
||||||
Then, we load the $fallback language (if not set, English) to fill in the gaps in
|
|
||||||
the messages. There is specialized behavior for certain keys, as they can be
|
|
||||||
mergeable maps, lists or alias lists (not sure what the last one is).
|
|
||||||
|
|
||||||
== Caching ==
|
== Caching ==
|
||||||
|
|
||||||
MediaWiki has lots of caching mechanisms built in, which make the code somewhat
|
MediaWiki has lots of caching mechanisms built in, which make the code somewhat
|
||||||
|
@@ -2,9 +2,8 @@
|
|||||||
Configuration Ideas
|
Configuration Ideas
|
||||||
|
|
||||||
Here are some theoretical configuration ideas that we could implement some
|
Here are some theoretical configuration ideas that we could implement some
|
||||||
time. Note the naming convention: %Namespace.Directive
|
time. Note the naming convention: %Namespace.Directive. If you want one
|
||||||
|
implemented, give us a ring, and we'll move it up the priority chain.
|
||||||
%Attr.IDPrefix - prefix all ids with this
|
|
||||||
|
|
||||||
%Attr.RewriteFragments - if there's %Attr.IDPrefix we may want to transparently
|
%Attr.RewriteFragments - if there's %Attr.IDPrefix we may want to transparently
|
||||||
rewrite the URLs we parse too. However, we can only do it when it's a pure
|
rewrite the URLs we parse too. However, we can only do it when it's a pure
|
||||||
@@ -24,8 +23,6 @@ time. Note the naming convention: %Namespace.Directive
|
|||||||
%URI.AddRelNofollow - will add rel="nofollow" to all links, preventing the
|
%URI.AddRelNofollow - will add rel="nofollow" to all links, preventing the
|
||||||
spread of ill-gotten pagerank
|
spread of ill-gotten pagerank
|
||||||
|
|
||||||
%URI.RelativeToAbsolute - transforms all relative URIs to absolute form
|
|
||||||
|
|
||||||
%URI.HostBlacklistRegex - regexes that if matching the host are disallowed
|
%URI.HostBlacklistRegex - regexes that if matching the host are disallowed
|
||||||
%URI.HostWhitelist - domain names that are excluded from the host blacklist
|
%URI.HostWhitelist - domain names that are excluded from the host blacklist
|
||||||
%URI.HostPolicy - determines whether or not its reject all and then whitelist
|
%URI.HostPolicy - determines whether or not its reject all and then whitelist
|
||||||
|
48
docs/ref-content-models.txt
Normal file
48
docs/ref-content-models.txt
Normal file
@@ -0,0 +1,48 @@
|
|||||||
|
|
||||||
|
Handling Content Model Changes
|
||||||
|
|
||||||
|
|
||||||
|
1. Context
|
||||||
|
|
||||||
|
The distinction between Transitional and Strict document types is somewhat
|
||||||
|
of an anomaly in the lineage of XHTML document types (following 1.0, no
|
||||||
|
doctypes do not have flavors: instead, modularization is used to let
|
||||||
|
document authors vary their elements). This transition is usually quite
|
||||||
|
straight-forward, as W3C usually deprecates attributes or elements, which
|
||||||
|
are quite easily handled using tag and attribute transforms.
|
||||||
|
|
||||||
|
However, for two elements, <blockquote>, <body> and <address>, W3C elected
|
||||||
|
to also change the content model. <blockquote> and <body> originally
|
||||||
|
accepted both inline and block elements, but in the strict doctype they
|
||||||
|
only allow block elements. With <address>, the situation is inverted:
|
||||||
|
<p> tags were now forbidden from appearing within this tag.
|
||||||
|
|
||||||
|
|
||||||
|
2. Current situation
|
||||||
|
|
||||||
|
Currently, HTML Purifier treats <blockquote> specially during Tidy mode
|
||||||
|
using a custom ChildDef class StrictBlockquote. StrictBlockquote
|
||||||
|
operates similarly to Required, except that when it encounters an inline
|
||||||
|
element, it will wrap it in a block tag (as specified by
|
||||||
|
%HTML.BlockWrapper, the default is <p>). The naming suggests it can
|
||||||
|
only be used for <blockquote>s, although it may be possible to
|
||||||
|
genericize it to work on other cases of this nature (this would be of
|
||||||
|
little practical application, as no other element in XHTML 1.1 or earlier
|
||||||
|
has a block-only content model).
|
||||||
|
|
||||||
|
Tidy currently contains no custom, lenient implementation for <address>.
|
||||||
|
If one were to be written, it would likely operate on the principle that,
|
||||||
|
when a <p> tag were to be encountered, it would be replaced with a
|
||||||
|
leading and trailing <br /> tag (the contents of <p>, being inline, are
|
||||||
|
not an issue). There is no prior work with this sort of operation.
|
||||||
|
|
||||||
|
|
||||||
|
3. Outside applicability
|
||||||
|
|
||||||
|
There are a number of other elements that contain restrictive content
|
||||||
|
models, such as <ul> or <span> (the latter is restrictive in that it
|
||||||
|
does not allow block elements). In the former case, an errant node
|
||||||
|
is eliminated completely, in the latter case, the text of the node
|
||||||
|
would is preserved (as the parent node does allow PCDATA). Custom
|
||||||
|
content model implementations probably are not the best way of handling
|
||||||
|
these cases, instead, node bubbling should be implemented instead.
|
28
docs/ref-css-length.txt
Normal file
28
docs/ref-css-length.txt
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
|
||||||
|
CSS Length Reference
|
||||||
|
To bound, or not to bound, that is the question
|
||||||
|
|
||||||
|
It's quite a reasonable request, really, and it's already been implemented
|
||||||
|
for HTML. That is, length bounding. It makes little sense to let users
|
||||||
|
define text blocks that have a font-size of 63,360 inches (that's a mile,
|
||||||
|
by the way) or a width of forty-fold the parent container.
|
||||||
|
|
||||||
|
But it's a little more complicated then that. There are multiple units
|
||||||
|
one can use, and we have to a little unit conversion to get things working.
|
||||||
|
Here's what we have:
|
||||||
|
|
||||||
|
Absolute:
|
||||||
|
1 in ~= 2.54 cm
|
||||||
|
1 cm = 10 mm
|
||||||
|
1 pt = 1/72 in
|
||||||
|
1 pc = 12 pt
|
||||||
|
|
||||||
|
Relative:
|
||||||
|
1 em ~= 10.0667 px
|
||||||
|
1 ex ~= 0.5 em, though Mozilla Firefox says 1 ex = 6px
|
||||||
|
1 px ~= 1 pt
|
||||||
|
|
||||||
|
Watch out: font-sizes can also be nested to get successively larger
|
||||||
|
(although I do not relish having to keep track of context font-sizes,
|
||||||
|
this may be necessary, especially for some of the more advanced features
|
||||||
|
for preventing things like white on white).
|
@@ -15,6 +15,7 @@
|
|||||||
|
|
||||||
<div id="filing">Filed under Reference</div>
|
<div id="filing">Filed under Reference</div>
|
||||||
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
||||||
|
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
|
||||||
|
|
||||||
<p>Many thanks to the DevNetwork community for answering questions,
|
<p>Many thanks to the DevNetwork community for answering questions,
|
||||||
theorizing about design, and offering encouragement during
|
theorizing about design, and offering encouragement during
|
||||||
@@ -41,4 +42,4 @@ the development of this library in these forum threads:</p>
|
|||||||
|
|
||||||
<div id="version">$Id$</div>
|
<div id="version">$Id$</div>
|
||||||
</body>
|
</body>
|
||||||
</html>
|
</html>
|
||||||
|
164
docs/ref-html-modularization.txt
Normal file
164
docs/ref-html-modularization.txt
Normal file
@@ -0,0 +1,164 @@
|
|||||||
|
|
||||||
|
The Modularization of HTMLDefinition in HTML Purifier
|
||||||
|
|
||||||
|
Todo for XHTML 1.1 support <http://www.w3.org/TR/xhtml11/changes.html>
|
||||||
|
1. Support Ruby <http://www.w3.org/TR/2001/REC-ruby-20010531/>
|
||||||
|
|
||||||
|
HTML Purifier uses the modularization of XHTML
|
||||||
|
<http://www.w3.org/TR/xhtml-modularization/> to organize the internals
|
||||||
|
of HTMLDefinition into a more manageable and extensible fashion. Rather
|
||||||
|
than have one super-object, HTMLDefinition is split into HTMLModules,
|
||||||
|
each of which are responsible for defining elements, their attributes,
|
||||||
|
and other properties (for a more indepth coverage, see
|
||||||
|
/library/HTMLPurifier/HTMLModule.php's docblock comments). These modules
|
||||||
|
are managed by HTMLModuleManager.
|
||||||
|
|
||||||
|
Modules that we don't support but could support are:
|
||||||
|
|
||||||
|
* 5.6. Table Modules
|
||||||
|
o 5.6.1. Basic Tables Module [?]
|
||||||
|
* 5.8. Client-side Image Map Module [?]
|
||||||
|
* 5.9. Server-side Image Map Module [?]
|
||||||
|
* 5.12. Target Module [?]
|
||||||
|
* 5.21. Name Identification Module [deprecated]
|
||||||
|
|
||||||
|
These modules would be implemented as "unsafe":
|
||||||
|
|
||||||
|
* 5.2. Core Modules
|
||||||
|
o 5.2.1. Structure Module
|
||||||
|
* 5.3. Applet Module
|
||||||
|
* 5.5. Forms Modules
|
||||||
|
o 5.5.1. Basic Forms Module
|
||||||
|
o 5.5.2. Forms Module
|
||||||
|
* 5.10. Object Module
|
||||||
|
* 5.11. Frames Module
|
||||||
|
* 5.13. Iframe Module
|
||||||
|
* 5.14. Intrinsic Events Module
|
||||||
|
* 5.15. Metainformation Module
|
||||||
|
* 5.16. Scripting Module
|
||||||
|
* 5.17. Style Sheet Module
|
||||||
|
* 5.19. Link Module
|
||||||
|
* 5.20. Base Module
|
||||||
|
|
||||||
|
We will not be using W3C's XML Schemas or DTDs directly due to the lack
|
||||||
|
of robust tools for handling them (the main problem is that all the
|
||||||
|
current parsers are usually PHP 5 only and solely-validating, not
|
||||||
|
correcting).
|
||||||
|
|
||||||
|
This system may be generalized and ported over for CSS.
|
||||||
|
|
||||||
|
== General Use-Case ==
|
||||||
|
|
||||||
|
The outwards API of HTMLDefinition has been largely preserved, not
|
||||||
|
only for backwards-compatibility but also by design. Instead,
|
||||||
|
HTMLDefinition can be retrieved "raw", in which it loads a structure
|
||||||
|
that closely resembles the modules of XHTML 1.1. This structure is very
|
||||||
|
dynamic, making it easy to make cascading changes to global content
|
||||||
|
sets or remove elements in bulk.
|
||||||
|
|
||||||
|
However, once HTML Purifier needs the actual definition, it retrieves
|
||||||
|
a finalized version of HTMLDefinition. The finalized definition involves
|
||||||
|
processing the modules into a form that it is optimized for multiple
|
||||||
|
calls. This final version is immutable and, even if editable, would
|
||||||
|
be extremely hard to change.
|
||||||
|
|
||||||
|
So, some code taking advantage of the XHTML modularization may look
|
||||||
|
like this:
|
||||||
|
|
||||||
|
<?php
|
||||||
|
$config = HTMLPurifier_Config::createDefault();
|
||||||
|
$def =& $config->getHTMLDefinition(true); // reference to raw
|
||||||
|
$def->addElement('marquee', 'Block', 'Flow', 'Common');
|
||||||
|
$purifier = new HTMLPurifier($config);
|
||||||
|
$purifier->purify($html); // now the definition is finalized
|
||||||
|
?>
|
||||||
|
|
||||||
|
== Inclusions ==
|
||||||
|
|
||||||
|
One of the nice features of HTMLDefinition is that piggy-backing off
|
||||||
|
of global attribute and content sets is extremely easy to do.
|
||||||
|
|
||||||
|
=== Attributes ===
|
||||||
|
|
||||||
|
HTMLModule->elements[$element]->attr stores attribute information for the
|
||||||
|
specific attributes of $element. This is quite close to the final
|
||||||
|
API that HTML Purifier interfaces with, but there's an important
|
||||||
|
extra feature: attr may also contain a array with a member index zero.
|
||||||
|
|
||||||
|
<?php
|
||||||
|
HTMLModule->elements[$element]->attr[0] = array('AttrSet');
|
||||||
|
?>
|
||||||
|
|
||||||
|
Rather than map the attribute key 0 to an array (which should be
|
||||||
|
an AttrDef), it defines a number of attribute collections that should
|
||||||
|
be merged into this elements attribute array.
|
||||||
|
|
||||||
|
Furthermore, the value of an attribute key, attribute value pair need
|
||||||
|
not be a fully fledged AttrDef object. They can also be a string, which
|
||||||
|
signifies a AttrDef that is looked up from a centralized registry
|
||||||
|
AttrTypes. This allows more concise attribute definitions that look
|
||||||
|
more like W3C's declarations, as well as offering a centralized point
|
||||||
|
for modifying the behavior of one attribute type. And, of course, the
|
||||||
|
old method of manually instantiating an AttrDef still works.
|
||||||
|
|
||||||
|
=== Attribute Collections ===
|
||||||
|
|
||||||
|
Attribute collections are stored and processed in the AttrCollections
|
||||||
|
object, which is responsible for performing the inclusions signified
|
||||||
|
by the 0 index. These attribute collections, too, are mutable, by
|
||||||
|
using HTMLModule->attr_collections. You may add new attributes
|
||||||
|
to a collection or define an entirely new collection for your module's
|
||||||
|
use. Inclusions can also be cumulative.
|
||||||
|
|
||||||
|
Attribute collections allow us to get rid of so called "global attributes"
|
||||||
|
(which actually aren't so global).
|
||||||
|
|
||||||
|
=== Content Models and ChildDef ===
|
||||||
|
|
||||||
|
An implementation of the above-mentioned attributes and attribute
|
||||||
|
collections was applied to the ChildDef system. HTML Purifier uses
|
||||||
|
a proprietary system called ChildDef for performance and flexibility
|
||||||
|
reasons, but this does not line up very well with W3C's notion of
|
||||||
|
regexps for defining the allowed children of an element.
|
||||||
|
|
||||||
|
HTMLPurifier->elements[$element]->content_model and
|
||||||
|
HTMLPurifier->elements[$element]->content_model_type store information
|
||||||
|
about the final ChildDef that will be stored in
|
||||||
|
HTMLPurifier->elements[$element]->child (we use a different variable
|
||||||
|
because the two forms are sufficiently different).
|
||||||
|
|
||||||
|
$content_model is an abstract, string representation of the internal
|
||||||
|
state of ChildDef, while $content_model_type is a string identifier
|
||||||
|
of which ChildDef subclass to instantiate. $content_model is processed
|
||||||
|
by substituting all content set identifiers (capitalized element names)
|
||||||
|
with their contents. It is then parsed and passed into the appropriate
|
||||||
|
ChildDef class, as defined by the ContentSets->getChildDef() or the
|
||||||
|
custom fallback HTMLModule->getChildDef() for custom child definitions
|
||||||
|
not in the core.
|
||||||
|
|
||||||
|
You'll need to use these facilities if you plan on referencing a content
|
||||||
|
set like "Inline" or "Block", and using them is recommended even if you're
|
||||||
|
not due to their conciseness.
|
||||||
|
|
||||||
|
A few notes on $content_model: it's structure can be as complicated
|
||||||
|
as you want, but the pipe symbol (|) is reserved for defining possible
|
||||||
|
choices, due to the content sets implementation. For example, a content
|
||||||
|
model that looks like:
|
||||||
|
|
||||||
|
"Inline -> Block -> a"
|
||||||
|
|
||||||
|
...when the Inline content set is defined as "span | b" and the Block
|
||||||
|
content set is defined as "div | blockquote", will expand into:
|
||||||
|
|
||||||
|
"span | b -> div | blockquote -> a"
|
||||||
|
|
||||||
|
The custom HTMLModule->getChildDef() function will need to be able to
|
||||||
|
then feed this information to ChildDef in a usable manner.
|
||||||
|
|
||||||
|
=== Content Sets ===
|
||||||
|
|
||||||
|
Content sets can be altered using HTMLModule->content_sets, an associative
|
||||||
|
array of content set names to content set contents. If the content set
|
||||||
|
already exists, your values are appended on to it (great for, say,
|
||||||
|
registering the font tag as an inline element), otherwise it is
|
||||||
|
created. They are substituted into content_model.
|
@@ -1,37 +0,0 @@
|
|||||||
|
|
||||||
Loose versus Strict
|
|
||||||
Changes from one doctype to another
|
|
||||||
|
|
||||||
There are changes. Wow, how insightful. Not everything changed is relevant
|
|
||||||
to HTML Purifier, though, so let's take a look:
|
|
||||||
|
|
||||||
== Major incompatibilities ==
|
|
||||||
|
|
||||||
[done] BLOCKQUOTE changes from 'flow' to 'block'
|
|
||||||
current behavior: inline inner contents should not be nuked, block-ify as necessary
|
|
||||||
[partially-done] U, S, STRIKE cut
|
|
||||||
current behavior: removed completely
|
|
||||||
projected behavior: replace with appropriate inline span + CSS
|
|
||||||
[done] ADDRESS from potpourri to Inline (removes p tags)
|
|
||||||
current behavior: block tags silently dropped
|
|
||||||
ideal behavior: replace tags with something like <br>. (not high priority)
|
|
||||||
|
|
||||||
== Things we can loosen up ==
|
|
||||||
|
|
||||||
Tags DIR, MENU, CENTER, ISINDEX, FONT, BASEFONT? allowed in loose
|
|
||||||
current behavior: transform to strict-valid forms
|
|
||||||
Attributes allowed in loose (see attribute transforms in 'dev-progress.html')
|
|
||||||
current behavior: projected to transform into strict-valid forms
|
|
||||||
|
|
||||||
== Periphery issues ==
|
|
||||||
|
|
||||||
A tag's attribute 'target' (for selecting frames) cut
|
|
||||||
current behavior: not allowed at all
|
|
||||||
projected behavior: use loose doctype if needed, needs valid values
|
|
||||||
[done] OL/LI tag's attribute 'start'/'value' (for renumbering lists) cut
|
|
||||||
current behavior: no substitute, just delete when in strict, allow in loose
|
|
||||||
Attribute 'name' deprecated in favor of 'id'
|
|
||||||
current behavior: dropped silently
|
|
||||||
projected behavior: create proper AttrTransform (currently not allowed at all)
|
|
||||||
[done] PRE tag allows SUB/SUP? (strict dtd comment vs syntax, loose disallows)
|
|
||||||
current behavior: disallow as usual
|
|
@@ -18,5 +18,7 @@ HTML Purifier context.
|
|||||||
|
|
||||||
<listing>, monospace pre-variant (extremely rare)
|
<listing>, monospace pre-variant (extremely rare)
|
||||||
<plaintext>, escapes all tags to the end of document
|
<plaintext>, escapes all tags to the end of document
|
||||||
<ruby> and friends, (more research needed, appears to be XHTML 1.1 markup)
|
|
||||||
<xmp>, monospace, replace with pre
|
<xmp>, monospace, replace with pre
|
||||||
|
|
||||||
|
These should be put into their own Tidy module, not loaded by default(?). These
|
||||||
|
all qualify as "lenient" transforms.
|
||||||
|
@@ -1,36 +0,0 @@
|
|||||||
|
|
||||||
Is HTML Purifier Strict or Transitional?
|
|
||||||
A little bit of helpful guidance
|
|
||||||
|
|
||||||
Despite the fact that HTML Purifier professes only to support transitional
|
|
||||||
HTML, it rejects a lot of attributes and elements that are actually, indeed,
|
|
||||||
valid. You can investigate progress.html to find out precisely what we
|
|
||||||
are doing to these *deprecated* attributes.
|
|
||||||
|
|
||||||
However, users have found that Strict HTML imposes some quite unreasonable
|
|
||||||
restrictions on certain things. The start and value attributes in ol and
|
|
||||||
li (respectively) perhaps are the most contested. There's is currently no
|
|
||||||
widely supported browser method short of JavaScript that can replace these
|
|
||||||
two deprecated elements. HTML Purifier does not currently support them, but
|
|
||||||
it might behoove us to do so while our output is still transitional.
|
|
||||||
|
|
||||||
Fortunantely, that's the only real bugger case. The others have near-perfect
|
|
||||||
CSS equivalents, and were presentational anyway. However, the other question
|
|
||||||
pops up: should we always convert these to the CSS forms when 1. the spec
|
|
||||||
allows them anyway and 2. older browsers support them better? After all, the
|
|
||||||
whole point about CSS is to seperate styling from content, so inline styling
|
|
||||||
doesn't solve that problem.
|
|
||||||
|
|
||||||
It's an icky question, and we'll have to deal with it as more and more
|
|
||||||
transforms get implemented. As of right now, however, we currently support
|
|
||||||
these loose-only constructs in loose mode:
|
|
||||||
|
|
||||||
- <ul start="1">, <li value="1"> attributes
|
|
||||||
- <u>, <strike>, <s> tags
|
|
||||||
- flow children in <blockquote>
|
|
||||||
- mixed children in <address>
|
|
||||||
|
|
||||||
The changed child definitions as well as the ul.start li.value are the most
|
|
||||||
compelling reasons why loose should be used. We may want offer disabling <u>,
|
|
||||||
<strike> and <s> by themselves.
|
|
||||||
|
|
@@ -2,8 +2,23 @@
|
|||||||
Web Hypertext Application Technology Working Group
|
Web Hypertext Application Technology Working Group
|
||||||
WHATWG
|
WHATWG
|
||||||
|
|
||||||
I don't think we need to worry about them. Untrusted users shouldn't be
|
== HTML 5 ==
|
||||||
submitting applications, eh? But if some interesting attribute pops up in
|
|
||||||
their spec, and might be worth supporting, stick it here.
|
|
||||||
|
|
||||||
(none so far, as you can see)
|
URL: http://www.whatwg.org/specs/web-apps/current-work/
|
||||||
|
|
||||||
|
HTML 5 defines a kaboodle of new elements and attributes, as well as
|
||||||
|
some well-defined, "quirks mode" HTML parsing. Although WHATWG professes
|
||||||
|
to be targeted towards web applications, many of their semantic additions
|
||||||
|
would be quite useful in regular documents. Eventually, HTML
|
||||||
|
Purifier will need to audit their lists and figure out what changes need
|
||||||
|
to be made. This process is complicated by the fact that the WHATWG
|
||||||
|
doesn't buy into W3C's modularization of XHTML 1.1: we may need
|
||||||
|
to remodularize HTML 5 (probably done by section name). No sense in
|
||||||
|
committing ourselves till the spec stabilizes, though.
|
||||||
|
|
||||||
|
More immediately speaking though, however, is the well-defined parsing
|
||||||
|
behavior that HTML 5 adds. While I have little interest in writing
|
||||||
|
another DirectLex parser, other parsers like ph5p
|
||||||
|
<http://jero.net/lab/ph5p/> can be adapted to DOMLex to support much more
|
||||||
|
flexible HTML parsing (a cool feature I've seen is how they resolve
|
||||||
|
<b>bold<i>both</b>italic</i>).
|
||||||
|
@@ -1,21 +0,0 @@
|
|||||||
|
|
||||||
Getting XHTML 1.1 Working
|
|
||||||
|
|
||||||
It's quite simple, according to <http://www.w3.org/TR/xhtml11/changes.html>
|
|
||||||
|
|
||||||
1. Scratch lang entirely in favor of xml:lang
|
|
||||||
2. Scratch name entirely in favor of id (partially-done)
|
|
||||||
3. Support Ruby <http://www.w3.org/TR/2001/REC-ruby-20010531/>
|
|
||||||
|
|
||||||
...but that's only an informative section. More things to do:
|
|
||||||
|
|
||||||
1. Scratch style attribute (it's deprecated)
|
|
||||||
2. Be module-aware (this might entail intelligent grouping in the definition
|
|
||||||
and allowing users to specifically remove certain modules (see 5))
|
|
||||||
3. Cross-reference minimal content models with existing DTDs and determine
|
|
||||||
changes (todo)
|
|
||||||
4. Watch out for the Legacy Module
|
|
||||||
<http://www.w3.org/TR/2001/REC-xhtml-modularization-20010410/abstract_modules.html#s_legacymodule>
|
|
||||||
5. Let users specify their own custom modules
|
|
||||||
6. Study Modularization document
|
|
||||||
<http://www.w3.org/TR/2001/REC-xhtml-modularization-20010410/>
|
|
8
docs/specimens/LICENSE
Normal file
8
docs/specimens/LICENSE
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
Licensing of Specimens
|
||||||
|
|
||||||
|
Some files in this directory have different licenses:
|
||||||
|
|
||||||
|
windows-live-mail-desktop-beta.html - donated by laacz, public domain
|
||||||
|
img.png - LGPL, from <http://commons.wikimedia.org/wiki/Image:Pastille_chrome.png>
|
||||||
|
|
||||||
|
All other files are by me, and are licensed under LGPL.
|
165
docs/specimens/html-align-to-css.html
Normal file
165
docs/specimens/html-align-to-css.html
Normal file
@@ -0,0 +1,165 @@
|
|||||||
|
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
||||||
|
"http://www.w3.org/TR/html4/loose.dtd">
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>HTML align attribute to CSS - HTML Purifier Specimen</title>
|
||||||
|
<style type="text/css">
|
||||||
|
div.container {position:relative;height:110px;}
|
||||||
|
div.container.legend .test {text-align:center;line-height:100px;}
|
||||||
|
div.test {width:100px;height:100px;border:1px solid black;
|
||||||
|
position:absolute;top:10px;}
|
||||||
|
div.test.html {left:10px;}
|
||||||
|
div.test.css {left:140px;}
|
||||||
|
table {background:#F00;}
|
||||||
|
img {border:1px solid #000;}
|
||||||
|
hr {width:50px;}
|
||||||
|
div.segment {width:250px; float:left; margin-top:1em;}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
|
||||||
|
<h1>HTML align attribute to CSS</h1>
|
||||||
|
|
||||||
|
<p>Inspect source for methodology.</p>
|
||||||
|
|
||||||
|
<div class="container legend">
|
||||||
|
<div class="test html">
|
||||||
|
HTML
|
||||||
|
</div>
|
||||||
|
<div class="test css">
|
||||||
|
CSS
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="segment">
|
||||||
|
|
||||||
|
<h2>table.align</h2>
|
||||||
|
|
||||||
|
<h3>left</h3>
|
||||||
|
<div class="container">
|
||||||
|
<div class="test html">
|
||||||
|
a<table align="left"><tr><td>O</td></tr></table>a
|
||||||
|
</div>
|
||||||
|
<div class="test css">
|
||||||
|
a<table style="float:left;"><tr><td>O</td></tr></table>a
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h3>center</h3>
|
||||||
|
<div class="container">
|
||||||
|
<div class="test html">
|
||||||
|
a<table align="center"><tr><td>O</td></tr></table>a
|
||||||
|
</div>
|
||||||
|
<div class="test css">
|
||||||
|
a<table style="margin-left:auto; margin-right:auto;"><tr><td>O</td></tr></table>a
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h3>right</h3>
|
||||||
|
<div class="container">
|
||||||
|
<div class="test html">
|
||||||
|
a<table align="right"><tr><td>O</td></tr></table>a
|
||||||
|
</div>
|
||||||
|
<div class="test css">
|
||||||
|
a<table style="float:right;"><tr><td>O</td></tr></table>a
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- ################################################################## -->
|
||||||
|
|
||||||
|
<div class="segment">
|
||||||
|
<h2>img.align</h2>
|
||||||
|
<h3>left</h3>
|
||||||
|
<div class="container">
|
||||||
|
<div class="test html">
|
||||||
|
a<img src="img.png" align="left">a
|
||||||
|
</div>
|
||||||
|
<div class="test css">
|
||||||
|
a<img src="img.png" style="float:left;">a
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h3>right</h3>
|
||||||
|
<div class="container">
|
||||||
|
<div class="test html">
|
||||||
|
a<img src="img.png" align="right">a
|
||||||
|
</div>
|
||||||
|
<div class="test css">
|
||||||
|
a<img src="img.png" style="float:right;">a
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h3>bottom</h3>
|
||||||
|
<div class="container">
|
||||||
|
<div class="test html">
|
||||||
|
a<img src="img.png" align="bottom">a
|
||||||
|
</div>
|
||||||
|
<div class="test css">
|
||||||
|
a<img src="img.png" style="vertical-align:baseline;">a
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h3>middle</h3>
|
||||||
|
<div class="container">
|
||||||
|
<div class="test html">
|
||||||
|
a<img src="img.png" align="middle">a
|
||||||
|
</div>
|
||||||
|
<div class="test css">
|
||||||
|
a<img src="img.png" style="vertical-align:middle;">a
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h3>top</h3>
|
||||||
|
<div class="container">
|
||||||
|
<div class="test html">
|
||||||
|
a<img src="img.png" align="top">a
|
||||||
|
</div>
|
||||||
|
<div class="test css">
|
||||||
|
a<img src="img.png" style="vertical-align:top;">a
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<!-- ################################################################## -->
|
||||||
|
|
||||||
|
<div class="segment">
|
||||||
|
|
||||||
|
<h2>hr.align</h2>
|
||||||
|
|
||||||
|
<h3>left</h3>
|
||||||
|
<div class="container">
|
||||||
|
<div class="test html">
|
||||||
|
<hr align="left" />
|
||||||
|
</div>
|
||||||
|
<div class="test css">
|
||||||
|
<hr style="margin-right:auto; margin-left:0; text-align:left;" />
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h3>center</h3>
|
||||||
|
<div class="container">
|
||||||
|
<div class="test html">
|
||||||
|
<hr align="center" />
|
||||||
|
</div>
|
||||||
|
<div class="test css">
|
||||||
|
<hr style="margin-right:auto; margin-left:auto; text-align:center;" />
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h3>right</h3>
|
||||||
|
<div class="container">
|
||||||
|
<div class="test html">
|
||||||
|
<hr align="right" />
|
||||||
|
</div>
|
||||||
|
<div class="test css">
|
||||||
|
<hr style="margin-right:0; margin-left:auto; text-align:right;" />
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</body>
|
||||||
|
</html>
|
BIN
docs/specimens/img.png
Normal file
BIN
docs/specimens/img.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 2.1 KiB |
74
docs/specimens/windows-live-mail-desktop-beta.html
Normal file
74
docs/specimens/windows-live-mail-desktop-beta.html
Normal file
@@ -0,0 +1,74 @@
|
|||||||
|
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
|
||||||
|
<HTML ChildAreas="4" xmlns:canvas><HEAD>
|
||||||
|
<META http-equiv=Content-Type content=text/html;charset=windows-1257>
|
||||||
|
<STYLE></STYLE>
|
||||||
|
|
||||||
|
<META content="MSHTML 6.00.6000.16414" name=GENERATOR></HEAD>
|
||||||
|
<BODY id=MailContainerBody
|
||||||
|
style="PADDING-RIGHT: 10px; PADDING-LEFT: 10px; FONT-SIZE: 10pt; COLOR: #000000; PADDING-TOP: 15px; FONT-FAMILY: Arial"
|
||||||
|
bgColor=#ff6600 leftMargin=0 background="" topMargin=0
|
||||||
|
name="Compose message area" acc_role="text" CanvasTabStop="false">
|
||||||
|
<DIV
|
||||||
|
style="BORDER-TOP: #dddddd 1px solid; FONT-SIZE: 10pt; WIDTH: 100%; MARGIN-RIGHT: 10px; PADDING-TOP: 5px; BORDER-BOTTOM: #dddddd 1px solid; FONT-FAMILY: Verdana; HEIGHT: 25px; BACKGROUND-COLOR: #ffffff"><NOBR><SPAN
|
||||||
|
title="View a slideshow of the pictures in this e-mail message."
|
||||||
|
style="PADDING-RIGHT: 20px"><A style="COLOR: #0088e4"
|
||||||
|
href="http://g.msn.com/5meen_us/171?path=/photomail/{6fc0065f-ffdd-4ca6-9a4c-cc5a93dc122f}&image=47D7B182CFEFB10!127&imagehi=47D7B182CFEFB10!125&CID=323550092004883216">Play
|
||||||
|
slideshow </A></SPAN><SPAN style="COLOR: #909090"><SPAN>|</SPAN><SPAN
|
||||||
|
style="PADDING-LEFT: 20px"> Download the highest quality version of a picture by
|
||||||
|
clicking the + above it </SPAN></SPAN></NOBR></DIV>
|
||||||
|
<DIV
|
||||||
|
style="PADDING-RIGHT: 5px; PADDING-LEFT: 7px; PADDING-BOTTOM: 2px; WIDTH: 100%; PADDING-TOP: 2px">
|
||||||
|
<OL>
|
||||||
|
<LI><IMG title="Angry smile emoticon"
|
||||||
|
style="FLOAT: none; MARGIN: 0px; POSITION: static" tabIndex=-1
|
||||||
|
alt="Angry smile emoticon" src="cid:49F0C856199E4D688D2D740680733D74@wc"
|
||||||
|
MSNNonUserImageOrEmoticon="true">Un ka <FONT style="BACKGROUND-COLOR: #800000"
|
||||||
|
color=#cc99ff><STRONG>Tev</STRONG></FONT> iet, un ko tu dari?
|
||||||
|
<LI>Aha!</LI></OL>
|
||||||
|
|
||||||
|
<UL>
|
||||||
|
<LI>Buletets
|
||||||
|
<LI>
|
||||||
|
<DIV align=justify><A title=http://laacz.lv/blog/
|
||||||
|
href="http://laacz.lv/blog/">http://laacz.lv/blog/</A> un <A
|
||||||
|
title=http://google.com/ href="http://google.com/">gugle</A></DIV>
|
||||||
|
<LI>Sarakstucitis</LI></UL></DIV><SPAN><SPAN xmlns:canvas="canvas-namespace-id"
|
||||||
|
layoutEmptyTextWellFont="Tahoma"><SPAN
|
||||||
|
style="MARGIN-BOTTOM: 15px; OVERFLOW: visible; HEIGHT: 16px"></SPAN><SPAN
|
||||||
|
style="MARGIN-BOTTOM: 25px; VERTICAL-ALIGN: top; OVERFLOW: visible; MARGIN-RIGHT: 25px; HEIGHT: 234px">
|
||||||
|
<TABLE style="DISPLAY: inline">
|
||||||
|
<TBODY>
|
||||||
|
<TR>
|
||||||
|
|
||||||
|
<TD>
|
||||||
|
<DIV
|
||||||
|
style="FONT-WEIGHT: bold; FONT-SIZE: 12pt; FONT-FAMILY: arial; TEXT-ALIGN: center"><A
|
||||||
|
id=HiresARef
|
||||||
|
title="Click here to view or download a high resolution version of this picture"
|
||||||
|
style="COLOR: #0088e4; TEXT-DECORATION: none"
|
||||||
|
href="http://byfiles.storage.msn.com/x1pMvt0I80jTgT6DuaCpEMbprX3nk3jNv_vjigxV_EYVSMyM_PKgEvDEUtuNhQC-F-23mTTcKyqx6eGaeK2e_wMJ0ikwpDdFntk4SY7pfJUv2g2Ck6R2S2vAA?download">+</A></DIV>
|
||||||
|
<DIV
|
||||||
|
title="Click here to view the full image using the online photo viewer."
|
||||||
|
style="DISPLAY: inline; OVERFLOW: hidden; WIDTH: 140px; HEIGHT: 140px"><A
|
||||||
|
href="http://g.msn.com/5meen_us/171?path=/photomail/{6fc0065f-ffdd-4ca6-9a4c-cc5a93dc122f}&image=47D7B182CFEFB10!127&imagehi=47D7B182CFEFB10!125&CID=323550092004883216"
|
||||||
|
border="0"><IMG
|
||||||
|
style="MARGIN-TOP: 15px; DISPLAY: inline-block; MARGIN-LEFT: 0px"
|
||||||
|
height=109 src="cid:006A71303B80404E9FB6184E55D6A446@wc" width=140
|
||||||
|
border=0></A></DIV></TD></TR>
|
||||||
|
<TR>
|
||||||
|
<TD>
|
||||||
|
<DIV
|
||||||
|
style="FONT-SIZE: 10pt; WIDTH: 140px; FONT-FAMILY: verdana; TEXT-ALIGN: center"><EM><STRONG>This
|
||||||
|
<U>is </U></STRONG><U>tit</U>le</EM> fo<STRONG>r <FONT
|
||||||
|
face="Arial Black">t<FONT color=#800000 size=7>h<U>i</U></FONT>s
|
||||||
|
</FONT>picture</STRONG></DIV></TD></TR></TBODY></TABLE></SPAN></SPAN></SPAN>
|
||||||
|
|
||||||
|
<DIV
|
||||||
|
style="PADDING-RIGHT: 5px; PADDING-LEFT: 7px; PADDING-BOTTOM: 2px; WIDTH: 100%; PADDING-TOP: 2px; HEIGHT: 50px">
|
||||||
|
<DIV> </DIV></DIV>
|
||||||
|
<DIV
|
||||||
|
style="BORDER-TOP: #dddddd 1px solid; FONT-SIZE: 10pt; MARGIN-BOTTOM: 10px; WIDTH: 100%; COLOR: #909090; MARGIN-RIGHT: 10px; PADDING-TOP: 9px; FONT-FAMILY: Verdana; HEIGHT: 42px; BACKGROUND-COLOR: #ffffff"><NOBR><SPAN
|
||||||
|
title="Join Windows Live to share photos using Windows Live Photo E-mail.">Online
|
||||||
|
pictures are available for 30 days. <A style="COLOR: #0088e4"
|
||||||
|
href="http://g.msn.com/5meen_us/175">Get Windows Live Mail desktop to create
|
||||||
|
your own photo e-mails. </A></SPAN></NOBR></DIV></BODY></HTML>
|
@@ -23,6 +23,9 @@ h4 {font-family:sans-serif; font-size:0.9em; font-weight:bold; }
|
|||||||
|
|
||||||
/* Marks off asides, discussions on why something is the way it is */
|
/* Marks off asides, discussions on why something is the way it is */
|
||||||
.aside {margin-left:2em; font-family:sans-serif; font-size:0.9em; }
|
.aside {margin-left:2em; font-family:sans-serif; font-size:0.9em; }
|
||||||
|
blockquote .label {font-weight:bold; font-size:1em; margin:0 0 .1em;
|
||||||
|
border-bottom:1px solid #CCC;}
|
||||||
|
.emphasis {font-weight:bold; text-align:center; font-size:1.3em;}
|
||||||
|
|
||||||
/* A regular table */
|
/* A regular table */
|
||||||
.table {border-collapse:collapse; border-bottom:2px solid #888; margin-left:2em; }
|
.table {border-collapse:collapse; border-bottom:2px solid #888; margin-left:2em; }
|
||||||
@@ -30,11 +33,42 @@ h4 {font-family:sans-serif; font-size:0.9em; font-weight:bold; }
|
|||||||
.table thead th:first-child {-moz-border-radius-topleft:1em;}
|
.table thead th:first-child {-moz-border-radius-topleft:1em;}
|
||||||
.table tbody td {border-bottom:1px solid #CCC; padding-right:0.6em;padding-left:0.6em;}
|
.table tbody td {border-bottom:1px solid #CCC; padding-right:0.6em;padding-left:0.6em;}
|
||||||
|
|
||||||
|
/* A quick table*/
|
||||||
|
table.quick tbody th {text-align:right; padding-right:1em;}
|
||||||
|
|
||||||
/* Category of the file */
|
/* Category of the file */
|
||||||
#filing {font-weight:bold; font-size:smaller; }
|
#filing {font-weight:bold; font-size:smaller; }
|
||||||
|
|
||||||
/* Contains, without exception, Return to index. */
|
/* Contains, without exception, Return to index. */
|
||||||
#index {font-size:smaller; }
|
#index {font-size:smaller; }
|
||||||
|
|
||||||
|
#home {font-size:smaller;}
|
||||||
|
|
||||||
/* Contains, without exception, $Id$, for SVN version info. */
|
/* Contains, without exception, $Id$, for SVN version info. */
|
||||||
#version {text-align:right; font-style:italic; margin:2em 0;}
|
#version {text-align:right; font-style:italic; margin:2em 0;}
|
||||||
|
|
||||||
|
#toc ol ol {list-style-type:lower-roman;}
|
||||||
|
#toc ol {list-style-type:decimal;}
|
||||||
|
#toc {list-style-type:upper-alpha;}
|
||||||
|
|
||||||
|
q {
|
||||||
|
behavior: url(fixquotes.htc); /* IE fix */
|
||||||
|
quotes: '\201C' '\201D' '\2018' '\2019';
|
||||||
|
}
|
||||||
|
q:before {
|
||||||
|
content: open-quote;
|
||||||
|
}
|
||||||
|
q:after {
|
||||||
|
content: close-quote;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Marks off implementation details interesting only to the person writing
|
||||||
|
the class described in the spec. */
|
||||||
|
.technical {margin-left:2em; }
|
||||||
|
.technical:before {content:"Technical note: "; font-weight:bold; color:#061; }
|
||||||
|
|
||||||
|
/* Marks off sections that are lacking. */
|
||||||
|
.fixme {margin-left:2em; }
|
||||||
|
.fixme:before {content:"Fix me: "; font-weight:bold; color:#C00; }
|
||||||
|
|
||||||
|
#applicability {margin: 1em 5%; font-style:italic;}
|
||||||
|
@@ -7,4 +7,3 @@
|
|||||||
set_include_path(dirname(__FILE__) . PATH_SEPARATOR . get_include_path() );
|
set_include_path(dirname(__FILE__) . PATH_SEPARATOR . get_include_path() );
|
||||||
require_once 'HTMLPurifier.php';
|
require_once 'HTMLPurifier.php';
|
||||||
|
|
||||||
?>
|
|
@@ -6,16 +6,15 @@
|
|||||||
* this is efficient for instances when you only use HTML Purifier
|
* this is efficient for instances when you only use HTML Purifier
|
||||||
* on a few of your pages, it murders bytecode caching. You still
|
* on a few of your pages, it murders bytecode caching. You still
|
||||||
* need to add HTML Purifier to your path.
|
* need to add HTML Purifier to your path.
|
||||||
|
* @note ''HTMLPurifier()'' is NOT the same as ''new HTMLPurifier()''
|
||||||
*/
|
*/
|
||||||
|
|
||||||
function HTMLPurifier($html, $config = null) {
|
function HTMLPurifier($html, $config = null) {
|
||||||
static $purifier = false;
|
static $purifier = false;
|
||||||
if (!$purifier) {
|
if (!$purifier) {
|
||||||
$init = true;
|
|
||||||
require_once 'HTMLPurifier.php';
|
require_once 'HTMLPurifier.php';
|
||||||
$purifier = new HTMLPurifier();
|
$purifier = new HTMLPurifier();
|
||||||
}
|
}
|
||||||
return $purifier->purify($html, $config);
|
return $purifier->purify($html, $config);
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
@@ -22,7 +22,7 @@
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
HTML Purifier 1.3.2 - Standards Compliant HTML Filtering
|
HTML Purifier 2.1.1 - Standards Compliant HTML Filtering
|
||||||
Copyright (C) 2006 Edward Z. Yang
|
Copyright (C) 2006 Edward Z. Yang
|
||||||
|
|
||||||
This library is free software; you can redistribute it and/or
|
This library is free software; you can redistribute it and/or
|
||||||
@@ -40,9 +40,12 @@
|
|||||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
// constants are slow, but we'll make one exception
|
||||||
|
define('HTMLPURIFIER_PREFIX', dirname(__FILE__));
|
||||||
|
|
||||||
// almost every class has an undocumented dependency to these, so make sure
|
// almost every class has an undocumented dependency to these, so make sure
|
||||||
// they get included
|
// they get included
|
||||||
require_once 'HTMLPurifier/ConfigSchema.php';
|
require_once 'HTMLPurifier/ConfigSchema.php'; // important
|
||||||
require_once 'HTMLPurifier/Config.php';
|
require_once 'HTMLPurifier/Config.php';
|
||||||
require_once 'HTMLPurifier/Context.php';
|
require_once 'HTMLPurifier/Context.php';
|
||||||
|
|
||||||
@@ -51,6 +54,16 @@ require_once 'HTMLPurifier/Generator.php';
|
|||||||
require_once 'HTMLPurifier/Strategy/Core.php';
|
require_once 'HTMLPurifier/Strategy/Core.php';
|
||||||
require_once 'HTMLPurifier/Encoder.php';
|
require_once 'HTMLPurifier/Encoder.php';
|
||||||
|
|
||||||
|
require_once 'HTMLPurifier/ErrorCollector.php';
|
||||||
|
require_once 'HTMLPurifier/LanguageFactory.php';
|
||||||
|
|
||||||
|
HTMLPurifier_ConfigSchema::define(
|
||||||
|
'Core', 'CollectErrors', false, 'bool', '
|
||||||
|
Whether or not to collect errors found while filtering the document. This
|
||||||
|
is a useful way to give feedback to your users. CURRENTLY NOT IMPLEMENTED.
|
||||||
|
This directive has been available since 2.0.0.
|
||||||
|
');
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Main library execution class.
|
* Main library execution class.
|
||||||
*
|
*
|
||||||
@@ -64,11 +77,12 @@ require_once 'HTMLPurifier/Encoder.php';
|
|||||||
class HTMLPurifier
|
class HTMLPurifier
|
||||||
{
|
{
|
||||||
|
|
||||||
var $version = '1.3.2';
|
var $version = '2.1.1';
|
||||||
|
|
||||||
var $config;
|
var $config;
|
||||||
|
var $filters;
|
||||||
|
|
||||||
var $lexer, $strategy, $generator;
|
var $strategy, $generator;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Final HTMLPurifier_Context of last run purification. Might be an array.
|
* Final HTMLPurifier_Context of last run purification. Might be an array.
|
||||||
@@ -88,13 +102,19 @@ class HTMLPurifier
|
|||||||
|
|
||||||
$this->config = HTMLPurifier_Config::create($config);
|
$this->config = HTMLPurifier_Config::create($config);
|
||||||
|
|
||||||
$this->lexer = HTMLPurifier_Lexer::create();
|
|
||||||
$this->strategy = new HTMLPurifier_Strategy_Core();
|
$this->strategy = new HTMLPurifier_Strategy_Core();
|
||||||
$this->generator = new HTMLPurifier_Generator();
|
$this->generator = new HTMLPurifier_Generator();
|
||||||
$this->encoder = new HTMLPurifier_Encoder();
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Adds a filter to process the output. First come first serve
|
||||||
|
* @param $filter HTMLPurifier_Filter object
|
||||||
|
*/
|
||||||
|
function addFilter($filter) {
|
||||||
|
$this->filters[] = $filter;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Filters an HTML snippet/document to be XSS-free and standards-compliant.
|
* Filters an HTML snippet/document to be XSS-free and standards-compliant.
|
||||||
*
|
*
|
||||||
@@ -109,8 +129,32 @@ class HTMLPurifier
|
|||||||
|
|
||||||
$config = $config ? HTMLPurifier_Config::create($config) : $this->config;
|
$config = $config ? HTMLPurifier_Config::create($config) : $this->config;
|
||||||
|
|
||||||
$context =& new HTMLPurifier_Context();
|
// implementation is partially environment dependant, partially
|
||||||
$html = $this->encoder->convertToUTF8($html, $config, $context);
|
// configuration dependant
|
||||||
|
$lexer = HTMLPurifier_Lexer::create($config);
|
||||||
|
|
||||||
|
$context = new HTMLPurifier_Context();
|
||||||
|
|
||||||
|
// our friendly neighborhood generator, all primed with configuration too!
|
||||||
|
$this->generator->generateFromTokens(array(), $config, $context);
|
||||||
|
$context->register('Generator', $this->generator);
|
||||||
|
|
||||||
|
// set up global context variables
|
||||||
|
if ($config->get('Core', 'CollectErrors')) {
|
||||||
|
// may get moved out if other facilities use it
|
||||||
|
$language_factory = HTMLPurifier_LanguageFactory::instance();
|
||||||
|
$language = $language_factory->create($config, $context);
|
||||||
|
$context->register('Locale', $language);
|
||||||
|
|
||||||
|
$error_collector = new HTMLPurifier_ErrorCollector($context);
|
||||||
|
$context->register('ErrorCollector', $error_collector);
|
||||||
|
}
|
||||||
|
|
||||||
|
$html = HTMLPurifier_Encoder::convertToUTF8($html, $config, $context);
|
||||||
|
|
||||||
|
for ($i = 0, $size = count($this->filters); $i < $size; $i++) {
|
||||||
|
$html = $this->filters[$i]->preFilter($html, $config, $context);
|
||||||
|
}
|
||||||
|
|
||||||
// purified HTML
|
// purified HTML
|
||||||
$html =
|
$html =
|
||||||
@@ -118,7 +162,7 @@ class HTMLPurifier
|
|||||||
// list of tokens
|
// list of tokens
|
||||||
$this->strategy->execute(
|
$this->strategy->execute(
|
||||||
// list of un-purified tokens
|
// list of un-purified tokens
|
||||||
$this->lexer->tokenizeHTML(
|
$lexer->tokenizeHTML(
|
||||||
// un-purified HTML
|
// un-purified HTML
|
||||||
$html, $config, $context
|
$html, $config, $context
|
||||||
),
|
),
|
||||||
@@ -127,7 +171,11 @@ class HTMLPurifier
|
|||||||
$config, $context
|
$config, $context
|
||||||
);
|
);
|
||||||
|
|
||||||
$html = $this->encoder->convertFromUTF8($html, $config, $context);
|
for ($i = $size - 1; $i >= 0; $i--) {
|
||||||
|
$html = $this->filters[$i]->postFilter($html, $config, $context);
|
||||||
|
}
|
||||||
|
|
||||||
|
$html = HTMLPurifier_Encoder::convertFromUTF8($html, $config, $context);
|
||||||
$this->context =& $context;
|
$this->context =& $context;
|
||||||
return $html;
|
return $html;
|
||||||
}
|
}
|
||||||
@@ -148,7 +196,23 @@ class HTMLPurifier
|
|||||||
return $array_of_html;
|
return $array_of_html;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Singleton for enforcing just one HTML Purifier in your system
|
||||||
|
*/
|
||||||
|
static function &getInstance($prototype = null) {
|
||||||
|
static $htmlpurifier;
|
||||||
|
if (!$htmlpurifier || $prototype) {
|
||||||
|
if ($prototype instanceof HTMLPurifier) {
|
||||||
|
$htmlpurifier = $prototype;
|
||||||
|
} elseif ($prototype) {
|
||||||
|
$htmlpurifier = new HTMLPurifier($prototype);
|
||||||
|
} else {
|
||||||
|
$htmlpurifier = new HTMLPurifier();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return $htmlpurifier;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
129
library/HTMLPurifier/AttrCollections.php
Normal file
129
library/HTMLPurifier/AttrCollections.php
Normal file
@@ -0,0 +1,129 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
require_once 'HTMLPurifier/AttrTypes.php';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Defines common attribute collections that modules reference
|
||||||
|
*/
|
||||||
|
|
||||||
|
class HTMLPurifier_AttrCollections
|
||||||
|
{
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Associative array of attribute collections, indexed by name
|
||||||
|
*/
|
||||||
|
var $info = array();
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Performs all expansions on internal data for use by other inclusions
|
||||||
|
* It also collects all attribute collection extensions from
|
||||||
|
* modules
|
||||||
|
* @param $attr_types HTMLPurifier_AttrTypes instance
|
||||||
|
* @param $modules Hash array of HTMLPurifier_HTMLModule members
|
||||||
|
*/
|
||||||
|
function HTMLPurifier_AttrCollections($attr_types, $modules) {
|
||||||
|
// load extensions from the modules
|
||||||
|
foreach ($modules as $module) {
|
||||||
|
foreach ($module->attr_collections as $coll_i => $coll) {
|
||||||
|
if (!isset($this->info[$coll_i])) {
|
||||||
|
$this->info[$coll_i] = array();
|
||||||
|
}
|
||||||
|
foreach ($coll as $attr_i => $attr) {
|
||||||
|
if ($attr_i === 0 && isset($this->info[$coll_i][$attr_i])) {
|
||||||
|
// merge in includes
|
||||||
|
$this->info[$coll_i][$attr_i] = array_merge(
|
||||||
|
$this->info[$coll_i][$attr_i], $attr);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
$this->info[$coll_i][$attr_i] = $attr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// perform internal expansions and inclusions
|
||||||
|
foreach ($this->info as $name => $attr) {
|
||||||
|
// merge attribute collections that include others
|
||||||
|
$this->performInclusions($this->info[$name]);
|
||||||
|
// replace string identifiers with actual attribute objects
|
||||||
|
$this->expandIdentifiers($this->info[$name], $attr_types);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Takes a reference to an attribute associative array and performs
|
||||||
|
* all inclusions specified by the zero index.
|
||||||
|
* @param &$attr Reference to attribute array
|
||||||
|
*/
|
||||||
|
function performInclusions(&$attr) {
|
||||||
|
if (!isset($attr[0])) return;
|
||||||
|
$merge = $attr[0];
|
||||||
|
$seen = array(); // recursion guard
|
||||||
|
// loop through all the inclusions
|
||||||
|
for ($i = 0; isset($merge[$i]); $i++) {
|
||||||
|
if (isset($seen[$merge[$i]])) continue;
|
||||||
|
$seen[$merge[$i]] = true;
|
||||||
|
// foreach attribute of the inclusion, copy it over
|
||||||
|
if (!isset($this->info[$merge[$i]])) continue;
|
||||||
|
foreach ($this->info[$merge[$i]] as $key => $value) {
|
||||||
|
if (isset($attr[$key])) continue; // also catches more inclusions
|
||||||
|
$attr[$key] = $value;
|
||||||
|
}
|
||||||
|
if (isset($this->info[$merge[$i]][0])) {
|
||||||
|
// recursion
|
||||||
|
$merge = array_merge($merge, $this->info[$merge[$i]][0]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
unset($attr[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Expands all string identifiers in an attribute array by replacing
|
||||||
|
* them with the appropriate values inside HTMLPurifier_AttrTypes
|
||||||
|
* @param &$attr Reference to attribute array
|
||||||
|
* @param $attr_types HTMLPurifier_AttrTypes instance
|
||||||
|
*/
|
||||||
|
function expandIdentifiers(&$attr, $attr_types) {
|
||||||
|
|
||||||
|
// because foreach will process new elements we add, make sure we
|
||||||
|
// skip duplicates
|
||||||
|
$processed = array();
|
||||||
|
|
||||||
|
foreach ($attr as $def_i => $def) {
|
||||||
|
// skip inclusions
|
||||||
|
if ($def_i === 0) continue;
|
||||||
|
|
||||||
|
if (isset($processed[$def_i])) continue;
|
||||||
|
|
||||||
|
// determine whether or not attribute is required
|
||||||
|
if ($required = (strpos($def_i, '*') !== false)) {
|
||||||
|
// rename the definition
|
||||||
|
unset($attr[$def_i]);
|
||||||
|
$def_i = trim($def_i, '*');
|
||||||
|
$attr[$def_i] = $def;
|
||||||
|
}
|
||||||
|
|
||||||
|
$processed[$def_i] = true;
|
||||||
|
|
||||||
|
// if we've already got a literal object, move on
|
||||||
|
if (is_object($def)) {
|
||||||
|
// preserve previous required
|
||||||
|
$attr[$def_i]->required = ($required || $attr[$def_i]->required);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($def === false) {
|
||||||
|
unset($attr[$def_i]);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($t = $attr_types->get($def)) {
|
||||||
|
$attr[$def_i] = $t;
|
||||||
|
$attr[$def_i]->required = $required;
|
||||||
|
} else {
|
||||||
|
unset($attr[$def_i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@@ -14,11 +14,17 @@ class HTMLPurifier_AttrDef
|
|||||||
{
|
{
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tells us whether or not an HTML attribute is minimized. Only the
|
* Tells us whether or not an HTML attribute is minimized. Has no
|
||||||
* boolean attribute vapourware would use this.
|
* meaning in other contexts.
|
||||||
*/
|
*/
|
||||||
var $minimized = false;
|
var $minimized = false;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Tells us whether or not an HTML attribute is required. Has no
|
||||||
|
* meaning in other contexts
|
||||||
|
*/
|
||||||
|
var $required = false;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Validates and cleans passed string according to a definition.
|
* Validates and cleans passed string according to a definition.
|
||||||
*
|
*
|
||||||
@@ -62,6 +68,19 @@ class HTMLPurifier_AttrDef
|
|||||||
$string = str_replace(array("\r", "\t"), ' ', $string);
|
$string = str_replace(array("\r", "\t"), ' ', $string);
|
||||||
return $string;
|
return $string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Factory method for creating this class from a string.
|
||||||
|
* @param $string String construction info
|
||||||
|
* @return Created AttrDef object corresponding to $string
|
||||||
|
* @public
|
||||||
|
*/
|
||||||
|
function make($string) {
|
||||||
|
// default implementation, return flyweight of this object
|
||||||
|
// if overloaded, it is *necessary* for you to clone the
|
||||||
|
// object (usually by instantiating a new copy) and return that
|
||||||
|
return $this;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
@@ -8,6 +8,11 @@ require_once 'HTMLPurifier/CSSDefinition.php';
|
|||||||
* @note We don't implement the whole CSS specification, so it might be
|
* @note We don't implement the whole CSS specification, so it might be
|
||||||
* difficult to reuse this component in the context of validating
|
* difficult to reuse this component in the context of validating
|
||||||
* actual stylesheet declarations.
|
* actual stylesheet declarations.
|
||||||
|
* @note If we were really serious about validating the CSS, we would
|
||||||
|
* tokenize the styles and then parse the tokens. Obviously, we
|
||||||
|
* are not doing that. Doing that could seriously harm performance,
|
||||||
|
* but would make these components a lot more viable for a CSS
|
||||||
|
* filtering solution.
|
||||||
*/
|
*/
|
||||||
class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef
|
class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef
|
||||||
{
|
{
|
||||||
@@ -20,6 +25,9 @@ class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef
|
|||||||
|
|
||||||
// we're going to break the spec and explode by semicolons.
|
// we're going to break the spec and explode by semicolons.
|
||||||
// This is because semicolon rarely appears in escaped form
|
// This is because semicolon rarely appears in escaped form
|
||||||
|
// Doing this is generally flaky but fast
|
||||||
|
// IT MIGHT APPEAR IN URIs, see HTMLPurifier_AttrDef_CSSURI
|
||||||
|
// for details
|
||||||
|
|
||||||
$declarations = explode(';', $css);
|
$declarations = explode(';', $css);
|
||||||
$propvalues = array();
|
$propvalues = array();
|
||||||
@@ -58,4 +66,3 @@ class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
86
library/HTMLPurifier/AttrDef/CSS/Background.php
Normal file
86
library/HTMLPurifier/AttrDef/CSS/Background.php
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
require_once 'HTMLPurifier/AttrDef.php';
|
||||||
|
require_once 'HTMLPurifier/CSSDefinition.php';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validates shorthand CSS property background.
|
||||||
|
* @warning Does not support url tokens that have internal spaces.
|
||||||
|
*/
|
||||||
|
class HTMLPurifier_AttrDef_CSS_Background extends HTMLPurifier_AttrDef
|
||||||
|
{
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Local copy of component validators.
|
||||||
|
* @note See HTMLPurifier_AttrDef_Font::$info for a similar impl.
|
||||||
|
*/
|
||||||
|
var $info;
|
||||||
|
|
||||||
|
function HTMLPurifier_AttrDef_CSS_Background($config) {
|
||||||
|
$def = $config->getCSSDefinition();
|
||||||
|
$this->info['background-color'] = $def->info['background-color'];
|
||||||
|
$this->info['background-image'] = $def->info['background-image'];
|
||||||
|
$this->info['background-repeat'] = $def->info['background-repeat'];
|
||||||
|
$this->info['background-attachment'] = $def->info['background-attachment'];
|
||||||
|
$this->info['background-position'] = $def->info['background-position'];
|
||||||
|
}
|
||||||
|
|
||||||
|
function validate($string, $config, &$context) {
|
||||||
|
|
||||||
|
// regular pre-processing
|
||||||
|
$string = $this->parseCDATA($string);
|
||||||
|
if ($string === '') return false;
|
||||||
|
|
||||||
|
// assumes URI doesn't have spaces in it
|
||||||
|
$bits = explode(' ', strtolower($string)); // bits to process
|
||||||
|
|
||||||
|
$caught = array();
|
||||||
|
$caught['color'] = false;
|
||||||
|
$caught['image'] = false;
|
||||||
|
$caught['repeat'] = false;
|
||||||
|
$caught['attachment'] = false;
|
||||||
|
$caught['position'] = false;
|
||||||
|
|
||||||
|
$i = 0; // number of catches
|
||||||
|
$none = false;
|
||||||
|
|
||||||
|
foreach ($bits as $bit) {
|
||||||
|
if ($bit === '') continue;
|
||||||
|
foreach ($caught as $key => $status) {
|
||||||
|
if ($key != 'position') {
|
||||||
|
if ($status !== false) continue;
|
||||||
|
$r = $this->info['background-' . $key]->validate($bit, $config, $context);
|
||||||
|
} else {
|
||||||
|
$r = $bit;
|
||||||
|
}
|
||||||
|
if ($r === false) continue;
|
||||||
|
if ($key == 'position') {
|
||||||
|
if ($caught[$key] === false) $caught[$key] = '';
|
||||||
|
$caught[$key] .= $r . ' ';
|
||||||
|
} else {
|
||||||
|
$caught[$key] = $r;
|
||||||
|
}
|
||||||
|
$i++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!$i) return false;
|
||||||
|
if ($caught['position'] !== false) {
|
||||||
|
$caught['position'] = $this->info['background-position']->
|
||||||
|
validate($caught['position'], $config, $context);
|
||||||
|
}
|
||||||
|
|
||||||
|
$ret = array();
|
||||||
|
foreach ($caught as $value) {
|
||||||
|
if ($value === false) continue;
|
||||||
|
$ret[] = $value;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (empty($ret)) return false;
|
||||||
|
return implode(' ', $ret);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
129
library/HTMLPurifier/AttrDef/CSS/BackgroundPosition.php
Normal file
129
library/HTMLPurifier/AttrDef/CSS/BackgroundPosition.php
Normal file
@@ -0,0 +1,129 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
require_once 'HTMLPurifier/AttrDef.php';
|
||||||
|
require_once 'HTMLPurifier/AttrDef/CSS/Length.php';
|
||||||
|
require_once 'HTMLPurifier/AttrDef/CSS/Percentage.php';
|
||||||
|
|
||||||
|
/* W3C says:
|
||||||
|
[ // adjective and number must be in correct order, even if
|
||||||
|
// you could switch them without introducing ambiguity.
|
||||||
|
// some browsers support that syntax
|
||||||
|
[
|
||||||
|
<percentage> | <length> | left | center | right
|
||||||
|
]
|
||||||
|
[
|
||||||
|
<percentage> | <length> | top | center | bottom
|
||||||
|
]?
|
||||||
|
] |
|
||||||
|
[ // this signifies that the vertical and horizontal adjectives
|
||||||
|
// can be arbitrarily ordered, however, there can only be two,
|
||||||
|
// one of each, or none at all
|
||||||
|
[
|
||||||
|
left | center | right
|
||||||
|
] ||
|
||||||
|
[
|
||||||
|
top | center | bottom
|
||||||
|
]
|
||||||
|
]
|
||||||
|
top, left = 0%
|
||||||
|
center, (none) = 50%
|
||||||
|
bottom, right = 100%
|
||||||
|
*/
|
||||||
|
|
||||||
|
/* QuirksMode says:
|
||||||
|
keyword + length/percentage must be ordered correctly, as per W3C
|
||||||
|
|
||||||
|
Internet Explorer and Opera, however, support arbitrary ordering. We
|
||||||
|
should fix it up.
|
||||||
|
|
||||||
|
Minor issue though, not strictly necessary.
|
||||||
|
*/
|
||||||
|
|
||||||
|
// control freaks may appreciate the ability to convert these to
|
||||||
|
// percentages or something, but it's not necessary
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validates the value of background-position.
|
||||||
|
*/
|
||||||
|
class HTMLPurifier_AttrDef_CSS_BackgroundPosition extends HTMLPurifier_AttrDef
|
||||||
|
{
|
||||||
|
|
||||||
|
var $length;
|
||||||
|
var $percentage;
|
||||||
|
|
||||||
|
function HTMLPurifier_AttrDef_CSS_BackgroundPosition() {
|
||||||
|
$this->length = new HTMLPurifier_AttrDef_CSS_Length();
|
||||||
|
$this->percentage = new HTMLPurifier_AttrDef_CSS_Percentage();
|
||||||
|
}
|
||||||
|
|
||||||
|
function validate($string, $config, &$context) {
|
||||||
|
$string = $this->parseCDATA($string);
|
||||||
|
$bits = explode(' ', $string);
|
||||||
|
|
||||||
|
$keywords = array();
|
||||||
|
$keywords['h'] = false; // left, right
|
||||||
|
$keywords['v'] = false; // top, bottom
|
||||||
|
$keywords['c'] = false; // center
|
||||||
|
$measures = array();
|
||||||
|
|
||||||
|
$i = 0;
|
||||||
|
|
||||||
|
$lookup = array(
|
||||||
|
'top' => 'v',
|
||||||
|
'bottom' => 'v',
|
||||||
|
'left' => 'h',
|
||||||
|
'right' => 'h',
|
||||||
|
'center' => 'c'
|
||||||
|
);
|
||||||
|
|
||||||
|
foreach ($bits as $bit) {
|
||||||
|
if ($bit === '') continue;
|
||||||
|
|
||||||
|
// test for keyword
|
||||||
|
$lbit = ctype_lower($bit) ? $bit : strtolower($bit);
|
||||||
|
if (isset($lookup[$lbit])) {
|
||||||
|
$status = $lookup[$lbit];
|
||||||
|
$keywords[$status] = $lbit;
|
||||||
|
$i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// test for length
|
||||||
|
$r = $this->length->validate($bit, $config, $context);
|
||||||
|
if ($r !== false) {
|
||||||
|
$measures[] = $r;
|
||||||
|
$i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// test for percentage
|
||||||
|
$r = $this->percentage->validate($bit, $config, $context);
|
||||||
|
if ($r !== false) {
|
||||||
|
$measures[] = $r;
|
||||||
|
$i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!$i) return false; // no valid values were caught
|
||||||
|
|
||||||
|
|
||||||
|
$ret = array();
|
||||||
|
|
||||||
|
// first keyword
|
||||||
|
if ($keywords['h']) $ret[] = $keywords['h'];
|
||||||
|
elseif (count($measures)) $ret[] = array_shift($measures);
|
||||||
|
elseif ($keywords['c']) {
|
||||||
|
$ret[] = $keywords['c'];
|
||||||
|
$keywords['c'] = false; // prevent re-use: center = center center
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($keywords['v']) $ret[] = $keywords['v'];
|
||||||
|
elseif (count($measures)) $ret[] = array_shift($measures);
|
||||||
|
elseif ($keywords['c']) $ret[] = $keywords['c'];
|
||||||
|
|
||||||
|
if (empty($ret)) return false;
|
||||||
|
return implode(' ', $ret);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@@ -5,7 +5,7 @@ require_once 'HTMLPurifier/AttrDef.php';
|
|||||||
/**
|
/**
|
||||||
* Validates the border property as defined by CSS.
|
* Validates the border property as defined by CSS.
|
||||||
*/
|
*/
|
||||||
class HTMLPurifier_AttrDef_Border extends HTMLPurifier_AttrDef
|
class HTMLPurifier_AttrDef_CSS_Border extends HTMLPurifier_AttrDef
|
||||||
{
|
{
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -13,7 +13,7 @@ class HTMLPurifier_AttrDef_Border extends HTMLPurifier_AttrDef
|
|||||||
*/
|
*/
|
||||||
var $info = array();
|
var $info = array();
|
||||||
|
|
||||||
function HTMLPurifier_AttrDef_Border($config) {
|
function HTMLPurifier_AttrDef_CSS_Border($config) {
|
||||||
$def = $config->getCSSDefinition();
|
$def = $config->getCSSDefinition();
|
||||||
$this->info['border-width'] = $def->info['border-width'];
|
$this->info['border-width'] = $def->info['border-width'];
|
||||||
$this->info['border-style'] = $def->info['border-style'];
|
$this->info['border-style'] = $def->info['border-style'];
|
||||||
@@ -42,4 +42,3 @@ class HTMLPurifier_AttrDef_Border extends HTMLPurifier_AttrDef
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
@@ -2,43 +2,47 @@
|
|||||||
|
|
||||||
require_once 'HTMLPurifier/AttrDef.php';
|
require_once 'HTMLPurifier/AttrDef.php';
|
||||||
|
|
||||||
|
HTMLPurifier_ConfigSchema::define(
|
||||||
|
'Core', 'ColorKeywords', array(
|
||||||
|
'maroon' => '#800000',
|
||||||
|
'red' => '#FF0000',
|
||||||
|
'orange' => '#FFA500',
|
||||||
|
'yellow' => '#FFFF00',
|
||||||
|
'olive' => '#808000',
|
||||||
|
'purple' => '#800080',
|
||||||
|
'fuchsia' => '#FF00FF',
|
||||||
|
'white' => '#FFFFFF',
|
||||||
|
'lime' => '#00FF00',
|
||||||
|
'green' => '#008000',
|
||||||
|
'navy' => '#000080',
|
||||||
|
'blue' => '#0000FF',
|
||||||
|
'aqua' => '#00FFFF',
|
||||||
|
'teal' => '#008080',
|
||||||
|
'black' => '#000000',
|
||||||
|
'silver' => '#C0C0C0',
|
||||||
|
'gray' => '#808080'
|
||||||
|
), 'hash', '
|
||||||
|
Lookup array of color names to six digit hexadecimal number corresponding
|
||||||
|
to color, with preceding hash mark. Used when parsing colors.
|
||||||
|
This directive has been available since 2.0.0.
|
||||||
|
');
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Validates Color as defined by CSS.
|
* Validates Color as defined by CSS.
|
||||||
*/
|
*/
|
||||||
class HTMLPurifier_AttrDef_Color extends HTMLPurifier_AttrDef
|
class HTMLPurifier_AttrDef_CSS_Color extends HTMLPurifier_AttrDef
|
||||||
{
|
{
|
||||||
|
|
||||||
/**
|
|
||||||
* Color keyword lookup table.
|
|
||||||
* @todo Extend it to include all usually allowed colors.
|
|
||||||
*/
|
|
||||||
var $colors = array(
|
|
||||||
'maroon' => '#800000',
|
|
||||||
'red' => '#F00',
|
|
||||||
'orange' => '#FFA500',
|
|
||||||
'yellow' => '#FF0',
|
|
||||||
'olive' => '#808000',
|
|
||||||
'purple' => '#800080',
|
|
||||||
'fuchsia' => '#F0F',
|
|
||||||
'white' => '#FFF',
|
|
||||||
'lime' => '#0F0',
|
|
||||||
'green' => '#008000',
|
|
||||||
'navy' => '#000080',
|
|
||||||
'blue' => '#00F',
|
|
||||||
'aqua' => '#0FF',
|
|
||||||
'teal' => '#008080',
|
|
||||||
'black' => '#000',
|
|
||||||
'silver' => '#C0C0C0',
|
|
||||||
'gray' => '#808080'
|
|
||||||
);
|
|
||||||
|
|
||||||
function validate($color, $config, &$context) {
|
function validate($color, $config, &$context) {
|
||||||
|
|
||||||
|
static $colors = null;
|
||||||
|
if ($colors === null) $colors = $config->get('Core', 'ColorKeywords');
|
||||||
|
|
||||||
$color = trim($color);
|
$color = trim($color);
|
||||||
if (!$color) return false;
|
if (!$color) return false;
|
||||||
|
|
||||||
$lower = strtolower($color);
|
$lower = strtolower($color);
|
||||||
if (isset($this->colors[$lower])) return $this->colors[$lower];
|
if (isset($colors[$lower])) return $colors[$lower];
|
||||||
|
|
||||||
if ($color[0] === '#') {
|
if ($color[0] === '#') {
|
||||||
// hexadecimal handling
|
// hexadecimal handling
|
||||||
@@ -94,4 +98,3 @@ class HTMLPurifier_AttrDef_Color extends HTMLPurifier_AttrDef
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
@@ -9,7 +9,7 @@
|
|||||||
* especially useful for CSS values, which often are a choice between
|
* especially useful for CSS values, which often are a choice between
|
||||||
* an enumerated set of predefined values or a flexible data type.
|
* an enumerated set of predefined values or a flexible data type.
|
||||||
*/
|
*/
|
||||||
class HTMLPurifier_AttrDef_Composite extends HTMLPurifier_AttrDef
|
class HTMLPurifier_AttrDef_CSS_Composite extends HTMLPurifier_AttrDef
|
||||||
{
|
{
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -21,7 +21,7 @@ class HTMLPurifier_AttrDef_Composite extends HTMLPurifier_AttrDef
|
|||||||
/**
|
/**
|
||||||
* @param $defs List of HTMLPurifier_AttrDef objects
|
* @param $defs List of HTMLPurifier_AttrDef objects
|
||||||
*/
|
*/
|
||||||
function HTMLPurifier_AttrDef_Composite($defs) {
|
function HTMLPurifier_AttrDef_CSS_Composite($defs) {
|
||||||
$this->defs = $defs;
|
$this->defs = $defs;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -35,4 +35,3 @@ class HTMLPurifier_AttrDef_Composite extends HTMLPurifier_AttrDef
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
@@ -5,7 +5,7 @@ require_once 'HTMLPurifier/AttrDef.php';
|
|||||||
/**
|
/**
|
||||||
* Validates shorthand CSS property font.
|
* Validates shorthand CSS property font.
|
||||||
*/
|
*/
|
||||||
class HTMLPurifier_AttrDef_Font extends HTMLPurifier_AttrDef
|
class HTMLPurifier_AttrDef_CSS_Font extends HTMLPurifier_AttrDef
|
||||||
{
|
{
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -18,19 +18,7 @@ class HTMLPurifier_AttrDef_Font extends HTMLPurifier_AttrDef
|
|||||||
*/
|
*/
|
||||||
var $info = array();
|
var $info = array();
|
||||||
|
|
||||||
/**
|
function HTMLPurifier_AttrDef_CSS_Font($config) {
|
||||||
* System font keywords.
|
|
||||||
*/
|
|
||||||
var $system_fonts = array(
|
|
||||||
'caption' => true,
|
|
||||||
'icon' => true,
|
|
||||||
'menu' => true,
|
|
||||||
'message-box' => true,
|
|
||||||
'small-caption' => true,
|
|
||||||
'status-bar' => true
|
|
||||||
);
|
|
||||||
|
|
||||||
function HTMLPurifier_AttrDef_Font($config) {
|
|
||||||
$def = $config->getCSSDefinition();
|
$def = $config->getCSSDefinition();
|
||||||
$this->info['font-style'] = $def->info['font-style'];
|
$this->info['font-style'] = $def->info['font-style'];
|
||||||
$this->info['font-variant'] = $def->info['font-variant'];
|
$this->info['font-variant'] = $def->info['font-variant'];
|
||||||
@@ -42,13 +30,22 @@ class HTMLPurifier_AttrDef_Font extends HTMLPurifier_AttrDef
|
|||||||
|
|
||||||
function validate($string, $config, &$context) {
|
function validate($string, $config, &$context) {
|
||||||
|
|
||||||
|
static $system_fonts = array(
|
||||||
|
'caption' => true,
|
||||||
|
'icon' => true,
|
||||||
|
'menu' => true,
|
||||||
|
'message-box' => true,
|
||||||
|
'small-caption' => true,
|
||||||
|
'status-bar' => true
|
||||||
|
);
|
||||||
|
|
||||||
// regular pre-processing
|
// regular pre-processing
|
||||||
$string = $this->parseCDATA($string);
|
$string = $this->parseCDATA($string);
|
||||||
if ($string === '') return false;
|
if ($string === '') return false;
|
||||||
|
|
||||||
// check if it's one of the keywords
|
// check if it's one of the keywords
|
||||||
$lowercase_string = strtolower($string);
|
$lowercase_string = strtolower($string);
|
||||||
if (isset($this->system_fonts[$lowercase_string])) {
|
if (isset($system_fonts[$lowercase_string])) {
|
||||||
return $lowercase_string;
|
return $lowercase_string;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -151,4 +148,3 @@ class HTMLPurifier_AttrDef_Font extends HTMLPurifier_AttrDef
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
@@ -7,22 +7,18 @@ require_once 'HTMLPurifier/AttrDef.php';
|
|||||||
/**
|
/**
|
||||||
* Validates a font family list according to CSS spec
|
* Validates a font family list according to CSS spec
|
||||||
*/
|
*/
|
||||||
class HTMLPurifier_AttrDef_FontFamily extends HTMLPurifier_AttrDef
|
class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
|
||||||
{
|
{
|
||||||
|
|
||||||
/**
|
|
||||||
* Generic font family keywords.
|
|
||||||
* @protected
|
|
||||||
*/
|
|
||||||
var $generic_names = array(
|
|
||||||
'serif' => true,
|
|
||||||
'sans-serif' => true,
|
|
||||||
'monospace' => true,
|
|
||||||
'fantasy' => true,
|
|
||||||
'cursive' => true
|
|
||||||
);
|
|
||||||
|
|
||||||
function validate($string, $config, &$context) {
|
function validate($string, $config, &$context) {
|
||||||
|
static $generic_names = array(
|
||||||
|
'serif' => true,
|
||||||
|
'sans-serif' => true,
|
||||||
|
'monospace' => true,
|
||||||
|
'fantasy' => true,
|
||||||
|
'cursive' => true
|
||||||
|
);
|
||||||
|
|
||||||
$string = $this->parseCDATA($string);
|
$string = $this->parseCDATA($string);
|
||||||
// assume that no font names contain commas in them
|
// assume that no font names contain commas in them
|
||||||
$fonts = explode(',', $string);
|
$fonts = explode(',', $string);
|
||||||
@@ -31,7 +27,7 @@ class HTMLPurifier_AttrDef_FontFamily extends HTMLPurifier_AttrDef
|
|||||||
$font = trim($font);
|
$font = trim($font);
|
||||||
if ($font === '') continue;
|
if ($font === '') continue;
|
||||||
// match a generic name
|
// match a generic name
|
||||||
if (isset($this->generic_names[$font])) {
|
if (isset($generic_names[$font])) {
|
||||||
$final .= $font . ', ';
|
$final .= $font . ', ';
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@@ -42,19 +38,24 @@ class HTMLPurifier_AttrDef_FontFamily extends HTMLPurifier_AttrDef
|
|||||||
$quote = $font[0];
|
$quote = $font[0];
|
||||||
if ($font[$length - 1] !== $quote) continue;
|
if ($font[$length - 1] !== $quote) continue;
|
||||||
$font = substr($font, 1, $length - 2);
|
$font = substr($font, 1, $length - 2);
|
||||||
|
// double-backslash processing is buggy
|
||||||
|
$font = str_replace("\\$quote", $quote, $font); // de-escape quote
|
||||||
|
$font = str_replace("\\\n", "\n", $font); // de-escape newlines
|
||||||
}
|
}
|
||||||
// process font
|
// $font is a pure representation of the font name
|
||||||
|
|
||||||
if (ctype_alnum($font)) {
|
if (ctype_alnum($font)) {
|
||||||
// very simple font, allow it in unharmed
|
// very simple font, allow it in unharmed
|
||||||
$final .= $font . ', ';
|
$final .= $font . ', ';
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
$nospace = str_replace(array(' ', '.', '!'), '', $font);
|
|
||||||
if (ctype_alnum($nospace)) {
|
// complicated font, requires quoting
|
||||||
// font with spaces in it
|
|
||||||
$final .= "'$font', ";
|
// armor single quotes and new lines
|
||||||
continue;
|
$font = str_replace("'", "\\'", $font);
|
||||||
}
|
$font = str_replace("\n", "\\\n", $font);
|
||||||
|
$final .= "'$font', ";
|
||||||
}
|
}
|
||||||
$final = rtrim($final, ', ');
|
$final = rtrim($final, ', ');
|
||||||
if ($final === '') return false;
|
if ($final === '') return false;
|
||||||
@@ -63,4 +64,3 @@ class HTMLPurifier_AttrDef_FontFamily extends HTMLPurifier_AttrDef
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
@@ -1,13 +1,12 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
require_once 'HTMLPurifier/AttrDef.php';
|
require_once 'HTMLPurifier/AttrDef.php';
|
||||||
require_once 'HTMLPurifier/AttrDef/Number.php';
|
require_once 'HTMLPurifier/AttrDef/CSS/Number.php';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Represents a Length as defined by CSS.
|
* Represents a Length as defined by CSS.
|
||||||
* @warning Be sure not to confuse this with HTMLPurifier_AttrDef_Length!
|
|
||||||
*/
|
*/
|
||||||
class HTMLPurifier_AttrDef_CSSLength extends HTMLPurifier_AttrDef
|
class HTMLPurifier_AttrDef_CSS_Length extends HTMLPurifier_AttrDef
|
||||||
{
|
{
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -26,8 +25,8 @@ class HTMLPurifier_AttrDef_CSSLength extends HTMLPurifier_AttrDef
|
|||||||
* @param $non_negative Bool indication whether or not negative values are
|
* @param $non_negative Bool indication whether or not negative values are
|
||||||
* allowed.
|
* allowed.
|
||||||
*/
|
*/
|
||||||
function HTMLPurifier_AttrDef_CSSLength($non_negative = false) {
|
function HTMLPurifier_AttrDef_CSS_Length($non_negative = false) {
|
||||||
$this->number_def = new HTMLPurifier_AttrDef_Number($non_negative);
|
$this->number_def = new HTMLPurifier_AttrDef_CSS_Number($non_negative);
|
||||||
}
|
}
|
||||||
|
|
||||||
function validate($length, $config, &$context) {
|
function validate($length, $config, &$context) {
|
||||||
@@ -40,6 +39,7 @@ class HTMLPurifier_AttrDef_CSSLength extends HTMLPurifier_AttrDef
|
|||||||
|
|
||||||
// we assume all units are two characters
|
// we assume all units are two characters
|
||||||
$unit = substr($length, $strlen - 2);
|
$unit = substr($length, $strlen - 2);
|
||||||
|
if (!ctype_lower($unit)) $unit = strtolower($unit);
|
||||||
$number = substr($length, 0, $strlen - 2);
|
$number = substr($length, 0, $strlen - 2);
|
||||||
|
|
||||||
if (!isset($this->units[$unit])) return false;
|
if (!isset($this->units[$unit])) return false;
|
||||||
@@ -53,4 +53,3 @@ class HTMLPurifier_AttrDef_CSSLength extends HTMLPurifier_AttrDef
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
79
library/HTMLPurifier/AttrDef/CSS/ListStyle.php
Normal file
79
library/HTMLPurifier/AttrDef/CSS/ListStyle.php
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
require_once 'HTMLPurifier/AttrDef.php';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validates shorthand CSS property list-style.
|
||||||
|
* @warning Does not support url tokens that have internal spaces.
|
||||||
|
*/
|
||||||
|
class HTMLPurifier_AttrDef_CSS_ListStyle extends HTMLPurifier_AttrDef
|
||||||
|
{
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Local copy of component validators.
|
||||||
|
* @note See HTMLPurifier_AttrDef_CSS_Font::$info for a similar impl.
|
||||||
|
*/
|
||||||
|
var $info;
|
||||||
|
|
||||||
|
function HTMLPurifier_AttrDef_CSS_ListStyle($config) {
|
||||||
|
$def = $config->getCSSDefinition();
|
||||||
|
$this->info['list-style-type'] = $def->info['list-style-type'];
|
||||||
|
$this->info['list-style-position'] = $def->info['list-style-position'];
|
||||||
|
$this->info['list-style-image'] = $def->info['list-style-image'];
|
||||||
|
}
|
||||||
|
|
||||||
|
function validate($string, $config, &$context) {
|
||||||
|
|
||||||
|
// regular pre-processing
|
||||||
|
$string = $this->parseCDATA($string);
|
||||||
|
if ($string === '') return false;
|
||||||
|
|
||||||
|
// assumes URI doesn't have spaces in it
|
||||||
|
$bits = explode(' ', strtolower($string)); // bits to process
|
||||||
|
|
||||||
|
$caught = array();
|
||||||
|
$caught['type'] = false;
|
||||||
|
$caught['position'] = false;
|
||||||
|
$caught['image'] = false;
|
||||||
|
|
||||||
|
$i = 0; // number of catches
|
||||||
|
$none = false;
|
||||||
|
|
||||||
|
foreach ($bits as $bit) {
|
||||||
|
if ($i >= 3) return; // optimization bit
|
||||||
|
if ($bit === '') continue;
|
||||||
|
foreach ($caught as $key => $status) {
|
||||||
|
if ($status !== false) continue;
|
||||||
|
$r = $this->info['list-style-' . $key]->validate($bit, $config, $context);
|
||||||
|
if ($r === false) continue;
|
||||||
|
if ($r === 'none') {
|
||||||
|
if ($none) continue;
|
||||||
|
else $none = true;
|
||||||
|
if ($key == 'image') continue;
|
||||||
|
}
|
||||||
|
$caught[$key] = $r;
|
||||||
|
$i++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!$i) return false;
|
||||||
|
|
||||||
|
$ret = array();
|
||||||
|
|
||||||
|
// construct type
|
||||||
|
if ($caught['type']) $ret[] = $caught['type'];
|
||||||
|
|
||||||
|
// construct image
|
||||||
|
if ($caught['image']) $ret[] = $caught['image'];
|
||||||
|
|
||||||
|
// construct position
|
||||||
|
if ($caught['position']) $ret[] = $caught['position'];
|
||||||
|
|
||||||
|
if (empty($ret)) return false;
|
||||||
|
return implode(' ', $ret);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@@ -13,7 +13,7 @@ require_once 'HTMLPurifier/AttrDef.php';
|
|||||||
* can only be used alone: it will never manifest as part of a multi
|
* can only be used alone: it will never manifest as part of a multi
|
||||||
* shorthand declaration. Thus, this class does not allow inherit.
|
* shorthand declaration. Thus, this class does not allow inherit.
|
||||||
*/
|
*/
|
||||||
class HTMLPurifier_AttrDef_Multiple extends HTMLPurifier_AttrDef
|
class HTMLPurifier_AttrDef_CSS_Multiple extends HTMLPurifier_AttrDef
|
||||||
{
|
{
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -30,7 +30,7 @@ class HTMLPurifier_AttrDef_Multiple extends HTMLPurifier_AttrDef
|
|||||||
* @param $single HTMLPurifier_AttrDef to multiply
|
* @param $single HTMLPurifier_AttrDef to multiply
|
||||||
* @param $max Max number of values allowed (usually four)
|
* @param $max Max number of values allowed (usually four)
|
||||||
*/
|
*/
|
||||||
function HTMLPurifier_AttrDef_Multiple($single, $max = 4) {
|
function HTMLPurifier_AttrDef_CSS_Multiple($single, $max = 4) {
|
||||||
$this->single = $single;
|
$this->single = $single;
|
||||||
$this->max = $max;
|
$this->max = $max;
|
||||||
}
|
}
|
||||||
@@ -55,4 +55,3 @@ class HTMLPurifier_AttrDef_Multiple extends HTMLPurifier_AttrDef
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
@@ -3,7 +3,7 @@
|
|||||||
/**
|
/**
|
||||||
* Validates a number as defined by the CSS spec.
|
* Validates a number as defined by the CSS spec.
|
||||||
*/
|
*/
|
||||||
class HTMLPurifier_AttrDef_Number extends HTMLPurifier_AttrDef
|
class HTMLPurifier_AttrDef_CSS_Number extends HTMLPurifier_AttrDef
|
||||||
{
|
{
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -14,7 +14,7 @@ class HTMLPurifier_AttrDef_Number extends HTMLPurifier_AttrDef
|
|||||||
/**
|
/**
|
||||||
* @param $non_negative Bool indicating whether negatives are forbidden
|
* @param $non_negative Bool indicating whether negatives are forbidden
|
||||||
*/
|
*/
|
||||||
function HTMLPurifier_AttrDef_Number($non_negative = false) {
|
function HTMLPurifier_AttrDef_CSS_Number($non_negative = false) {
|
||||||
$this->non_negative = $non_negative;
|
$this->non_negative = $non_negative;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -58,4 +58,3 @@ class HTMLPurifier_AttrDef_Number extends HTMLPurifier_AttrDef
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
@@ -1,25 +1,24 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
require_once 'HTMLPurifier/AttrDef.php';
|
require_once 'HTMLPurifier/AttrDef.php';
|
||||||
require_once 'HTMLPurifier/AttrDef/Number.php';
|
require_once 'HTMLPurifier/AttrDef/CSS/Number.php';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Validates a Percentage as defined by the HTML spec.
|
* Validates a Percentage as defined by the CSS spec.
|
||||||
* @note This also allows integer pixel values.
|
|
||||||
*/
|
*/
|
||||||
class HTMLPurifier_AttrDef_Percentage extends HTMLPurifier_AttrDef
|
class HTMLPurifier_AttrDef_CSS_Percentage extends HTMLPurifier_AttrDef
|
||||||
{
|
{
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Instance of HTMLPurifier_AttrDef_Number to defer pixel validation
|
* Instance of HTMLPurifier_AttrDef_CSS_Number to defer number validation
|
||||||
*/
|
*/
|
||||||
var $number_def;
|
var $number_def;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param Bool indicating whether to forbid negative values
|
* @param Bool indicating whether to forbid negative values
|
||||||
*/
|
*/
|
||||||
function HTMLPurifier_AttrDef_Percentage($non_negative = false) {
|
function HTMLPurifier_AttrDef_CSS_Percentage($non_negative = false) {
|
||||||
$this->number_def = new HTMLPurifier_AttrDef_Number($non_negative);
|
$this->number_def = new HTMLPurifier_AttrDef_CSS_Number($non_negative);
|
||||||
}
|
}
|
||||||
|
|
||||||
function validate($string, $config, &$context) {
|
function validate($string, $config, &$context) {
|
||||||
@@ -41,4 +40,3 @@ class HTMLPurifier_AttrDef_Percentage extends HTMLPurifier_AttrDef
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
@@ -7,26 +7,22 @@ require_once 'HTMLPurifier/AttrDef.php';
|
|||||||
* @note This class could be generalized into a version that acts sort of
|
* @note This class could be generalized into a version that acts sort of
|
||||||
* like Enum except you can compound the allowed values.
|
* like Enum except you can compound the allowed values.
|
||||||
*/
|
*/
|
||||||
class HTMLPurifier_AttrDef_TextDecoration extends HTMLPurifier_AttrDef
|
class HTMLPurifier_AttrDef_CSS_TextDecoration extends HTMLPurifier_AttrDef
|
||||||
{
|
{
|
||||||
|
|
||||||
/**
|
|
||||||
* Lookup table of allowed values.
|
|
||||||
* @protected
|
|
||||||
*/
|
|
||||||
var $allowed_values = array(
|
|
||||||
'line-through' => true,
|
|
||||||
'overline' => true,
|
|
||||||
'underline' => true
|
|
||||||
);
|
|
||||||
|
|
||||||
function validate($string, $config, &$context) {
|
function validate($string, $config, &$context) {
|
||||||
|
|
||||||
|
static $allowed_values = array(
|
||||||
|
'line-through' => true,
|
||||||
|
'overline' => true,
|
||||||
|
'underline' => true
|
||||||
|
);
|
||||||
|
|
||||||
$string = strtolower($this->parseCDATA($string));
|
$string = strtolower($this->parseCDATA($string));
|
||||||
$parts = explode(' ', $string);
|
$parts = explode(' ', $string);
|
||||||
$final = '';
|
$final = '';
|
||||||
foreach ($parts as $part) {
|
foreach ($parts as $part) {
|
||||||
if (isset($this->allowed_values[$part])) {
|
if (isset($allowed_values[$part])) {
|
||||||
$final .= $part . ' ';
|
$final .= $part . ' ';
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -38,4 +34,3 @@ class HTMLPurifier_AttrDef_TextDecoration extends HTMLPurifier_AttrDef
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
57
library/HTMLPurifier/AttrDef/CSS/URI.php
Normal file
57
library/HTMLPurifier/AttrDef/CSS/URI.php
Normal file
@@ -0,0 +1,57 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
require_once 'HTMLPurifier/AttrDef/URI.php';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validates a URI in CSS syntax, which uses url('http://example.com')
|
||||||
|
* @note While theoretically speaking a URI in a CSS document could
|
||||||
|
* be non-embedded, as of CSS2 there is no such usage so we're
|
||||||
|
* generalizing it. This may need to be changed in the future.
|
||||||
|
* @warning Since HTMLPurifier_AttrDef_CSS blindly uses semicolons as
|
||||||
|
* the separator, you cannot put a literal semicolon in
|
||||||
|
* in the URI. Try percent encoding it, in that case.
|
||||||
|
*/
|
||||||
|
class HTMLPurifier_AttrDef_CSS_URI extends HTMLPurifier_AttrDef_URI
|
||||||
|
{
|
||||||
|
|
||||||
|
function HTMLPurifier_AttrDef_CSS_URI() {
|
||||||
|
parent::HTMLPurifier_AttrDef_URI(true); // always embedded
|
||||||
|
}
|
||||||
|
|
||||||
|
function validate($uri_string, $config, &$context) {
|
||||||
|
// parse the URI out of the string and then pass it onto
|
||||||
|
// the parent object
|
||||||
|
|
||||||
|
$uri_string = $this->parseCDATA($uri_string);
|
||||||
|
if (strpos($uri_string, 'url(') !== 0) return false;
|
||||||
|
$uri_string = substr($uri_string, 4);
|
||||||
|
$new_length = strlen($uri_string) - 1;
|
||||||
|
if ($uri_string[$new_length] != ')') return false;
|
||||||
|
$uri = trim(substr($uri_string, 0, $new_length));
|
||||||
|
|
||||||
|
if (!empty($uri) && ($uri[0] == "'" || $uri[0] == '"')) {
|
||||||
|
$quote = $uri[0];
|
||||||
|
$new_length = strlen($uri) - 1;
|
||||||
|
if ($uri[$new_length] !== $quote) return false;
|
||||||
|
$uri = substr($uri, 1, $new_length - 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
$keys = array( '(', ')', ',', ' ', '"', "'");
|
||||||
|
$values = array('\\(', '\\)', '\\,', '\\ ', '\\"', "\\'");
|
||||||
|
$uri = str_replace($values, $keys, $uri);
|
||||||
|
|
||||||
|
$result = parent::validate($uri, $config, $context);
|
||||||
|
|
||||||
|
if ($result === false) return false;
|
||||||
|
|
||||||
|
// escape necessary characters according to CSS spec
|
||||||
|
// except for the comma, none of these should appear in the
|
||||||
|
// URI at all
|
||||||
|
$result = str_replace($keys, $values, $result);
|
||||||
|
|
||||||
|
return "url($result)";
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@@ -5,6 +5,9 @@ require_once 'HTMLPurifier/AttrDef.php';
|
|||||||
// Enum = Enumerated
|
// Enum = Enumerated
|
||||||
/**
|
/**
|
||||||
* Validates a keyword against a list of valid values.
|
* Validates a keyword against a list of valid values.
|
||||||
|
* @warning The case-insensitive compare of this function uses PHP's
|
||||||
|
* built-in strtolower and ctype_lower functions, which may
|
||||||
|
* cause problems with international comparisons
|
||||||
*/
|
*/
|
||||||
class HTMLPurifier_AttrDef_Enum extends HTMLPurifier_AttrDef
|
class HTMLPurifier_AttrDef_Enum extends HTMLPurifier_AttrDef
|
||||||
{
|
{
|
||||||
@@ -25,8 +28,8 @@ class HTMLPurifier_AttrDef_Enum extends HTMLPurifier_AttrDef
|
|||||||
* @param $case_sensitive Bool indicating whether or not case sensitive
|
* @param $case_sensitive Bool indicating whether or not case sensitive
|
||||||
*/
|
*/
|
||||||
function HTMLPurifier_AttrDef_Enum(
|
function HTMLPurifier_AttrDef_Enum(
|
||||||
$valid_values = array(), $case_sensitive = false) {
|
$valid_values = array(), $case_sensitive = false
|
||||||
|
) {
|
||||||
$this->valid_values = array_flip($valid_values);
|
$this->valid_values = array_flip($valid_values);
|
||||||
$this->case_sensitive = $case_sensitive;
|
$this->case_sensitive = $case_sensitive;
|
||||||
}
|
}
|
||||||
@@ -34,6 +37,7 @@ class HTMLPurifier_AttrDef_Enum extends HTMLPurifier_AttrDef
|
|||||||
function validate($string, $config, &$context) {
|
function validate($string, $config, &$context) {
|
||||||
$string = trim($string);
|
$string = trim($string);
|
||||||
if (!$this->case_sensitive) {
|
if (!$this->case_sensitive) {
|
||||||
|
// we may want to do full case-insensitive libraries
|
||||||
$string = ctype_lower($string) ? $string : strtolower($string);
|
$string = ctype_lower($string) ? $string : strtolower($string);
|
||||||
}
|
}
|
||||||
$result = isset($this->valid_values[$string]);
|
$result = isset($this->valid_values[$string]);
|
||||||
@@ -41,6 +45,21 @@ class HTMLPurifier_AttrDef_Enum extends HTMLPurifier_AttrDef
|
|||||||
return $result ? $string : false;
|
return $result ? $string : false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param $string In form of comma-delimited list of case-insensitive
|
||||||
|
* valid values. Example: "foo,bar,baz". Prepend "s:" to make
|
||||||
|
* case sensitive
|
||||||
|
*/
|
||||||
|
function make($string) {
|
||||||
|
if (strlen($string) > 2 && $string[0] == 's' && $string[1] == ':') {
|
||||||
|
$string = substr($string, 2);
|
||||||
|
$sensitive = true;
|
||||||
|
} else {
|
||||||
|
$sensitive = false;
|
||||||
|
}
|
||||||
|
$values = explode(',', $string);
|
||||||
|
return new HTMLPurifier_AttrDef_Enum($values, $sensitive);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
29
library/HTMLPurifier/AttrDef/HTML/Bool.php
Normal file
29
library/HTMLPurifier/AttrDef/HTML/Bool.php
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
require_once 'HTMLPurifier/AttrDef.php';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validates a boolean attribute
|
||||||
|
*/
|
||||||
|
class HTMLPurifier_AttrDef_HTML_Bool extends HTMLPurifier_AttrDef
|
||||||
|
{
|
||||||
|
|
||||||
|
var $name;
|
||||||
|
var $minimized = true;
|
||||||
|
|
||||||
|
function HTMLPurifier_AttrDef_HTML_Bool($name = false) {$this->name = $name;}
|
||||||
|
|
||||||
|
function validate($string, $config, &$context) {
|
||||||
|
if (empty($string)) return false;
|
||||||
|
return $this->name;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param $string Name of attribute
|
||||||
|
*/
|
||||||
|
function make($string) {
|
||||||
|
return new HTMLPurifier_AttrDef_HTML_Bool($string);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
34
library/HTMLPurifier/AttrDef/HTML/Color.php
Normal file
34
library/HTMLPurifier/AttrDef/HTML/Color.php
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
require_once 'HTMLPurifier/AttrDef.php';
|
||||||
|
require_once 'HTMLPurifier/AttrDef/CSS/Color.php'; // for %Core.ColorKeywords
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validates a color according to the HTML spec.
|
||||||
|
*/
|
||||||
|
class HTMLPurifier_AttrDef_HTML_Color extends HTMLPurifier_AttrDef
|
||||||
|
{
|
||||||
|
|
||||||
|
function validate($string, $config, &$context) {
|
||||||
|
|
||||||
|
static $colors = null;
|
||||||
|
if ($colors === null) $colors = $config->get('Core', 'ColorKeywords');
|
||||||
|
|
||||||
|
$string = trim($string);
|
||||||
|
|
||||||
|
if (empty($string)) return false;
|
||||||
|
if (isset($colors[$string])) return $colors[$string];
|
||||||
|
if ($string[0] === '#') $hex = substr($string, 1);
|
||||||
|
else $hex = $string;
|
||||||
|
|
||||||
|
$length = strlen($hex);
|
||||||
|
if ($length !== 3 && $length !== 6) return false;
|
||||||
|
if (!ctype_xdigit($hex)) return false;
|
||||||
|
if ($length === 3) $hex = $hex[0].$hex[0].$hex[1].$hex[1].$hex[2].$hex[2];
|
||||||
|
|
||||||
|
return "#$hex";
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
33
library/HTMLPurifier/AttrDef/HTML/FrameTarget.php
Normal file
33
library/HTMLPurifier/AttrDef/HTML/FrameTarget.php
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
HTMLPurifier_ConfigSchema::define(
|
||||||
|
'Attr', 'AllowedFrameTargets', array(), 'lookup',
|
||||||
|
'Lookup table of all allowed link frame targets. Some commonly used '.
|
||||||
|
'link targets include _blank, _self, _parent and _top. Values should '.
|
||||||
|
'be lowercase, as validation will be done in a case-sensitive manner '.
|
||||||
|
'despite W3C\'s recommendation. XHTML 1.0 Strict does not permit '.
|
||||||
|
'the target attribute so this directive will have no effect in that '.
|
||||||
|
'doctype. XHTML 1.1 does not enable the Target module by default, you '.
|
||||||
|
'will have to manually enable it (see the module documentation for more details.)'
|
||||||
|
);
|
||||||
|
|
||||||
|
require_once 'HTMLPurifier/AttrDef/Enum.php';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Special-case enum attribute definition that lazy loads allowed frame targets
|
||||||
|
*/
|
||||||
|
class HTMLPurifier_AttrDef_HTML_FrameTarget extends HTMLPurifier_AttrDef_Enum
|
||||||
|
{
|
||||||
|
|
||||||
|
var $valid_values = false; // uninitialized value
|
||||||
|
var $case_sensitive = false;
|
||||||
|
|
||||||
|
function HTMLPurifier_AttrDef_HTML_FrameTarget() {}
|
||||||
|
|
||||||
|
function validate($string, $config, &$context) {
|
||||||
|
if ($this->valid_values === false) $this->valid_values = $config->get('Attr', 'AllowedFrameTargets');
|
||||||
|
return parent::validate($string, $config, $context);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@@ -3,6 +3,22 @@
|
|||||||
require_once 'HTMLPurifier/AttrDef.php';
|
require_once 'HTMLPurifier/AttrDef.php';
|
||||||
require_once 'HTMLPurifier/IDAccumulator.php';
|
require_once 'HTMLPurifier/IDAccumulator.php';
|
||||||
|
|
||||||
|
HTMLPurifier_ConfigSchema::define(
|
||||||
|
'Attr', 'EnableID', false, 'bool',
|
||||||
|
'Allows the ID attribute in HTML. This is disabled by default '.
|
||||||
|
'due to the fact that without proper configuration user input can '.
|
||||||
|
'easily break the validation of a webpage by specifying an ID that is '.
|
||||||
|
'already on the surrounding HTML. If you don\'t mind throwing caution to '.
|
||||||
|
'the wind, enable this directive, but I strongly recommend you also '.
|
||||||
|
'consider blacklisting IDs you use (%Attr.IDBlacklist) or prefixing all '.
|
||||||
|
'user supplied IDs (%Attr.IDPrefix). This directive has been available '.
|
||||||
|
'since 1.2.0, and when set to true reverts to the behavior of pre-1.2.0 '.
|
||||||
|
'versions.'
|
||||||
|
);
|
||||||
|
HTMLPurifier_ConfigSchema::defineAlias(
|
||||||
|
'HTML', 'EnableAttrID', 'Attr', 'EnableID'
|
||||||
|
);
|
||||||
|
|
||||||
HTMLPurifier_ConfigSchema::define(
|
HTMLPurifier_ConfigSchema::define(
|
||||||
'Attr', 'IDPrefix', '', 'string',
|
'Attr', 'IDPrefix', '', 'string',
|
||||||
'String to prefix to IDs. If you have no idea what IDs your pages '.
|
'String to prefix to IDs. If you have no idea what IDs your pages '.
|
||||||
@@ -27,6 +43,14 @@ HTMLPurifier_ConfigSchema::define(
|
|||||||
'is set to a non-empty value! This directive was available since 1.2.0.'
|
'is set to a non-empty value! This directive was available since 1.2.0.'
|
||||||
);
|
);
|
||||||
|
|
||||||
|
HTMLPurifier_ConfigSchema::define(
|
||||||
|
'Attr', 'IDBlacklistRegexp', null, 'string/null',
|
||||||
|
'PCRE regular expression to be matched against all IDs. If the expression '.
|
||||||
|
'is matches, the ID is rejected. Use this with care: may cause '.
|
||||||
|
'significant degradation. ID matching is done after all other '.
|
||||||
|
'validation. This directive was available since 1.6.0.'
|
||||||
|
);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Validates the HTML attribute ID.
|
* Validates the HTML attribute ID.
|
||||||
* @warning Even though this is the id processor, it
|
* @warning Even though this is the id processor, it
|
||||||
@@ -36,11 +60,16 @@ HTMLPurifier_ConfigSchema::define(
|
|||||||
* blacklist. If you're hacking around, make sure you use load()!
|
* blacklist. If you're hacking around, make sure you use load()!
|
||||||
*/
|
*/
|
||||||
|
|
||||||
class HTMLPurifier_AttrDef_ID extends HTMLPurifier_AttrDef
|
class HTMLPurifier_AttrDef_HTML_ID extends HTMLPurifier_AttrDef
|
||||||
{
|
{
|
||||||
|
|
||||||
|
// ref functionality disabled, since we also have to verify
|
||||||
|
// whether or not the ID it refers to exists
|
||||||
|
|
||||||
function validate($id, $config, &$context) {
|
function validate($id, $config, &$context) {
|
||||||
|
|
||||||
|
if (!$config->get('Attr', 'EnableID')) return false;
|
||||||
|
|
||||||
$id = trim($id); // trim it first
|
$id = trim($id); // trim it first
|
||||||
|
|
||||||
if ($id === '') return false;
|
if ($id === '') return false;
|
||||||
@@ -55,8 +84,10 @@ class HTMLPurifier_AttrDef_ID extends HTMLPurifier_AttrDef
|
|||||||
'%Attr.IDPrefix is set', E_USER_WARNING);
|
'%Attr.IDPrefix is set', E_USER_WARNING);
|
||||||
}
|
}
|
||||||
|
|
||||||
$id_accumulator =& $context->get('IDAccumulator');
|
//if (!$this->ref) {
|
||||||
if (isset($id_accumulator->ids[$id])) return false;
|
$id_accumulator =& $context->get('IDAccumulator');
|
||||||
|
if (isset($id_accumulator->ids[$id])) return false;
|
||||||
|
//}
|
||||||
|
|
||||||
// we purposely avoid using regex, hopefully this is faster
|
// we purposely avoid using regex, hopefully this is faster
|
||||||
|
|
||||||
@@ -71,7 +102,12 @@ class HTMLPurifier_AttrDef_ID extends HTMLPurifier_AttrDef
|
|||||||
$result = ($trim === '');
|
$result = ($trim === '');
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($result) $id_accumulator->add($id);
|
$regexp = $config->get('Attr', 'IDBlacklistRegexp');
|
||||||
|
if ($regexp && preg_match($regexp, $id)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (/*!$this->ref && */$result) $id_accumulator->add($id);
|
||||||
|
|
||||||
// if no change was made to the ID, return the result
|
// if no change was made to the ID, return the result
|
||||||
// else, return the new id if stripping whitespace made it
|
// else, return the new id if stripping whitespace made it
|
||||||
@@ -82,4 +118,3 @@ class HTMLPurifier_AttrDef_ID extends HTMLPurifier_AttrDef
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
@@ -1,18 +1,16 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
require_once 'HTMLPurifier/AttrDef.php';
|
require_once 'HTMLPurifier/AttrDef.php';
|
||||||
require_once 'HTMLPurifier/AttrDef/Pixels.php';
|
require_once 'HTMLPurifier/AttrDef/HTML/Pixels.php';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Validates the HTML type length (not to be confused with CSS's length).
|
* Validates the HTML type length (not to be confused with CSS's length).
|
||||||
*
|
*
|
||||||
* This accepts integer pixels or percentages as lengths for certain
|
* This accepts integer pixels or percentages as lengths for certain
|
||||||
* HTML attributes. Don't use this for CSS: that's
|
* HTML attributes.
|
||||||
* HTMLPurifier_AttrDef_CSSLength which requires prefixes and allows a lot
|
|
||||||
* more different types.
|
|
||||||
*/
|
*/
|
||||||
|
|
||||||
class HTMLPurifier_AttrDef_Length extends HTMLPurifier_AttrDef_Pixels
|
class HTMLPurifier_AttrDef_HTML_Length extends HTMLPurifier_AttrDef_HTML_Pixels
|
||||||
{
|
{
|
||||||
|
|
||||||
function validate($string, $config, &$context) {
|
function validate($string, $config, &$context) {
|
||||||
@@ -43,4 +41,3 @@ class HTMLPurifier_AttrDef_Length extends HTMLPurifier_AttrDef_Pixels
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
72
library/HTMLPurifier/AttrDef/HTML/LinkTypes.php
Normal file
72
library/HTMLPurifier/AttrDef/HTML/LinkTypes.php
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
<?php
|
||||||
|
|
||||||
|
require_once 'HTMLPurifier/AttrDef.php';
|
||||||
|
|
||||||
|
HTMLPurifier_ConfigSchema::define(
|
||||||
|
'Attr', 'AllowedRel', array(), 'lookup',
|
||||||
|
'List of allowed forward document relationships in the rel attribute. '.
|
||||||
|
'Common values may be nofollow or print. By default, this is empty, '.
|
||||||
|
'meaning that no document relationships are allowed. This directive '.
|
||||||
|
'was available since 1.6.0.'
|
||||||
|
);
|
||||||
|
|
||||||
|
HTMLPurifier_ConfigSchema::define(
|
||||||
|
'Attr', 'AllowedRev', array(), 'lookup',
|
||||||
|
'List of allowed reverse document relationships in the rev attribute. '.
|
||||||
|
'This attribute is a bit of an edge-case; if you don\'t know what it '.
|
||||||
|
'is for, stay away. This directive was available since 1.6.0.'
|
||||||
|
);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Validates a rel/rev link attribute against a directive of allowed values
|
||||||
|
* @note We cannot use Enum because link types allow multiple
|
||||||
|
* values.
|
||||||
|
* @note Assumes link types are ASCII text
|
||||||
|
*/
|
||||||
|
class HTMLPurifier_AttrDef_HTML_LinkTypes extends HTMLPurifier_AttrDef
|
||||||
|
{
|
||||||
|
|
||||||
|
/** Name config attribute to pull. */
|
||||||
|
var $name;
|
||||||
|
|
||||||
|
function HTMLPurifier_AttrDef_HTML_LinkTypes($name) {
|
||||||
|
$configLookup = array(
|
||||||
|
'rel' => 'AllowedRel',
|
||||||
|
'rev' => 'AllowedRev'
|
||||||
|
);
|
||||||
|
if (!isset($configLookup[$name])) {
|
||||||
|
trigger_error('Unrecognized attribute name for link '.
|
||||||
|
'relationship.', E_USER_ERROR);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
$this->name = $configLookup[$name];
|
||||||
|
}
|
||||||
|
|
||||||
|
function validate($string, $config, &$context) {
|
||||||
|
|
||||||
|
$allowed = $config->get('Attr', $this->name);
|
||||||
|
if (empty($allowed)) return false;
|
||||||
|
|
||||||
|
$string = $this->parseCDATA($string);
|
||||||
|
$parts = explode(' ', $string);
|
||||||
|
|
||||||
|
// lookup to prevent duplicates
|
||||||
|
$ret_lookup = array();
|
||||||
|
foreach ($parts as $part) {
|
||||||
|
$part = strtolower(trim($part));
|
||||||
|
if (!isset($allowed[$part])) continue;
|
||||||
|
$ret_lookup[$part] = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (empty($ret_lookup)) return false;
|
||||||
|
|
||||||
|
$ret_array = array();
|
||||||
|
foreach ($ret_lookup as $part => $bool) $ret_array[] = $part;
|
||||||
|
$string = implode(' ', $ret_array);
|
||||||
|
|
||||||
|
return $string;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
@@ -1,7 +1,7 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
require_once 'HTMLPurifier/AttrDef.php';
|
require_once 'HTMLPurifier/AttrDef.php';
|
||||||
require_once 'HTMLPurifier/AttrDef/Length.php';
|
require_once 'HTMLPurifier/AttrDef/HTML/Length.php';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Validates a MultiLength as defined by the HTML spec.
|
* Validates a MultiLength as defined by the HTML spec.
|
||||||
@@ -9,7 +9,7 @@ require_once 'HTMLPurifier/AttrDef/Length.php';
|
|||||||
* A multilength is either a integer (pixel count), a percentage, or
|
* A multilength is either a integer (pixel count), a percentage, or
|
||||||
* a relative number.
|
* a relative number.
|
||||||
*/
|
*/
|
||||||
class HTMLPurifier_AttrDef_MultiLength extends HTMLPurifier_AttrDef_Length
|
class HTMLPurifier_AttrDef_HTML_MultiLength extends HTMLPurifier_AttrDef_HTML_Length
|
||||||
{
|
{
|
||||||
|
|
||||||
function validate($string, $config, &$context) {
|
function validate($string, $config, &$context) {
|
||||||
@@ -27,16 +27,17 @@ class HTMLPurifier_AttrDef_MultiLength extends HTMLPurifier_AttrDef_Length
|
|||||||
|
|
||||||
$int = substr($string, 0, $length - 1);
|
$int = substr($string, 0, $length - 1);
|
||||||
|
|
||||||
|
if ($int == '') return '*';
|
||||||
if (!is_numeric($int)) return false;
|
if (!is_numeric($int)) return false;
|
||||||
|
|
||||||
$int = (int) $int;
|
$int = (int) $int;
|
||||||
|
|
||||||
if ($int < 0) return '0*';
|
if ($int < 0) return false;
|
||||||
|
if ($int == 0) return '0';
|
||||||
|
if ($int == 1) return '*';
|
||||||
return ((string) $int) . '*';
|
return ((string) $int) . '*';
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
@@ -4,9 +4,13 @@ require_once 'HTMLPurifier/AttrDef.php';
|
|||||||
require_once 'HTMLPurifier/Config.php';
|
require_once 'HTMLPurifier/Config.php';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Validates the contents of the global HTML attribute class.
|
* Validates contents based on NMTOKENS attribute type.
|
||||||
|
* @note The only current use for this is the class attribute in HTML
|
||||||
|
* @note Could have some functionality factored out into Nmtoken class
|
||||||
|
* @warning We cannot assume this class will be used only for 'class'
|
||||||
|
* attributes. Not sure how to hook in magic behavior, then.
|
||||||
*/
|
*/
|
||||||
class HTMLPurifier_AttrDef_Class extends HTMLPurifier_AttrDef
|
class HTMLPurifier_AttrDef_HTML_Nmtokens extends HTMLPurifier_AttrDef
|
||||||
{
|
{
|
||||||
|
|
||||||
function validate($string, $config, &$context) {
|
function validate($string, $config, &$context) {
|
||||||
@@ -31,10 +35,10 @@ class HTMLPurifier_AttrDef_Class extends HTMLPurifier_AttrDef
|
|||||||
|
|
||||||
if (empty($matches[1])) return false;
|
if (empty($matches[1])) return false;
|
||||||
|
|
||||||
// reconstruct class string
|
// reconstruct string
|
||||||
$new_string = '';
|
$new_string = '';
|
||||||
foreach ($matches[1] as $class_names) {
|
foreach ($matches[1] as $token) {
|
||||||
$new_string .= $class_names . ' ';
|
$new_string .= $token . ' ';
|
||||||
}
|
}
|
||||||
$new_string = rtrim($new_string);
|
$new_string = rtrim($new_string);
|
||||||
|
|
||||||
@@ -44,4 +48,3 @@ class HTMLPurifier_AttrDef_Class extends HTMLPurifier_AttrDef
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
@@ -5,7 +5,7 @@ require_once 'HTMLPurifier/AttrDef.php';
|
|||||||
/**
|
/**
|
||||||
* Validates an integer representation of pixels according to the HTML spec.
|
* Validates an integer representation of pixels according to the HTML spec.
|
||||||
*/
|
*/
|
||||||
class HTMLPurifier_AttrDef_Pixels extends HTMLPurifier_AttrDef
|
class HTMLPurifier_AttrDef_HTML_Pixels extends HTMLPurifier_AttrDef
|
||||||
{
|
{
|
||||||
|
|
||||||
function validate($string, $config, &$context) {
|
function validate($string, $config, &$context) {
|
||||||
@@ -34,4 +34,3 @@ class HTMLPurifier_AttrDef_Pixels extends HTMLPurifier_AttrDef
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
@@ -72,4 +72,3 @@ class HTMLPurifier_AttrDef_Integer extends HTMLPurifier_AttrDef
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
@@ -46,7 +46,7 @@ class HTMLPurifier_AttrDef_Lang extends HTMLPurifier_AttrDef
|
|||||||
|
|
||||||
// process second subtag : $subtags[1]
|
// process second subtag : $subtags[1]
|
||||||
$length = strlen($subtags[1]);
|
$length = strlen($subtags[1]);
|
||||||
if ($length == 0 || $length == 1 || $length > 8 || !ctype_alnum($subtags[1])) {
|
if ($length == 0 || ($length == 1 && $subtags[1] != 'x') || $length > 8 || !ctype_alnum($subtags[1])) {
|
||||||
return $new_string;
|
return $new_string;
|
||||||
}
|
}
|
||||||
if (!ctype_lower($subtags[1])) $subtags[1] = strtolower($subtags[1]);
|
if (!ctype_lower($subtags[1])) $subtags[1] = strtolower($subtags[1]);
|
||||||
@@ -72,4 +72,3 @@ class HTMLPurifier_AttrDef_Lang extends HTMLPurifier_AttrDef
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
@@ -1,78 +0,0 @@
|
|||||||
<?php
|
|
||||||
|
|
||||||
require_once 'HTMLPurifier/AttrDef.php';
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Validates shorthand CSS property list-style.
|
|
||||||
* @note This currently does not support list-style-image, as that functionality
|
|
||||||
* is not implemented yet elsewhere.
|
|
||||||
*/
|
|
||||||
class HTMLPurifier_AttrDef_ListStyle extends HTMLPurifier_AttrDef
|
|
||||||
{
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Local copy of component validators.
|
|
||||||
* @note See HTMLPurifier_AttrDef_Font::$info for a similar impl.
|
|
||||||
*/
|
|
||||||
var $info;
|
|
||||||
|
|
||||||
function HTMLPurifier_AttrDef_ListStyle($config) {
|
|
||||||
$def = $config->getCSSDefinition();
|
|
||||||
$this->info['list-style-type'] = $def->info['list-style-type'];
|
|
||||||
$this->info['list-style-position'] = $def->info['list-style-position'];
|
|
||||||
}
|
|
||||||
|
|
||||||
function validate($string, $config, &$context) {
|
|
||||||
|
|
||||||
// regular pre-processing
|
|
||||||
$string = $this->parseCDATA($string);
|
|
||||||
if ($string === '') return false;
|
|
||||||
|
|
||||||
$bits = explode(' ', strtolower($string)); // bits to process
|
|
||||||
|
|
||||||
$caught_type = false;
|
|
||||||
$caught_position = false;
|
|
||||||
$caught_none = false; // as in keyword none, which is in all of them
|
|
||||||
|
|
||||||
$ret = '';
|
|
||||||
|
|
||||||
foreach ($bits as $bit) {
|
|
||||||
if ($caught_none && ($caught_type || $caught_position)) break;
|
|
||||||
if ($caught_type && $caught_position) break;
|
|
||||||
|
|
||||||
if ($bit === '') continue;
|
|
||||||
|
|
||||||
if ($bit === 'none') {
|
|
||||||
if ($caught_none) continue;
|
|
||||||
$caught_none = true;
|
|
||||||
$ret .= 'none ';
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// if we add anymore, roll it into a loop
|
|
||||||
|
|
||||||
$r = $this->info['list-style-type']->validate($bit, $config, $context);
|
|
||||||
if ($r !== false) {
|
|
||||||
if ($caught_type) continue;
|
|
||||||
$caught_type = true;
|
|
||||||
$ret .= $r . ' ';
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
$r = $this->info['list-style-position']->validate($bit, $config, $context);
|
|
||||||
if ($r !== false) {
|
|
||||||
if ($caught_position) continue;
|
|
||||||
$caught_position = true;
|
|
||||||
$ret .= $r . ' ';
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
$ret = rtrim($ret);
|
|
||||||
return $ret ? $ret : false;
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
?>
|
|
@@ -14,4 +14,3 @@ class HTMLPurifier_AttrDef_Text extends HTMLPurifier_AttrDef
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
@@ -1,81 +1,64 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
require_once 'HTMLPurifier/AttrDef.php';
|
require_once 'HTMLPurifier/AttrDef.php';
|
||||||
|
require_once 'HTMLPurifier/URIParser.php';
|
||||||
require_once 'HTMLPurifier/URIScheme.php';
|
require_once 'HTMLPurifier/URIScheme.php';
|
||||||
require_once 'HTMLPurifier/URISchemeRegistry.php';
|
require_once 'HTMLPurifier/URISchemeRegistry.php';
|
||||||
require_once 'HTMLPurifier/AttrDef/Host.php';
|
require_once 'HTMLPurifier/AttrDef/URI/Host.php';
|
||||||
require_once 'HTMLPurifier/PercentEncoder.php';
|
require_once 'HTMLPurifier/PercentEncoder.php';
|
||||||
|
|
||||||
HTMLPurifier_ConfigSchema::define(
|
// special case filtering directives
|
||||||
'URI', 'DefaultScheme', 'http', 'string',
|
|
||||||
'Defines through what scheme the output will be served, in order to '.
|
|
||||||
'select the proper object validator when no scheme information is present.'
|
|
||||||
);
|
|
||||||
|
|
||||||
HTMLPurifier_ConfigSchema::define(
|
HTMLPurifier_ConfigSchema::define(
|
||||||
'URI', 'Host', null, 'string/null',
|
'URI', 'Munge', null, 'string/null', '
|
||||||
'Defines the domain name of the server, so we can determine whether or '.
|
<p>
|
||||||
'an absolute URI is from your website or not. Not strictly necessary, '.
|
Munges all browsable (usually http, https and ftp)
|
||||||
'as users should be using relative URIs to reference resources on your '.
|
absolute URI\'s into another URI, usually a URI redirection service.
|
||||||
'website. It will, however, let you use absolute URIs to link to '.
|
This directive accepts a URI, formatted with a <code>%s</code> where
|
||||||
'subdomains of the domain you post here: i.e. example.com will allow '.
|
the url-encoded original URI should be inserted (sample:
|
||||||
'sub.example.com. However, higher up domains will still be excluded: '.
|
<code>http://www.google.com/url?q=%s</code>).
|
||||||
'if you set %URI.Host to sub.example.com, example.com will be blocked. '.
|
</p>
|
||||||
'This directive has been available since 1.2.0.'
|
<p>
|
||||||
);
|
Uses for this directive:
|
||||||
|
</p>
|
||||||
|
<ul>
|
||||||
|
<li>
|
||||||
|
Prevent PageRank leaks, while being fairly transparent
|
||||||
|
to users (you may also want to add some client side JavaScript to
|
||||||
|
override the text in the statusbar). <strong>Notice</strong>:
|
||||||
|
Many security experts believe that this form of protection does not deter spam-bots.
|
||||||
|
</li>
|
||||||
|
<li>
|
||||||
|
Redirect users to a splash page telling them they are leaving your
|
||||||
|
website. While this is poor usability practice, it is often mandated
|
||||||
|
in corporate environments.
|
||||||
|
</li>
|
||||||
|
</ul>
|
||||||
|
<p>
|
||||||
|
This directive has been available since 1.3.0.
|
||||||
|
</p>
|
||||||
|
');
|
||||||
|
|
||||||
|
// disabling directives
|
||||||
|
|
||||||
HTMLPurifier_ConfigSchema::define(
|
HTMLPurifier_ConfigSchema::define(
|
||||||
'URI', 'DisableExternal', false, 'bool',
|
'URI', 'Disable', false, 'bool', '
|
||||||
'Disables links to external websites. This is a highly effective '.
|
<p>
|
||||||
'anti-spam and anti-pagerank-leech measure, but comes at a hefty price: no'.
|
Disables all URIs in all forms. Not sure why you\'d want to do that
|
||||||
'links or images outside of your domain will be allowed. Non-linkified '.
|
(after all, the Internet\'s founded on the notion of a hyperlink).
|
||||||
'URIs will still be preserved. If you want to be able to link to '.
|
This directive has been available since 1.3.0.
|
||||||
'subdomains or use absolute URIs, specify %URI.Host for your website. '.
|
</p>
|
||||||
'This directive has been available since 1.2.0.'
|
');
|
||||||
);
|
HTMLPurifier_ConfigSchema::defineAlias('Attr', 'DisableURI', 'URI', 'Disable');
|
||||||
|
|
||||||
HTMLPurifier_ConfigSchema::define(
|
HTMLPurifier_ConfigSchema::define(
|
||||||
'URI', 'DisableExternalResources', false, 'bool',
|
'URI', 'DisableResources', false, 'bool', '
|
||||||
'Disables the embedding of external resources, preventing users from '.
|
<p>
|
||||||
'embedding things like images from other hosts. This prevents '.
|
Disables embedding resources, essentially meaning no pictures. You can
|
||||||
'access tracking (good for email viewers), bandwidth leeching, '.
|
still link to them though. See %URI.DisableExternalResources for why
|
||||||
'cross-site request forging, goatse.cx posting, and '.
|
this might be a good idea. This directive has been available since 1.3.0.
|
||||||
'other nasties, but also results in '.
|
</p>
|
||||||
'a loss of end-user functionality (they can\'t directly post a pic '.
|
');
|
||||||
'they posted from Flickr anymore). Use it if you don\'t have a '.
|
|
||||||
'robust user-content moderation team. This directive has been '.
|
|
||||||
'available since 1.3.0.'
|
|
||||||
);
|
|
||||||
|
|
||||||
HTMLPurifier_ConfigSchema::define(
|
|
||||||
'URI', 'DisableResources', false, 'bool',
|
|
||||||
'Disables embedding resources, essentially meaning no pictures. You can '.
|
|
||||||
'still link to them though. See %URI.DisableExternalResources for why '.
|
|
||||||
'this might be a good idea. This directive has been available since 1.3.0.'
|
|
||||||
);
|
|
||||||
|
|
||||||
HTMLPurifier_ConfigSchema::define(
|
|
||||||
'URI', 'Munge', null, 'string/null',
|
|
||||||
'Munges all browsable (usually http, https and ftp) URI\'s into some URL '.
|
|
||||||
'redirection service. Pass this directive a URI, with %s inserted where '.
|
|
||||||
'the url-encoded original URI should be inserted (sample: '.
|
|
||||||
'<code>http://www.google.com/url?q=%s</code>). '.
|
|
||||||
'This prevents PageRank leaks, while being as transparent as possible '.
|
|
||||||
'to users (you may also want to add some client side JavaScript to '.
|
|
||||||
'override the text in the statusbar). Warning: many security experts '.
|
|
||||||
'believe that this form of protection does not deter spam-bots. '.
|
|
||||||
'You can also use this directive to redirect users to a splash page '.
|
|
||||||
'telling them they are leaving your website. '.
|
|
||||||
'This directive has been available since 1.3.0.'
|
|
||||||
);
|
|
||||||
|
|
||||||
HTMLPurifier_ConfigSchema::define(
|
|
||||||
'URI', 'HostBlacklist', array(), 'list',
|
|
||||||
'List of strings that are forbidden in the host of any URI. Use it to '.
|
|
||||||
'kill domain names of spam, etc. Note that it will catch anything in '.
|
|
||||||
'the domain, so <tt>moo.com</tt> will catch <tt>moo.com.example.com</tt>. '.
|
|
||||||
'This directive has been available since 1.3.0.'
|
|
||||||
);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Validates a URI as defined by RFC 3986.
|
* Validates a URI as defined by RFC 3986.
|
||||||
@@ -84,203 +67,83 @@ HTMLPurifier_ConfigSchema::define(
|
|||||||
class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
|
class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
|
||||||
{
|
{
|
||||||
|
|
||||||
var $host;
|
var $parser, $percentEncoder;
|
||||||
var $PercentEncoder;
|
var $embedsResource;
|
||||||
var $embeds_resource;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param $embeds_resource_resource Does the URI here result in an extra HTTP request?
|
* @param $embeds_resource_resource Does the URI here result in an extra HTTP request?
|
||||||
*/
|
*/
|
||||||
function HTMLPurifier_AttrDef_URI($embeds_resource = false) {
|
function HTMLPurifier_AttrDef_URI($embeds_resource = false) {
|
||||||
$this->host = new HTMLPurifier_AttrDef_Host();
|
$this->parser = new HTMLPurifier_URIParser();
|
||||||
$this->PercentEncoder = new HTMLPurifier_PercentEncoder();
|
$this->percentEncoder = new HTMLPurifier_PercentEncoder();
|
||||||
$this->embeds_resource = (bool) $embeds_resource;
|
$this->embedsResource = (bool) $embeds_resource;
|
||||||
}
|
}
|
||||||
|
|
||||||
function validate($uri, $config, &$context) {
|
function validate($uri, $config, &$context) {
|
||||||
|
|
||||||
// We'll write stack-based parsers later, for now, use regexps to
|
if ($config->get('URI', 'Disable')) return false;
|
||||||
// get things working as fast as possible (irony)
|
|
||||||
|
|
||||||
// parse as CDATA
|
// initial operations
|
||||||
$uri = $this->parseCDATA($uri);
|
$uri = $this->parseCDATA($uri);
|
||||||
|
$uri = $this->percentEncoder->normalize($uri);
|
||||||
|
|
||||||
// fix up percent-encoding
|
// parse the URI
|
||||||
$uri = $this->PercentEncoder->normalize($uri);
|
$uri = $this->parser->parse($uri);
|
||||||
|
if ($uri === false) return false;
|
||||||
|
|
||||||
// while it would be nice to use parse_url(), that's specifically
|
// add embedded flag to context for validators
|
||||||
// for HTTP and thus won't work for our generic URI parsing
|
$context->register('EmbeddedURI', $this->embedsResource);
|
||||||
|
|
||||||
// according to the RFC... (but this cuts corners, i.e. non-validating)
|
$ok = false;
|
||||||
$r_URI = '!'.
|
do {
|
||||||
'(([^:/?#<>\'"]+):)?'. // 2. Scheme
|
|
||||||
'(//([^/?#<>\'"]*))?'. // 4. Authority
|
// generic validation
|
||||||
'([^?#<>\'"]*)'. // 5. Path
|
$result = $uri->validate($config, $context);
|
||||||
'(\?([^#<>\'"]*))?'. // 7. Query
|
if (!$result) break;
|
||||||
'(#([^<>\'"]*))?'. // 8. Fragment
|
|
||||||
'!';
|
// chained validation
|
||||||
|
$uri_def =& $config->getDefinition('URI');
|
||||||
|
$result = $uri_def->filter($uri, $config, $context);
|
||||||
|
if (!$result) break;
|
||||||
|
|
||||||
|
// scheme-specific validation
|
||||||
|
$scheme_obj = $uri->getSchemeObj($config, $context);
|
||||||
|
if (!$scheme_obj) break;
|
||||||
|
if ($this->embedsResource && !$scheme_obj->browsable) break;
|
||||||
|
$result = $scheme_obj->validate($uri, $config, $context);
|
||||||
|
if (!$result) break;
|
||||||
|
|
||||||
|
// survived gauntlet
|
||||||
|
$ok = true;
|
||||||
|
|
||||||
|
} while (false);
|
||||||
|
|
||||||
$matches = array();
|
$context->destroy('EmbeddedURI');
|
||||||
$result = preg_match($r_URI, $uri, $matches);
|
if (!$ok) return false;
|
||||||
|
|
||||||
if (!$result) return false; // invalid URI
|
// munge scheme off if necessary (this must be last)
|
||||||
|
if (!is_null($uri->scheme) && is_null($uri->host)) {
|
||||||
// seperate out parts
|
if ($uri_def->defaultScheme == $uri->scheme) {
|
||||||
$scheme = !empty($matches[1]) ? $matches[2] : null;
|
$uri->scheme = null;
|
||||||
$authority = !empty($matches[3]) ? $matches[4] : null;
|
}
|
||||||
$path = $matches[5]; // always present, can be empty
|
|
||||||
$query = !empty($matches[6]) ? $matches[7] : null;
|
|
||||||
$fragment = !empty($matches[8]) ? $matches[9] : null;
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
$registry =& HTMLPurifier_URISchemeRegistry::instance();
|
|
||||||
if ($scheme !== null) {
|
|
||||||
// no need to validate the scheme's fmt since we do that when we
|
|
||||||
// retrieve the specific scheme object from the registry
|
|
||||||
$scheme = ctype_lower($scheme) ? $scheme : strtolower($scheme);
|
|
||||||
$scheme_obj =& $registry->getScheme($scheme, $config, $context);
|
|
||||||
if (!$scheme_obj) return false; // invalid scheme, clean it out
|
|
||||||
} else {
|
|
||||||
$scheme_obj =& $registry->getScheme(
|
|
||||||
$config->get('URI', 'DefaultScheme'), $config, $context
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// back to string
|
||||||
|
$result = $uri->toString();
|
||||||
|
|
||||||
// the URI we're processing embeds_resource a resource in the page, but the URI
|
// munge entire URI if necessary
|
||||||
// it references cannot be located
|
if (
|
||||||
if ($this->embeds_resource && !$scheme_obj->browsable) {
|
!is_null($uri->host) && // indicator for authority
|
||||||
return false;
|
!empty($scheme_obj->browsable) &&
|
||||||
}
|
!is_null($munge = $config->get('URI', 'Munge'))
|
||||||
|
) {
|
||||||
|
$result = str_replace('%s', rawurlencode($result), $munge);
|
||||||
if ($authority !== null) {
|
|
||||||
|
|
||||||
// remove URI if it's absolute and we disabled externals or
|
|
||||||
// if it's absolute and embedded and we disabled external resources
|
|
||||||
unset($our_host);
|
|
||||||
if (
|
|
||||||
$config->get('URI', 'DisableExternal') ||
|
|
||||||
(
|
|
||||||
$config->get('URI', 'DisableExternalResources') &&
|
|
||||||
$this->embeds_resource
|
|
||||||
)
|
|
||||||
) {
|
|
||||||
$our_host = $config->get('URI', 'Host');
|
|
||||||
if ($our_host === null) return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
$HEXDIG = '[A-Fa-f0-9]';
|
|
||||||
$unreserved = 'A-Za-z0-9-._~'; // make sure you wrap with []
|
|
||||||
$sub_delims = '!$&\'()'; // needs []
|
|
||||||
$pct_encoded = "%$HEXDIG$HEXDIG";
|
|
||||||
$r_userinfo = "(?:[$unreserved$sub_delims:]|$pct_encoded)*";
|
|
||||||
$r_authority = "/^(($r_userinfo)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
|
|
||||||
$matches = array();
|
|
||||||
preg_match($r_authority, $authority, $matches);
|
|
||||||
// overloads regexp!
|
|
||||||
$userinfo = !empty($matches[1]) ? $matches[2] : null;
|
|
||||||
$host = !empty($matches[3]) ? $matches[3] : null;
|
|
||||||
$port = !empty($matches[4]) ? $matches[5] : null;
|
|
||||||
|
|
||||||
// validate port
|
|
||||||
if ($port !== null) {
|
|
||||||
$port = (int) $port;
|
|
||||||
if ($port < 1 || $port > 65535) $port = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
$host = $this->host->validate($host, $config, $context);
|
|
||||||
if ($host === false) $host = null;
|
|
||||||
|
|
||||||
if ($this->checkBlacklist($host, $config, $context)) return false;
|
|
||||||
|
|
||||||
// more lenient absolute checking
|
|
||||||
if (isset($our_host)) {
|
|
||||||
$host_parts = array_reverse(explode('.', $host));
|
|
||||||
// could be cached
|
|
||||||
$our_host_parts = array_reverse(explode('.', $our_host));
|
|
||||||
foreach ($our_host_parts as $i => $discard) {
|
|
||||||
if (!isset($host_parts[$i])) return false;
|
|
||||||
if ($host_parts[$i] != $our_host_parts[$i]) return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// userinfo and host are validated within the regexp
|
|
||||||
|
|
||||||
} else {
|
|
||||||
$port = $host = $userinfo = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
// query and fragment are quite simple in terms of definition:
|
|
||||||
// *( pchar / "/" / "?" ), so define their validation routines
|
|
||||||
// when we start fixing percent encoding
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// path gets to be validated against a hodge-podge of rules depending
|
|
||||||
// on the status of authority and scheme, but it's not that important,
|
|
||||||
// esp. since it won't be applicable to everyone
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// okay, now we defer execution to the subobject for more processing
|
|
||||||
// note that $fragment is omitted
|
|
||||||
list($userinfo, $host, $port, $path, $query) =
|
|
||||||
$scheme_obj->validateComponents(
|
|
||||||
$userinfo, $host, $port, $path, $query, $config, $context
|
|
||||||
);
|
|
||||||
|
|
||||||
|
|
||||||
// reconstruct authority
|
|
||||||
$authority = null;
|
|
||||||
if (!is_null($userinfo) || !is_null($host) || !is_null($port)) {
|
|
||||||
$authority = '';
|
|
||||||
if($userinfo !== null) $authority .= $userinfo . '@';
|
|
||||||
$authority .= $host;
|
|
||||||
if($port !== null) $authority .= ':' . $port;
|
|
||||||
}
|
|
||||||
|
|
||||||
// reconstruct the result
|
|
||||||
$result = '';
|
|
||||||
if ($scheme !== null) $result .= "$scheme:";
|
|
||||||
if ($authority !== null) $result .= "//$authority";
|
|
||||||
$result .= $path;
|
|
||||||
if ($query !== null) $result .= "?$query";
|
|
||||||
if ($fragment !== null) $result .= "#$fragment";
|
|
||||||
|
|
||||||
// munge if necessary
|
|
||||||
$munge = $config->get('URI', 'Munge');
|
|
||||||
if (!empty($scheme_obj->browsable) && $munge !== null) {
|
|
||||||
if ($authority !== null) {
|
|
||||||
$result = str_replace('%s', rawurlencode($result), $munge);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return $result;
|
return $result;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Checks a host against an array blacklist
|
|
||||||
* @param $host Host to check
|
|
||||||
* @param $config HTMLPurifier_Config instance
|
|
||||||
* @param $context HTMLPurifier_Context instance
|
|
||||||
* @return bool Is spam?
|
|
||||||
*/
|
|
||||||
function checkBlacklist($host, &$config, &$context) {
|
|
||||||
$blacklist = $config->get('URI', 'HostBlacklist');
|
|
||||||
if (!empty($blacklist)) {
|
|
||||||
foreach($blacklist as $blacklisted_host_fragment) {
|
|
||||||
if (strpos($host, $blacklisted_host_fragment) !== false) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
||||||
|
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
require_once 'HTMLPurifier/AttrDef.php';
|
require_once 'HTMLPurifier/AttrDef.php';
|
||||||
|
|
||||||
class HTMLPurifier_AttrDef_Email extends HTMLPurifier_AttrDef
|
class HTMLPurifier_AttrDef_URI_Email extends HTMLPurifier_AttrDef
|
||||||
{
|
{
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -14,4 +14,3 @@ class HTMLPurifier_AttrDef_Email extends HTMLPurifier_AttrDef
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
@@ -1,12 +1,12 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
require_once 'HTMLPurifier/AttrDef/Email.php';
|
require_once 'HTMLPurifier/AttrDef/URI/Email.php';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Primitive email validation class based on the regexp found at
|
* Primitive email validation class based on the regexp found at
|
||||||
* http://www.regular-expressions.info/email.html
|
* http://www.regular-expressions.info/email.html
|
||||||
*/
|
*/
|
||||||
class HTMLPurifier_AttrDef_Email_SimpleCheck extends HTMLPurifier_AttrDef_Email
|
class HTMLPurifier_AttrDef_URI_Email_SimpleCheck extends HTMLPurifier_AttrDef_URI_Email
|
||||||
{
|
{
|
||||||
|
|
||||||
function validate($string, $config, &$context) {
|
function validate($string, $config, &$context) {
|
||||||
@@ -20,4 +20,3 @@ class HTMLPurifier_AttrDef_Email_SimpleCheck extends HTMLPurifier_AttrDef_Email
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
@@ -1,28 +1,28 @@
|
|||||||
<?php
|
<?php
|
||||||
|
|
||||||
require_once 'HTMLPurifier/AttrDef.php';
|
require_once 'HTMLPurifier/AttrDef.php';
|
||||||
require_once 'HTMLPurifier/AttrDef/IPv4.php';
|
require_once 'HTMLPurifier/AttrDef/URI/IPv4.php';
|
||||||
require_once 'HTMLPurifier/AttrDef/IPv6.php';
|
require_once 'HTMLPurifier/AttrDef/URI/IPv6.php';
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Validates a host according to the IPv4, IPv6 and DNS (future) specifications.
|
* Validates a host according to the IPv4, IPv6 and DNS (future) specifications.
|
||||||
*/
|
*/
|
||||||
class HTMLPurifier_AttrDef_Host extends HTMLPurifier_AttrDef
|
class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef
|
||||||
{
|
{
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Instance of HTMLPurifier_AttrDef_IPv4 sub-validator
|
* Instance of HTMLPurifier_AttrDef_URI_IPv4 sub-validator
|
||||||
*/
|
*/
|
||||||
var $ipv4;
|
var $ipv4;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Instance of HTMLPurifier_AttrDef_IPv6 sub-validator
|
* Instance of HTMLPurifier_AttrDef_URI_IPv6 sub-validator
|
||||||
*/
|
*/
|
||||||
var $ipv6;
|
var $ipv6;
|
||||||
|
|
||||||
function HTMLPurifier_AttrDef_Host() {
|
function HTMLPurifier_AttrDef_URI_Host() {
|
||||||
$this->ipv4 = new HTMLPurifier_AttrDef_IPv4();
|
$this->ipv4 = new HTMLPurifier_AttrDef_URI_IPv4();
|
||||||
$this->ipv6 = new HTMLPurifier_AttrDef_IPv6();
|
$this->ipv6 = new HTMLPurifier_AttrDef_URI_IPv6();
|
||||||
}
|
}
|
||||||
|
|
||||||
function validate($string, $config, &$context) {
|
function validate($string, $config, &$context) {
|
||||||
@@ -51,4 +51,3 @@ class HTMLPurifier_AttrDef_Host extends HTMLPurifier_AttrDef
|
|||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
?>
|
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user