mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2025-08-02 20:27:40 +02:00
Compare commits
7 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
48ce521572 | ||
|
728e6c5b44 | ||
|
8104145580 | ||
|
6ef8abd04f | ||
|
bc5871f389 | ||
|
30d75c999d | ||
|
64d8ca9831 |
11
Doxyfile
11
Doxyfile
@@ -4,7 +4,7 @@
|
||||
# Project related configuration options
|
||||
#---------------------------------------------------------------------------
|
||||
PROJECT_NAME = HTML Purifier
|
||||
PROJECT_NUMBER = 1.0.0
|
||||
PROJECT_NUMBER = 1.1.2
|
||||
OUTPUT_DIRECTORY = "C:/Documents and Settings/Edward/My Documents/My Webs/htmlpurifier/docs/doxygen"
|
||||
CREATE_SUBDIRS = NO
|
||||
OUTPUT_LANGUAGE = English
|
||||
@@ -89,9 +89,12 @@ EXCLUDE =
|
||||
EXCLUDE_SYMLINKS = NO
|
||||
EXCLUDE_PATTERNS = */tests/* \
|
||||
*/benchmarks/* \
|
||||
*/docs/phpdoc/* \
|
||||
*/docs/doxygen/* \
|
||||
*/test-settings.php
|
||||
*/docs/* \
|
||||
*/test-settings.php \
|
||||
*/configdoc/* \
|
||||
*/test-settings.php \
|
||||
*/maintenance/* \
|
||||
*/smoketests/*
|
||||
EXAMPLE_PATH =
|
||||
EXAMPLE_PATTERNS = *
|
||||
EXAMPLE_RECURSIVE = NO
|
||||
|
182
INSTALL
182
INSTALL
@@ -2,141 +2,183 @@
|
||||
Install
|
||||
How to install HTML Purifier
|
||||
|
||||
Being a library, there's no fancy GUI that will take you step-by-step through
|
||||
configuring database credentials and other mumbo-jumbo. HTML Purifier is
|
||||
designed to run "out of the box." Regardless, there are still a couple of
|
||||
things you should be mindful of.
|
||||
HTML Purifier is designed to run out of the box, so actually using the library
|
||||
is extremely easy. (Although, if you were looking for a step-by-step
|
||||
installation GUI, you've come to the wrong place!) The impatient can scroll
|
||||
down to the bottom of this INSTALL document to see the code, but you really
|
||||
should make sure a few things are properly done.
|
||||
|
||||
|
||||
|
||||
0. Compatibility
|
||||
1. Compatibility
|
||||
|
||||
HTML Purifier works in both PHP 4 and PHP 5. I have run the test suite on
|
||||
these versions:
|
||||
HTML Purifier works in both PHP 4 and PHP 5, from PHP 4.3.9 and up. It has no
|
||||
core dependencies with other libraries. (Whoopee!)
|
||||
|
||||
- 4.3.9, 4.3.11
|
||||
- 4.4.0, 4.4.4
|
||||
- 5.0.0, 5.0.4
|
||||
- 5.1.0, 5.1.6
|
||||
|
||||
And can confidently say that HTML Purifier should work in all versions
|
||||
between and afterwards. HTML Purifier definitely does not support PHP 4.2,
|
||||
and PHP 4.3 branch support may go further back than that, but I haven't tested
|
||||
any earlier versions.
|
||||
|
||||
I have been unable to get PHP 5.0.5 working on my computer, so if someone
|
||||
wants to test that, be my guest. All tests were done on Windows XP Home,
|
||||
but operating system should not be a major factor in the library.
|
||||
Optional extensions are iconv (usually installed) and tidy (also common).
|
||||
If you use UTF-8 and don't plan on pretty-printing HTML, you can get away with
|
||||
not having either of these extensions.
|
||||
|
||||
|
||||
|
||||
1. Including the proper files
|
||||
2. Including the library
|
||||
|
||||
The library/ directory must be added to your path: HTML Purifier will not be
|
||||
able to find the necessary includes otherwise. This is as simple as:
|
||||
Simply use:
|
||||
|
||||
set_include_path('/path/to/htmlpurifier/library' . PATH_SEPARATOR .
|
||||
get_include_path() );
|
||||
require_once '/path/to/library/HTMLPurifier.auto.php';
|
||||
|
||||
...replacing /path/to/htmlpurifier with the actual location of the folder. Don't
|
||||
worry, HTML Purifier is namespaced so unless you have another file named
|
||||
HTMLPurifier.php, the files won't collide with any of your includes.
|
||||
...and you're good to go. Since HTML Purifier's codebase is fairly
|
||||
large, I recommend only including HTML Purifier when you need it.
|
||||
|
||||
Then, it's a simple matter of including the base file:
|
||||
If you don't like your include_path to be fiddled around with, simply set
|
||||
HTML Purifier's library/ directory to the include path yourself and then:
|
||||
|
||||
require_once 'HTMLPurifier.php';
|
||||
require_once 'HTMLPurifier.php';
|
||||
|
||||
...and you're good to go.
|
||||
Only the contents in the library/ folder are necessary, so you can remove
|
||||
everything else when using HTML Purifier in a production environment.
|
||||
|
||||
|
||||
|
||||
2. Preparing the proper environment
|
||||
3. Preparing the proper output environment
|
||||
|
||||
While no configuration is necessary, you first should take precautions regarding
|
||||
the other output HTML that the filtered content will be going along with. Here
|
||||
is a (short) checklist:
|
||||
HTML Purifier is all about web-standards, so accordingly your webpages should
|
||||
be standards compliant. HTML Purifier can deal with these doctypes:
|
||||
|
||||
* Have I specified XHTML 1.0 Transitional as the doctype?
|
||||
* Have I specified UTF-8 as the character encoding?
|
||||
* XHTML 1.0 Transitional (default)
|
||||
* HTML 4.01 Transitional
|
||||
|
||||
...and these character encodings:
|
||||
|
||||
* UTF-8 (default)
|
||||
* Any encoding iconv supports (support is crippled for i18n though)
|
||||
|
||||
The defaults are there for a reason: they are best-practice choices that
|
||||
should not be changed lightly. For those of you in the dark, you can determine
|
||||
the doctype from this code in your HTML documents:
|
||||
|
||||
To find out what these are, browse to your website and view its source code.
|
||||
You can figure out the doctype from the a declaration that looks like
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||
or no doctype. You can figure out the character encoding by looking for
|
||||
|
||||
...and the character encoding from this code:
|
||||
|
||||
<meta http-equiv="Content-type" content="text/html;charset=ENCODING">
|
||||
|
||||
I cannot stress the importance of these two bullets enough. Omitting either
|
||||
of them could have dire consequences not only for security but for plain
|
||||
old usability. You can find a more in-depth discussion of why this is needed
|
||||
in docs/security.txt, in the meantime, try to change your output so this is
|
||||
the case. If you can't, well, we might be able to accomodate you (read
|
||||
section 3).
|
||||
For legacy codebases these declarations may be missing. If that is the case,
|
||||
STOP, and read up on character encodings and doctypes (in that order). Here
|
||||
are some links:
|
||||
|
||||
* http://www.joelonsoftware.com/articles/Unicode.html
|
||||
* http://alistapart.com/stories/doctype/
|
||||
|
||||
You may currently be vulnerable to XSS and other security threats, and HTML
|
||||
Purifier won't be able to fix that.
|
||||
|
||||
|
||||
|
||||
3. Configuring HTML Purifier
|
||||
4. Configuration
|
||||
|
||||
HTML Purifier is designed to run out-of-the-box, but occasionally HTML
|
||||
Purifier needs to be told what to do.
|
||||
Purifier needs to be told what to do. If you answered no to any of these
|
||||
questions, read on, otherwise, you can skip to the next section (or, if you're
|
||||
into configuring things just for the heck of it, skip to 4.3).
|
||||
|
||||
If, for some reason, you are unable to switch to UTF-8 immediately, you can
|
||||
switch HTML Purifier's encoding. Note that the availability of encodings is
|
||||
dependent on iconv, and you'll be missing characters if the charset you
|
||||
choose doesn't have them.
|
||||
* Am I using UTF-8?
|
||||
* Am I using XHTML 1.0 Transitional?
|
||||
|
||||
If you answered yes to any of these questions, instantiate a configuration
|
||||
object and read on:
|
||||
|
||||
$config = HTMLPurifier_Config::createDefault();
|
||||
|
||||
|
||||
|
||||
4.1. Setting a different character encoding
|
||||
|
||||
You really shouldn't use any other encoding except UTF-8, especially if you
|
||||
plan to support multilingual websites (read section three for more details).
|
||||
However, switching to UTF-8 is not always immediately feasible, so we can
|
||||
adapt.
|
||||
|
||||
HTML Purifier uses iconv to support other character encodings, as such,
|
||||
any encoding that iconv supports <http://www.gnu.org/software/libiconv/>
|
||||
HTML Purifier supports with this code:
|
||||
|
||||
$config->set('Core', 'Encoding', /* put your encoding here */);
|
||||
|
||||
An example usage for Latin-1 websites:
|
||||
An example usage for Latin-1 websites (the most common encoding for English
|
||||
websites):
|
||||
|
||||
$config->set('Core', 'Encoding', 'ISO-8859-1');
|
||||
|
||||
Note that HTML Purifier's support for non-Unicode encodings is crippled by the
|
||||
fact that any character not supported by that encoding will be silently
|
||||
dropped, EVEN if it is ampersand escaped. This is a current limitation of
|
||||
HTML Purifier that we are NOT actively working to fix. Patches are welcome,
|
||||
but there are so many other gotchas and problems in I18N for non-Unicode
|
||||
encodings that this functionality is low priority. See
|
||||
<http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html> for a more
|
||||
detailed lowdown on the topic.
|
||||
|
||||
|
||||
|
||||
4.2. Setting a different doctype
|
||||
|
||||
For those of you stuck using HTML 4.01 Transitional, you can disable
|
||||
XHTML output like this:
|
||||
|
||||
$config->set('Core', 'XHTML', false);
|
||||
|
||||
However, I strongly recommend that you use XHTML. Currently, we can only
|
||||
guarantee transitional-complaint output, future versions will also allow strict
|
||||
output.
|
||||
I recommend that you use XHTML, although not as much as I recommend UTF-8. If
|
||||
your HTML 4.01 page validates, good for you!
|
||||
|
||||
Currently, we can only guarantee transitional-complaint output, future
|
||||
versions will also allow strict-compliant output.
|
||||
|
||||
|
||||
|
||||
3. Using the code
|
||||
4.3. Other settings
|
||||
|
||||
There are more configuration directives which can be read about
|
||||
here: <http://hp.jpsband.org/live/configdoc/plain.html> They're a bit boring,
|
||||
but they can help out for those of you who like to exert maximum control over
|
||||
your code.
|
||||
|
||||
|
||||
|
||||
5. Using the code
|
||||
|
||||
The interface is mind-numbingly simple:
|
||||
|
||||
$purifier = new HTMLPurifier();
|
||||
$clean_html = $purifier->purify($dirty_html);
|
||||
$clean_html = $purifier->purify( $dirty_html );
|
||||
|
||||
Or, if you're using the configuration object:
|
||||
...or, if you're using the configuration object:
|
||||
|
||||
$purifier = new HTMLPurifier($config);
|
||||
$clean_html = $purifier->purify($dirty_html);
|
||||
$clean_html = $purifier->purify( $dirty_html );
|
||||
|
||||
That's it. For more examples, check out docs/examples/. Also, SLOW gives
|
||||
advice on what to do if HTML Purifier is slowing down your application.
|
||||
That's it! For more examples, check out docs/examples/ (they aren't very
|
||||
different though). Also, SLOW gives advice on what to do if HTML Purifier
|
||||
is slowing down your application.
|
||||
|
||||
|
||||
|
||||
4. Quick install
|
||||
6. Quick install
|
||||
|
||||
If your website is in UTF-8 and XHTML Transitional, use this code:
|
||||
|
||||
<?php
|
||||
set_include_path('/path/to/htmlpurifier/library'
|
||||
. PATH_SEPARATOR . get_include_path() );
|
||||
require_once 'HTMLPurifier.php';
|
||||
$purifier = new HTMLPurifier();
|
||||
require_once '/path/to/htmlpurifier/library/HTMLPurifier.auto.php';
|
||||
|
||||
$purifier = new HTMLPurifier();
|
||||
$clean_html = $purifier->purify($dirty_html);
|
||||
?>
|
||||
|
||||
If your website is in a different encoding or doctype, use this code:
|
||||
|
||||
<?php
|
||||
set_include_path('/path/to/htmlpurifier/library'
|
||||
. PATH_SEPARATOR . get_include_path() );
|
||||
require_once 'HTMLPurifier.php';
|
||||
require_once '/path/to/htmlpurifier/library/HTMLPurifier.auto.php';
|
||||
|
||||
$config = HTMLPurifier_Config::createDefault();
|
||||
$config->set('Core', 'Encoding', 'ISO-8859-1'); //replace with your encoding
|
||||
|
61
NEWS
61
NEWS
@@ -1,20 +1,47 @@
|
||||
NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
||||
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
||||
|
||||
1.2.0, unknown projected release date
|
||||
(major feature release)
|
||||
= KEY ====================
|
||||
! Feature
|
||||
- Bugfix
|
||||
+ Sub-comment
|
||||
. Internal change
|
||||
==========================
|
||||
|
||||
1.1.1, unknown projected release date
|
||||
(bugfix release)
|
||||
1.1.2, released 2006-09-30
|
||||
! Add HTMLPurifier.auto.php stub file that configures include_path
|
||||
- Documentation updated
|
||||
+ INSTALL document rewritten
|
||||
+ TODO added semi-lossy conversion
|
||||
+ API Doxygen docs' file exclusions updated
|
||||
+ Added notes on HTML versus XML attribute whitespace handling
|
||||
+ Noted that HTMLPurifier_ChildDef_Custom isn't being used
|
||||
+ Noted that config object's definitions are cached versions
|
||||
- Fixed lack of attribute parsing in HTMLPurifier_Lexer_PEARSax3
|
||||
- ftp:// URIs now have their typecodes checked
|
||||
- Hooked up HTMLPurifier_ChildDef_Custom's unit tests (they weren't being run)
|
||||
. Line endings standardized throughout project (svn:eol-style standardized)
|
||||
. Refactored parseData() to general Lexer class
|
||||
. Tester named "HTML Purifier" not "HTMLPurifier"
|
||||
|
||||
1.1.1, released 2006-09-24
|
||||
! Configuration option to optionally Tidy up output for indentation to make up
|
||||
for dropped whitespace by DOMLex (pretty-printing for the entire application
|
||||
should be done by a page-wide Tidy)
|
||||
- Various documentation updates
|
||||
- Fixed parse error in configuration documentation script
|
||||
- Fixed fatal error in benchmark scripts, slightly augmented
|
||||
- As far as possible, whitespace is preserved in-between table children
|
||||
- Sample test-settings.php file included
|
||||
|
||||
1.1.0, released 2006-09-16
|
||||
! Directive documentation generation using XSLT
|
||||
! XHTML can now be turned off, output becomes <br>
|
||||
- Made URI validator more forgiving: will ignore leading and trailing
|
||||
quotes, apostrophes and less than or greater than signs.
|
||||
- Enforce alphanumeric namespace and directive names for configuration.
|
||||
- Directive documentation generation using XSLT
|
||||
- Table child definition made more flexible, will fix up poorly ordered elements
|
||||
- XHTML generation can now be turned off, allowing things like <br>
|
||||
- Renamed ConfigDef to ConfigSchema
|
||||
. Renamed ConfigDef to ConfigSchema
|
||||
|
||||
1.0.1, released 2006-09-04
|
||||
- Fixed slight bug in DOMLex attribute parsing
|
||||
@@ -24,17 +51,17 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
||||
space in them. This manifested in TinyMCE.
|
||||
|
||||
1.0.0, released 2006-09-01
|
||||
! Shorthand CSS properties implemented: font, border, background, list-style
|
||||
! Basic color keywords translated into hexadecimal values
|
||||
! Table CSS properties implemented
|
||||
! Support for charsets other than UTF-8 (defined by iconv)
|
||||
! Malformed UTF-8 and non-SGML character detection and cleaning implemented
|
||||
- Fixed broken numeric entity conversion
|
||||
- Malformed UTF-8 and non-SGML character detection and cleaning implemented
|
||||
- API documentation completed
|
||||
- Shorthand CSS properties implemented: font, border, background, list-style
|
||||
- Basic color keywords translated into hexadecimal values
|
||||
- Table CSS properties implemented
|
||||
- (HTML|CSS)Definition de-singleton-ized
|
||||
- Support for charsets other than UTF-8 (defined by iconv)
|
||||
. (HTML|CSS)Definition de-singleton-ized
|
||||
|
||||
1.0.0beta, released 2006-08-16
|
||||
- First public release, most functionality implemented. Notable omissions are:
|
||||
. Shorthand CSS properties
|
||||
. Table CSS properties
|
||||
. Deprecated attribute transformations
|
||||
! First public release, most functionality implemented. Notable omissions are:
|
||||
+ Shorthand CSS properties
|
||||
+ Table CSS properties
|
||||
+ Deprecated attribute transformations
|
||||
|
24
README
24
README
@@ -1,13 +1,13 @@
|
||||
|
||||
README
|
||||
All about HTMLPurifier
|
||||
|
||||
HTMLPurifier is an HTML filtering solution. It uses a unique combination of
|
||||
robust whitelists and agressive parsing to ensure that not only are XSS
|
||||
attacks thwarted, but the resulting HTML is standards compliant.
|
||||
|
||||
See INSTALL on how to use the library. See docs/ for more developer-oriented
|
||||
documentation as well as some code examples. Users of TinyMCE or FCKeditor
|
||||
may be especially interested in WYSIWYG.
|
||||
|
||||
HTMLPurifier can be found on the web at: http://hp.jpsband.org/
|
||||
README
|
||||
All about HTMLPurifier
|
||||
|
||||
HTMLPurifier is an HTML filtering solution. It uses a unique combination of
|
||||
robust whitelists and agressive parsing to ensure that not only are XSS
|
||||
attacks thwarted, but the resulting HTML is standards compliant.
|
||||
|
||||
See INSTALL on how to use the library. See docs/ for more developer-oriented
|
||||
documentation as well as some code examples. Users of TinyMCE or FCKeditor
|
||||
may be especially interested in WYSIWYG.
|
||||
|
||||
HTMLPurifier can be found on the web at: http://hp.jpsband.org/
|
||||
|
13
SLOW
13
SLOW
@@ -17,18 +17,23 @@ second tacked on to the load time probably isn't going to be that huge of
|
||||
a problem. Then, displaying the content is a simple a manner of outputting
|
||||
it directly from your database/filesystem. The trouble with this method is
|
||||
that your user loses the original text, and when doing edits, will be
|
||||
handling the filtered text. Of course, maybe that's a good thing. If you
|
||||
don't mind a little extra complexity, you can try...
|
||||
handling the filtered text. While this may be a good thing, especially if
|
||||
you're using a WYSIWYG editor, it can also result in data-loss if a user
|
||||
expects a certain to be available but it doesn't.
|
||||
|
||||
2. Caching the filtered output - accept the submitted text and put it
|
||||
unaltered into the database, but then also generate a filtered version and
|
||||
stash that in the database. Serve the filtered version to readers, and the
|
||||
unaltered version to editors. If need be, you can invalidate the cache and
|
||||
have the cached filtered version be regenerated on the first page view. Pros?
|
||||
Full data retention. Cons? It's more complicated.
|
||||
Full data retention. Cons? It's more complicated, and opens other editors
|
||||
up to XSS if they are using a WYSIWYG editor (to fix that, they'd have to
|
||||
be able to get their hands on the *really* original text served in plaintext
|
||||
mode).
|
||||
|
||||
In short, inbound filtering is almost as simple as outbound filtering, but
|
||||
it has some drawbacks which cannot be fixed unless you save both the original
|
||||
and the filtered versions.
|
||||
|
||||
There is a third option: profile and optimize HTMLPurifier yourself. ;-)
|
||||
There is a third option: profile and optimize HTMLPurifier yourself. Be sure
|
||||
to tell me if you decide to do that! ;-)
|
||||
|
27
TODO
27
TODO
@@ -6,24 +6,29 @@ Ongoing
|
||||
- Plugins for major CMSes (very tricky issue)
|
||||
|
||||
1.2 release
|
||||
- Additional support for poorly written HTML
|
||||
- Implement all non-essential attribute transforms
|
||||
- Microsoft Word HTML cleaning (i.e. MsoNormal)
|
||||
|
||||
1.3 release
|
||||
- Formatters for plaintext
|
||||
- Auto-paragraphing (be sure to leverage fact that we know when things
|
||||
shouldn't be paragraphed, such as lists and tables).
|
||||
- Make URI validation routines tighter (especially mailto)
|
||||
- More extensive URI filtering schemes
|
||||
- Allow for background-image and list-style-image (see above)
|
||||
- Distinguish between different types of URIs, for instance, a mailto URI
|
||||
in IMG SRC is nonsensical
|
||||
- Error logging for filtering/cleanup procedures
|
||||
|
||||
2.0 release
|
||||
1.3 release
|
||||
- Add various "levels" of cleaning
|
||||
- Related: Allow strict (X)HTML
|
||||
|
||||
1.4 release
|
||||
- Additional support for poorly written HTML
|
||||
- Implement all non-essential attribute transforms
|
||||
- Microsoft Word HTML cleaning (i.e. MsoNormal)
|
||||
|
||||
2.0 release
|
||||
- Formatters for plaintext
|
||||
- Auto-paragraphing (be sure to leverage fact that we know when things
|
||||
shouldn't be paragraphed, such as lists and tables).
|
||||
- Linkify URLs
|
||||
- Smileys
|
||||
|
||||
3.0 release
|
||||
- Extended HTML capabilities based on namespacing and tag transforms
|
||||
- Hooks for adding custom processors to custom namespaced tags and
|
||||
@@ -38,9 +43,11 @@ Unknown release (on a scratch-an-itch basis)
|
||||
- Fixes for Firefox's inability to handle COL alignment props (Bug 915)
|
||||
- Automatically add non-breaking spaces to empty table cells when
|
||||
empty-cells:show is applied to have compatibility with Internet Explorer
|
||||
- Pretty-printing HTML (adds dependency of Generator to HTMLDefinition)
|
||||
- Non-lossy dumb alternate character encoding transformations, achieved by
|
||||
numerically encoding all non-ASCII characters
|
||||
- Semi-lossy dumb alternate character encoding transformations, achieved by
|
||||
encoding all characters that have string entity equivalents
|
||||
|
||||
Wontfix
|
||||
- Non-lossy smart alternate character encoding transformations
|
||||
- Pretty-printing HTML, users can use Tidy on the output on entire page
|
||||
|
@@ -3,15 +3,24 @@
|
||||
// emulates inserting a dir called HTMLPurifier into your class dir
|
||||
set_include_path(get_include_path() . PATH_SEPARATOR . '../library/');
|
||||
|
||||
require_once 'HTMLPurifier/ConfigDef.php';
|
||||
require_once 'HTMLPurifier/Config.php';
|
||||
require_once 'HTMLPurifier/Lexer/DirectLex.php';
|
||||
require_once 'HTMLPurifier/Lexer/PEARSax3.php';
|
||||
@include_once '../test-settings.php';
|
||||
|
||||
$LEXERS = array(
|
||||
'DirectLex' => new HTMLPurifier_Lexer_DirectLex(),
|
||||
'PEARSax3' => new HTMLPurifier_Lexer_PEARSax3()
|
||||
);
|
||||
require_once 'HTMLPurifier/ConfigSchema.php';
|
||||
require_once 'HTMLPurifier/Config.php';
|
||||
|
||||
$LEXERS = array();
|
||||
$RUNS = isset($GLOBALS['HTMLPurifierTest']['Runs'])
|
||||
? $GLOBALS['HTMLPurifierTest']['Runs'] : 2;
|
||||
|
||||
require_once 'HTMLPurifier/Lexer/DirectLex.php';
|
||||
$LEXERS['DirectLex'] = new HTMLPurifier_Lexer_DirectLex();
|
||||
|
||||
if (!empty($GLOBALS['HTMLPurifierTest']['PEAR'])) {
|
||||
require_once 'HTMLPurifier/Lexer/PEARSax3.php';
|
||||
$LEXERS['PEARSax3'] = new HTMLPurifier_Lexer_PEARSax3();
|
||||
} else {
|
||||
exit('PEAR required to perform benchmark.');
|
||||
}
|
||||
|
||||
if (version_compare(PHP_VERSION, '5', '>=')) {
|
||||
require_once 'HTMLPurifier/Lexer/DOMLex.php';
|
||||
@@ -56,9 +65,12 @@ class RowTimer extends Benchmark_Timer
|
||||
if ($standard == false) $standard = $v['diff'];
|
||||
|
||||
$perc = $v['diff'] * 100 / $standard;
|
||||
$bad_run = ($v['diff'] < 0);
|
||||
|
||||
$out .= '<td align="right">' . number_format($perc, 2, '.', '') .
|
||||
'%</td>';
|
||||
$out .= '<td align="right"'.
|
||||
($bad_run ? ' style="color:#AAA;"' : '').
|
||||
'>' . number_format($perc, 2, '.', '') .
|
||||
'%</td><td>'.number_format($v['diff'],4,'.','').'</td>';
|
||||
|
||||
}
|
||||
|
||||
@@ -79,13 +91,13 @@ function print_lexers() {
|
||||
}
|
||||
|
||||
function do_benchmark($name, $document) {
|
||||
global $LEXERS;
|
||||
global $LEXERS, $RUNS;
|
||||
|
||||
$timer = new RowTimer($name);
|
||||
$timer->start();
|
||||
|
||||
foreach($LEXERS as $key => $lexer) {
|
||||
$tokens = $lexer->tokenizeHTML($document);
|
||||
for ($i=0; $i<$RUNS; $i++) $tokens = $lexer->tokenizeHTML($document);
|
||||
$timer->setMarker($key);
|
||||
}
|
||||
|
||||
@@ -103,7 +115,7 @@ function do_benchmark($name, $document) {
|
||||
<table border="1">
|
||||
<tr><th>Case</th><?php
|
||||
foreach ($LEXERS as $key => $value) {
|
||||
echo '<th>' . htmlspecialchars($key) . '</th>';
|
||||
echo '<th colspan="2">' . htmlspecialchars($key) . '</th>';
|
||||
}
|
||||
?></tr>
|
||||
<?php
|
||||
|
@@ -2,7 +2,7 @@
|
||||
|
||||
set_include_path(get_include_path() . PATH_SEPARATOR . '../library/');
|
||||
|
||||
require_once 'HTMLPurifier/ConfigDef.php';
|
||||
require_once 'HTMLPurifier/ConfigSchema.php';
|
||||
require_once 'HTMLPurifier/Config.php';
|
||||
require_once 'HTMLPurifier/Lexer/DirectLex.php';
|
||||
|
||||
|
@@ -50,7 +50,7 @@ function appendHTMLDiv($document, $node, $html) {
|
||||
// ---------------------------------------------------------------------------
|
||||
// Load copies of HTMLPurifier_ConfigDef and HTMLPurifier
|
||||
|
||||
$definition = HTMLPurifier_ConfigDef::instance();
|
||||
$schema = HTMLPurifier_ConfigSchema::instance();
|
||||
$purifier = new HTMLPurifier();
|
||||
|
||||
|
||||
@@ -61,7 +61,7 @@ $types_document = new DOMDocument('1.0', 'UTF-8');
|
||||
$types_root = $types_document->createElement('types');
|
||||
$types_document->appendChild($types_root);
|
||||
$types_document->formatOutput = true;
|
||||
foreach ($definition->types as $name => $expanded_name) {
|
||||
foreach ($schema->types as $name => $expanded_name) {
|
||||
$types_type = $types_document->createElement('type', $expanded_name);
|
||||
$types_type->setAttribute('id', $name);
|
||||
$types_root->appendChild($types_type);
|
||||
@@ -88,7 +88,7 @@ TODO for XML format:
|
||||
- create a definition (DTD or other) once interface stabilizes
|
||||
*/
|
||||
|
||||
foreach($definition->info as $namespace_name => $namespace_info) {
|
||||
foreach($schema->info as $namespace_name => $namespace_info) {
|
||||
|
||||
$dom_namespace = $dom_document->createElement('namespace');
|
||||
$dom_root->appendChild($dom_namespace);
|
||||
@@ -100,7 +100,7 @@ foreach($definition->info as $namespace_name => $namespace_info) {
|
||||
$dom_namespace_description = $dom_document->createElement('description');
|
||||
$dom_namespace->appendChild($dom_namespace_description);
|
||||
appendHTMLDiv($dom_document, $dom_namespace_description,
|
||||
$definition->info_namespace[$namespace_name]->description);
|
||||
$schema->info_namespace[$namespace_name]->description);
|
||||
|
||||
foreach ($namespace_info as $name => $info) {
|
||||
|
||||
@@ -128,14 +128,14 @@ foreach($definition->info as $namespace_name => $namespace_info) {
|
||||
}
|
||||
}
|
||||
|
||||
$raw_default = $definition->defaults[$namespace_name][$name];
|
||||
$raw_default = $schema->defaults[$namespace_name][$name];
|
||||
if (is_bool($raw_default)) {
|
||||
$default = $raw_default ? 'true' : 'false';
|
||||
} elseif (is_string($raw_default)) {
|
||||
$default = "\"$raw_default\"";
|
||||
} else {
|
||||
$default = print_r(
|
||||
$definition->defaults[$namespace_name][$name], true
|
||||
$schema->defaults[$namespace_name][$name], true
|
||||
);
|
||||
}
|
||||
$dom_constraints->appendChild(
|
||||
|
@@ -1,7 +1,7 @@
|
||||
table {border-collapse:collapse;}
|
||||
table td, table th {padding:0.2em;}
|
||||
|
||||
table.constraints {margin:0 0 1em;}
|
||||
table.constraints th {text-align:left;padding-left:0.4em;}
|
||||
table.constraints td {padding-right:0.4em;}
|
||||
table.constraints td pre {margin:0;}
|
||||
table {border-collapse:collapse;}
|
||||
table td, table th {padding:0.2em;}
|
||||
|
||||
table.constraints {margin:0 0 1em;}
|
||||
table.constraints th {text-align:left;padding-left:0.4em;}
|
||||
table.constraints td {padding-right:0.4em;}
|
||||
table.constraints td pre {margin:0;}
|
||||
|
@@ -1,105 +1,105 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<xsl:stylesheet
|
||||
version = "1.0"
|
||||
xmlns = "http://www.w3.org/1999/xhtml"
|
||||
xmlns:xsl = "http://www.w3.org/1999/XSL/Transform"
|
||||
>
|
||||
<xsl:output
|
||||
method = "xml"
|
||||
encoding = "UTF-8"
|
||||
doctype-public = "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
||||
doctype-system = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"
|
||||
indent = "no"
|
||||
media-type = "text/html"
|
||||
/>
|
||||
|
||||
<xsl:variable name="typeLookup" select="document('../types.xml')" />
|
||||
|
||||
<xsl:template match="/">
|
||||
<html lang="en" xml:lang="en">
|
||||
<head>
|
||||
<title><xsl:value-of select="/configdoc/title" /> Configuration Documentation</title>
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8" />
|
||||
<link rel="stylesheet" type="text/css" href="styles/plain.css" />
|
||||
</head>
|
||||
<body>
|
||||
<xsl:apply-templates />
|
||||
</body>
|
||||
</html>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="title">
|
||||
<h1><xsl:value-of select="/configdoc/title" /> Configuration Documentation</h1>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="namespace">
|
||||
<xsl:apply-templates />
|
||||
<xsl:if test="count(child::directive)=0">
|
||||
<p>No configuration directives defined for this namespace.</p>
|
||||
</xsl:if>
|
||||
</xsl:template>
|
||||
<xsl:template match="namespace/name">
|
||||
<h2 id="{../@id}"><xsl:value-of select="text()" /></h2>
|
||||
</xsl:template>
|
||||
<xsl:template match="namespace/description">
|
||||
<div class="description">
|
||||
<xsl:copy-of select="div/node()" />
|
||||
</div>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="directive">
|
||||
<xsl:apply-templates />
|
||||
</xsl:template>
|
||||
<xsl:template match="directive/name">
|
||||
<h3 id="{../@id}"><xsl:value-of select="text()" /></h3>
|
||||
</xsl:template>
|
||||
<xsl:template match="directive/constraints">
|
||||
<table class="constraints">
|
||||
<xsl:apply-templates />
|
||||
<!-- Calculated other values -->
|
||||
<tr>
|
||||
<th>Used by:</th>
|
||||
<td>
|
||||
<xsl:for-each select="../descriptions/description">
|
||||
<xsl:if test="position()>1">, </xsl:if>
|
||||
<xsl:value-of select="@file" />
|
||||
</xsl:for-each>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
</xsl:template>
|
||||
<xsl:template match="directive//description">
|
||||
<div class="description">
|
||||
<xsl:copy-of select="div/node()" />
|
||||
</div>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="constraints/type">
|
||||
<tr>
|
||||
<th>Type:</th>
|
||||
<td>
|
||||
<xsl:variable name="type" select="text()" />
|
||||
<xsl:attribute name="class">type type-<xsl:value-of select="$type" /></xsl:attribute>
|
||||
<xsl:value-of select="$typeLookup/types/type[@id=$type]/text()" />
|
||||
</td>
|
||||
</tr>
|
||||
</xsl:template>
|
||||
<xsl:template match="constraints/allowed">
|
||||
<tr>
|
||||
<th>Allowed values:</th>
|
||||
<td>
|
||||
<xsl:for-each select="value"><!--
|
||||
--><xsl:if test="position()>1">, </xsl:if>
|
||||
"<xsl:value-of select="." />"<!--
|
||||
--></xsl:for-each>
|
||||
</td>
|
||||
</tr>
|
||||
</xsl:template>
|
||||
<xsl:template match="constraints/default">
|
||||
<tr>
|
||||
<th>Default:</th>
|
||||
<td><pre><xsl:value-of select="." xml:space="preserve" /></pre></td>
|
||||
</tr>
|
||||
</xsl:template>
|
||||
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<xsl:stylesheet
|
||||
version = "1.0"
|
||||
xmlns = "http://www.w3.org/1999/xhtml"
|
||||
xmlns:xsl = "http://www.w3.org/1999/XSL/Transform"
|
||||
>
|
||||
<xsl:output
|
||||
method = "xml"
|
||||
encoding = "UTF-8"
|
||||
doctype-public = "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
||||
doctype-system = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"
|
||||
indent = "no"
|
||||
media-type = "text/html"
|
||||
/>
|
||||
|
||||
<xsl:variable name="typeLookup" select="document('../types.xml')" />
|
||||
|
||||
<xsl:template match="/">
|
||||
<html lang="en" xml:lang="en">
|
||||
<head>
|
||||
<title><xsl:value-of select="/configdoc/title" /> Configuration Documentation</title>
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8" />
|
||||
<link rel="stylesheet" type="text/css" href="styles/plain.css" />
|
||||
</head>
|
||||
<body>
|
||||
<xsl:apply-templates />
|
||||
</body>
|
||||
</html>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="title">
|
||||
<h1><xsl:value-of select="/configdoc/title" /> Configuration Documentation</h1>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="namespace">
|
||||
<xsl:apply-templates />
|
||||
<xsl:if test="count(child::directive)=0">
|
||||
<p>No configuration directives defined for this namespace.</p>
|
||||
</xsl:if>
|
||||
</xsl:template>
|
||||
<xsl:template match="namespace/name">
|
||||
<h2 id="{../@id}"><xsl:value-of select="text()" /></h2>
|
||||
</xsl:template>
|
||||
<xsl:template match="namespace/description">
|
||||
<div class="description">
|
||||
<xsl:copy-of select="div/node()" />
|
||||
</div>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="directive">
|
||||
<xsl:apply-templates />
|
||||
</xsl:template>
|
||||
<xsl:template match="directive/name">
|
||||
<h3 id="{../@id}"><xsl:value-of select="text()" /></h3>
|
||||
</xsl:template>
|
||||
<xsl:template match="directive/constraints">
|
||||
<table class="constraints">
|
||||
<xsl:apply-templates />
|
||||
<!-- Calculated other values -->
|
||||
<tr>
|
||||
<th>Used by:</th>
|
||||
<td>
|
||||
<xsl:for-each select="../descriptions/description">
|
||||
<xsl:if test="position()>1">, </xsl:if>
|
||||
<xsl:value-of select="@file" />
|
||||
</xsl:for-each>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
</xsl:template>
|
||||
<xsl:template match="directive//description">
|
||||
<div class="description">
|
||||
<xsl:copy-of select="div/node()" />
|
||||
</div>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="constraints/type">
|
||||
<tr>
|
||||
<th>Type:</th>
|
||||
<td>
|
||||
<xsl:variable name="type" select="text()" />
|
||||
<xsl:attribute name="class">type type-<xsl:value-of select="$type" /></xsl:attribute>
|
||||
<xsl:value-of select="$typeLookup/types/type[@id=$type]/text()" />
|
||||
</td>
|
||||
</tr>
|
||||
</xsl:template>
|
||||
<xsl:template match="constraints/allowed">
|
||||
<tr>
|
||||
<th>Allowed values:</th>
|
||||
<td>
|
||||
<xsl:for-each select="value"><!--
|
||||
--><xsl:if test="position()>1">, </xsl:if>
|
||||
"<xsl:value-of select="." />"<!--
|
||||
--></xsl:for-each>
|
||||
</td>
|
||||
</tr>
|
||||
</xsl:template>
|
||||
<xsl:template match="constraints/default">
|
||||
<tr>
|
||||
<th>Default:</th>
|
||||
<td><pre><xsl:value-of select="." xml:space="preserve" /></pre></td>
|
||||
</tr>
|
||||
</xsl:template>
|
||||
|
||||
</xsl:stylesheet>
|
@@ -11,24 +11,24 @@ profiling.
|
||||
Here we go:
|
||||
|
||||
AttrDef
|
||||
Class - doesn't support Unicode characters, uses regular expressions
|
||||
Lang - code duplication, premature optimization, doesn't consult official
|
||||
lists
|
||||
Pixels/Length/MultiLength - implemented according to HTML spec (excludes
|
||||
code reuse in CSS)
|
||||
URI - multiple regular expressions, needs host validation routines factored
|
||||
out for mailto scheme, IPv6 validation is broken (fringe), unintuitive
|
||||
variable overwriting, missing validation for query, fragment and path,
|
||||
Class - doesn't support Unicode characters (fringe); uses regular
|
||||
expressions
|
||||
Lang - code duplication; premature optimization; doesn't consult official
|
||||
lists (fringe)
|
||||
Length - easily mistaken for CSSLength
|
||||
URI - multiple regular expressions; needs host validation routines factored
|
||||
out for mailto scheme; missing validation for query; fragment and path,
|
||||
no percent-encode fixing
|
||||
CSS - parser doesn't accept advanced CSS (fringe)
|
||||
Number - constructor interface is inconsistent with Integer
|
||||
AttrTransform - doesn't accept AttrContext, non-validating
|
||||
ChildDef - not-allowed nodes translated to text, likely invalid handling
|
||||
AttrTransform - doesn't accept AttrContext
|
||||
Config - "load configuration" hooks missing, rich set* accessors missing
|
||||
ConfigSchema - redefinition is a mess
|
||||
Strategy
|
||||
FixNesting - cannot bubble nodes out of structures
|
||||
MakeWellFormed - insufficient automatic closing definitions (check HTML
|
||||
spec for optional end tags).
|
||||
spec for optional end tags, also, closing based on type (block/inline)
|
||||
might be efficient).
|
||||
RemoveForeignElements - should be run in parallel with MakeWellFormed
|
||||
URIScheme - needs to have callable generic checks
|
||||
ftp - missing typecode check
|
||||
|
23
docs/colors.txt
Normal file
23
docs/colors.txt
Normal file
@@ -0,0 +1,23 @@
|
||||
|
||||
Colors
|
||||
Hammering some sense into those content-makers
|
||||
|
||||
Your website probably has a color-scheme. Green on white, purple on yellow,
|
||||
whatever. When you give users the ability to style their content, you may
|
||||
want them to keep in line with your styling. If you're website is all
|
||||
about light colors, you don't want a user to come in and vandalize your
|
||||
page with a deep maroon.
|
||||
|
||||
This is an extremely silly feature proposal, but I'm writing it down anyway.
|
||||
|
||||
What if the user could constrain the colors specified in inline styles? You
|
||||
are only allowed to use these shades of dark green for text and these shades
|
||||
of light yellow for the background. At the very least, you could ensure
|
||||
that we did not have pale yellow on white text.
|
||||
|
||||
Implementation issues:
|
||||
1. Requires the color attribute definition to know, currently, what the text
|
||||
and background colors are. This becomes difficult when classes are thrown
|
||||
into the mix.
|
||||
2. The user still has to define the permissible colors, how does one do
|
||||
something like that?
|
@@ -21,7 +21,9 @@ if (!empty($_POST['html'])) {
|
||||
|
||||
$html = get_magic_quotes_gpc() ? stripslashes($_POST['html']) : $_POST['html'];
|
||||
|
||||
$purifier = new HTMLPurifier();
|
||||
$config = HTMLPurifier_Config::createDefault();
|
||||
$config->set('Core', 'TidyFormat', !empty($_POST['tidy']));
|
||||
$purifier = new HTMLPurifier($config);
|
||||
$pure_html = $purifier->purify($html);
|
||||
|
||||
?>
|
||||
@@ -65,6 +67,8 @@ if (isset($html)) {
|
||||
HTMLPurifier_Encoder::cleanUTF8($html), ENT_COMPAT, 'UTF-8');
|
||||
}
|
||||
?></textarea>
|
||||
<div>Nicely format output with Tidy? <input type="checkbox" value="1"
|
||||
name="tidy"<?php if (!empty($_POST['tidy'])) echo ' checked="checked"'; ?> /></div>
|
||||
<div>
|
||||
<input type="submit" value="Submit" name="submit" class="button" />
|
||||
</div>
|
||||
|
@@ -20,15 +20,32 @@ can further be customized using simpler configuration options.
|
||||
Here are some fuzzy levels you could set:
|
||||
|
||||
1. Comments - Wordpress recommends a, abbr, acronym, b, blockquote, cite,
|
||||
code, em, i, strike, strong; however, you could get away with only a, b and
|
||||
i; also having p and pre tags would be helpful.
|
||||
2. Pages - As permissive as possible without allowing XSS. No protection
|
||||
code, em, i, strike, strong; however, you could get away with only a, em and
|
||||
p; also having blockquote and pre tags would be helpful.
|
||||
2. BBCode - Emulate the usual tagset for forums: b, i, img, a, blockquote,
|
||||
pre, div, span and h[2-6] (the last three are for specially formatted
|
||||
posts, div and span require associated classes or inline styling enabled
|
||||
to be useful)
|
||||
3. Pages - As permissive as possible without allowing XSS. No protection
|
||||
against bad design sense, unfortunantely. Suitable for wiki and page
|
||||
environments.
|
||||
3. Lint - Accept everything in the spec, a Tidy wannabe.
|
||||
4. Lint - Accept everything in the spec, a Tidy wannabe. (This probably won't
|
||||
get implemented as it would require routines for things like <object>
|
||||
and friends to be implemented, which is a lot of work for not a lot of
|
||||
benefit)
|
||||
|
||||
I've also decomposed tags into risk levels. An asterisk indicates that no one
|
||||
really uses that tag, tilde indicates it's deprecated.
|
||||
One final note: when you start axing tags that are more commonly used, you
|
||||
run the risk of accidentally destroying user data, especially if the data
|
||||
is incoming from a WYSIWYG eidtor that hasn't been synced accordingly. This may
|
||||
make forbidden element to text transformations desirable (for example, images).
|
||||
|
||||
|
||||
|
||||
== Element Risk Analysis ==
|
||||
|
||||
Legend:
|
||||
[danger level] - regular tags / uncommon tags ~ deprecated tags
|
||||
[danger level]* - rare tags
|
||||
|
||||
1 - blockquote, code, em, i, p, tt / strong, sub, sup
|
||||
1* - abbr, acronym, bdo, cite, dfn, kbd, q, samp
|
||||
@@ -38,30 +55,76 @@ really uses that tag, tilde indicates it's deprecated.
|
||||
5 - a
|
||||
7 - area, map
|
||||
|
||||
These are special use tags, they should be enabled on a blanket basis.
|
||||
|
||||
Lists - dd, dl, dt, li, ol, ul ~ menu, dir
|
||||
Tables - caption, table, td, th, tr / col, colgroup, tbody, tfoot, thead
|
||||
|
||||
Forms - fieldset, form, input, lable, legend, optgroup, option, select, textarea
|
||||
XSS - noscript, object, script ~ applet
|
||||
|
||||
Meta - base, basefont, body, head, html, link, meta, style, title
|
||||
Frames - frame, frameset, iframe
|
||||
|
||||
And tag specific notes:
|
||||
|
||||
a - general problems involving linkspam
|
||||
b - too much bold is bad, typographically speaking bold is discouraged
|
||||
br - often misused
|
||||
a - general problems involving linkspam
|
||||
b - too much bold is bad, typographically speaking bold is discouraged
|
||||
br - often misused
|
||||
center - CSS, usually no legit use
|
||||
del - only useful in editing context
|
||||
div - little meaning in certain contexts i.e. blog comment
|
||||
h1 - usually no legit use, as header is already set by application
|
||||
h* - not needed in blog comments
|
||||
hr - usually not necessary in blog comments
|
||||
img - could be extremely undesirable if linking to external pics
|
||||
h1 - usually no legit use, as header is already set by application
|
||||
h* - not needed in blog comments
|
||||
hr - usually not necessary in blog comments
|
||||
img - could be extremely undesirable if linking to external pics (CSRF, goatse)
|
||||
pre - could use formatting, only useful in code contexts
|
||||
q - very little support
|
||||
s - transform into span with styling or del?
|
||||
q - very little support
|
||||
s - transform into span with styling or del?
|
||||
small - technically presentational
|
||||
span - depends on attribute allowances
|
||||
sub, sup - specialized
|
||||
u - little legit use, prefer class with text-decoration
|
||||
u - little legit use, prefer class with text-decoration
|
||||
|
||||
Based on the riskiness of the items, we may want to offer %HTML.DisableImages
|
||||
attribute and put URI filtering higher up on the priority list.
|
||||
|
||||
|
||||
== Attribute Risk Analysis ==
|
||||
|
||||
We actually have a suprisingly small assortment of allowed attributes (the
|
||||
rest are deprecated in strict, and thus we opted not to allow them, even
|
||||
though our output is XHTML Transitional by default.)
|
||||
|
||||
Required URI - img.alt, img.src, a.href
|
||||
Medium risk - *.class, *.dir
|
||||
High risk - img.height, img.width, *.id, *.style
|
||||
|
||||
Table - colgroup/col.span, td/th.rowspan, td/th.colspan
|
||||
Uncommon - *.title, *.lang, *.xml:lang
|
||||
Rare - td/th.abbr, table.summary, {table}.charoff
|
||||
Rare URI - del.cite, ins.cite, blockquote.cite, q.cite, img.longdesc
|
||||
Presentational - {table}.align, {table}.valign, table.frame, table.rules,
|
||||
table.border
|
||||
Partially presentational - table.cellpadding, table.cellspacing,
|
||||
table.width, col.width, colgroup.width
|
||||
|
||||
|
||||
== CSS Risk Analysis ==
|
||||
|
||||
There are certain CSS elements that are extremely useful inline, but then
|
||||
as you get to more presentation oriented styling it may not always be
|
||||
appropriate to inline them.
|
||||
|
||||
Useful - clear, float, border-collapse, caption-side
|
||||
|
||||
These CSS properties can break layouts if used improperly. We have excluded
|
||||
any CSS properties that are not currently implemented (such as position).
|
||||
|
||||
Dangerous, can go outside container - float
|
||||
Easy to abuse - font-size, font-family (font), width
|
||||
Colored - background-color (background), border-color (border), color
|
||||
Dramatic - border, list-style-position (list-style), margin, padding,
|
||||
text-align, text-indent, text-transform, vertical-align, line-height
|
||||
|
||||
Dramatic elements substnatially change the look of text in ways that should
|
||||
probably have been reserved to other areas.
|
||||
|
@@ -2,7 +2,8 @@
|
||||
Optimization
|
||||
|
||||
Here are some possible optimization techniques we can apply to code sections if
|
||||
they turn out to be slow. Be sure not to prematurely optimize though!
|
||||
they turn out to be slow. Be sure not to prematurely optimize: if you get
|
||||
that itch, put it here!
|
||||
|
||||
- Make Tokens Flyweights (may prove problematic, probably not worth it)
|
||||
- Rewrite regexps into PHP code
|
||||
|
@@ -86,7 +86,7 @@ thead th {text-align:left;padding:0.1em;background-color:#EEE;}
|
||||
Well-supported values are: disc, circle, square,
|
||||
decimal, lower-roman, upper-roman, lower-alpha and upper-alpha. See also
|
||||
CSS 3. Mostly IE lack of support.</td></tr>
|
||||
<tr class="css1 impl-yes"><td>list-style</td><td>SHORTHAND, target milestone 1.0</td></tr>
|
||||
<tr class="css1 impl-yes"><td>list-style</td><td>SHORTHAND</td></tr>
|
||||
<tr class="css1 impl-yes"><td>margin</td><td>MULTIPLE</td></tr>
|
||||
<tr class="css1 impl-yes"><td>margin-*</td><td>COMPOSITE(<length>,
|
||||
<percentage>, auto)</td></tr>
|
||||
@@ -134,7 +134,7 @@ thead th {text-align:left;padding:0.1em;background-color:#EEE;}
|
||||
|
||||
<tbody>
|
||||
<tr><th colspan="2">Unknown</th></tr>
|
||||
<tr class="danger css1"><td>background-image</td><td>Dangerous, target milestone 1.3</td></tr>
|
||||
<tr class="danger css1"><td>background-image</td><td>Dangerous, target milestone 1.2</td></tr>
|
||||
<tr class="css1"><td>background-attachment</td><td>ENUM(scroll, fixed),
|
||||
Depends on background-image</td></tr>
|
||||
<tr class="css1"><td>background-position</td><td>Depends on background-image</td></tr>
|
||||
@@ -144,7 +144,7 @@ thead th {text-align:left;padding:0.1em;background-color:#EEE;}
|
||||
inline-block has incomplete IE6 support and requires -moz-inline-box
|
||||
for Mozilla. Unknown target milestone.</td></tr>
|
||||
<tr><td class="css1">height</td><td>Interesting, why use it? Unknown target milestone.</td></tr>
|
||||
<tr class="danger css1"><td>list-style-image</td><td>Dangerous? Target milestone 1.3</td></tr>
|
||||
<tr class="danger css1"><td>list-style-image</td><td>Dangerous? Target milestone 1.2</td></tr>
|
||||
<tr class="impl-no"><td>max-height</td><td rowspan="4">No IE 5/6</td></tr>
|
||||
<tr class="impl-no"><td>min-height</td></tr>
|
||||
<tr class="impl-no"><td>max-width</td></tr>
|
||||
@@ -254,7 +254,7 @@ Mozilla on inside and needs -moz-outline, no IE support.</td></tr>
|
||||
</tbody>
|
||||
|
||||
<tbody>
|
||||
<tr><th colspan="3">Transform, target milestone 1.2</th></tr>
|
||||
<tr><th colspan="3">Transform, target milestone 1.4</th></tr>
|
||||
<tr><td rowspan="5">align</td><td>CAPTION</td><td>Near-equiv style 'caption-side', drop left and right</td></tr>
|
||||
<tr><td>IMG</td><td rowspan="2">Margin-left and margin-right = auto or parent div</td></tr>
|
||||
<tr><td>TABLE</td></tr>
|
||||
|
@@ -6,30 +6,39 @@ through negligence of people. This class will do its job: no more, no less,
|
||||
and it's up to you to provide it the proper information and proper context
|
||||
to be effective. Things to remember:
|
||||
|
||||
1. UTF-8. Currently, the parser runs under the assumption that it is dealing
|
||||
1. Character Encoding: UTF-8.
|
||||
Currently, the parser runs under the assumption that it is dealing
|
||||
with UTF-8. Not ISO-8859-1 or Windows-1252, UTF-8. And definitely not "no
|
||||
character encoding explicitly stated" or UTF-7. If you're not using UTF-8 as
|
||||
your character encoding, you should switch. Now. Make sure any input is
|
||||
properly converted to UTF-8, or the parser will mangle it badly
|
||||
(though it won't be a security risk if you're outputting it as UTF-8 though).
|
||||
your character encoding, make sure you configure HTML Purifier or switch
|
||||
to UTF-8. Now. Also, make sure any input is properly converted to UTF-8, or
|
||||
the parser will mangle it badly (though it won't be a security risk if you're
|
||||
outputting it as UTF-8 though). Character encoding is, in general, a knotty
|
||||
issue, but do yourself a favor and learn about it:
|
||||
<http://www.joelonsoftware.com/articles/Unicode.html>
|
||||
|
||||
2. XHTML 1.0 Transitional. This is what the parser is outputting. For the most
|
||||
2. Doctype: XHTML 1.0 Transitional
|
||||
This is what the parser is outputting. For the most
|
||||
part, it's compatible with HTML 4.01, but XHTML enforces some very nice things
|
||||
that all web developers should use. Regardless, NO DOCTYPE is a NO. Quirks mode
|
||||
has waaaay too many quirks for a little parser to handle. We did not select
|
||||
strict in order to prevent ourselves from being too draconic on users, but
|
||||
this may be configurable in the future.
|
||||
this may be configurable in the future. Do you want standards compliance?
|
||||
The doctype is a good place to start.
|
||||
|
||||
3. IDs. They need to be unique, but without some knowledge of the
|
||||
3. IDs
|
||||
They need to be unique, but without some knowledge of the
|
||||
rest of the document, it's difficult to know what's unique. %Attr.IDBlacklist
|
||||
needs to be set: we may want to consider disallowing IDs by default to
|
||||
save lazy programmers.
|
||||
|
||||
4. [PROJECTED] Links. We're not going to try for spam protection (although
|
||||
4. [PROJECTED] Links
|
||||
We're not going to try for spam protection (although
|
||||
some hooks for such a module might be nice) but we may offer the ability to
|
||||
only accept relative URLs. Pick the one that's right for you.
|
||||
|
||||
5. CSS. While we can prevent the most flagrant cases from affecting your
|
||||
5. CSS
|
||||
While we can prevent the most flagrant cases from affecting your
|
||||
layout (such as absolutely positioned elements), no amount of code is going
|
||||
to protect your pages from being attacked by garish colors and plain old
|
||||
bad taste. A neat feature would be the ability to define acceptable colors
|
||||
|
25
docs/strictness.txt
Normal file
25
docs/strictness.txt
Normal file
@@ -0,0 +1,25 @@
|
||||
|
||||
Is HTML Purifier Strict or Transitional?
|
||||
A little bit of helpful guidance
|
||||
|
||||
Despite the fact that HTML Purifier professes only to support transitional
|
||||
HTML, it rejects a lot of attributes and elements that are actually, indeed,
|
||||
valid. You can investigate progress.html to find out precisely what we
|
||||
are doing to these *deprecated* attributes.
|
||||
|
||||
However, users have found that Strict HTML imposes some quite unreasonable
|
||||
restrictions on certain things. The start and value attributes in ol and
|
||||
li (respectively) perhaps are the most contested. There's is currently no
|
||||
widely supported browser method short of JavaScript that can replace these
|
||||
two deprecated elements. HTML Purifier does not currently support them, but
|
||||
it might behoove us to do so while our output is still transitional.
|
||||
|
||||
Fortunantely, that's the only real bugger case. The others have near-perfect
|
||||
CSS equivalents, and were presentational anyway. However, the other question
|
||||
pops up: should we always convert these to the CSS forms when 1. the spec
|
||||
allows them anyway and 2. older browsers support them better? After all, the
|
||||
whole point about CSS is to seperate styling from content, so inline styling
|
||||
doesn't solve that problem.
|
||||
|
||||
It's an icky question, and we'll have to deal with it as more and more
|
||||
transforms get implemented.
|
10
library/HTMLPurifier.auto.php
Normal file
10
library/HTMLPurifier.auto.php
Normal file
@@ -0,0 +1,10 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* This is a stub include that automatically configures the include path.
|
||||
*/
|
||||
|
||||
set_include_path(dirname(__FILE__) . PATH_SEPARATOR . get_include_path() );
|
||||
require_once 'HTMLPurifier.php';
|
||||
|
||||
?>
|
@@ -3,7 +3,7 @@
|
||||
/*!
|
||||
* @mainpage
|
||||
*
|
||||
* HTMLPurifier is an HTML filter that will take an arbitrary snippet of
|
||||
* HTML Purifier is an HTML filter that will take an arbitrary snippet of
|
||||
* HTML and rigorously test, validate and filter it into a version that
|
||||
* is safe for output onto webpages. It achieves this by:
|
||||
*
|
||||
@@ -22,7 +22,7 @@
|
||||
*/
|
||||
|
||||
/*
|
||||
HTMLPurifier - Standards Compliant HTML Filtering
|
||||
HTML Purifier - Standards Compliant HTML Filtering
|
||||
Copyright (C) 2006 Edward Z. Yang
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
|
@@ -3,15 +3,15 @@
|
||||
/**
|
||||
* Internal data-structure used in attribute validation to accumulate state.
|
||||
*
|
||||
* All it is is a data-structure that holds objects that accumulate state, like
|
||||
* HTMLPurifier_IDAccumulator.
|
||||
* This is a data-structure that holds objects that accumulate state, like
|
||||
* HTMLPurifier_IDAccumulator. It's better than using globals!
|
||||
*
|
||||
* @param Many functions that accept this object have it as a mandatory
|
||||
* parameter, even when there is no use for it. Though this is
|
||||
* for the same reasons as why HTMLPurifier_Config is a mandatory
|
||||
* parameter, it is also because you cannot assign a default value
|
||||
* to a parameter passed by reference (passing by reference is essential
|
||||
* for context to work in PHP 4).
|
||||
* @note Many functions that accept this object have it as a mandatory
|
||||
* parameter, even when there is no use for it. Though this is
|
||||
* for the same reasons as why HTMLPurifier_Config is a mandatory
|
||||
* parameter, it is also because you cannot assign a default value
|
||||
* to a parameter passed by reference (passing by reference is essential
|
||||
* for context to work in PHP 4).
|
||||
*/
|
||||
|
||||
class HTMLPurifier_AttrContext
|
||||
|
@@ -48,7 +48,16 @@ class HTMLPurifier_AttrDef
|
||||
*
|
||||
* @note This method is not entirely standards compliant, as trim() removes
|
||||
* more types of whitespace than specified in the spec. In practice,
|
||||
* this is rarely a problem.
|
||||
* this is rarely a problem, as those extra characters usually have
|
||||
* already been removed by HTMLPurifier_Encoder.
|
||||
*
|
||||
* @warning This processing is inconsistent with XML's whitespace handling
|
||||
* as specified by section 3.3.3 and referenced XHTML 1.0 section
|
||||
* 4.7. Compliant processing requires all line breaks normalized
|
||||
* to "\n", so the fix is not as simple as fixing it in this
|
||||
* function. Trim and whitespace collapsing are supposed to only
|
||||
* occur in NMTOKENs. However, note that we are NOT necessarily
|
||||
* parsing XML, thus, this behavior may still be correct.
|
||||
*
|
||||
* @public
|
||||
*/
|
||||
|
@@ -11,9 +11,14 @@ class HTMLPurifier_AttrDef_Host extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
/**
|
||||
* Instances of HTMLPurifier_AttrDef_IPv4 and HTMLPurifier_AttrDef_IPv6
|
||||
* Instance of HTMLPurifier_AttrDef_IPv4 sub-validator
|
||||
*/
|
||||
var $ipv4, $ipv6;
|
||||
var $ipv4;
|
||||
|
||||
/**
|
||||
* Instance of HTMLPurifier_AttrDef_IPv6 sub-validator
|
||||
*/
|
||||
var $ipv6;
|
||||
|
||||
function HTMLPurifier_AttrDef_Host() {
|
||||
$this->ipv4 = new HTMLPurifier_AttrDef_IPv4();
|
||||
|
@@ -56,6 +56,8 @@ class HTMLPurifier_ChildDef
|
||||
*
|
||||
* @warning Currently this class is an all or nothing proposition, that is,
|
||||
* it will only give a bool return value.
|
||||
* @note This class is currently not used by any code, although it is unit
|
||||
* tested.
|
||||
*/
|
||||
class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef
|
||||
{
|
||||
@@ -327,6 +329,8 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
|
||||
$is_collecting = false; // are we globbing together tokens to package
|
||||
// into one of the collectors?
|
||||
$collection = array(); // collected nodes
|
||||
$tag_index = 0; // the first node might be whitespace,
|
||||
// so this tells us where the start tag is
|
||||
|
||||
foreach ($tokens_of_children as $token) {
|
||||
$is_child = ($nesting == 0);
|
||||
@@ -344,7 +348,7 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
|
||||
if ($is_child) {
|
||||
// okay, let's stash the tokens away
|
||||
// first token tells us the type of the collection
|
||||
switch ($collection[0]->name) {
|
||||
switch ($collection[$tag_index]->name) {
|
||||
case 'tr':
|
||||
case 'tbody':
|
||||
$content[] = $collection;
|
||||
@@ -356,13 +360,13 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
|
||||
case 'thead':
|
||||
case 'tfoot':
|
||||
// access the appropriate variable, $thead or $tfoot
|
||||
$var = $collection[0]->name;
|
||||
$var = $collection[$tag_index]->name;
|
||||
if ($$var === false) {
|
||||
$$var = $collection;
|
||||
} else {
|
||||
// transmutate the first and less entries into
|
||||
// tbody tags, and then put into content
|
||||
$collection[0]->name = 'tbody';
|
||||
$collection[$tag_index]->name = 'tbody';
|
||||
$collection[count($collection)-1]->name = 'tbody';
|
||||
$content[] = $collection;
|
||||
}
|
||||
@@ -373,6 +377,7 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
|
||||
}
|
||||
$collection = array();
|
||||
$is_collecting = false;
|
||||
$tag_index = 0;
|
||||
} else {
|
||||
// add the node to the collection
|
||||
$collection[] = $token;
|
||||
@@ -387,7 +392,9 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
|
||||
if ($token->name == 'col') {
|
||||
// the only empty tag in the possie, we can handle it
|
||||
// immediately
|
||||
$cols[] = array($token);
|
||||
$cols[] = array_merge($collection, array($token));
|
||||
$collection = array();
|
||||
$tag_index = 0;
|
||||
continue;
|
||||
}
|
||||
switch($token->name) {
|
||||
@@ -401,7 +408,10 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
|
||||
$collection[] = $token;
|
||||
continue;
|
||||
default:
|
||||
// unrecognized, drop silently
|
||||
if ($token->type == 'text' && $token->is_whitespace) {
|
||||
$collection[] = $token;
|
||||
$tag_index++;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
@@ -415,6 +425,10 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
|
||||
if ($thead !== false) $ret = array_merge($ret, $thead);
|
||||
if ($tfoot !== false) $ret = array_merge($ret, $tfoot);
|
||||
foreach ($content as $token_array) $ret = array_merge($ret, $token_array);
|
||||
if (!empty($collection) && $is_collecting == false){
|
||||
// grab the trailing space
|
||||
$ret = array_merge($ret, $collection);
|
||||
}
|
||||
|
||||
array_pop($tokens_of_children); // remove phantom token
|
||||
|
||||
@@ -423,4 +437,4 @@ class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
|
||||
}
|
||||
}
|
||||
|
||||
?>
|
||||
?>
|
||||
|
@@ -26,12 +26,12 @@ class HTMLPurifier_Config
|
||||
var $def;
|
||||
|
||||
/**
|
||||
* Instance of HTMLPurifier_HTMLDefinition
|
||||
* Cached instance of HTMLPurifier_HTMLDefinition
|
||||
*/
|
||||
var $html_definition;
|
||||
|
||||
/**
|
||||
* Instance of HTMLPurifier_CSSDefinition
|
||||
* Cached instance of HTMLPurifier_CSSDefinition
|
||||
*/
|
||||
var $css_definition;
|
||||
|
||||
|
@@ -2,7 +2,6 @@
|
||||
|
||||
/**
|
||||
* Configuration definition, defines directives and their defaults.
|
||||
* @todo Build documentation generation capabilities.
|
||||
* @todo The ability to define things multiple times is confusing and should
|
||||
* be factored out to its own function named registerDependency() or
|
||||
* addNote(), where only the namespace.name and an extra descriptions
|
||||
@@ -39,7 +38,6 @@ class HTMLPurifier_ConfigSchema {
|
||||
|
||||
/**
|
||||
* Lookup table of allowed types.
|
||||
* @todo Add descriptions
|
||||
*/
|
||||
var $types = array(
|
||||
'string' => 'String',
|
||||
@@ -82,9 +80,6 @@ class HTMLPurifier_ConfigSchema {
|
||||
/**
|
||||
* Defines a directive for configuration
|
||||
* @warning Will fail of directive's namespace is defined
|
||||
* @todo Collect information on description and allow redefinition
|
||||
* so that multiple files can register a dependency on a
|
||||
* configuration directive.
|
||||
* @param $namespace Namespace the directive is in
|
||||
* @param $name Key of directive
|
||||
* @param $default Default value of directive
|
||||
|
@@ -88,7 +88,7 @@ class HTMLPurifier_Encoder
|
||||
if ($iconv && !$force_php) {
|
||||
// do the shortcut way
|
||||
$str = @iconv('UTF-8', 'UTF-8//IGNORE', $str);
|
||||
return strtr($str, $non_sgml_chars);;
|
||||
return strtr($str, $non_sgml_chars);
|
||||
}
|
||||
|
||||
$mState = 0; // cached expected number of octets after the current octet
|
||||
|
@@ -88,7 +88,6 @@ class HTMLPurifier_EntityParser
|
||||
* either index 1, 2 or 3 set with a hex value, dec value,
|
||||
* or string (respectively).
|
||||
* @returns Replacement string.
|
||||
* @todo Implement string translations
|
||||
*/
|
||||
|
||||
// +----------+----------+----------+----------+
|
||||
|
@@ -23,6 +23,21 @@ HTMLPurifier_ConfigSchema::define(
|
||||
'This directive was available since 1.1.'
|
||||
);
|
||||
|
||||
// extension constraints could be factored into ConfigSchema
|
||||
HTMLPurifier_ConfigSchema::define(
|
||||
'Core', 'TidyFormat', false, 'bool',
|
||||
'<p>Determines whether or not to run Tidy on the final output for pretty '.
|
||||
'formatting reasons, such as indentation and wrap.</p><p>This can greatly '.
|
||||
'improve readability for editors who are hand-editing the HTML, but is '.
|
||||
'by no means necessary as HTML Purifier has already fixed all major '.
|
||||
'errors the HTML may have had. Tidy is a non-default extension, and this directive '.
|
||||
'will silently fail if Tidy is not available.</p><p>If you are looking to make '.
|
||||
'the overall look of your page\'s source better, I recommend running Tidy '.
|
||||
'on the entire page rather than just user-content (after all, the '.
|
||||
'indentation relative to the containing blocks will be incorrect).</p><p>This '.
|
||||
'directive was available since 1.1.1.</p>'
|
||||
);
|
||||
|
||||
/**
|
||||
* Generates HTML from tokens.
|
||||
*/
|
||||
@@ -56,6 +71,30 @@ class HTMLPurifier_Generator
|
||||
foreach ($tokens as $token) {
|
||||
$html .= $this->generateFromToken($token);
|
||||
}
|
||||
if ($config->get('Core', 'TidyFormat') && extension_loaded('tidy')) {
|
||||
|
||||
$tidy_options = array(
|
||||
'indent'=> true,
|
||||
'output-xhtml' => $this->_xhtml,
|
||||
'show-body-only' => true,
|
||||
'indent-spaces' => 2,
|
||||
'wrap' => 68,
|
||||
);
|
||||
if (version_compare(PHP_VERSION, '5', '<')) {
|
||||
tidy_set_encoding('utf8');
|
||||
foreach ($tidy_options as $key => $value) {
|
||||
tidy_setopt($key, $value);
|
||||
}
|
||||
tidy_parse_string($html);
|
||||
tidy_clean_repair();
|
||||
$html = tidy_get_output();
|
||||
} else {
|
||||
$tidy = new Tidy;
|
||||
$tidy->parseString($html, $tidy_options, 'utf8');
|
||||
$tidy->cleanRepair();
|
||||
$html = (string) $tidy;
|
||||
}
|
||||
}
|
||||
return $html;
|
||||
}
|
||||
|
||||
|
@@ -56,6 +56,7 @@ class HTMLPurifier_HTMLDefinition
|
||||
|
||||
/**
|
||||
* String name of parent element HTML will be going into.
|
||||
* @todo Allow this to be overloaded by user config
|
||||
* @public
|
||||
*/
|
||||
var $info_parent = 'div';
|
||||
@@ -111,12 +112,19 @@ class HTMLPurifier_HTMLDefinition
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// info[]->child : defines allowed children for elements
|
||||
|
||||
// entities: prefixed with e_ and _ replaces .
|
||||
// entities: prefixed with e_ and _ replaces . from DTD
|
||||
// double underlines are entities we made up
|
||||
|
||||
// we don't use an array because that complicates interpolation
|
||||
// strings are used instead of arrays because if you use arrays,
|
||||
// you have to do some hideous manipulation with array_merge()
|
||||
|
||||
// todo: determine whether or not having allowed children
|
||||
// that aren't allowed globally affects security (it shouldn't)
|
||||
// if above works out, extend children definitions to include all
|
||||
// possible elements (allowed elements will dictate which ones
|
||||
// get dropped
|
||||
|
||||
$e_special_extra = 'img';
|
||||
$e_special_basic = 'br | span | bdo';
|
||||
$e_special = "$e_special_basic | $e_special_extra";
|
||||
@@ -142,16 +150,18 @@ class HTMLPurifier_HTMLDefinition
|
||||
$e_block = "p | $e_heading | div | $e_lists | $e_blocktext | table";
|
||||
$e__flow = "#PCDATA | $e_block | $e_inline | $e_misc";
|
||||
$e_Flow = new HTMLPurifier_ChildDef_Optional($e__flow);
|
||||
$e_a_content = new HTMLPurifier_ChildDef_Optional("#PCDATA | $e_special".
|
||||
" | $e_fontstyle | $e_phrase | $e_inline_forms | $e_misc_inline");
|
||||
$e_a_content = new HTMLPurifier_ChildDef_Optional("#PCDATA".
|
||||
" | $e_special | $e_fontstyle | $e_phrase | $e_inline_forms".
|
||||
" | $e_misc_inline");
|
||||
$e_pre_content = new HTMLPurifier_ChildDef_Optional("#PCDATA | a".
|
||||
" | $e_special_basic | $e_fontstyle_basic | $e_phrase_basic".
|
||||
" | $e_inline_forms | $e_misc_inline");
|
||||
$e_form_content = new HTMLPurifier_ChildDef_Optional(''); //unused
|
||||
$e_form_button_content = new HTMLPurifier_ChildDef_Optional(''); // unused
|
||||
$e_form_content = new HTMLPurifier_ChildDef_Optional('');//unused
|
||||
$e_form_button_content = new HTMLPurifier_ChildDef_Optional('');//unused
|
||||
|
||||
$this->info['ins']->child =
|
||||
$this->info['del']->child = new HTMLPurifier_ChildDef_Chameleon($e__inline, $e__flow);
|
||||
$this->info['del']->child =
|
||||
new HTMLPurifier_ChildDef_Chameleon($e__inline, $e__flow);
|
||||
|
||||
$this->info['blockquote']->child=
|
||||
$this->info['dd']->child =
|
||||
@@ -225,7 +235,7 @@ class HTMLPurifier_HTMLDefinition
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// info[]->type : defines the type of the element (block or inline)
|
||||
|
||||
// reuses $e_Inline and $e_block
|
||||
// reuses $e_Inline and $e_Block
|
||||
|
||||
foreach ($e_Inline->elements as $name) {
|
||||
$this->info[$name]->type = 'inline';
|
||||
@@ -243,7 +253,7 @@ class HTMLPurifier_HTMLDefinition
|
||||
|
||||
$this->info['a']->excludes = array('a' => true);
|
||||
$this->info['pre']->excludes = array_flip(array('img', 'big', 'small',
|
||||
// technically in spec, but we don't allow em anyway
|
||||
// technically useless, but good to be indepth
|
||||
'object', 'applet', 'font', 'basefont'));
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
@@ -253,6 +263,8 @@ class HTMLPurifier_HTMLDefinition
|
||||
// by the transform classes. It will, however, do simple and slightly
|
||||
// complex attribute value substitution
|
||||
|
||||
// the question of varying allowed attributes is more entangling.
|
||||
|
||||
$e_Text = new HTMLPurifier_AttrDef_Text();
|
||||
|
||||
// attrs, included in almost every single one except for a few,
|
||||
@@ -297,7 +309,8 @@ class HTMLPurifier_HTMLDefinition
|
||||
|
||||
$this->info['table']->attr['summary'] = $e_Text;
|
||||
|
||||
$this->info['table']->attr['border'] = new HTMLPurifier_AttrDef_Pixels();
|
||||
$this->info['table']->attr['border'] =
|
||||
new HTMLPurifier_AttrDef_Pixels();
|
||||
|
||||
$e_Length = new HTMLPurifier_AttrDef_Length();
|
||||
$this->info['table']->attr['cellpadding'] =
|
||||
@@ -329,7 +342,7 @@ class HTMLPurifier_HTMLDefinition
|
||||
$this->info['q']->attr['cite'] = $e_URI;
|
||||
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// UNIMP : info_tag_transform : transformations of tags
|
||||
// info_tag_transform : transformations of tags
|
||||
|
||||
$this->info_tag_transform['font'] = new HTMLPurifier_TagTransform_Font();
|
||||
$this->info_tag_transform['menu'] = new HTMLPurifier_TagTransform_Simple('ul');
|
||||
@@ -339,6 +352,9 @@ class HTMLPurifier_HTMLDefinition
|
||||
//////////////////////////////////////////////////////////////////////
|
||||
// info[]->auto_close : tags that automatically close another
|
||||
|
||||
// todo: determine whether or not SGML-like modeling based on
|
||||
// mandatory/optional end tags would be a better policy
|
||||
|
||||
// make sure you test using isset() not !empty()
|
||||
|
||||
// these are all block elements: blocks aren't allowed in P
|
||||
|
@@ -60,6 +60,60 @@ class HTMLPurifier_Lexer
|
||||
$this->_entity_parser = new HTMLPurifier_EntityParser();
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Most common entity to raw value conversion table for special entities.
|
||||
* @protected
|
||||
*/
|
||||
var $_special_entity2str =
|
||||
array(
|
||||
'"' => '"',
|
||||
'&' => '&',
|
||||
'<' => '<',
|
||||
'>' => '>',
|
||||
''' => "'",
|
||||
''' => "'",
|
||||
''' => "'"
|
||||
);
|
||||
|
||||
/**
|
||||
* Parses special entities into the proper characters.
|
||||
*
|
||||
* This string will translate escaped versions of the special characters
|
||||
* into the correct ones.
|
||||
*
|
||||
* @warning
|
||||
* You should be able to treat the output of this function as
|
||||
* completely parsed, but that's only because all other entities should
|
||||
* have been handled previously in substituteNonSpecialEntities()
|
||||
*
|
||||
* @param $string String character data to be parsed.
|
||||
* @returns Parsed character data.
|
||||
*/
|
||||
function parseData($string) {
|
||||
|
||||
// following functions require at least one character
|
||||
if ($string === '') return '';
|
||||
|
||||
// subtracts amps that cannot possibly be escaped
|
||||
$num_amp = substr_count($string, '&') - substr_count($string, '& ') -
|
||||
($string[strlen($string)-1] === '&' ? 1 : 0);
|
||||
|
||||
if (!$num_amp) return $string; // abort if no entities
|
||||
$num_esc_amp = substr_count($string, '&');
|
||||
$string = strtr($string, $this->_special_entity2str);
|
||||
|
||||
// code duplication for sake of optimization, see above
|
||||
$num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
|
||||
($string[strlen($string)-1] === '&' ? 1 : 0);
|
||||
|
||||
if ($num_amp_2 <= $num_esc_amp) return $string;
|
||||
|
||||
// hmm... now we have some uncommon entities. Use the callback.
|
||||
$string = $this->_entity_parser->substituteSpecialEntities($string);
|
||||
return $string;
|
||||
}
|
||||
|
||||
var $_encoder;
|
||||
|
||||
/**
|
||||
|
@@ -12,15 +12,19 @@ require_once 'HTMLPurifier/TokenFactory.php';
|
||||
* documents, it performs twenty times faster than
|
||||
* HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5.
|
||||
*
|
||||
* @notice
|
||||
* Any empty elements will have empty tokens associated with them, even if
|
||||
* @note Any empty elements will have empty tokens associated with them, even if
|
||||
* this is prohibited by the spec. This is cannot be fixed until the spec
|
||||
* comes into play.
|
||||
*
|
||||
* @todo Determine DOM's entity parsing behavior, point to local entity files
|
||||
* if necessary.
|
||||
* @todo Make div access less fragile, and refrain from preprocessing when
|
||||
* HTML tag and friends are already present.
|
||||
* @note PHP's DOM extension does not actually parse any entities, we use
|
||||
* our own function to do that.
|
||||
*
|
||||
* @warning DOM tends to drop whitespace, which may wreak havoc on indenting.
|
||||
* If this is a huge problem, due to the fact that HTML is hand
|
||||
* edited and youa re unable to get a parser cache that caches the
|
||||
* the output of HTML Purifier while keeping the original HTML lying
|
||||
* around, you may want to run Tidy on the resulting output or use
|
||||
* HTMLPurifier_DirectLex
|
||||
*/
|
||||
|
||||
class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
||||
|
@@ -12,64 +12,12 @@ require_once 'HTMLPurifier/Lexer.php';
|
||||
* completely eventually.
|
||||
*
|
||||
* @todo Reread XML spec and document differences.
|
||||
* @todo Add support for CDATA sections.
|
||||
* @todo Determine correct behavior in outputting comment data. (preserve dashes?)
|
||||
* @todo Optimize main function tokenizeHTML().
|
||||
* @todo Less than sign (<) being prohibited (even as entity) in attr-values?
|
||||
*
|
||||
* @todo Determine correct behavior in transforming comment data. (preserve dashes?)
|
||||
*/
|
||||
class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
{
|
||||
|
||||
/**
|
||||
* Most common entity to raw value conversion table for special entities.
|
||||
* @protected
|
||||
*/
|
||||
var $_special_entity2str =
|
||||
array(
|
||||
'"' => '"',
|
||||
'&' => '&',
|
||||
'<' => '<',
|
||||
'>' => '>',
|
||||
''' => "'",
|
||||
''' => "'",
|
||||
''' => "'"
|
||||
);
|
||||
|
||||
/**
|
||||
* Parses special entities into the proper characters.
|
||||
*
|
||||
* This string will translate escaped versions of the special characters
|
||||
* into the correct ones.
|
||||
*
|
||||
* @warning
|
||||
* You should be able to treat the output of this function as
|
||||
* completely parsed, but that's only because all other entities should
|
||||
* have been handled previously in substituteNonSpecialEntities()
|
||||
*
|
||||
* @param $string String character data to be parsed.
|
||||
* @returns Parsed character data.
|
||||
*/
|
||||
function parseData($string) {
|
||||
|
||||
// subtracts amps that cannot possibly be escaped
|
||||
$num_amp = substr_count($string, '&') - substr_count($string, '& ') -
|
||||
($string[strlen($string)-1] === '&' ? 1 : 0);
|
||||
|
||||
if (!$num_amp) return $string; // abort if no entities
|
||||
$num_esc_amp = substr_count($string, '&');
|
||||
$string = strtr($string, $this->_special_entity2str);
|
||||
|
||||
// code duplication for sake of optimization, see above
|
||||
$num_amp_2 = substr_count($string, '&') - substr_count($string, '& ') -
|
||||
($string[strlen($string)-1] === '&' ? 1 : 0);
|
||||
|
||||
if ($num_amp_2 <= $num_esc_amp) return $string;
|
||||
|
||||
// hmm... now we have some uncommon entities. Use the callback.
|
||||
$string = $this->_entity_parser->substituteSpecialEntities($string);
|
||||
return $string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Whitespace characters for str(c)spn.
|
||||
* @protected
|
||||
|
@@ -18,6 +18,8 @@ require_once 'HTMLPurifier/Lexer.php';
|
||||
* whatever it does for poorly formed HTML is up to it.
|
||||
*
|
||||
* @todo Generalize so that XML_HTMLSax is also supported.
|
||||
*
|
||||
* @warning Entity-resolution inside attributes is broken.
|
||||
*/
|
||||
|
||||
class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
|
||||
@@ -41,6 +43,8 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
|
||||
$parser->set_element_handler('openHandler','closeHandler');
|
||||
$parser->set_data_handler('dataHandler');
|
||||
$parser->set_escape_handler('escapeHandler');
|
||||
|
||||
// doesn't seem to work correctly for attributes
|
||||
$parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);
|
||||
|
||||
$parser->parse($string);
|
||||
@@ -53,6 +57,10 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
|
||||
* Open tag event handler, interface is defined by PEAR package.
|
||||
*/
|
||||
function openHandler(&$parser, $name, $attrs, $closed) {
|
||||
// entities are not resolved in attrs
|
||||
foreach ($attrs as $key => $attr) {
|
||||
$attrs[$key] = $this->parseData($attr);
|
||||
}
|
||||
if ($closed) {
|
||||
$this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs);
|
||||
} else {
|
||||
|
@@ -4,7 +4,6 @@ require_once 'HTMLPurifier/URIScheme.php';
|
||||
|
||||
/**
|
||||
* Validates ftp (File Transfer Protocol) URIs as defined by generic RFC 1738.
|
||||
* @todo Typecode check on path
|
||||
*/
|
||||
class HTMLPurifier_URIScheme_ftp extends HTMLPurifier_URIScheme {
|
||||
|
||||
@@ -16,7 +15,27 @@ class HTMLPurifier_URIScheme_ftp extends HTMLPurifier_URIScheme {
|
||||
list($userinfo, $host, $port, $path, $query) =
|
||||
parent::validateComponents(
|
||||
$userinfo, $host, $port, $path, $query, $config );
|
||||
// typecode check needed on path
|
||||
$semicolon_pos = strrpos($path, ';'); // reverse
|
||||
if ($semicolon_pos !== false) {
|
||||
// typecode check
|
||||
$type = substr($path, $semicolon_pos + 1); // no semicolon
|
||||
$path = substr($path, 0, $semicolon_pos);
|
||||
$type_ret = '';
|
||||
if (strpos($type, '=') !== false) {
|
||||
// figure out whether or not the declaration is correct
|
||||
list($key, $typecode) = explode('=', $type, 2);
|
||||
if ($key !== 'type') {
|
||||
// invalid key, tack it back on encoded
|
||||
$path .= '%3B' . $type;
|
||||
} elseif ($typecode === 'a' || $typecode === 'i' || $typecode === 'd') {
|
||||
$type_ret = ";type=$typecode";
|
||||
}
|
||||
} else {
|
||||
$path .= '%3B' . $type;
|
||||
}
|
||||
$path = str_replace(';', '%3B', $path);
|
||||
$path .= $type_ret;
|
||||
}
|
||||
return array($userinfo, $host, $port, $path, null);
|
||||
}
|
||||
|
||||
|
@@ -1 +1 @@
|
||||
Deny from all
|
||||
Deny from all
|
||||
|
198
phpdoc.ini
198
phpdoc.ini
@@ -1,100 +1,100 @@
|
||||
;; phpDocumentor parse configuration file
|
||||
;;
|
||||
;; This file is designed to cut down on repetitive typing on the command-line or web interface
|
||||
;; You can copy this file to create a number of configuration files that can be used with the
|
||||
;; command-line switch -c, as in phpdoc -c default.ini or phpdoc -c myini.ini. The web
|
||||
;; interface will automatically generate a list of .ini files that can be used.
|
||||
;;
|
||||
;; default.ini is used to generate the online manual at http://www.phpdoc.org/docs
|
||||
;;
|
||||
;; ALL .ini files must be in the user subdirectory of phpDocumentor with an extension of .ini
|
||||
;;
|
||||
;; Copyright 2002, Greg Beaver <cellog@users.sourceforge.net>
|
||||
;;
|
||||
;; WARNING: do not change the name of any command-line parameters, phpDocumentor will ignore them
|
||||
|
||||
[Parse Data]
|
||||
;; title of all the documentation
|
||||
;; legal values: any string
|
||||
title = HTML Purifier API Documentation
|
||||
|
||||
;; parse files that start with a . like .bash_profile
|
||||
;; legal values: true, false
|
||||
hidden = false
|
||||
|
||||
;; show elements marked @access private in documentation by setting this to on
|
||||
;; legal values: on, off
|
||||
parseprivate = off
|
||||
|
||||
;; parse with javadoc-like description (first sentence is always the short description)
|
||||
;; legal values: on, off
|
||||
javadocdesc = on
|
||||
|
||||
;; add any custom @tags separated by commas here
|
||||
;; legal values: any legal tagname separated by commas.
|
||||
;customtags = mytag1,mytag2
|
||||
|
||||
;; This is only used by the XML:DocBook/peardoc2 converter
|
||||
defaultcategoryname = Documentation
|
||||
|
||||
;; what is the main package?
|
||||
;; legal values: alphanumeric string plus - and _
|
||||
defaultpackagename = HTMLPurifier
|
||||
|
||||
;; output any parsing information? set to on for cron jobs
|
||||
;; legal values: on
|
||||
;quiet = on
|
||||
|
||||
;; parse a PEAR-style repository. Do not turn this on if your project does
|
||||
;; not have a parent directory named "pear"
|
||||
;; legal values: on/off
|
||||
;pear = on
|
||||
|
||||
;; where should the documentation be written?
|
||||
;; legal values: a legal path
|
||||
target = docs/phpdoc
|
||||
|
||||
;; Which files should be parsed out as special documentation files, such as README,
|
||||
;; INSTALL and CHANGELOG? This overrides the default files found in
|
||||
;; phpDocumentor.ini (this file is not a user .ini file, but the global file)
|
||||
readmeinstallchangelog = README, INSTALL, NEWS, WYSIWYG, SLOW, LICENSE, CREDITS
|
||||
|
||||
;; limit output to the specified packages, even if others are parsed
|
||||
;; legal values: package names separated by commas
|
||||
;packageoutput = package1,package2
|
||||
|
||||
;; comma-separated list of files to parse
|
||||
;; legal values: paths separated by commas
|
||||
;filename = /path/to/file1,/path/to/file2,fileincurrentdirectory
|
||||
|
||||
;; comma-separated list of directories to parse
|
||||
;; legal values: directory paths separated by commas
|
||||
;directory = /path1,/path2,.,..,subdirectory
|
||||
;directory = /home/jeichorn/cvs/pear
|
||||
directory = ./
|
||||
|
||||
;; template base directory (the equivalent directory of <installdir>/phpDocumentor)
|
||||
;templatebase = /path/to/my/templates
|
||||
|
||||
;; directory to find any example files in through @example and {@example} tags
|
||||
;examplesdir = /path/to/my/templates
|
||||
|
||||
;; comma-separated list of files, directories or wildcards ? and * (any wildcard) to ignore
|
||||
;; legal values: any wildcard strings separated by commas
|
||||
;ignore = /path/to/ignore*,*list.php,myfile.php,subdirectory/
|
||||
ignore = pear-*,templates/,Documentation/,test*.php,Lexer.inc
|
||||
|
||||
sourcecode = on
|
||||
|
||||
;; comma-separated list of Converters to use in outputformat:Convertername:templatedirectory format
|
||||
;; legal values: HTML:frames:default,HTML:frames:l0l33t,HTML:frames:phpdoc.de,HTML:frames:phphtmllib,
|
||||
;; HTML:frames:earthli,
|
||||
;; HTML:frames:DOM/default,HTML:frames:DOM/l0l33t,HTML:frames:DOM/phpdoc.de,
|
||||
;; HTML:frames:DOM/phphtmllib,HTML:frames:DOM/earthli
|
||||
;; HTML:Smarty:default,HTML:Smarty:PHP,HTML:Smarty:HandS
|
||||
;; PDF:default:default,CHM:default:default,XML:DocBook/peardoc2:default
|
||||
output=HTML:frames:default
|
||||
|
||||
;; turn this option on if you want highlighted source code for every file
|
||||
;; legal values: on/off
|
||||
;; phpDocumentor parse configuration file
|
||||
;;
|
||||
;; This file is designed to cut down on repetitive typing on the command-line or web interface
|
||||
;; You can copy this file to create a number of configuration files that can be used with the
|
||||
;; command-line switch -c, as in phpdoc -c default.ini or phpdoc -c myini.ini. The web
|
||||
;; interface will automatically generate a list of .ini files that can be used.
|
||||
;;
|
||||
;; default.ini is used to generate the online manual at http://www.phpdoc.org/docs
|
||||
;;
|
||||
;; ALL .ini files must be in the user subdirectory of phpDocumentor with an extension of .ini
|
||||
;;
|
||||
;; Copyright 2002, Greg Beaver <cellog@users.sourceforge.net>
|
||||
;;
|
||||
;; WARNING: do not change the name of any command-line parameters, phpDocumentor will ignore them
|
||||
|
||||
[Parse Data]
|
||||
;; title of all the documentation
|
||||
;; legal values: any string
|
||||
title = HTML Purifier API Documentation
|
||||
|
||||
;; parse files that start with a . like .bash_profile
|
||||
;; legal values: true, false
|
||||
hidden = false
|
||||
|
||||
;; show elements marked @access private in documentation by setting this to on
|
||||
;; legal values: on, off
|
||||
parseprivate = off
|
||||
|
||||
;; parse with javadoc-like description (first sentence is always the short description)
|
||||
;; legal values: on, off
|
||||
javadocdesc = on
|
||||
|
||||
;; add any custom @tags separated by commas here
|
||||
;; legal values: any legal tagname separated by commas.
|
||||
;customtags = mytag1,mytag2
|
||||
|
||||
;; This is only used by the XML:DocBook/peardoc2 converter
|
||||
defaultcategoryname = Documentation
|
||||
|
||||
;; what is the main package?
|
||||
;; legal values: alphanumeric string plus - and _
|
||||
defaultpackagename = HTMLPurifier
|
||||
|
||||
;; output any parsing information? set to on for cron jobs
|
||||
;; legal values: on
|
||||
;quiet = on
|
||||
|
||||
;; parse a PEAR-style repository. Do not turn this on if your project does
|
||||
;; not have a parent directory named "pear"
|
||||
;; legal values: on/off
|
||||
;pear = on
|
||||
|
||||
;; where should the documentation be written?
|
||||
;; legal values: a legal path
|
||||
target = docs/phpdoc
|
||||
|
||||
;; Which files should be parsed out as special documentation files, such as README,
|
||||
;; INSTALL and CHANGELOG? This overrides the default files found in
|
||||
;; phpDocumentor.ini (this file is not a user .ini file, but the global file)
|
||||
readmeinstallchangelog = README, INSTALL, NEWS, WYSIWYG, SLOW, LICENSE, CREDITS
|
||||
|
||||
;; limit output to the specified packages, even if others are parsed
|
||||
;; legal values: package names separated by commas
|
||||
;packageoutput = package1,package2
|
||||
|
||||
;; comma-separated list of files to parse
|
||||
;; legal values: paths separated by commas
|
||||
;filename = /path/to/file1,/path/to/file2,fileincurrentdirectory
|
||||
|
||||
;; comma-separated list of directories to parse
|
||||
;; legal values: directory paths separated by commas
|
||||
;directory = /path1,/path2,.,..,subdirectory
|
||||
;directory = /home/jeichorn/cvs/pear
|
||||
directory = ./
|
||||
|
||||
;; template base directory (the equivalent directory of <installdir>/phpDocumentor)
|
||||
;templatebase = /path/to/my/templates
|
||||
|
||||
;; directory to find any example files in through @example and {@example} tags
|
||||
;examplesdir = /path/to/my/templates
|
||||
|
||||
;; comma-separated list of files, directories or wildcards ? and * (any wildcard) to ignore
|
||||
;; legal values: any wildcard strings separated by commas
|
||||
;ignore = /path/to/ignore*,*list.php,myfile.php,subdirectory/
|
||||
ignore = pear-*,templates/,Documentation/,test*.php,Lexer.inc
|
||||
|
||||
sourcecode = on
|
||||
|
||||
;; comma-separated list of Converters to use in outputformat:Convertername:templatedirectory format
|
||||
;; legal values: HTML:frames:default,HTML:frames:l0l33t,HTML:frames:phpdoc.de,HTML:frames:phphtmllib,
|
||||
;; HTML:frames:earthli,
|
||||
;; HTML:frames:DOM/default,HTML:frames:DOM/l0l33t,HTML:frames:DOM/phpdoc.de,
|
||||
;; HTML:frames:DOM/phphtmllib,HTML:frames:DOM/earthli
|
||||
;; HTML:Smarty:default,HTML:Smarty:PHP,HTML:Smarty:HandS
|
||||
;; PDF:default:default,CHM:default:default,XML:DocBook/peardoc2:default
|
||||
output=HTML:frames:default
|
||||
|
||||
;; turn this option on if you want highlighted source code for every file
|
||||
;; legal values: on/off
|
||||
sourcecode = on
|
17
test-settings.sample.php
Normal file
17
test-settings.sample.php
Normal file
@@ -0,0 +1,17 @@
|
||||
<?php
|
||||
|
||||
// This file is necessary to run the unit tests and profiling
|
||||
// scripts.
|
||||
|
||||
// Is PEAR available on your system? If it isn't, set to false. If PEAR
|
||||
// is not part of the default include_path, add it.
|
||||
$GLOBALS['HTMLPurifierTest']['PEAR'] = true;
|
||||
|
||||
// How many times should profiling scripts iterate over the function? More runs
|
||||
// means more accurate results, but they'll take longer to perform.
|
||||
$GLOBALS['HTMLPurifierTest']['Runs'] = 2;
|
||||
|
||||
// Where is SimpleTest located?
|
||||
$simpletest_location = '/path/to/simpletest/';
|
||||
|
||||
?>
|
@@ -1,7 +1,7 @@
|
||||
<?php
|
||||
|
||||
require_once 'HTMLPurifier/ChildDef.php';
|
||||
require_once 'HTMLPurifier/Lexer.php';
|
||||
require_once 'HTMLPurifier/Lexer/DirectLex.php';
|
||||
require_once 'HTMLPurifier/Generator.php';
|
||||
|
||||
class HTMLPurifier_ChildDefTest extends UnitTestCase
|
||||
@@ -12,7 +12,8 @@ class HTMLPurifier_ChildDefTest extends UnitTestCase
|
||||
var $gen;
|
||||
|
||||
function HTMLPurifier_ChildDefTest() {
|
||||
$this->lex = HTMLPurifier_Lexer::create();
|
||||
// it is vital that the tags be treated as literally as possible
|
||||
$this->lex = new HTMLPurifier_Lexer_DirectLex();
|
||||
$this->gen = new HTMLPurifier_Generator();
|
||||
parent::UnitTestCase();
|
||||
}
|
||||
@@ -45,18 +46,23 @@ class HTMLPurifier_ChildDefTest extends UnitTestCase
|
||||
$this->def = new HTMLPurifier_ChildDef_Custom(
|
||||
'(a, b?, c*, d+, (a, b)*)');
|
||||
|
||||
$inputs = array();
|
||||
$expect = array();
|
||||
$config = array();
|
||||
|
||||
$inputs[0] = '';
|
||||
$expect[0] = false;
|
||||
|
||||
$inputs[1] = '<a /><b /><c /><d /><a /><b />';
|
||||
$expect[1] = true;
|
||||
|
||||
$inputs[2] = '<a /><d>Dob</d><a /><b>foo</b><a href="moo"><b>foo</b>';
|
||||
$inputs[2] = '<a /><d>Dob</d><a /><b>foo</b><a href="moo" /><b>foo</b>';
|
||||
$expect[2] = true;
|
||||
|
||||
$inputs[3] = '<a /><a />';
|
||||
$expect[3] = false;
|
||||
|
||||
$this->assertSeries($inputs, $expect, $config);
|
||||
}
|
||||
|
||||
function test_table() {
|
||||
@@ -98,6 +104,14 @@ class HTMLPurifier_ChildDefTest extends UnitTestCase
|
||||
$inputs[6] = 'foo';
|
||||
$expect[6] = false;
|
||||
|
||||
// whitespace sticks to the previous element, last whitespace is
|
||||
// stationary
|
||||
$inputs[7] = "\n <tr />\n <tr />\n ";
|
||||
$expect[7] = true;
|
||||
|
||||
$inputs[8] = "\n\t<tbody />\n\t\t<tfoot />\n\t\t\t";
|
||||
$expect[8] = "\n\t\t<tfoot />\n\t<tbody />\n\t\t\t";
|
||||
|
||||
$this->assertSeries($inputs, $expect, $config);
|
||||
|
||||
}
|
||||
@@ -209,4 +223,4 @@ class HTMLPurifier_ChildDefTest extends UnitTestCase
|
||||
|
||||
}
|
||||
|
||||
?>
|
||||
?>
|
||||
|
@@ -8,6 +8,7 @@ class HTMLPurifier_ConfigTest extends UnitTestCase
|
||||
var $our_copy, $old_copy;
|
||||
|
||||
function setUp() {
|
||||
// set up a dummy schema object for testing
|
||||
$our_copy = new HTMLPurifier_ConfigSchema();
|
||||
$this->old_copy = HTMLPurifier_ConfigSchema::instance();
|
||||
$this->our_copy =& HTMLPurifier_ConfigSchema::instance($our_copy);
|
||||
@@ -93,6 +94,17 @@ class HTMLPurifier_ConfigTest extends UnitTestCase
|
||||
|
||||
}
|
||||
|
||||
function test_getDefinition() {
|
||||
|
||||
$config = HTMLPurifier_Config::createDefault();
|
||||
$def = $config->getHTMLDefinition();
|
||||
$this->assertIsA($def, 'HTMLPurifier_HTMLDefinition');
|
||||
|
||||
$def = $config->getCSSDefinition();
|
||||
$this->assertIsA($def, 'HTMLPurifier_CSSDefinition');
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
?>
|
@@ -123,6 +123,9 @@ class HTMLPurifier_GeneratorTest extends UnitTestCase
|
||||
var $config;
|
||||
function assertGeneration($tokens, $expect) {
|
||||
$result = $this->gen->generateFromTokens($tokens, $this->config);
|
||||
// normalized newlines, this probably should be put somewhere else
|
||||
$result = str_replace("\r\n", "\n", $result);
|
||||
$result = str_replace("\r", "\n", $result);
|
||||
$this->assertEqual($expect, $result);
|
||||
}
|
||||
|
||||
@@ -148,6 +151,25 @@ class HTMLPurifier_GeneratorTest extends UnitTestCase
|
||||
|
||||
}
|
||||
|
||||
function test_generateFromTokens_TidyFormat() {
|
||||
// abort test if tidy isn't loaded
|
||||
if (!extension_loaded('tidy')) return;
|
||||
|
||||
$this->config = HTMLPurifier_Config::createDefault();
|
||||
$this->config->set('Core', 'TidyFormat', true);
|
||||
|
||||
// nice wrapping please
|
||||
$this->assertGeneration(
|
||||
array(
|
||||
new HTMLPurifier_Token_Start('div'),
|
||||
new HTMLPurifier_Token_Text('Text'),
|
||||
new HTMLPurifier_Token_End('div')
|
||||
),
|
||||
"<div>\n Text\n</div>\n"
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
?>
|
@@ -11,24 +11,6 @@ class HTMLPurifier_Lexer_DirectLexTest extends UnitTestCase
|
||||
$this->DirectLex = new HTMLPurifier_Lexer_DirectLex();
|
||||
}
|
||||
|
||||
function test_parseData() {
|
||||
$HP =& $this->DirectLex;
|
||||
|
||||
$this->assertIdentical('asdf', $HP->parseData('asdf'));
|
||||
$this->assertIdentical('&', $HP->parseData('&'));
|
||||
$this->assertIdentical('"', $HP->parseData('"'));
|
||||
$this->assertIdentical("'", $HP->parseData('''));
|
||||
$this->assertIdentical("'", $HP->parseData('''));
|
||||
$this->assertIdentical('&&&', $HP->parseData('&&&'));
|
||||
$this->assertIdentical('&&', $HP->parseData('&&')); // [INVALID]
|
||||
$this->assertIdentical('Procter & Gamble',
|
||||
$HP->parseData('Procter & Gamble')); // [INVALID]
|
||||
|
||||
// This is not special, thus not converted. Test of fault tolerance,
|
||||
// realistically speaking, this should never happen
|
||||
$this->assertIdentical('-', $HP->parseData('-'));
|
||||
}
|
||||
|
||||
// internals testing
|
||||
function test_parseAttributeString() {
|
||||
|
||||
|
@@ -38,6 +38,25 @@ class HTMLPurifier_LexerTest extends UnitTestCase
|
||||
$this->assertIdentical($extract, $result);
|
||||
}
|
||||
|
||||
function test_parseData() {
|
||||
$HP =& $this->Lexer;
|
||||
|
||||
$this->assertIdentical('asdf', $HP->parseData('asdf'));
|
||||
$this->assertIdentical('&', $HP->parseData('&'));
|
||||
$this->assertIdentical('"', $HP->parseData('"'));
|
||||
$this->assertIdentical("'", $HP->parseData('''));
|
||||
$this->assertIdentical("'", $HP->parseData('''));
|
||||
$this->assertIdentical('&&&', $HP->parseData('&&&'));
|
||||
$this->assertIdentical('&&', $HP->parseData('&&')); // [INVALID]
|
||||
$this->assertIdentical('Procter & Gamble',
|
||||
$HP->parseData('Procter & Gamble')); // [INVALID]
|
||||
|
||||
// This is not special, thus not converted. Test of fault tolerance,
|
||||
// realistically speaking, this should never happen
|
||||
$this->assertIdentical('-', $HP->parseData('-'));
|
||||
}
|
||||
|
||||
|
||||
function test_extractBody() {
|
||||
$this->assertExtractBody('<b>Bold</b>');
|
||||
$this->assertExtractBody('<html><body><b>Bold</b></body></html>', '<b>Bold</b>');
|
||||
@@ -249,13 +268,16 @@ class HTMLPurifier_LexerTest extends UnitTestCase
|
||||
,new HTMLPurifier_Token_Text('Link')
|
||||
,new HTMLPurifier_Token_End('a')
|
||||
);
|
||||
$sax_expect[16] = false; // PEARSax doesn't support it!
|
||||
|
||||
// test that UTF-8 is preserved
|
||||
$char_hearts = $this->_entity_lookup->table['hearts'];
|
||||
$input[17] = $char_hearts;
|
||||
$expect[17] = array( new HTMLPurifier_Token_Text($char_hearts) );
|
||||
|
||||
// test weird characters in attributes
|
||||
$input[18] = '<br test="x < 6" />';
|
||||
$expect[18] = array( new HTMLPurifier_Token_Empty('br', array('test' => 'x < 6')) );
|
||||
|
||||
$default_config = HTMLPurifier_Config::createDefault();
|
||||
foreach($input as $i => $discard) {
|
||||
if (!isset($config[$i])) $config[$i] = $default_config;
|
||||
|
@@ -54,12 +54,34 @@ class HTMLPurifier_URISchemeTest extends UnitTestCase
|
||||
|
||||
$scheme = new HTMLPurifier_URIScheme_ftp();
|
||||
$config = HTMLPurifier_Config::createDefault();
|
||||
|
||||
$this->assertIdentical(
|
||||
$scheme->validateComponents(
|
||||
'user', 'www.example.com', 21, '/', 's=foobar', $config),
|
||||
array('user', 'www.example.com', null, '/', null)
|
||||
);
|
||||
|
||||
// valid typecode
|
||||
$this->assertIdentical(
|
||||
$scheme->validateComponents(
|
||||
null, 'www.example.com', null, '/file.txt;type=a', null, $config),
|
||||
array(null, 'www.example.com', null, '/file.txt;type=a', null)
|
||||
);
|
||||
|
||||
// remove invalid typecode
|
||||
$this->assertIdentical(
|
||||
$scheme->validateComponents(
|
||||
null, 'www.example.com', null, '/file.txt;type=z', null, $config),
|
||||
array(null, 'www.example.com', null, '/file.txt', null)
|
||||
);
|
||||
|
||||
// encode errant semicolons
|
||||
$this->assertIdentical(
|
||||
$scheme->validateComponents(
|
||||
null, 'www.example.com', null, '/too;many;semicolons=1', null, $config),
|
||||
array(null, 'www.example.com', null, '/too%3Bmany%3Bsemicolons=1', null)
|
||||
);
|
||||
|
||||
}
|
||||
|
||||
function test_news() {
|
||||
|
@@ -114,14 +114,14 @@ if (isset($_GET['file']) && isset($test_file_lookup[$_GET['file']])) {
|
||||
// execute only one test
|
||||
$test_file = $_GET['file'];
|
||||
|
||||
$test = new GroupTest('HTMLPurifier - ' . $test_file);
|
||||
$test = new GroupTest('HTML Purifier - ' . $test_file);
|
||||
$path = 'HTMLPurifier/' . $test_file;
|
||||
require_once $path;
|
||||
$test->addTestClass(htmlpurifier_path2class($path));
|
||||
|
||||
} else {
|
||||
|
||||
$test = new GroupTest('HTMLPurifier');
|
||||
$test = new GroupTest('HTML Purifier');
|
||||
|
||||
foreach ($test_files as $test_file) {
|
||||
$path = 'HTMLPurifier/' . $test_file;
|
||||
|
Reference in New Issue
Block a user