mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2025-08-03 12:47:56 +02:00
Compare commits
103 Commits
v1.0.0beta
...
v1.1.1
Author | SHA1 | Date | |
---|---|---|---|
|
6ef8abd04f | ||
|
bc5871f389 | ||
|
30d75c999d | ||
|
64d8ca9831 | ||
|
d7642b8c70 | ||
|
3b30c2ca5b | ||
|
f43616f72d | ||
|
6740ba61af | ||
|
6a33945499 | ||
|
4660791682 | ||
|
b5c69d8ca5 | ||
|
e440f25bce | ||
|
665e80d223 | ||
|
69747ede8a | ||
|
49b3832ebf | ||
|
a365d4c688 | ||
|
7038fad788 | ||
|
50b272d75e | ||
|
bfb642d32c | ||
|
edb39601c7 | ||
|
694139d3bb | ||
|
81721ded5c | ||
|
371fb7c3d2 | ||
|
9e6953e619 | ||
|
2299f0c831 | ||
|
9dd4dcb27a | ||
|
aa0838492e | ||
|
df075c96e0 | ||
|
fbaa909d25 | ||
|
967f40fc11 | ||
|
5ee6ffe20f | ||
|
10d41d7130 | ||
|
65a628bcb7 | ||
|
a5b4ed2126 | ||
|
d20bbd8db3 | ||
|
b99573223d | ||
|
c6cfb68713 | ||
|
2259bfa40e | ||
|
de3b2b70fb | ||
|
4f0a5c0e22 | ||
|
fdd583253c | ||
|
a4be6ffe4d | ||
|
6de42d8d1d | ||
|
e9a519e589 | ||
|
709a17a504 | ||
|
47a6c9eb75 | ||
|
eddf474351 | ||
|
0e715bdda6 | ||
|
478fab1ad1 | ||
|
f4f636a09c | ||
|
b621602ac1 | ||
|
14aeafcf22 | ||
|
90279eaee2 | ||
|
0ac97774d4 | ||
|
89376a11e3 | ||
|
1de3088276 | ||
|
55503744ee | ||
|
670d298a87 | ||
|
784e7356d1 | ||
|
fbb0c486ec | ||
|
a1b60ad70f | ||
|
3e8b1d1148 | ||
|
dd1b911183 | ||
|
4e08389427 | ||
|
dc19ac9a2a | ||
|
15988980db | ||
|
7588068b7b | ||
|
24cde9c891 | ||
|
0d4ee2ba37 | ||
|
78414abafd | ||
|
692a9abc0f | ||
|
ffe39d7f30 | ||
|
5169fc7a3b | ||
|
80e79d906a | ||
|
a43a2730bc | ||
|
ca1453401f | ||
|
dcec92e7b3 | ||
|
fb0003a608 | ||
|
1b8f4fa5bc | ||
|
f46b15cb82 | ||
|
6af60425b8 | ||
|
f8839d56a0 | ||
|
fb08b9c89b | ||
|
f7760c8cb6 | ||
|
7813e79bda | ||
|
314a48373c | ||
|
ca0914789c | ||
|
2605257723 | ||
|
679302b161 | ||
|
37cbdc25b1 | ||
|
973cc43b64 | ||
|
53808ee34a | ||
|
2eef708557 | ||
|
42ba96e2de | ||
|
a33cd12f1a | ||
|
c393ef8a81 | ||
|
ad83ab430d | ||
|
4c52e42189 | ||
|
50d5179dbd | ||
|
af0de616ae | ||
|
66ddc4cc5a | ||
|
252c5afae0 | ||
|
04c0953af0 |
9
Doxyfile
9
Doxyfile
@@ -3,8 +3,8 @@
|
||||
#---------------------------------------------------------------------------
|
||||
# Project related configuration options
|
||||
#---------------------------------------------------------------------------
|
||||
PROJECT_NAME = HTMLPurifier
|
||||
PROJECT_NUMBER = trunk
|
||||
PROJECT_NAME = HTML Purifier
|
||||
PROJECT_NUMBER = 1.0.0
|
||||
OUTPUT_DIRECTORY = "C:/Documents and Settings/Edward/My Documents/My Webs/htmlpurifier/docs/doxygen"
|
||||
CREATE_SUBDIRS = NO
|
||||
OUTPUT_LANGUAGE = English
|
||||
@@ -88,7 +88,10 @@ RECURSIVE = YES
|
||||
EXCLUDE =
|
||||
EXCLUDE_SYMLINKS = NO
|
||||
EXCLUDE_PATTERNS = */tests/* \
|
||||
*/benchmarks/*
|
||||
*/benchmarks/* \
|
||||
*/docs/phpdoc/* \
|
||||
*/docs/doxygen/* \
|
||||
*/test-settings.php
|
||||
EXAMPLE_PATH =
|
||||
EXAMPLE_PATTERNS = *
|
||||
EXAMPLE_RECURSIVE = NO
|
||||
|
120
INSTALL
120
INSTALL
@@ -1,30 +1,54 @@
|
||||
|
||||
Install
|
||||
How to install HTMLPurifier
|
||||
How to install HTML Purifier
|
||||
|
||||
Being a library, there's no fancy GUI that will take you step-by-step through
|
||||
configuring database credentials and other mumbo-jumbo. HTMLPurifier is
|
||||
configuring database credentials and other mumbo-jumbo. HTML Purifier is
|
||||
designed to run "out of the box." Regardless, there are still a couple of
|
||||
things you should be mindful of.
|
||||
|
||||
|
||||
|
||||
0. Compatibility
|
||||
|
||||
HTML Purifier works in both PHP 4 and PHP 5. I have run the test suite on
|
||||
these versions:
|
||||
|
||||
- 4.3.9, 4.3.11
|
||||
- 4.4.0, 4.4.4
|
||||
- 5.0.0, 5.0.4
|
||||
- 5.1.0, 5.1.6
|
||||
|
||||
And can confidently say that HTML Purifier should work in all versions
|
||||
between and afterwards. HTML Purifier definitely does not support PHP 4.2,
|
||||
and PHP 4.3 branch support may go further back than that, but I haven't tested
|
||||
any earlier versions.
|
||||
|
||||
I have been unable to get PHP 5.0.5 working on my computer, so if someone
|
||||
wants to test that, be my guest. All tests were done on Windows XP Home,
|
||||
but operating system should not be a major factor in the library.
|
||||
|
||||
|
||||
|
||||
1. Including the proper files
|
||||
|
||||
The library/ directory must be added to your path: HTMLPurifier will not be
|
||||
The library/ directory must be added to your path: HTML Purifier will not be
|
||||
able to find the necessary includes otherwise. This is as simple as:
|
||||
|
||||
set_include_path('/path/to/htmlpurifier/library' . PATH_SEPARATOR . get_include_path());
|
||||
set_include_path('/path/to/htmlpurifier/library' . PATH_SEPARATOR .
|
||||
get_include_path() );
|
||||
|
||||
...replacing /path/to/htmlpurifier with the actual location of the folder. Don't
|
||||
worry, HTMLPurifier is namespaced so unless you have another file named
|
||||
worry, HTML Purifier is namespaced so unless you have another file named
|
||||
HTMLPurifier.php, the files won't collide with any of your includes.
|
||||
|
||||
Then, it's a simple matter of including the base file:
|
||||
|
||||
require_once 'HTMLPurifier.php';
|
||||
require_once 'HTMLPurifier.php';
|
||||
|
||||
...and you're good to go.
|
||||
...and you're good to go. The library/ folder contains all the files you need,
|
||||
so you can get rid of most of everything else when using the library in a
|
||||
production environment.
|
||||
|
||||
|
||||
|
||||
@@ -37,25 +61,91 @@ is a (short) checklist:
|
||||
* Have I specified XHTML 1.0 Transitional as the doctype?
|
||||
* Have I specified UTF-8 as the character encoding?
|
||||
|
||||
To find out what these are, browse to your website and view its source code.
|
||||
You can figure out the doctype from the a declaration that looks like
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||
or no doctype. You can figure out the character encoding by looking for
|
||||
<meta http-equiv="Content-type" content="text/html;charset=ENCODING">
|
||||
|
||||
I cannot stress the importance of these two bullets enough. Omitting either
|
||||
of them could have dire consequences not only for security but for plain
|
||||
old usability. You can find a more in-depth discussion of why this is needed
|
||||
in docs/security.txt, in the meantime, try to change your output so this is
|
||||
the case.
|
||||
the case. If you can't, well, we might be able to accomodate you (read
|
||||
section 3).
|
||||
|
||||
|
||||
|
||||
3. Configuring HTML Purifier
|
||||
|
||||
HTML Purifier is designed to run out-of-the-box, but occasionally HTML
|
||||
Purifier needs to be told what to do.
|
||||
|
||||
If, for some reason, you are unable to switch to UTF-8 immediately, you can
|
||||
use iconv to convert the output of HTMLPurifier to your desired encoding.
|
||||
We may integrate support for other encodings in later releases, but for now,
|
||||
UTF-8 is all you should need. (If you're not using UTF-8, switch now!)
|
||||
switch HTML Purifier's encoding. Note that the availability of encodings is
|
||||
dependent on iconv, and you'll be missing characters if the charset you
|
||||
choose doesn't have them.
|
||||
|
||||
$config->set('Core', 'Encoding', /* put your encoding here */);
|
||||
|
||||
An example usage for Latin-1 websites:
|
||||
|
||||
$config->set('Core', 'Encoding', 'ISO-8859-1');
|
||||
|
||||
For those of you stuck using HTML 4.01 Transitional, you can disable
|
||||
XHTML output like this:
|
||||
|
||||
$config->set('Core', 'XHTML', false);
|
||||
|
||||
However, I strongly recommend that you use XHTML. Currently, we can only
|
||||
guarantee transitional-complaint output, future versions will also allow strict
|
||||
output. There are more configuration directives which can be read about
|
||||
here: http://hp.jpsband.org/live/configdoc/plain.html
|
||||
|
||||
|
||||
|
||||
3. Using the code
|
||||
|
||||
The interface is mind-numbingly simple.
|
||||
The interface is mind-numbingly simple:
|
||||
|
||||
$purifier = new HTMLPurifier();
|
||||
$clean_html = $purifier->purify($dirty_html);
|
||||
$purifier = new HTMLPurifier();
|
||||
$clean_html = $purifier->purify($dirty_html);
|
||||
|
||||
Or, if you're using the configuration object:
|
||||
|
||||
$purifier = new HTMLPurifier($config);
|
||||
$clean_html = $purifier->purify($dirty_html);
|
||||
|
||||
That's it. For more examples, check out docs/examples/. Also, SLOW gives
|
||||
advice on what to do if HTMLPurifier is slowing down your application.
|
||||
advice on what to do if HTML Purifier is slowing down your application.
|
||||
|
||||
|
||||
|
||||
4. Quick install
|
||||
|
||||
If your website is in UTF-8 and XHTML Transitional, use this code:
|
||||
|
||||
<?php
|
||||
set_include_path('/path/to/htmlpurifier/library'
|
||||
. PATH_SEPARATOR . get_include_path() );
|
||||
require_once 'HTMLPurifier.php';
|
||||
$purifier = new HTMLPurifier();
|
||||
|
||||
$clean_html = $purifier->purify($dirty_html);
|
||||
?>
|
||||
|
||||
If your website is in a different encoding or doctype, use this code:
|
||||
|
||||
<?php
|
||||
set_include_path('/path/to/htmlpurifier/library'
|
||||
. PATH_SEPARATOR . get_include_path() );
|
||||
require_once 'HTMLPurifier.php';
|
||||
|
||||
$config = HTMLPurifier_Config::createDefault();
|
||||
$config->set('Core', 'Encoding', 'ISO-8859-1'); //replace with your encoding
|
||||
$config->set('Core', 'XHTML', true); //replace with false if HTML 4.01
|
||||
$purifier = new HTMLPurifier($config);
|
||||
|
||||
$clean_html = $purifier->purify($dirty_html);
|
||||
?>
|
37
NEWS
37
NEWS
@@ -1,9 +1,44 @@
|
||||
NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
||||
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
|
||||
|
||||
1.1.1, released 2006-09-24
|
||||
- Various documentation updates
|
||||
- Fixed parse error in configuration documentation script
|
||||
- Fixed fatal error in benchmark scripts, slightly augmented
|
||||
- As far as possible, whitespace is preserved in-between table children
|
||||
- Configuration option to optionally Tidy up output for indentation to make up
|
||||
for dropped whitespace by DOMLex (pretty-printing for the entire application
|
||||
should be done by a page-wide Tidy)
|
||||
- Sample test-settings.php file included
|
||||
|
||||
1.1.0, released 2006-09-16
|
||||
- Made URI validator more forgiving: will ignore leading and trailing
|
||||
quotes, apostrophes and less than or greater than signs.
|
||||
- Enforce alphanumeric namespace and directive names for configuration.
|
||||
- Directive documentation generation using XSLT
|
||||
- Table child definition made more flexible, will fix up poorly ordered elements
|
||||
- XHTML generation can now be turned off, allowing things like <br>
|
||||
- Renamed ConfigDef to ConfigSchema
|
||||
|
||||
1.0.1, released 2006-09-04
|
||||
- Fixed slight bug in DOMLex attribute parsing
|
||||
- Fixed rejection of case-insensitive configuration values when there is a
|
||||
set of allowed values. This manifested in %Core.Encoding.
|
||||
- Fixed rejection of inline style declarations that had lots of extra
|
||||
space in them. This manifested in TinyMCE.
|
||||
|
||||
1.0.0, released 2006-09-01
|
||||
- Fixed broken numeric entity conversion
|
||||
- Malformed UTF-8 and non-SGML character detection and cleaning implemented
|
||||
- API documentation completed
|
||||
- Shorthand CSS properties implemented: font, border, background, list-style
|
||||
- Basic color keywords translated into hexadecimal values
|
||||
- Table CSS properties implemented
|
||||
- (HTML|CSS)Definition de-singleton-ized
|
||||
- Support for charsets other than UTF-8 (defined by iconv)
|
||||
|
||||
1.0.0beta, released 2006-08-16
|
||||
- First public release, most functionality implemented. Notable omissions are:
|
||||
. Shorthand CSS properties
|
||||
. Table CSS properties
|
||||
. IPv6 validation
|
||||
. Deprecated attribute transformations
|
||||
|
13
SLOW
13
SLOW
@@ -17,18 +17,23 @@ second tacked on to the load time probably isn't going to be that huge of
|
||||
a problem. Then, displaying the content is a simple a manner of outputting
|
||||
it directly from your database/filesystem. The trouble with this method is
|
||||
that your user loses the original text, and when doing edits, will be
|
||||
handling the filtered text. Of course, maybe that's a good thing. If you
|
||||
don't mind a little extra complexity, you can try...
|
||||
handling the filtered text. While this may be a good thing, especially if
|
||||
you're using a WYSIWYG editor, it can also result in data-loss if a user
|
||||
expects a certain to be available but it doesn't.
|
||||
|
||||
2. Caching the filtered output - accept the submitted text and put it
|
||||
unaltered into the database, but then also generate a filtered version and
|
||||
stash that in the database. Serve the filtered version to readers, and the
|
||||
unaltered version to editors. If need be, you can invalidate the cache and
|
||||
have the cached filtered version be regenerated on the first page view. Pros?
|
||||
Full data retention. Cons? It's more complicated.
|
||||
Full data retention. Cons? It's more complicated, and opens other editors
|
||||
up to XSS if they are using a WYSIWYG editor (to fix that, they'd have to
|
||||
be able to get their hands on the *really* original text served in plaintext
|
||||
mode).
|
||||
|
||||
In short, inbound filtering is almost as simple as outbound filtering, but
|
||||
it has some drawbacks which cannot be fixed unless you save both the original
|
||||
and the filtered versions.
|
||||
|
||||
There is a third option: profile and optimize HTMLPurifier yourself. ;-)
|
||||
There is a third option: profile and optimize HTMLPurifier yourself. Be sure
|
||||
to tell me if you decide to do that! ;-)
|
||||
|
58
TODO
58
TODO
@@ -1,19 +1,51 @@
|
||||
Todo List
|
||||
|
||||
Core:
|
||||
- Finish table and shorthand CSS attributes
|
||||
- Implement all non-essential attribute transforms
|
||||
TODO List
|
||||
|
||||
Code issues:
|
||||
- Massive profiling, make it faster!
|
||||
- Fix IPv6 issues
|
||||
Ongoing
|
||||
- Lots of profiling, make it faster!
|
||||
- Plugins for major CMSes (very tricky issue)
|
||||
|
||||
1.2 release
|
||||
- Make URI validation routines tighter (especially mailto)
|
||||
- More extensive URI filtering schemes
|
||||
- Allow for background-image and list-style-image (see above)
|
||||
- Distinguish between different types of URIs, for instance, a mailto URI
|
||||
in IMG SRC is nonsensical
|
||||
- Factor out Host validation to its own AttrDef
|
||||
- Rewrite table's child definition
|
||||
- Silently drop content inbetween SCRIPT tags
|
||||
- Error logging for filtering/cleanup procedures
|
||||
|
||||
Enhancements:
|
||||
- Do fixes for Firefox's inability to handle COL alignment props (Bug 915)
|
||||
- Pretty-printing HTML
|
||||
1.3 release
|
||||
- Add various "levels" of cleaning
|
||||
- Related: Allow strict (X)HTML
|
||||
|
||||
1.4 release
|
||||
- Additional support for poorly written HTML
|
||||
- Implement all non-essential attribute transforms
|
||||
- Microsoft Word HTML cleaning (i.e. MsoNormal)
|
||||
|
||||
2.0 release
|
||||
- Formatters for plaintext
|
||||
- Auto-paragraphing (be sure to leverage fact that we know when things
|
||||
shouldn't be paragraphed, such as lists and tables).
|
||||
- Linkify URLs
|
||||
- Smileys
|
||||
|
||||
3.0 release
|
||||
- Extended HTML capabilities based on namespacing and tag transforms
|
||||
- Hooks for adding custom processors to custom namespaced tags and
|
||||
attributes, offer default implementation
|
||||
- Lots of documentation and samples
|
||||
|
||||
Unknown release (on a scratch-an-itch basis)
|
||||
- Silently drop content inbetween SCRIPT tags (can be generalized to allow
|
||||
specification of elements that, when detected as foreign, trigger removal
|
||||
of children, although unbalanced tags could wreck havoc (or at least delete
|
||||
the rest of the document)).
|
||||
- Fixes for Firefox's inability to handle COL alignment props (Bug 915)
|
||||
- Automatically add non-breaking spaces to empty table cells when
|
||||
empty-cells:show is applied to have compatibility with Internet Explorer
|
||||
- Non-lossy dumb alternate character encoding transformations, achieved by
|
||||
numerically encoding all non-ASCII characters
|
||||
|
||||
Wontfix
|
||||
- Non-lossy smart alternate character encoding transformations
|
||||
- Pretty-printing HTML, users can use Tidy on the output on entire page
|
||||
|
7
WYSIWYG
7
WYSIWYG
@@ -1,6 +1,6 @@
|
||||
|
||||
WYSIWYG - What You See Is What You Get
|
||||
HTMLPurifier: A Pretty Good Fit for TinyMCE and FCKeditor
|
||||
HTML Purifier: A Pretty Good Fit for TinyMCE and FCKeditor
|
||||
|
||||
Javascript-based WYSIWYG editors, simply stated, are quite amazing. But I've
|
||||
always been wary about using them due to security issues: they handle the
|
||||
@@ -13,6 +13,9 @@ other markup languages still reign supreme. Put simply: filtering HTML is
|
||||
hard work, and these WYSIWYG authors don't offer anything to alleviate that
|
||||
trouble. Therein lies the solution:
|
||||
|
||||
HTMLPurifier is perfect for filtering pure-HTML input from WYSIWYG editors.
|
||||
HTML Purifier is perfect for filtering pure-HTML input from WYSIWYG editors.
|
||||
|
||||
Enough said.
|
||||
|
||||
There is a proof-of-concept integration of HTML Purifier with the Mantis
|
||||
bugtracker at http://hp.jpsband.org/mantis/
|
||||
|
@@ -3,15 +3,24 @@
|
||||
// emulates inserting a dir called HTMLPurifier into your class dir
|
||||
set_include_path(get_include_path() . PATH_SEPARATOR . '../library/');
|
||||
|
||||
require_once 'HTMLPurifier/ConfigDef.php';
|
||||
require_once 'HTMLPurifier/Config.php';
|
||||
require_once 'HTMLPurifier/Lexer/DirectLex.php';
|
||||
require_once 'HTMLPurifier/Lexer/PEARSax3.php';
|
||||
@include_once '../test-settings.php';
|
||||
|
||||
$LEXERS = array(
|
||||
'DirectLex' => new HTMLPurifier_Lexer_DirectLex(),
|
||||
'PEARSax3' => new HTMLPurifier_Lexer_PEARSax3()
|
||||
);
|
||||
require_once 'HTMLPurifier/ConfigSchema.php';
|
||||
require_once 'HTMLPurifier/Config.php';
|
||||
|
||||
$LEXERS = array();
|
||||
$RUNS = isset($GLOBALS['HTMLPurifierTest']['Runs'])
|
||||
? $GLOBALS['HTMLPurifierTest']['Runs'] : 2;
|
||||
|
||||
require_once 'HTMLPurifier/Lexer/DirectLex.php';
|
||||
$LEXERS['DirectLex'] = new HTMLPurifier_Lexer_DirectLex();
|
||||
|
||||
if (!empty($GLOBALS['HTMLPurifierTest']['PEAR'])) {
|
||||
require_once 'HTMLPurifier/Lexer/PEARSax3.php';
|
||||
$LEXERS['PEARSax3'] = new HTMLPurifier_Lexer_PEARSax3();
|
||||
} else {
|
||||
exit('PEAR required to perform benchmark.');
|
||||
}
|
||||
|
||||
if (version_compare(PHP_VERSION, '5', '>=')) {
|
||||
require_once 'HTMLPurifier/Lexer/DOMLex.php';
|
||||
@@ -56,9 +65,12 @@ class RowTimer extends Benchmark_Timer
|
||||
if ($standard == false) $standard = $v['diff'];
|
||||
|
||||
$perc = $v['diff'] * 100 / $standard;
|
||||
$bad_run = ($v['diff'] < 0);
|
||||
|
||||
$out .= '<td align="right">' . number_format($perc, 2, '.', '') .
|
||||
'%</td>';
|
||||
$out .= '<td align="right"'.
|
||||
($bad_run ? ' style="color:#AAA;"' : '').
|
||||
'>' . number_format($perc, 2, '.', '') .
|
||||
'%</td><td>'.number_format($v['diff'],4,'.','').'</td>';
|
||||
|
||||
}
|
||||
|
||||
@@ -79,13 +91,13 @@ function print_lexers() {
|
||||
}
|
||||
|
||||
function do_benchmark($name, $document) {
|
||||
global $LEXERS;
|
||||
global $LEXERS, $RUNS;
|
||||
|
||||
$timer = new RowTimer($name);
|
||||
$timer->start();
|
||||
|
||||
foreach($LEXERS as $key => $lexer) {
|
||||
$tokens = $lexer->tokenizeHTML($document);
|
||||
for ($i=0; $i<$RUNS; $i++) $tokens = $lexer->tokenizeHTML($document);
|
||||
$timer->setMarker($key);
|
||||
}
|
||||
|
||||
@@ -103,7 +115,7 @@ function do_benchmark($name, $document) {
|
||||
<table border="1">
|
||||
<tr><th>Case</th><?php
|
||||
foreach ($LEXERS as $key => $value) {
|
||||
echo '<th>' . htmlspecialchars($key) . '</th>';
|
||||
echo '<th colspan="2">' . htmlspecialchars($key) . '</th>';
|
||||
}
|
||||
?></tr>
|
||||
<?php
|
||||
|
@@ -2,7 +2,7 @@
|
||||
|
||||
set_include_path(get_include_path() . PATH_SEPARATOR . '../library/');
|
||||
|
||||
require_once 'HTMLPurifier/ConfigDef.php';
|
||||
require_once 'HTMLPurifier/ConfigSchema.php';
|
||||
require_once 'HTMLPurifier/Config.php';
|
||||
require_once 'HTMLPurifier/Lexer/DirectLex.php';
|
||||
|
||||
|
214
configdoc/generate.php
Normal file
214
configdoc/generate.php
Normal file
@@ -0,0 +1,214 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Generates XML and HTML documents describing configuration.
|
||||
*/
|
||||
|
||||
/*
|
||||
TODO:
|
||||
- make XML format richer (see below)
|
||||
- extend XSLT transformation (see the corresponding XSLT file)
|
||||
- allow generation of packaged docs that can be easily moved
|
||||
- multipage documentation
|
||||
- determine how to multilingualize
|
||||
- factor out code into classes
|
||||
- generate a table of contents
|
||||
*/
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Check and configure environment
|
||||
|
||||
if (version_compare('5', PHP_VERSION, '>')) exit('Requires PHP 5 or higher.');
|
||||
error_reporting(E_ALL);
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Include HTML Purifier library
|
||||
|
||||
set_include_path('../library' . PATH_SEPARATOR . get_include_path());
|
||||
require_once 'HTMLPurifier.php';
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Setup convenience functions
|
||||
|
||||
function appendHTMLDiv($document, $node, $html) {
|
||||
global $purifier;
|
||||
$html = $purifier->purify($html);
|
||||
$dom_html = $document->createDocumentFragment();
|
||||
$dom_html->appendXML($html);
|
||||
|
||||
$dom_div = $document->createElement('div');
|
||||
$dom_div->setAttribute('xmlns', 'http://www.w3.org/1999/xhtml');
|
||||
$dom_div->appendChild($dom_html);
|
||||
|
||||
$node->appendChild($dom_div);
|
||||
}
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Load copies of HTMLPurifier_ConfigDef and HTMLPurifier
|
||||
|
||||
$schema = HTMLPurifier_ConfigSchema::instance();
|
||||
$purifier = new HTMLPurifier();
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Generate types.xml, a document describing the constraint "type"
|
||||
|
||||
$types_document = new DOMDocument('1.0', 'UTF-8');
|
||||
$types_root = $types_document->createElement('types');
|
||||
$types_document->appendChild($types_root);
|
||||
$types_document->formatOutput = true;
|
||||
foreach ($schema->types as $name => $expanded_name) {
|
||||
$types_type = $types_document->createElement('type', $expanded_name);
|
||||
$types_type->setAttribute('id', $name);
|
||||
$types_root->appendChild($types_type);
|
||||
}
|
||||
$types_document->save('types.xml');
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Generate configdoc.xml, a document documenting configuration directives
|
||||
|
||||
$dom_document = new DOMDocument('1.0', 'UTF-8');
|
||||
$dom_root = $dom_document->createElement('configdoc');
|
||||
$dom_document->appendChild($dom_root);
|
||||
$dom_document->formatOutput = true;
|
||||
|
||||
// add the name of the application
|
||||
$dom_root->appendChild($dom_document->createElement('title', 'HTML Purifier'));
|
||||
|
||||
/*
|
||||
TODO for XML format:
|
||||
- namespace descriptions
|
||||
- enumerated values
|
||||
- default values
|
||||
- create a definition (DTD or other) once interface stabilizes
|
||||
*/
|
||||
|
||||
foreach($schema->info as $namespace_name => $namespace_info) {
|
||||
|
||||
$dom_namespace = $dom_document->createElement('namespace');
|
||||
$dom_root->appendChild($dom_namespace);
|
||||
|
||||
$dom_namespace->setAttribute('id', $namespace_name);
|
||||
$dom_namespace->appendChild(
|
||||
$dom_document->createElement('name', $namespace_name)
|
||||
);
|
||||
$dom_namespace_description = $dom_document->createElement('description');
|
||||
$dom_namespace->appendChild($dom_namespace_description);
|
||||
appendHTMLDiv($dom_document, $dom_namespace_description,
|
||||
$schema->info_namespace[$namespace_name]->description);
|
||||
|
||||
foreach ($namespace_info as $name => $info) {
|
||||
|
||||
$dom_directive = $dom_document->createElement('directive');
|
||||
$dom_namespace->appendChild($dom_directive);
|
||||
|
||||
$dom_directive->setAttribute('id', $namespace_name . '.' . $name);
|
||||
$dom_directive->appendChild(
|
||||
$dom_document->createElement('name', $name)
|
||||
);
|
||||
|
||||
$dom_constraints = $dom_document->createElement('constraints');
|
||||
$dom_directive->appendChild($dom_constraints);
|
||||
|
||||
$dom_constraints->appendChild(
|
||||
$dom_document->createElement('type', $info->type)
|
||||
);
|
||||
if ($info->allowed !== true) {
|
||||
$dom_allowed = $dom_document->createElement('allowed');
|
||||
$dom_constraints->appendChild($dom_allowed);
|
||||
foreach ($info->allowed as $allowed => $bool) {
|
||||
$dom_allowed->appendChild(
|
||||
$dom_document->createElement('value', $allowed)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
$raw_default = $schema->defaults[$namespace_name][$name];
|
||||
if (is_bool($raw_default)) {
|
||||
$default = $raw_default ? 'true' : 'false';
|
||||
} elseif (is_string($raw_default)) {
|
||||
$default = "\"$raw_default\"";
|
||||
} else {
|
||||
$default = print_r(
|
||||
$schema->defaults[$namespace_name][$name], true
|
||||
);
|
||||
}
|
||||
$dom_constraints->appendChild(
|
||||
$dom_document->createElement('default', $default)
|
||||
);
|
||||
|
||||
$dom_descriptions = $dom_document->createElement('descriptions');
|
||||
$dom_directive->appendChild($dom_descriptions);
|
||||
|
||||
foreach ($info->descriptions as $file => $file_descriptions) {
|
||||
foreach ($file_descriptions as $line => $description) {
|
||||
$dom_description = $dom_document->createElement('description');
|
||||
$dom_description->setAttribute('file', $file);
|
||||
$dom_description->setAttribute('line', $line);
|
||||
appendHTMLDiv($dom_document, $dom_description, $description);
|
||||
$dom_descriptions->appendChild($dom_description);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// print_r($dom_document->saveXML());
|
||||
|
||||
// save a copy of the raw XML
|
||||
$dom_document->save('configdoc.xml');
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Generate final output using XSLT
|
||||
|
||||
// load the stylesheet
|
||||
$xsl_stylesheet_name = 'plain';
|
||||
$xsl_stylesheet = "styles/$xsl_stylesheet_name.xsl";
|
||||
$xsl_dom_stylesheet = new DOMDocument();
|
||||
$xsl_dom_stylesheet->load($xsl_stylesheet);
|
||||
|
||||
// setup the XSLT processor
|
||||
$xsl_processor = new XSLTProcessor();
|
||||
|
||||
// perform the transformation
|
||||
$xsl_processor->importStylesheet($xsl_dom_stylesheet);
|
||||
$html_output = $xsl_processor->transformToXML($dom_document);
|
||||
|
||||
// some slight fudges to preserve backwards compatibility
|
||||
$html_output = str_replace('/>', ' />', $html_output); // <br /> not <br>
|
||||
$html_output = str_replace(' xmlns=""', '', $html_output); // rm unnecessary xmlns
|
||||
|
||||
if (class_exists('Tidy')) {
|
||||
// cleanup output
|
||||
$config = array(
|
||||
'indent' => true,
|
||||
'output-xhtml' => true,
|
||||
'wrap' => 80
|
||||
);
|
||||
$tidy = new Tidy;
|
||||
$tidy->parseString($html_output, $config, 'utf8');
|
||||
$tidy->cleanRepair();
|
||||
$html_output = (string) $tidy;
|
||||
}
|
||||
|
||||
// write it to a file (todo: parse into seperate pages)
|
||||
file_put_contents("$xsl_stylesheet_name.html", $html_output);
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Output for instant feedback
|
||||
|
||||
if (php_sapi_name() != 'cli') {
|
||||
echo $html_output;
|
||||
} else {
|
||||
echo 'Files generated successfully.';
|
||||
}
|
||||
|
||||
?>
|
7
configdoc/styles/plain.css
Normal file
7
configdoc/styles/plain.css
Normal file
@@ -0,0 +1,7 @@
|
||||
table {border-collapse:collapse;}
|
||||
table td, table th {padding:0.2em;}
|
||||
|
||||
table.constraints {margin:0 0 1em;}
|
||||
table.constraints th {text-align:left;padding-left:0.4em;}
|
||||
table.constraints td {padding-right:0.4em;}
|
||||
table.constraints td pre {margin:0;}
|
105
configdoc/styles/plain.xsl
Normal file
105
configdoc/styles/plain.xsl
Normal file
@@ -0,0 +1,105 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<xsl:stylesheet
|
||||
version = "1.0"
|
||||
xmlns = "http://www.w3.org/1999/xhtml"
|
||||
xmlns:xsl = "http://www.w3.org/1999/XSL/Transform"
|
||||
>
|
||||
<xsl:output
|
||||
method = "xml"
|
||||
encoding = "UTF-8"
|
||||
doctype-public = "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
||||
doctype-system = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"
|
||||
indent = "no"
|
||||
media-type = "text/html"
|
||||
/>
|
||||
|
||||
<xsl:variable name="typeLookup" select="document('../types.xml')" />
|
||||
|
||||
<xsl:template match="/">
|
||||
<html lang="en" xml:lang="en">
|
||||
<head>
|
||||
<title><xsl:value-of select="/configdoc/title" /> Configuration Documentation</title>
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8" />
|
||||
<link rel="stylesheet" type="text/css" href="styles/plain.css" />
|
||||
</head>
|
||||
<body>
|
||||
<xsl:apply-templates />
|
||||
</body>
|
||||
</html>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="title">
|
||||
<h1><xsl:value-of select="/configdoc/title" /> Configuration Documentation</h1>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="namespace">
|
||||
<xsl:apply-templates />
|
||||
<xsl:if test="count(child::directive)=0">
|
||||
<p>No configuration directives defined for this namespace.</p>
|
||||
</xsl:if>
|
||||
</xsl:template>
|
||||
<xsl:template match="namespace/name">
|
||||
<h2 id="{../@id}"><xsl:value-of select="text()" /></h2>
|
||||
</xsl:template>
|
||||
<xsl:template match="namespace/description">
|
||||
<div class="description">
|
||||
<xsl:copy-of select="div/node()" />
|
||||
</div>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="directive">
|
||||
<xsl:apply-templates />
|
||||
</xsl:template>
|
||||
<xsl:template match="directive/name">
|
||||
<h3 id="{../@id}"><xsl:value-of select="text()" /></h3>
|
||||
</xsl:template>
|
||||
<xsl:template match="directive/constraints">
|
||||
<table class="constraints">
|
||||
<xsl:apply-templates />
|
||||
<!-- Calculated other values -->
|
||||
<tr>
|
||||
<th>Used by:</th>
|
||||
<td>
|
||||
<xsl:for-each select="../descriptions/description">
|
||||
<xsl:if test="position()>1">, </xsl:if>
|
||||
<xsl:value-of select="@file" />
|
||||
</xsl:for-each>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
</xsl:template>
|
||||
<xsl:template match="directive//description">
|
||||
<div class="description">
|
||||
<xsl:copy-of select="div/node()" />
|
||||
</div>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="constraints/type">
|
||||
<tr>
|
||||
<th>Type:</th>
|
||||
<td>
|
||||
<xsl:variable name="type" select="text()" />
|
||||
<xsl:attribute name="class">type type-<xsl:value-of select="$type" /></xsl:attribute>
|
||||
<xsl:value-of select="$typeLookup/types/type[@id=$type]/text()" />
|
||||
</td>
|
||||
</tr>
|
||||
</xsl:template>
|
||||
<xsl:template match="constraints/allowed">
|
||||
<tr>
|
||||
<th>Allowed values:</th>
|
||||
<td>
|
||||
<xsl:for-each select="value"><!--
|
||||
--><xsl:if test="position()>1">, </xsl:if>
|
||||
"<xsl:value-of select="." />"<!--
|
||||
--></xsl:for-each>
|
||||
</td>
|
||||
</tr>
|
||||
</xsl:template>
|
||||
<xsl:template match="constraints/default">
|
||||
<tr>
|
||||
<th>Default:</th>
|
||||
<td><pre><xsl:value-of select="." xml:space="preserve" /></pre></td>
|
||||
</tr>
|
||||
</xsl:template>
|
||||
|
||||
</xsl:stylesheet>
|
@@ -11,23 +11,24 @@ profiling.
|
||||
Here we go:
|
||||
|
||||
AttrDef
|
||||
Class - doesn't support Unicode characters, uses regular expressions
|
||||
Lang - code duplication, premature optimization, doesn't consult official
|
||||
lists
|
||||
Pixels/Length/MultiLength - implemented according to HTML spec (excludes
|
||||
code reuse in CSS)
|
||||
URI - multiple regular expressions, needs host validation routines factored
|
||||
out for mailto scheme, IPv6 validation is broken (fringe), unintuitive
|
||||
variable overwriting, missing validation for query, fragment and path,
|
||||
Class - doesn't support Unicode characters (fringe); uses regular
|
||||
expressions
|
||||
Lang - code duplication; premature optimization; doesn't consult official
|
||||
lists (fringe)
|
||||
Length - easily mistaken for CSSLength
|
||||
URI - multiple regular expressions; needs host validation routines factored
|
||||
out for mailto scheme; missing validation for query; fragment and path,
|
||||
no percent-encode fixing
|
||||
CSS - parser doesn't accept advanced CSS (fringe)
|
||||
AttrTransform - doesn't accept AttrContext, non-validating
|
||||
Lang - invalid xml:lang value can overwrite valid lang value (fringe)
|
||||
ChildDef - not-allowed nodes translated to text, likely invalid handling
|
||||
Number - constructor interface is inconsistent with Integer
|
||||
AttrTransform - doesn't accept AttrContext
|
||||
Config - "load configuration" hooks missing, rich set* accessors missing
|
||||
ConfigSchema - redefinition is a mess
|
||||
Strategy
|
||||
FixNesting - cannot bubble nodes out of structures
|
||||
MakeWellFormed - insufficient automatic closing definitions
|
||||
MakeWellFormed - insufficient automatic closing definitions (check HTML
|
||||
spec for optional end tags, also, closing based on type (block/inline)
|
||||
might be efficient).
|
||||
RemoveForeignElements - should be run in parallel with MakeWellFormed
|
||||
URIScheme - needs to have callable generic checks
|
||||
ftp - missing typecode check
|
||||
|
@@ -28,6 +28,7 @@ time. Note the naming convention: %Namespace.Directive
|
||||
|
||||
%Attr.MaxWidth,
|
||||
%Attr.MaxHeight - caps for width and height related checks.
|
||||
(a hack in Pixels for an image crashing attack could be replaced by this)
|
||||
|
||||
%URI.Munge - will munge all URIs to a different URI, which should redirect
|
||||
the user to the applicable page. A urlencoded version of the URI
|
||||
|
@@ -16,3 +16,23 @@ make this optional: they will supply a default configuration object if none
|
||||
are passed. These classes are: HTMLPurifier::*, Generator::generateFromTokens
|
||||
and Lexer::tokenizeHTML. However, whenever a valid configuration object
|
||||
is defined, that object should be used.
|
||||
|
||||
In relation to HTMLDefinition and CSSDefinition, there is a special class
|
||||
of directives that influence the *construction* of the Definition object.
|
||||
A standard call pattern would look like:
|
||||
|
||||
1. Client calls Config->getHTMLDefinition()
|
||||
2. Config calls HTMLDefinition->createNew(this)
|
||||
3. HTMLDefinition constructs itself with base configuration
|
||||
4. HTMLDefinition calls Config->get('HTMLDefinition')
|
||||
5. Config returns array of directives that later construction
|
||||
6. HTMLDefinition performs operations and changes specified by directives
|
||||
7. HTMLPurifier returns constructed definition
|
||||
8. Config caches definition so it doesn't have to be generated again
|
||||
9. Config returns definition
|
||||
|
||||
You could also override Config's copy of the definition with your own
|
||||
custom copy, which OVERRIDES all directives. Only the base, vanilla copy
|
||||
is the Singleton, the object actually interfaced with is a operated-upon
|
||||
clone of that object. Also, if an update to the directives would update
|
||||
the definition, you'd have to force reconstruction.
|
||||
|
@@ -22,7 +22,10 @@ the development of this library in these forum threads:</p>
|
||||
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=53539">http and ftp versus news and mailto</a></li>
|
||||
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=53579">HTMLPurifier - Take your best shot</a></li>
|
||||
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=53664">Need help optimizing a block of code</a>
|
||||
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=53861">Non-SGML characters</a>
|
||||
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=54283">Wordpress makes me cry</a>
|
||||
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=54478">Parameter Object vs. Parameter Array vs. Parameter Functions</a>
|
||||
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=54521">Convert encoding where output cannot represent characters</a>
|
||||
</ul>
|
||||
|
||||
</body>
|
||||
</html>
|
@@ -1,272 +0,0 @@
|
||||
<!-- Transform %TextAlign to align:value in style -->
|
||||
|
||||
<!-- text alignment for p, div, h1-h6. The default is
|
||||
align="left" for ltr headings, "right" for rtl
|
||||
|
||||
Move to style! -->
|
||||
<!ENTITY % TextAlign "DEPRECATED align (left|center|right|justify) #IMPLIED">
|
||||
|
||||
<!-- type and start should have CSS equivalents, but they'll need to
|
||||
be translated intelligently -->
|
||||
<!ENTITY % ULStyle "(disc|square|circle)">
|
||||
<!-- Ordered list numbering style
|
||||
|
||||
1 arabic numbers 1, 2, 3, ...
|
||||
a lower alpha a, b, c, ...
|
||||
A upper alpha A, B, C, ...
|
||||
i lower roman i, ii, iii, ...
|
||||
I upper roman I, II, III, ...
|
||||
|
||||
The style is applied to the sequence number which by default
|
||||
is reset to 1 for the first list item in an ordered list.
|
||||
-->
|
||||
<!ENTITY % OLStyle "CDATA">
|
||||
<!-- LIStyle is constrained to: "(%ULStyle;|%OLStyle;)" -->
|
||||
<!ENTITY % LIStyle "CDATA">
|
||||
|
||||
<!ATTLIST ol
|
||||
%attrs;
|
||||
DEPRECATED type %OLStyle; #IMPLIED
|
||||
DEPRECATED start %Number; #IMPLIED
|
||||
>
|
||||
|
||||
<!ATTLIST li
|
||||
%attrs;
|
||||
DEPRECATED type %LIStyle; #IMPLIED
|
||||
DEPRECATED value %Number; #IMPLIED
|
||||
>
|
||||
|
||||
<!ATTLIST hr
|
||||
%attrs;
|
||||
DEPRECATED align (left|center|right) #IMPLIED
|
||||
DEPRECATED size %Pixels; #IMPLIED
|
||||
DEPRECATED width %Length; #IMPLIED
|
||||
>
|
||||
|
||||
<!ATTLIST pre
|
||||
%attrs;
|
||||
DEPRECATED width %Number; #IMPLIED
|
||||
>
|
||||
|
||||
<!ATTLIST blockquote
|
||||
%attrs;
|
||||
cite %URI; #IMPLIED
|
||||
>
|
||||
|
||||
<!ATTLIST ins
|
||||
%attrs;
|
||||
cite %URI; #IMPLIED
|
||||
datetime %Datetime; #IMPLIED
|
||||
>
|
||||
<!ATTLIST del
|
||||
%attrs;
|
||||
cite %URI; #IMPLIED
|
||||
datetime %Datetime; #IMPLIED
|
||||
>
|
||||
|
||||
<!ATTLIST a
|
||||
%attrs;
|
||||
name NMTOKEN #IMPLIED // ID
|
||||
href %URI; #IMPLIED
|
||||
rel %LinkTypes; #IMPLIED // needs policing
|
||||
rev %LinkTypes; #IMPLIED // see rel
|
||||
target %FrameTarget; #IMPLIED // usually not used, but might be
|
||||
>
|
||||
|
||||
<!ATTLIST bdo
|
||||
%coreattrs; // !#!
|
||||
lang %LanguageCode; #IMPLIED
|
||||
xml:lang %LanguageCode; #IMPLIED
|
||||
dir (ltr|rtl) #REQUIRED
|
||||
>
|
||||
|
||||
<!ATTLIST br
|
||||
%coreattrs; // !#!
|
||||
DEPRECATED clear (left|all|right|none) "none"
|
||||
>
|
||||
|
||||
<!ELEMENT q %Inline;> <!-- inlined quote -->
|
||||
<!ATTLIST q
|
||||
%attrs;
|
||||
cite %URI; #IMPLIED
|
||||
>
|
||||
|
||||
<!ATTLIST img
|
||||
%attrs;
|
||||
src %URI; #REQUIRED
|
||||
alt %Text; #REQUIRED
|
||||
DEPRECATED name NMTOKEN #IMPLIED // ID
|
||||
longdesc %URI; #IMPLIED
|
||||
height %Length; #IMPLIED // dubious, but we'll allow
|
||||
width %Length; #IMPLIED //
|
||||
DEPRECATED align %ImgAlign; #IMPLIED
|
||||
DEPRECATED border %Length; #IMPLIED
|
||||
DEPRECATED hspace %Pixels; #IMPLIED // left/right margin
|
||||
DEPRECATED vspace %Pixels; #IMPLIED // up/down margin
|
||||
>
|
||||
|
||||
<!--
|
||||
The border attribute sets the thickness of the frame around the
|
||||
table. The default units are screen pixels.
|
||||
|
||||
The frame attribute specifies which parts of the frame around
|
||||
the table should be rendered. The values are not the same as
|
||||
CALS to avoid a name clash with the valign attribute.
|
||||
-->
|
||||
<!ENTITY % TFrame "(void|above|below|hsides|lhs|rhs|vsides|box|border)">
|
||||
|
||||
<!--
|
||||
The rules attribute defines which rules to draw between cells:
|
||||
|
||||
If rules is absent then assume:
|
||||
"none" if border is absent or border="0" otherwise "all"
|
||||
-->
|
||||
|
||||
<!ENTITY % TRules "(none | groups | rows | cols | all)">
|
||||
|
||||
<!-- horizontal placement of table relative to document -->
|
||||
<!ENTITY % TAlign "(left|center|right)">
|
||||
|
||||
<!-- horizontal alignment attributes for cell contents
|
||||
|
||||
char alignment char, e.g. char=':'
|
||||
charoff offset for alignment char
|
||||
-->
|
||||
<!ENTITY % cellhalign
|
||||
"align (left|center|right|justify|char) #IMPLIED
|
||||
char %Character; #IMPLIED
|
||||
charoff %Length; #IMPLIED"
|
||||
>
|
||||
|
||||
<!-- vertical alignment attributes for cell contents -->
|
||||
<!ENTITY % cellvalign
|
||||
"valign (top|middle|bottom|baseline) #IMPLIED"
|
||||
>
|
||||
|
||||
<!-- we may want to convert some of these nonetheless -->
|
||||
<!ATTLIST table
|
||||
%attrs;
|
||||
summary %Text; #IMPLIED
|
||||
width %Length; #IMPLIED
|
||||
border %Pixels; #IMPLIED
|
||||
frame %TFrame; #IMPLIED
|
||||
rules %TRules; #IMPLIED
|
||||
cellspacing %Length; #IMPLIED
|
||||
cellpadding %Length; #IMPLIED
|
||||
DEPRECATED align %TAlign; #IMPLIED
|
||||
DEPRECATED bgcolor %Color; #IMPLIED
|
||||
>
|
||||
|
||||
<!ENTITY % CAlign "(top|bottom|left|right)">
|
||||
|
||||
<!ATTLIST caption
|
||||
%attrs;
|
||||
DEPRECATED align %CAlign; #IMPLIED // watch, it's a special set
|
||||
>
|
||||
|
||||
<!--
|
||||
colgroup groups a set of col elements. It allows you to group
|
||||
several semantically related columns together.
|
||||
-->
|
||||
<!ATTLIST colgroup
|
||||
%attrs;
|
||||
span %Number; "1"
|
||||
width %MultiLength; #IMPLIED
|
||||
%cellhalign; // very interesting
|
||||
%cellvalign;
|
||||
>
|
||||
|
||||
<!--
|
||||
col elements define the alignment properties for cells in
|
||||
one or more columns.
|
||||
|
||||
The width attribute specifies the width of the columns, e.g.
|
||||
|
||||
width=64 width in screen pixels
|
||||
width=0.5* relative width of 0.5
|
||||
|
||||
The span attribute causes the attributes of one
|
||||
col element to apply to more than one column.
|
||||
-->
|
||||
<!ATTLIST col
|
||||
%attrs;
|
||||
span %Number; "1"
|
||||
width %MultiLength; #IMPLIED
|
||||
%cellhalign;
|
||||
%cellvalign;
|
||||
>
|
||||
|
||||
<!--
|
||||
Use thead to duplicate headers when breaking table
|
||||
across page boundaries, or for static headers when
|
||||
tbody sections are rendered in scrolling panel.
|
||||
|
||||
Use tfoot to duplicate footers when breaking table
|
||||
across page boundaries, or for static footers when
|
||||
tbody sections are rendered in scrolling panel.
|
||||
|
||||
Use multiple tbody sections when rules are needed
|
||||
between groups of table rows.
|
||||
-->
|
||||
<!ATTLIST thead
|
||||
%attrs;
|
||||
%cellhalign;
|
||||
%cellvalign;
|
||||
>
|
||||
|
||||
<!ATTLIST tfoot
|
||||
%attrs;
|
||||
%cellhalign;
|
||||
%cellvalign;
|
||||
>
|
||||
|
||||
<!ATTLIST tbody
|
||||
%attrs;
|
||||
%cellhalign;
|
||||
%cellvalign;
|
||||
>
|
||||
|
||||
<!ATTLIST tr
|
||||
%attrs;
|
||||
%cellhalign;
|
||||
%cellvalign;
|
||||
DEPRECATED bgcolor %Color; #IMPLIED
|
||||
>
|
||||
|
||||
<!-- Scope is simpler than headers attribute for common tables -->
|
||||
<!ENTITY % Scope "(row|col|rowgroup|colgroup)">
|
||||
|
||||
<!-- th is for headers, td for data and for cells acting as both -->
|
||||
|
||||
<!ATTLIST th
|
||||
%attrs;
|
||||
abbr %Text; #IMPLIED
|
||||
axis CDATA #IMPLIED
|
||||
headers IDREFS #IMPLIED
|
||||
scope %Scope; #IMPLIED
|
||||
rowspan %Number; "1"
|
||||
colspan %Number; "1"
|
||||
%cellhalign;
|
||||
%cellvalign;
|
||||
DEPRECATED nowrap (nowrap) #IMPLIED
|
||||
DEPRECATED bgcolor %Color; #IMPLIED
|
||||
DEPRECATED width %Length; #IMPLIED
|
||||
DEPRECATED height %Length; #IMPLIED
|
||||
>
|
||||
|
||||
<!ATTLIST td
|
||||
%attrs;
|
||||
abbr %Text; #IMPLIED
|
||||
axis CDATA #IMPLIED
|
||||
headers IDREFS #IMPLIED
|
||||
scope %Scope; #IMPLIED
|
||||
rowspan %Number; "1"
|
||||
colspan %Number; "1"
|
||||
%cellhalign;
|
||||
%cellvalign;
|
||||
DEPRECATED nowrap (nowrap) #IMPLIED
|
||||
DEPRECATED bgcolor %Color; #IMPLIED
|
||||
DEPRECATED width %Length; #IMPLIED
|
||||
DEPRECATED height %Length; #IMPLIED
|
||||
>
|
||||
|
@@ -1,4 +1,8 @@
|
||||
<!DOCTYPE html
|
||||
<?php
|
||||
|
||||
header('Content-type:text/html;charset=UTF-8');
|
||||
|
||||
?><!DOCTYPE html
|
||||
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||
<html>
|
||||
@@ -10,8 +14,6 @@
|
||||
<h1>HTMLPurifier Live Demo</h1>
|
||||
<?php
|
||||
|
||||
set_time_limit(120);
|
||||
|
||||
set_include_path('../../library' . PATH_SEPARATOR . get_include_path());
|
||||
require_once 'HTMLPurifier.php';
|
||||
|
||||
@@ -19,7 +21,9 @@ if (!empty($_POST['html'])) {
|
||||
|
||||
$html = get_magic_quotes_gpc() ? stripslashes($_POST['html']) : $_POST['html'];
|
||||
|
||||
$purifier = new HTMLPurifier();
|
||||
$config = HTMLPurifier_Config::createDefault();
|
||||
$config->set('Core', 'TidyFormat', !empty($_POST['tidy']));
|
||||
$purifier = new HTMLPurifier($config);
|
||||
$pure_html = $purifier->purify($html);
|
||||
|
||||
?>
|
||||
@@ -58,13 +62,18 @@ if (isset($_GET['profile']) || isset($_GET['XDEBUG_PROFILE'])) {
|
||||
<legend>HTML</legend>
|
||||
<textarea name="html" cols="60" rows="15"><?php
|
||||
|
||||
if (isset($html)) echo htmlspecialchars($html, ENT_COMPAT, 'UTF-8');
|
||||
|
||||
if (isset($html)) {
|
||||
echo htmlspecialchars(
|
||||
HTMLPurifier_Encoder::cleanUTF8($html), ENT_COMPAT, 'UTF-8');
|
||||
}
|
||||
?></textarea>
|
||||
<div>Nicely format output with Tidy? <input type="checkbox" value="1"
|
||||
name="tidy"<?php if (!empty($_POST['tidy'])) echo ' checked="checked"'; ?> /></div>
|
||||
<div>
|
||||
<input type="submit" value="Submit" name="submit" class="button" />
|
||||
</div>
|
||||
</fieldset>
|
||||
</form>
|
||||
<p>Return to <a href="http://hp.jpsband.org/">HTMLPurifier's home page</a>.</p>
|
||||
</body>
|
||||
</html>
|
67
docs/filter-levels.txt
Normal file
67
docs/filter-levels.txt
Normal file
@@ -0,0 +1,67 @@
|
||||
|
||||
Filter Levels
|
||||
When one size *does not* fit all
|
||||
|
||||
The more I think about it, the less sense it makes for maintaining one huge
|
||||
monolithic HTMLDefinition class. There's simply so much variation that
|
||||
could go into this definition: the set of HTML good for blog entries is
|
||||
definitely too large for HTML that would be allowed in blog comments. Going
|
||||
from Transitional to Strict requires changes to the definition.
|
||||
|
||||
However, allowing users to specify their own whitelists was an idea I
|
||||
rejected from the start. Simply put, the typical programmer is too lazy
|
||||
to actually go through the trouble of investigating which tags, attributes
|
||||
and properties to allow. HTMLDefinition makes a big part of what HTMLPurifier
|
||||
is.
|
||||
|
||||
The idea, then, is to setup fundamentally different set of definitions, which
|
||||
can further be customized using simpler configuration options.
|
||||
|
||||
Here are some fuzzy levels you could set:
|
||||
|
||||
1. Comments - Wordpress recommends a, abbr, acronym, b, blockquote, cite,
|
||||
code, em, i, strike, strong; however, you could get away with only a, b and
|
||||
i; also having p and pre tags would be helpful.
|
||||
2. Pages - As permissive as possible without allowing XSS. No protection
|
||||
against bad design sense, unfortunantely. Suitable for wiki and page
|
||||
environments.
|
||||
3. Lint - Accept everything in the spec, a Tidy wannabe.
|
||||
|
||||
I've also decomposed tags into risk levels. An asterisk indicates that no one
|
||||
really uses that tag, tilde indicates it's deprecated.
|
||||
|
||||
1 - blockquote, code, em, i, p, tt / strong, sub, sup
|
||||
1* - abbr, acronym, bdo, cite, dfn, kbd, q, samp
|
||||
2 - b, br, del, div, pre, span / ins, s, strike ~ u
|
||||
3 - h2, h3, h4, h5, h6 ~ center
|
||||
4 - h1, big ~ font
|
||||
5 - a
|
||||
7 - area, map
|
||||
|
||||
Lists - dd, dl, dt, li, ol, ul ~ menu, dir
|
||||
Tables - caption, table, td, th, tr / col, colgroup, tbody, tfoot, thead
|
||||
Forms - fieldset, form, input, lable, legend, optgroup, option, select, textarea
|
||||
XSS - noscript, object, script ~ applet
|
||||
|
||||
Meta - base, basefont, body, head, html, link, meta, style, title
|
||||
Frames - frame, frameset, iframe
|
||||
|
||||
And tag specific notes:
|
||||
|
||||
a - general problems involving linkspam
|
||||
b - too much bold is bad, typographically speaking bold is discouraged
|
||||
br - often misused
|
||||
center - CSS, usually no legit use
|
||||
del - only useful in editing context
|
||||
div - little meaning in certain contexts i.e. blog comment
|
||||
h1 - usually no legit use, as header is already set by application
|
||||
h* - not needed in blog comments
|
||||
hr - usually not necessary in blog comments
|
||||
img - could be extremely undesirable if linking to external pics
|
||||
pre - could use formatting, only useful in code contexts
|
||||
q - very little support
|
||||
s - transform into span with styling or del?
|
||||
small - technically presentational
|
||||
span - depends on attribute allowances
|
||||
sub, sup - specialized
|
||||
u - little legit use, prefer class with text-decoration
|
@@ -6,8 +6,9 @@ help you find the correct functionality more quickly. Here they are:
|
||||
|
||||
All classes occupy the HTMLPurifier pseudo-namespace.
|
||||
This means that all classes are prefixed with HTMLPurifier_. As such, all
|
||||
names under HTMLPurifier_ are reserved, and userspace extensions should
|
||||
be registered in a different namespace (or the main namespace).
|
||||
names under HTMLPurifier_ are reserved. I recommend that you use the name
|
||||
HTMLPurifierX_YourName_ClassName, especially if you want to take advantage
|
||||
of HTMLPurifier_ConfigDef.
|
||||
|
||||
All classes correspond to their path if library/ was in the include path
|
||||
HTMLPurifier_AttrDef is located at HTMLPurifier/AttrDef.php; replace
|
||||
|
@@ -2,9 +2,10 @@
|
||||
Optimization
|
||||
|
||||
Here are some possible optimization techniques we can apply to code sections if
|
||||
they turn out to be slow. Be sure not to prematurely optimize though!
|
||||
they turn out to be slow. Be sure not to prematurely optimize: if you get
|
||||
that itch, put it here!
|
||||
|
||||
- Make Tokens Flyweights
|
||||
- Make Tokens Flyweights (may prove problematic, probably not worth it)
|
||||
- Rewrite regexps into PHP code
|
||||
- Serialize the Definition object
|
||||
- Batch regexp validation (do as many per function call as possible)
|
||||
|
@@ -12,13 +12,14 @@ th {text-align:left;padding-top:1.4em;font-size:13pt;
|
||||
border-bottom:2px solid #000;background:#FFF;}
|
||||
thead th {text-align:left;padding:0.1em;background-color:#EEE;}
|
||||
|
||||
.impl-yes {background:#AFA;}
|
||||
.impl-yes {background:#9D9;}
|
||||
.impl-partial {background:#FFA;}
|
||||
.impl-no {background:#FAA;}
|
||||
.impl-no {background:#CCC;}
|
||||
|
||||
.danger {background:#FEE;}
|
||||
.danger {color:#600;}
|
||||
.css1 {color:#060;}
|
||||
.required {font-weight:bold;}
|
||||
.feature {color:#999;}
|
||||
|
||||
</style>
|
||||
|
||||
@@ -34,91 +35,9 @@ thead th {text-align:left;padding:0.1em;background-color:#EEE;}
|
||||
<tr><td class="impl-no">Will not implement</td></tr>
|
||||
<tr><td class="danger">Dangerous attribute/property</td></tr>
|
||||
<tr><td class="css1">Present in CSS1</td></tr>
|
||||
<tr><td class="feature">Feature, requires extra work</td></tr>
|
||||
</tbody></table>
|
||||
|
||||
<h2>Interesting Attributes</h2>
|
||||
|
||||
<table cellspacing="0">
|
||||
|
||||
<thead>
|
||||
<tr><th>Attribute</th><th>Tags</th><th>Notes</th></tr>
|
||||
</thead>
|
||||
|
||||
<!--
|
||||
<tr><th></th></tr>
|
||||
<tbody>
|
||||
<tr><td>-</td><td>-</td><td>-</td></tr>
|
||||
</tbody>
|
||||
-->
|
||||
|
||||
<tbody>
|
||||
<tr><th colspan="3">CSS</th></tr>
|
||||
<tr class="impl-partial"><td>style</td><td>All</td><td>Needs CSS parser</td></tr>
|
||||
</tbody>
|
||||
|
||||
<tbody>
|
||||
<tr><th colspan="3">Questionable</th></tr>
|
||||
<tr class="impl-no"><td>accesskey</td><td>A</td><td>May interfere with main interface</td></tr>
|
||||
<tr class="impl-no"><td>tabindex</td><td>A</td><td>May interfere with main interface</td></tr>
|
||||
<tr><td>target</td><td>A</td><td>Config enabled, only useful for frame layouts</td></tr>
|
||||
</tbody>
|
||||
|
||||
<tbody>
|
||||
<tr><th colspan="3">Miscellaneous</th></tr>
|
||||
<tr><td>datetime</td><td>DEL, INS</td><td>No visible effect, ISO format</td></tr>
|
||||
<tr><td>rel</td><td>A</td><td>Largely user-defined: nofollow, tag (see microformats)</td></tr>
|
||||
<tr><td>rev</td><td>A</td><td>Largely user-defined: vote-*</td></tr>
|
||||
<tr class="impl-no"><td>axis</td><td>TD, TH</td><td>W3C only: No browser implementation</td></tr>
|
||||
<tr class="impl-no"><td>char</td><td>COL, COLGROUP, TBODY, TD, TFOOT, TH, THEAD, TR</td><td>W3C only: No browser implementation</td></tr>
|
||||
<tr class="impl-no"><td>headers</td><td>TD, TH</td><td>W3C only: No browser implementation</td></tr>
|
||||
<tr class="impl-no"><td>scope</td><td>TD, TH</td><td>W3C only: No browser implementation</td></tr>
|
||||
</tbody>
|
||||
|
||||
<tbody class="impl-yes">
|
||||
<tr><th colspan="3">URI</th></tr>
|
||||
<tr><td rowspan="2">cite</td><td>BLOCKQUOTE, Q</td><td>For attribution</td></tr>
|
||||
<tr><td>DEL, INS</td><td>Link to explanation why it changed</td></tr>
|
||||
<tr><td>href</td><td>A</td><td>-</td></tr>
|
||||
<tr><td>longdesc</td><td>IMG</td><td>-</td></tr>
|
||||
<tr class="required"><td>src</td><td>IMG</td><td>Required</td></tr>
|
||||
</tbody>
|
||||
|
||||
<tbody>
|
||||
<tr><th colspan="3">Transform</th></tr>
|
||||
<tr><td rowspan="5">align</td><td>CAPTION</td><td>Near-equiv style 'caption-side', drop left and right</td></tr>
|
||||
<tr><td>IMG</td><td rowspan="2">Margin-left and margin-right = auto or parent div</td></tr>
|
||||
<tr><td>TABLE</td></tr>
|
||||
<tr><td>HR</td><td>Equivalent style 'text-align' (IE tested)</td></tr>
|
||||
<tr class="impl-yes"><td>H1, H2, H3, H4, H5, H6, P</td><td>Equivalent style 'text-align'</td></tr>
|
||||
<tr class="required impl-yes"><td>alt</td><td>IMG</td><td>Required, insert image filename if src is present or default invalid image text</td></tr>
|
||||
<tr><td rowspan="3">bgcolor</td><td>TABLE</td><td>Equivalent style 'background-color' (IE tested)</td></tr>
|
||||
<tr><td>TR</td><td>Equivalent style 'background-color' (IE tested)</td></tr>
|
||||
<tr><td>TD, TH</td><td>Equivalent style 'background-color'</td></tr>
|
||||
<tr><td>border</td><td>IMG</td><td>Equivalent style 'border-width', only applies when link present</td></tr>
|
||||
<tr><td>clear</td><td>BR</td><td>Near-equiv style 'clear', transform 'all' into 'both'</td></tr>
|
||||
<tr class="impl-no"><td>compact</td><td>DL, OL, UL</td><td>Boolean, needs custom CSS class</td></tr>
|
||||
<tr class="required impl-yes"><td>dir</td><td>BDO</td><td>Required, insert ltr (or configuration value) if none</td></tr>
|
||||
<tr><td>height</td><td>TD, TH</td><td>Near-equiv style 'height', needs px suffix if original was in pixels</td></tr>
|
||||
<tr><td>hspace</td><td>IMG</td><td>Near-equiv styles 'margin-top' and 'margin-bottom', needs px suffix</td></tr>
|
||||
<tr class="impl-yes"><td>lang</td><td>*</td><td>Copy value to xml:lang</td></tr>
|
||||
<tr><td rowspan="2">name</td><td>IMG</td><td>Turn into ID</td></tr>
|
||||
<tr><td>A</td><td>Turn into ID? (not deprecated, though in which specs?)</td></tr>
|
||||
<tr><td>noshade</td><td>HR</td><td>Boolean, style 'border-style:solid;'</td></tr>
|
||||
<tr><td>nowrap</td><td>TD, TH</td><td>Boolean, style 'white-space:nowrap;' (not compat with IE5)</td></tr>
|
||||
<tr><td>size</td><td>HR</td><td>Near-equiv 'width', needs px suffix if original was pixels</td></tr>
|
||||
<tr class="required impl-yes"><td>src</td><td>IMG</td><td>Required, insert blank or default img if not set</td></tr>
|
||||
<tr><td>start</td><td>OL</td><td>Poorly supported 'counter-reset', transform may not be desirable</td></tr>
|
||||
<tr><td rowspan="3">type</td><td>LI</td><td rowspan="3">Equivalent style 'list-style-type', different allowed values though. (needs testing)</td></tr>
|
||||
<tr><td>OL</td></tr>
|
||||
<tr><td>UL</td></tr>
|
||||
<tr><td>value</td><td>LI</td><td>Poorly supported 'counter-reset', transform may not be desirable, see ol.start. Configurable.</td></tr>
|
||||
<tr><td>vspace</td><td>IMG</td><td>Near-equiv styles 'margin-left' and 'margin-right', needs px suffix, see hspace</td></tr>
|
||||
<tr><td rowspan="2">width</td><td>HR</td><td rowspan="2">Near-equiv style 'width', needs px suffix if original was pixels</td></tr>
|
||||
<tr><td>TD, TH</td></tr>
|
||||
</tbody>
|
||||
|
||||
</table>
|
||||
|
||||
<h3>CSS</h3>
|
||||
|
||||
<table cellspacing="0">
|
||||
@@ -134,12 +53,12 @@ thead th {text-align:left;padding:0.1em;background-color:#EEE;}
|
||||
<tbody>
|
||||
<tr><th colspan="2">Standard</th></tr>
|
||||
<tr class="css1 impl-yes"><td>background-color</td><td>COMPOSITE(<color>, transparent)</td></tr>
|
||||
<tr class="css1"><td>background</td><td>SHORTHAND</td></tr>
|
||||
<tr class="css1"><td>border</td><td>SHORTHAND, MULTIPLE</td></tr>
|
||||
<tr class="css1 impl-yes"><td>background</td><td>SHORTHAND, only for color, see below for info on background-image and friends</td></tr>
|
||||
<tr class="css1 impl-yes"><td>border</td><td>SHORTHAND, MULTIPLE</td></tr>
|
||||
<tr class="css1 impl-yes"><td>border-color</td><td>MULTIPLE</td></tr>
|
||||
<tr class="css1 impl-yes"><td>border-style</td><td>MULTIPLE</td></tr>
|
||||
<tr class="css1 impl-yes"><td>border-width</td><td>MULTIPLE</td></tr>
|
||||
<tr class="css1"><td>border-*</td><td>SHORTHAND</td></tr>
|
||||
<tr class="css1 impl-yes"><td>border-*</td><td>SHORTHAND</td></tr>
|
||||
<tr class="impl-yes"><td>border-*-color</td><td>COMPOSITE(<color>, transparent)</td></tr>
|
||||
<tr class="impl-yes"><td>border-*-style</td><td>ENUM(none, hidden, dotted, dashed,
|
||||
solid, double, groove, ridge, inset, outset)</td></tr>
|
||||
@@ -148,7 +67,7 @@ thead th {text-align:left;padding:0.1em;background-color:#EEE;}
|
||||
<tr class="css1 impl-yes"><td>color</td><td><color></td></tr>
|
||||
<tr class="css1 impl-yes"><td>float</td><td>ENUM(left, right, none), May require layout
|
||||
precautions with clear</td></tr>
|
||||
<tr class="css1"><td>font</td><td>SHORTHAND</td></tr>
|
||||
<tr class="css1 impl-yes"><td>font</td><td>SHORTHAND</td></tr>
|
||||
<tr class="css1 impl-yes"><td>font-family</td><td>CSS validator may complain if fallback font
|
||||
family not specified</td></tr>
|
||||
<tr class="css1 impl-yes"><td>font-size</td><td>COMPOSITE(<absolute-size>,
|
||||
@@ -167,7 +86,7 @@ thead th {text-align:left;padding:0.1em;background-color:#EEE;}
|
||||
Well-supported values are: disc, circle, square,
|
||||
decimal, lower-roman, upper-roman, lower-alpha and upper-alpha. See also
|
||||
CSS 3. Mostly IE lack of support.</td></tr>
|
||||
<tr class="css1"><td>list-style</td><td>SHORTHAND</td></tr>
|
||||
<tr class="css1 impl-yes"><td>list-style</td><td>SHORTHAND</td></tr>
|
||||
<tr class="css1 impl-yes"><td>margin</td><td>MULTIPLE</td></tr>
|
||||
<tr class="css1 impl-yes"><td>margin-*</td><td>COMPOSITE(<length>,
|
||||
<percentage>, auto)</td></tr>
|
||||
@@ -191,18 +110,18 @@ thead th {text-align:left;padding:0.1em;background-color:#EEE;}
|
||||
|
||||
<tbody>
|
||||
<tr><th colspan="2">Table</th></tr>
|
||||
<tr><td>border-collapse</td><td>ENUM(collapse, seperate)</td></tr>
|
||||
<tr><td>caption-side</td><td>ENUM(top, bottom)</td></tr>
|
||||
<tr><td>empty-cells</td><td>ENUM(show, hide), No IE support, possible fix
|
||||
with &nbsp;?</td></tr>
|
||||
<tr><td>table-layout</td><td>ENUM(auto, fixed)</td></tr>
|
||||
<tr class="css1"><td>vertical-align</td><td>COMPOSITE(ENUM(baseline, sub,
|
||||
<tr class="impl-yes"><td>border-collapse</td><td>ENUM(collapse, seperate)</td></tr>
|
||||
<tr class="impl-yes"><td>caption-side</td><td>ENUM(top, bottom)</td></tr>
|
||||
<tr class="feature"><td>empty-cells</td><td>ENUM(show, hide), No IE support makes this useless,
|
||||
possible fix with &nbsp;? Unknown release milestone.</td></tr>
|
||||
<tr class="impl-yes"><td>table-layout</td><td>ENUM(auto, fixed)</td></tr>
|
||||
<tr class="impl-yes css1"><td>vertical-align</td><td>COMPOSITE(ENUM(baseline, sub,
|
||||
super, top, text-top, middle, bottom, text-bottom), <percentage>,
|
||||
<length>) Also applies to others with explicit height</td></tr>
|
||||
</tbody>
|
||||
|
||||
<tbody>
|
||||
<tr><th colspan="2">Absolute positioning</th></tr>
|
||||
<tr><th colspan="2">Absolute positioning, unknown release milestone</th></tr>
|
||||
<tr class="danger"><td>bottom</td><td rowspan="4">Dangerous, must be non-negative</td></tr>
|
||||
<tr class="danger"><td>left</td></tr>
|
||||
<tr class="danger"><td>right</td></tr>
|
||||
@@ -215,36 +134,36 @@ thead th {text-align:left;padding:0.1em;background-color:#EEE;}
|
||||
|
||||
<tbody>
|
||||
<tr><th colspan="2">Unknown</th></tr>
|
||||
<tr class="danger css1"><td>background-image</td><td>Dangerous</td></tr>
|
||||
<tr class="danger css1"><td>background-image</td><td>Dangerous, target milestone 1.2</td></tr>
|
||||
<tr class="css1"><td>background-attachment</td><td>ENUM(scroll, fixed),
|
||||
Depends on background-image</td></tr>
|
||||
<tr class="css1"><td>background-position</td><td>Depends on background-image</td></tr>
|
||||
<tr class="danger"><td>cursor</td><td>Dangerous but fluffy</td></tr>
|
||||
<tr class="danger impl-no"><td>cursor</td><td>Dangerous but fluffy</td></tr>
|
||||
<tr class="danger css1"><td>display</td><td>ENUM(...), Dangerous but interesting;
|
||||
will not implement list-item, run-in (Opera only) or table (no IE);
|
||||
inline-block has incomplete IE6 support and requires -moz-inline-box
|
||||
for Mozilla.</td></tr>
|
||||
<tr><td class="css1">height</td><td>Interesting, why use it?</td></tr>
|
||||
<tr class="danger css1"><td>list-style-image</td><td>Dangerous?</td></tr>
|
||||
for Mozilla. Unknown target milestone.</td></tr>
|
||||
<tr><td class="css1">height</td><td>Interesting, why use it? Unknown target milestone.</td></tr>
|
||||
<tr class="danger css1"><td>list-style-image</td><td>Dangerous? Target milestone 1.2</td></tr>
|
||||
<tr class="impl-no"><td>max-height</td><td rowspan="4">No IE 5/6</td></tr>
|
||||
<tr class="impl-no"><td>min-height</td></tr>
|
||||
<tr class="impl-no"><td>max-width</td></tr>
|
||||
<tr class="impl-no"><td>min-width</td></tr>
|
||||
<tr class="impl-no"><td>orphans</td><td>No IE support</td></tr>
|
||||
<tr class="impl-no"><td>widows</td><td>No IE support</td></tr>
|
||||
<tr><td>overflow</td><td>ENUM, IE 5/6 almost (remove visible if set)</td></tr>
|
||||
<tr><td>overflow</td><td>ENUM, IE 5/6 almost (remove visible if set). Unknown target milestone.</td></tr>
|
||||
<tr><td>page-break-after</td><td>ENUM(auto, always, avoid, left, right),
|
||||
IE 5.5/6 and Opera</td></tr>
|
||||
IE 5.5/6 and Opera. Unknown target milestone.</td></tr>
|
||||
<tr><td>page-break-before</td><td>ENUM(auto, always, avoid, left, right),
|
||||
Mostly supported</td></tr>
|
||||
<tr><td>page-break-inside</td><td>ENUM(avoid, auto), Opera only</td></tr>
|
||||
<tr class="impl-no"><td>quotes</td><td>May be dropped from CSS2</td></tr>
|
||||
Mostly supported. Unknown target milestone.</td></tr>
|
||||
<tr><td>page-break-inside</td><td>ENUM(avoid, auto), Opera only. Unknown target milestone.</td></tr>
|
||||
<tr class="impl-no"><td>quotes</td><td>May be dropped from CSS2, fairly useless for inline context</td></tr>
|
||||
<tr class="impl-no"><td>visibility</td><td>ENUM(visible, hidden, collapse),
|
||||
Dangerous</td></tr>
|
||||
<tr><td class="css1">white-space</td><td>ENUM(normal, pre, nowrap, pre-wrap,
|
||||
<tr class="css1 feature"><td>white-space</td><td>ENUM(normal, pre, nowrap, pre-wrap,
|
||||
pre-line), Spotty implementation:
|
||||
pre (no IE 5/6), nowrap (no IE 5),
|
||||
pre-wrap (only Opera), pre-line (no support). Fixable?</td></tr>
|
||||
pre-wrap (only Opera), pre-line (no support). Fixable? Unknown target milestone.</td></tr>
|
||||
</tbody>
|
||||
|
||||
<tbody class="impl-no">
|
||||
@@ -287,4 +206,87 @@ Mozilla on inside and needs -moz-outline, no IE support.</td></tr>
|
||||
|
||||
</table>
|
||||
|
||||
<h2>Interesting Attributes</h2>
|
||||
|
||||
<table cellspacing="0">
|
||||
|
||||
<thead>
|
||||
<tr><th>Attribute</th><th>Tags</th><th>Notes</th></tr>
|
||||
</thead>
|
||||
|
||||
<!--
|
||||
<tr><th></th></tr>
|
||||
<tbody>
|
||||
<tr><td>-</td><td>-</td><td>-</td></tr>
|
||||
</tbody>
|
||||
-->
|
||||
|
||||
<tbody>
|
||||
<tr><th colspan="3">CSS</th></tr>
|
||||
<tr class="impl-yes"><td>style</td><td>All</td><td>Not all properties may be implemented, parser is good though.</td></tr>
|
||||
</tbody>
|
||||
|
||||
<tbody>
|
||||
<tr><th colspan="3">Questionable</th></tr>
|
||||
<tr class="impl-no"><td>accesskey</td><td>A</td><td>May interfere with main interface</td></tr>
|
||||
<tr class="impl-no"><td>tabindex</td><td>A</td><td>May interfere with main interface</td></tr>
|
||||
<tr><td>target</td><td>A</td><td>Config enabled, only useful for frame layouts</td></tr>
|
||||
</tbody>
|
||||
|
||||
<tbody>
|
||||
<tr><th colspan="3">Miscellaneous</th></tr>
|
||||
<tr><td>datetime</td><td>DEL, INS</td><td>No visible effect, ISO format</td></tr>
|
||||
<tr><td>rel</td><td>A</td><td>Largely user-defined: nofollow, tag (see microformats)</td></tr>
|
||||
<tr><td>rev</td><td>A</td><td>Largely user-defined: vote-*</td></tr>
|
||||
<tr class="feature"><td>axis</td><td>TD, TH</td><td>W3C only: No browser implementation</td></tr>
|
||||
<tr class="feature"><td>char</td><td>COL, COLGROUP, TBODY, TD, TFOOT, TH, THEAD, TR</td><td>W3C only: No browser implementation</td></tr>
|
||||
<tr class="feature"><td>headers</td><td>TD, TH</td><td>W3C only: No browser implementation</td></tr>
|
||||
<tr class="feature"><td>scope</td><td>TD, TH</td><td>W3C only: No browser implementation</td></tr>
|
||||
</tbody>
|
||||
|
||||
<tbody class="impl-yes">
|
||||
<tr><th colspan="3">URI</th></tr>
|
||||
<tr><td rowspan="2">cite</td><td>BLOCKQUOTE, Q</td><td>For attribution</td></tr>
|
||||
<tr><td>DEL, INS</td><td>Link to explanation why it changed</td></tr>
|
||||
<tr><td>href</td><td>A</td><td>-</td></tr>
|
||||
<tr><td>longdesc</td><td>IMG</td><td>-</td></tr>
|
||||
<tr class="required"><td>src</td><td>IMG</td><td>Required</td></tr>
|
||||
</tbody>
|
||||
|
||||
<tbody>
|
||||
<tr><th colspan="3">Transform, target milestone 1.4</th></tr>
|
||||
<tr><td rowspan="5">align</td><td>CAPTION</td><td>Near-equiv style 'caption-side', drop left and right</td></tr>
|
||||
<tr><td>IMG</td><td rowspan="2">Margin-left and margin-right = auto or parent div</td></tr>
|
||||
<tr><td>TABLE</td></tr>
|
||||
<tr><td>HR</td><td>Equivalent style 'text-align' (IE tested)</td></tr>
|
||||
<tr class="impl-yes"><td>H1, H2, H3, H4, H5, H6, P</td><td>Equivalent style 'text-align'</td></tr>
|
||||
<tr class="required impl-yes"><td>alt</td><td>IMG</td><td>Required, insert image filename if src is present or default invalid image text</td></tr>
|
||||
<tr><td rowspan="3">bgcolor</td><td>TABLE</td><td>Equivalent style 'background-color' (IE tested)</td></tr>
|
||||
<tr><td>TR</td><td>Equivalent style 'background-color' (IE tested)</td></tr>
|
||||
<tr><td>TD, TH</td><td>Equivalent style 'background-color'</td></tr>
|
||||
<tr><td>border</td><td>IMG</td><td>Equivalent style 'border-width', only applies when link present</td></tr>
|
||||
<tr><td>clear</td><td>BR</td><td>Near-equiv style 'clear', transform 'all' into 'both'</td></tr>
|
||||
<tr class="impl-no"><td>compact</td><td>DL, OL, UL</td><td>Boolean, needs custom CSS class; rarely used anyway</td></tr>
|
||||
<tr class="required impl-yes"><td>dir</td><td>BDO</td><td>Required, insert ltr (or configuration value) if none</td></tr>
|
||||
<tr><td>height</td><td>TD, TH</td><td>Near-equiv style 'height', needs px suffix if original was in pixels</td></tr>
|
||||
<tr><td>hspace</td><td>IMG</td><td>Near-equiv styles 'margin-top' and 'margin-bottom', needs px suffix</td></tr>
|
||||
<tr class="impl-yes"><td>lang</td><td>*</td><td>Copy value to xml:lang</td></tr>
|
||||
<tr><td rowspan="2">name</td><td>IMG</td><td>Turn into ID</td></tr>
|
||||
<tr><td>A</td><td>Turn into ID? (not deprecated, though in which specs?)</td></tr>
|
||||
<tr><td>noshade</td><td>HR</td><td>Boolean, style 'border-style:solid;'</td></tr>
|
||||
<tr><td>nowrap</td><td>TD, TH</td><td>Boolean, style 'white-space:nowrap;' (not compat with IE5)</td></tr>
|
||||
<tr><td>size</td><td>HR</td><td>Near-equiv 'width', needs px suffix if original was pixels</td></tr>
|
||||
<tr class="required impl-yes"><td>src</td><td>IMG</td><td>Required, insert blank or default img if not set</td></tr>
|
||||
<tr><td>start</td><td>OL</td><td>Poorly supported 'counter-reset', transform may not be desirable</td></tr>
|
||||
<tr><td rowspan="3">type</td><td>LI</td><td rowspan="3">Equivalent style 'list-style-type', different allowed values though. (needs testing)</td></tr>
|
||||
<tr><td>OL</td></tr>
|
||||
<tr><td>UL</td></tr>
|
||||
<tr><td>value</td><td>LI</td><td>Poorly supported 'counter-reset', transform may not be desirable, see ol.start. Configurable.</td></tr>
|
||||
<tr><td>vspace</td><td>IMG</td><td>Near-equiv styles 'margin-left' and 'margin-right', needs px suffix, see hspace</td></tr>
|
||||
<tr><td rowspan="2">width</td><td>HR</td><td rowspan="2">Near-equiv style 'width', needs px suffix if original was pixels</td></tr>
|
||||
<tr><td>TD, TH</td></tr>
|
||||
</tbody>
|
||||
|
||||
</table>
|
||||
|
||||
</body></html>
|
@@ -6,34 +6,43 @@ through negligence of people. This class will do its job: no more, no less,
|
||||
and it's up to you to provide it the proper information and proper context
|
||||
to be effective. Things to remember:
|
||||
|
||||
1. UTF-8. Currently, the parser runs under the assumption that it is dealing
|
||||
1. Character Encoding: UTF-8.
|
||||
Currently, the parser runs under the assumption that it is dealing
|
||||
with UTF-8. Not ISO-8859-1 or Windows-1252, UTF-8. And definitely not "no
|
||||
character encoding explicitly stated" or UTF-7. If you're not using UTF-8 as
|
||||
your character encoding, you should switch. Now. (in future versions, however,
|
||||
I may make the character encoding configurable, but there's only so much I
|
||||
can do). Make sure any input is properly converted to UTF-8, or the parser
|
||||
will mangle it badly (though it won't be a security risk if you're outputting
|
||||
it as UTF-8 though).
|
||||
your character encoding, make sure you configure HTML Purifier or switch
|
||||
to UTF-8. Now. Also, make sure any input is properly converted to UTF-8, or
|
||||
the parser will mangle it badly (though it won't be a security risk if you're
|
||||
outputting it as UTF-8 though). Character encoding is, in general, a knotty
|
||||
issue, but do yourself a favor and learn about it:
|
||||
<http://www.joelonsoftware.com/articles/Unicode.html>
|
||||
|
||||
2. XHTML 1.0 Transitional. This is what the parser is outputting. For the most
|
||||
2. Doctype: XHTML 1.0 Transitional
|
||||
This is what the parser is outputting. For the most
|
||||
part, it's compatible with HTML 4.01, but XHTML enforces some very nice things
|
||||
that all web developers should use. Regardless, NO DOCTYPE is a NO. Quirks mode
|
||||
has waaaay too many quirks for a little parser to handle. We did not select
|
||||
strict in order to prevent ourselves from being too draconic on users, but
|
||||
this may be configurable in the future.
|
||||
this may be configurable in the future. Do you want standards compliance?
|
||||
The doctype is a good place to start.
|
||||
|
||||
3. IDs. They need to be unique, but without some knowledge of the
|
||||
rest of the document, it's difficult to know what's unique. Without setting
|
||||
%Attr.IDBlacklist to the proper
|
||||
3. IDs
|
||||
They need to be unique, but without some knowledge of the
|
||||
rest of the document, it's difficult to know what's unique. %Attr.IDBlacklist
|
||||
needs to be set: we may want to consider disallowing IDs by default to
|
||||
save lazy programmers.
|
||||
|
||||
4. [PROJECTED] Links. We're not going to try for spam protection (although
|
||||
4. [PROJECTED] Links
|
||||
We're not going to try for spam protection (although
|
||||
some hooks for such a module might be nice) but we may offer the ability to
|
||||
only accept relative URLs. Pick the one that's right for you.
|
||||
|
||||
5. CSS. While we can prevent the most flagrant cases from affecting your
|
||||
5. CSS
|
||||
While we can prevent the most flagrant cases from affecting your
|
||||
layout (such as absolutely positioned elements), no amount of code is going
|
||||
to protect your pages from being attacked by garish colors and plain old
|
||||
bad taste. A neat feature would be the ability to define acceptable colors
|
||||
in a document, but that's not likely to be implemented for a while. In the
|
||||
meantime, be sure to make sure that floated elements (permitted, since they
|
||||
can be quite useful) cna't mess up your layout.
|
||||
can be quite useful) can't mess up your layout. Once again, we may want to
|
||||
disable this by default to protect lazy developers.
|
||||
|
@@ -29,7 +29,8 @@ output is valid XHTML or send the HTML through a draconic XML parser (and yet
|
||||
still get the nesting wrong: SafeHtmlChecker.class.php does not prevent <a>
|
||||
tags from being nested within each other).
|
||||
|
||||
This document seeks to detail the inner workings of HTML Purifier. The first
|
||||
This document no longer is a detailed description of how HTMLPurifier works,
|
||||
as those descriptions have been moved to the appropriate code. The first
|
||||
draft was drawn up after two rough code sketches and the implementation of a
|
||||
forgiving lexer. You may also be interested in the unit tests located in the
|
||||
tests/ folder, which provide a living document on how exactly the filter deals
|
||||
@@ -52,4 +53,5 @@ In summary:
|
||||
HTML Purifier is best suited for documents that require a rich array of
|
||||
HTML tags. Things like blog comments are, in all likelihood, most appropriately
|
||||
written in an extremely restrictive set of markup that doesn't require
|
||||
all this functionality (or not written in HTML at all).
|
||||
all this functionality (or not written in HTML at all), although this may
|
||||
be changing in the future with the addition of levels of filtering.
|
||||
|
@@ -3,7 +3,7 @@
|
||||
/*!
|
||||
* @mainpage
|
||||
*
|
||||
* HTMLPurifier is a purification class that will take an arbitrary snippet of
|
||||
* HTMLPurifier is an HTML filter that will take an arbitrary snippet of
|
||||
* HTML and rigorously test, validate and filter it into a version that
|
||||
* is safe for output onto webpages. It achieves this by:
|
||||
*
|
||||
@@ -15,15 +15,40 @@
|
||||
* -# Validating attributes of the nodes; and
|
||||
* -# Generating HTML from the purified tokens.
|
||||
*
|
||||
* See /docs/spec.txt for more details.
|
||||
* However, most users will only need to interface with the HTMLPurifier
|
||||
* class, so this massive amount of infrastructure is usually concealed.
|
||||
* If you plan on working with the internals, be sure to include
|
||||
* HTMLPurifier_ConfigSchema and HTMLPurifier_Config.
|
||||
*/
|
||||
|
||||
require_once 'HTMLPurifier/ConfigDef.php';
|
||||
/*
|
||||
HTMLPurifier - Standards Compliant HTML Filtering
|
||||
Copyright (C) 2006 Edward Z. Yang
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free Software
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
// almost every class has an undocumented dependency to these, so make sure
|
||||
// they get included
|
||||
require_once 'HTMLPurifier/ConfigSchema.php';
|
||||
require_once 'HTMLPurifier/Config.php';
|
||||
|
||||
require_once 'HTMLPurifier/Lexer.php';
|
||||
require_once 'HTMLPurifier/HTMLDefinition.php';
|
||||
require_once 'HTMLPurifier/Generator.php';
|
||||
require_once 'HTMLPurifier/Strategy/Core.php';
|
||||
require_once 'HTMLPurifier/Encoder.php';
|
||||
|
||||
/**
|
||||
* Main library execution class.
|
||||
@@ -31,39 +56,56 @@ require_once 'HTMLPurifier/Strategy/Core.php';
|
||||
* Facade that performs calls to the HTMLPurifier_Lexer,
|
||||
* HTMLPurifier_Strategy and HTMLPurifier_Generator subsystems in order to
|
||||
* purify HTML.
|
||||
*
|
||||
* @todo We need an easier way to inject strategies, it'll probably end
|
||||
* up getting done through config though.
|
||||
*/
|
||||
class HTMLPurifier
|
||||
{
|
||||
|
||||
var $config;
|
||||
|
||||
var $lexer, $strategy, $generator;
|
||||
|
||||
/**
|
||||
* Initializes the purifier.
|
||||
* @param $config Configuration for all instances of the purifier
|
||||
* @param $config Optional HTMLPurifier_Config object for all instances of
|
||||
* the purifier, if omitted, a default configuration is
|
||||
* supplied (which can be overridden on a per-use basis).
|
||||
*/
|
||||
function HTMLPurifier($config = null) {
|
||||
|
||||
$this->config = $config ? $config : HTMLPurifier_Config::createDefault();
|
||||
|
||||
$this->lexer = HTMLPurifier_Lexer::create();
|
||||
$this->strategy = new HTMLPurifier_Strategy_Core();
|
||||
$this->generator = new HTMLPurifier_Generator();
|
||||
$this->encoder = new HTMLPurifier_Encoder();
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Purifies HTML.
|
||||
* Filters an HTML snippet/document to be XSS-free and standards-compliant.
|
||||
*
|
||||
* @param $html String of HTML to purify
|
||||
* @param $config HTMLPurifier_Config object for this specific round
|
||||
* @param $config HTMLPurifier_Config object for this operation, if omitted,
|
||||
* defaults to the config object specified during this
|
||||
* object's construction.
|
||||
* @return Purified HTML
|
||||
*/
|
||||
function purify($html, $config = null) {
|
||||
$config = $config ? $config : $this->config;
|
||||
$lexer = HTMLPurifier_Lexer::create();
|
||||
$strategy = new HTMLPurifier_Strategy_Core();
|
||||
$generator = new HTMLPurifier_Generator();
|
||||
return $generator->generateFromTokens(
|
||||
$strategy->execute(
|
||||
$lexer->tokenizeHTML($html, $config),
|
||||
$html = $this->encoder->convertToUTF8($html, $config);
|
||||
$html =
|
||||
$this->generator->generateFromTokens(
|
||||
$this->strategy->execute(
|
||||
$this->lexer->tokenizeHTML($html, $config),
|
||||
$config
|
||||
),
|
||||
$config
|
||||
),
|
||||
$config
|
||||
);
|
||||
);
|
||||
$html = $this->encoder->convertFromUTF8($html, $config);
|
||||
return $html;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -3,12 +3,23 @@
|
||||
/**
|
||||
* Internal data-structure used in attribute validation to accumulate state.
|
||||
*
|
||||
* All it is is a data-structure that holds objects that accumulate state, like
|
||||
* HTMLPurifier_IDAccumulator.
|
||||
* This is a data-structure that holds objects that accumulate state, like
|
||||
* HTMLPurifier_IDAccumulator. It's better than using globals!
|
||||
*
|
||||
* @note Many functions that accept this object have it as a mandatory
|
||||
* parameter, even when there is no use for it. Though this is
|
||||
* for the same reasons as why HTMLPurifier_Config is a mandatory
|
||||
* parameter, it is also because you cannot assign a default value
|
||||
* to a parameter passed by reference (passing by reference is essential
|
||||
* for context to work in PHP 4).
|
||||
*/
|
||||
|
||||
class HTMLPurifier_AttrContext
|
||||
{
|
||||
/**
|
||||
* Contains an HTMLPurifier_IDAccumulator, which keeps track of used IDs.
|
||||
* @public
|
||||
*/
|
||||
var $id_accumulator;
|
||||
}
|
||||
|
||||
|
@@ -2,15 +2,56 @@
|
||||
|
||||
require_once 'HTMLPurifier/AttrContext.php';
|
||||
|
||||
// AttrDef = Attribute Definition
|
||||
/**
|
||||
* Base class for all validating attribute definitions.
|
||||
*
|
||||
* This family of classes forms the core for not only HTML attribute validation,
|
||||
* but also any sort of string that needs to be validated or cleaned (which
|
||||
* means CSS properties and composite definitions are defined here too).
|
||||
* Besides defining (through code) what precisely makes the string valid,
|
||||
* subclasses are also responsible for cleaning the code if possible.
|
||||
*/
|
||||
|
||||
class HTMLPurifier_AttrDef
|
||||
{
|
||||
function HTMLPurifier_AttrDef() {}
|
||||
|
||||
/**
|
||||
* Tells us whether or not an HTML attribute is minimized. Only the
|
||||
* boolean attribute vapourware would use this.
|
||||
*/
|
||||
var $minimized = false;
|
||||
|
||||
/**
|
||||
* Abstract function defined for functions that validate and clean strings.
|
||||
*
|
||||
* This function forms the basis for all the subclasses: they must
|
||||
* define this method.
|
||||
*
|
||||
* @public
|
||||
* @param $string String to be validated and cleaned.
|
||||
* @param $config Mandatory HTMLPurifier_Config object.
|
||||
* @param $context Mandatory HTMLPurifier_AttrContext object.
|
||||
*/
|
||||
function validate($string, $config, &$context) {
|
||||
trigger_error('Cannot call abstract function', E_USER_ERROR);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience method that parses a string as if it were CDATA.
|
||||
*
|
||||
* This method process a string in the manner specified at
|
||||
* <http://www.w3.org/TR/html4/types.html#h-6.2> by removing
|
||||
* leading and trailing whitespace, ignoring line feeds, and replacing
|
||||
* carriage returns and tabs with spaces. While most useful for HTML
|
||||
* attributes specified as CDATA, it can also be applied to most CSS
|
||||
* values.
|
||||
*
|
||||
* @note This method is not entirely standards compliant, as trim() removes
|
||||
* more types of whitespace than specified in the spec. In practice,
|
||||
* this is rarely a problem.
|
||||
*
|
||||
* @public
|
||||
*/
|
||||
function parseCDATA($string) {
|
||||
$string = trim($string);
|
||||
$string = str_replace("\n", '', $string);
|
||||
|
45
library/HTMLPurifier/AttrDef/Border.php
Normal file
45
library/HTMLPurifier/AttrDef/Border.php
Normal file
@@ -0,0 +1,45 @@
|
||||
<?php
|
||||
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
|
||||
/**
|
||||
* Validates the border property as defined by CSS.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_Border extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
/**
|
||||
* Local copy of properties this property is shorthand for.
|
||||
*/
|
||||
var $info = array();
|
||||
|
||||
function HTMLPurifier_AttrDef_Border($config) {
|
||||
$def = $config->getCSSDefinition();
|
||||
$this->info['border-width'] = $def->info['border-width'];
|
||||
$this->info['border-style'] = $def->info['border-style'];
|
||||
$this->info['border-top-color'] = $def->info['border-top-color'];
|
||||
}
|
||||
|
||||
function validate($string, $config, &$context) {
|
||||
$string = $this->parseCDATA($string);
|
||||
// we specifically will not support rgb() syntax with spaces
|
||||
$bits = explode(' ', $string);
|
||||
$done = array(); // segments we've finished
|
||||
$ret = ''; // return value
|
||||
foreach ($bits as $bit) {
|
||||
foreach ($this->info as $propname => $validator) {
|
||||
if (isset($done[$propname])) continue;
|
||||
$r = $validator->validate($bit, $config, $context);
|
||||
if ($r !== false) {
|
||||
$ret .= $r . ' ';
|
||||
$done[$propname] = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return rtrim($ret);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
?>
|
@@ -3,6 +3,12 @@
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
require_once 'HTMLPurifier/CSSDefinition.php';
|
||||
|
||||
/**
|
||||
* Validates the HTML attribute style, otherwise known as CSS.
|
||||
* @note We don't implement the whole CSS specification, so it might be
|
||||
* difficult to reuse this component in the context of validating
|
||||
* actual stylesheet declarations.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
@@ -10,7 +16,7 @@ class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef
|
||||
|
||||
$css = $this->parseCDATA($css);
|
||||
|
||||
$definition = HTMLPurifier_CSSDefinition::instance();
|
||||
$definition = $config->getCSSDefinition();
|
||||
|
||||
// we're going to break the spec and explode by semicolons.
|
||||
// This is because semicolon rarely appears in escaped form
|
||||
@@ -22,10 +28,12 @@ class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef
|
||||
if (!$declaration) continue;
|
||||
if (!strpos($declaration, ':')) continue;
|
||||
list($property, $value) = explode(':', $declaration, 2);
|
||||
$property = trim($property);
|
||||
$value = trim($value);
|
||||
if (!isset($definition->info[$property])) continue;
|
||||
// inefficient call, since the validator will do this again
|
||||
// inherit works for everything
|
||||
if (strtolower(trim($value)) !== 'inherit') {
|
||||
// inherit works for everything (but only on the base property)
|
||||
$result = $definition->info[$property]->validate(
|
||||
$value, $config, $context );
|
||||
} else {
|
||||
|
@@ -3,13 +3,29 @@
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
require_once 'HTMLPurifier/AttrDef/Number.php';
|
||||
|
||||
/**
|
||||
* Represents a Length as defined by CSS.
|
||||
* @warning Be sure not to confuse this with HTMLPurifier_AttrDef_Length!
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_CSSLength extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
/**
|
||||
* Valid unit lookup table.
|
||||
* @warning The code assumes all units are two characters long. Be careful
|
||||
* if we have to change this behavior!
|
||||
*/
|
||||
var $units = array('em' => true, 'ex' => true, 'px' => true, 'in' => true,
|
||||
'cm' => true, 'mm' => true, 'pt' => true, 'pc' => true);
|
||||
/**
|
||||
* Instance of HTMLPurifier_AttrDef_Number to defer number validation to
|
||||
*/
|
||||
var $number_def;
|
||||
|
||||
/**
|
||||
* @param $non_negative Bool indication whether or not negative values are
|
||||
* allowed.
|
||||
*/
|
||||
function HTMLPurifier_AttrDef_CSSLength($non_negative = false) {
|
||||
$this->number_def = new HTMLPurifier_AttrDef_Number($non_negative);
|
||||
}
|
||||
|
@@ -3,6 +3,9 @@
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
require_once 'HTMLPurifier/Config.php';
|
||||
|
||||
/**
|
||||
* Validates the contents of the global HTML attribute class.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_Class extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
|
@@ -2,14 +2,44 @@
|
||||
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
|
||||
class HTMLPurifier_AttrDef_Color
|
||||
/**
|
||||
* Validates Color as defined by CSS.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_Color extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
/**
|
||||
* Color keyword lookup table.
|
||||
* @todo Extend it to include all usually allowed colors.
|
||||
*/
|
||||
var $colors = array(
|
||||
'maroon' => '#800000',
|
||||
'red' => '#F00',
|
||||
'orange' => '#FFA500',
|
||||
'yellow' => '#FF0',
|
||||
'olive' => '#808000',
|
||||
'purple' => '#800080',
|
||||
'fuchsia' => '#F0F',
|
||||
'white' => '#FFF',
|
||||
'lime' => '#0F0',
|
||||
'green' => '#008000',
|
||||
'navy' => '#000080',
|
||||
'blue' => '#00F',
|
||||
'aqua' => '#0FF',
|
||||
'teal' => '#008080',
|
||||
'black' => '#000',
|
||||
'silver' => '#C0C0C0',
|
||||
'gray' => '#808080'
|
||||
);
|
||||
|
||||
function validate($color, $config, &$context) {
|
||||
|
||||
$color = trim($color);
|
||||
if (!$color) return false;
|
||||
|
||||
$lower = strtolower($color);
|
||||
if (isset($this->colors[$lower])) return $this->colors[$lower];
|
||||
|
||||
if ($color[0] === '#') {
|
||||
// hexadecimal handling
|
||||
$hex = substr($color, 1);
|
||||
|
@@ -1,10 +1,26 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Allows multiple validators to attempt to validate attribute.
|
||||
*
|
||||
* Composite is just what it sounds like: a composite of many validators.
|
||||
* This means that multiple HTMLPurifier_AttrDef objects will have a whack
|
||||
* at the string. If one of them passes, that's what is returned. This is
|
||||
* especially useful for CSS values, which often are a choice between
|
||||
* an enumerated set of predefined values or a flexible data type.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_Composite extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
/**
|
||||
* List of HTMLPurifier_AttrDef objects that may process strings
|
||||
* @protected
|
||||
*/
|
||||
var $defs;
|
||||
|
||||
/**
|
||||
* @param $defs List of HTMLPurifier_AttrDef objects
|
||||
*/
|
||||
function HTMLPurifier_AttrDef_Composite($defs) {
|
||||
$this->defs = $defs;
|
||||
}
|
||||
|
@@ -3,12 +3,27 @@
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
|
||||
// Enum = Enumerated
|
||||
/**
|
||||
* Validates a keyword against a list of valid values.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_Enum extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
/**
|
||||
* Lookup table of valid values.
|
||||
*/
|
||||
var $valid_values = array();
|
||||
|
||||
/**
|
||||
* Bool indicating whether or not enumeration is case sensitive.
|
||||
* @note In general this is always case insensitive.
|
||||
*/
|
||||
var $case_sensitive = false; // values according to W3C spec
|
||||
|
||||
/**
|
||||
* @param $valid_values List of valid values
|
||||
* @param $case_sensitive Bool indicating whether or not case sensitive
|
||||
*/
|
||||
function HTMLPurifier_AttrDef_Enum(
|
||||
$valid_values = array(), $case_sensitive = false) {
|
||||
|
||||
|
154
library/HTMLPurifier/AttrDef/Font.php
Normal file
154
library/HTMLPurifier/AttrDef/Font.php
Normal file
@@ -0,0 +1,154 @@
|
||||
<?php
|
||||
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
|
||||
/**
|
||||
* Validates shorthand CSS property font.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_Font extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
/**
|
||||
* Local copy of component validators.
|
||||
*
|
||||
* @note If we moved specific CSS property definitions to their own
|
||||
* classes instead of having them be assembled at run time by
|
||||
* CSSDefinition, this wouldn't be necessary. We'd instantiate
|
||||
* our own copies.
|
||||
*/
|
||||
var $info = array();
|
||||
|
||||
/**
|
||||
* System font keywords.
|
||||
*/
|
||||
var $system_fonts = array(
|
||||
'caption' => true,
|
||||
'icon' => true,
|
||||
'menu' => true,
|
||||
'message-box' => true,
|
||||
'small-caption' => true,
|
||||
'status-bar' => true
|
||||
);
|
||||
|
||||
function HTMLPurifier_AttrDef_Font($config) {
|
||||
$def = $config->getCSSDefinition();
|
||||
$this->info['font-style'] = $def->info['font-style'];
|
||||
$this->info['font-variant'] = $def->info['font-variant'];
|
||||
$this->info['font-weight'] = $def->info['font-weight'];
|
||||
$this->info['font-size'] = $def->info['font-size'];
|
||||
$this->info['line-height'] = $def->info['line-height'];
|
||||
$this->info['font-family'] = $def->info['font-family'];
|
||||
}
|
||||
|
||||
function validate($string, $config, &$context) {
|
||||
|
||||
// regular pre-processing
|
||||
$string = $this->parseCDATA($string);
|
||||
if ($string === '') return false;
|
||||
|
||||
// check if it's one of the keywords
|
||||
$lowercase_string = strtolower($string);
|
||||
if (isset($this->system_fonts[$lowercase_string])) {
|
||||
return $lowercase_string;
|
||||
}
|
||||
|
||||
$bits = explode(' ', $string); // bits to process
|
||||
$stage = 0; // this indicates what we're looking for
|
||||
$caught = array(); // which stage 0 properties have we caught?
|
||||
$stage_1 = array('font-style', 'font-variant', 'font-weight');
|
||||
$final = ''; // output
|
||||
|
||||
for ($i = 0, $size = count($bits); $i < $size; $i++) {
|
||||
if ($bits[$i] === '') continue;
|
||||
switch ($stage) {
|
||||
|
||||
// attempting to catch font-style, font-variant or font-weight
|
||||
case 0:
|
||||
foreach ($stage_1 as $validator_name) {
|
||||
if (isset($caught[$validator_name])) continue;
|
||||
$r = $this->info[$validator_name]->validate(
|
||||
$bits[$i], $config, $context);
|
||||
if ($r !== false) {
|
||||
$final .= $r . ' ';
|
||||
$caught[$validator_name] = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
// all three caught, continue on
|
||||
if (count($caught) >= 3) $stage = 1;
|
||||
if ($r !== false) break;
|
||||
|
||||
// attempting to catch font-size and perhaps line-height
|
||||
case 1:
|
||||
$found_slash = false;
|
||||
if (strpos($bits[$i], '/') !== false) {
|
||||
list($font_size, $line_height) =
|
||||
explode('/', $bits[$i]);
|
||||
if ($line_height === '') {
|
||||
// ooh, there's a space after the slash!
|
||||
$line_height = false;
|
||||
$found_slash = true;
|
||||
}
|
||||
} else {
|
||||
$font_size = $bits[$i];
|
||||
$line_height = false;
|
||||
}
|
||||
$r = $this->info['font-size']->validate(
|
||||
$font_size, $config, $context);
|
||||
if ($r !== false) {
|
||||
$final .= $r;
|
||||
// attempt to catch line-height
|
||||
if ($line_height === false) {
|
||||
// we need to scroll forward
|
||||
for ($j = $i + 1; $j < $size; $j++) {
|
||||
if ($bits[$j] === '') continue;
|
||||
if ($bits[$j] === '/') {
|
||||
if ($found_slash) {
|
||||
return false;
|
||||
} else {
|
||||
$found_slash = true;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
$line_height = $bits[$j];
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// slash already found
|
||||
$found_slash = true;
|
||||
$j = $i;
|
||||
}
|
||||
if ($found_slash) {
|
||||
$i = $j;
|
||||
$r = $this->info['line-height']->validate(
|
||||
$line_height, $config, $context);
|
||||
if ($r !== false) {
|
||||
$final .= '/' . $r;
|
||||
}
|
||||
}
|
||||
$final .= ' ';
|
||||
$stage = 2;
|
||||
break;
|
||||
}
|
||||
return false;
|
||||
|
||||
// attempting to catch font-family
|
||||
case 2:
|
||||
$font_family =
|
||||
implode(' ', array_slice($bits, $i, $size - $i));
|
||||
$r = $this->info['font-family']->validate(
|
||||
$font_family, $config, $context);
|
||||
if ($r !== false) {
|
||||
$final .= $r . ' ';
|
||||
// processing completed successfully
|
||||
return rtrim($final);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
?>
|
@@ -4,9 +4,16 @@ require_once 'HTMLPurifier/AttrDef.php';
|
||||
|
||||
// whitelisting allowed fonts would be nice
|
||||
|
||||
/**
|
||||
* Validates a font family list according to CSS spec
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_FontFamily extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
/**
|
||||
* Generic font family keywords.
|
||||
* @protected
|
||||
*/
|
||||
var $generic_names = array(
|
||||
'serif' => true,
|
||||
'sans-serif' => true,
|
||||
|
@@ -4,10 +4,21 @@ require_once 'HTMLPurifier/AttrDef.php';
|
||||
require_once 'HTMLPurifier/AttrDef/IPv4.php';
|
||||
require_once 'HTMLPurifier/AttrDef/IPv6.php';
|
||||
|
||||
/**
|
||||
* Validates a host according to the IPv4, IPv6 and DNS specifications.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_Host extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
var $ipv4, $ipv6;
|
||||
/**
|
||||
* Instance of HTMLPurifier_AttrDef_IPv4 sub-validator
|
||||
*/
|
||||
var $ipv4;
|
||||
|
||||
/**
|
||||
* Instance of HTMLPurifier_AttrDef_IPv6 sub-validator
|
||||
*/
|
||||
var $ipv6;
|
||||
|
||||
function HTMLPurifier_AttrDef_Host() {
|
||||
$this->ipv4 = new HTMLPurifier_AttrDef_IPv4();
|
||||
|
@@ -2,12 +2,15 @@
|
||||
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
require_once 'HTMLPurifier/IDAccumulator.php';
|
||||
|
||||
// NOTE QUIRKY BEHAVIOR: even though this is the id processor, it
|
||||
// will ignore directive Attr:IDBlacklist, since it will only
|
||||
// go according to the ID accumulator. Since the accumulator is
|
||||
// automatically generated, it will have already absorbed the
|
||||
// blacklist. If you're hacking around, make sure you use load()!
|
||||
|
||||
/**
|
||||
* Validates the HTML attribute ID.
|
||||
* @warning Even though this is the id processor, it
|
||||
* will ignore the directive Attr:IDBlacklist, since it will only
|
||||
* go according to the ID accumulator. Since the accumulator is
|
||||
* automatically generated, it will have already absorbed the
|
||||
* blacklist. If you're hacking around, make sure you use load()!
|
||||
*/
|
||||
|
||||
class HTMLPurifier_AttrDef_ID extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
@@ -2,12 +2,17 @@
|
||||
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
|
||||
// spliced from Feyd's IPv6 function (pd)
|
||||
|
||||
/**
|
||||
* Validates an IPv4 address
|
||||
* @author Feyd @ forums.devnetwork.net (public domain)
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_IPv4 extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
// regex is public so that IPv6 can reuse it
|
||||
/**
|
||||
* IPv4 regex, protected so that IPv6 can reuse it
|
||||
* @protected
|
||||
*/
|
||||
var $ip4;
|
||||
|
||||
function HTMLPurifier_AttrDef_IPv4() {
|
||||
|
@@ -2,11 +2,12 @@
|
||||
|
||||
require_once 'HTMLPurifier/AttrDef/IPv4.php';
|
||||
|
||||
// IPv6 by Feyd, source is in public domain
|
||||
|
||||
// note that this expects the brackets to be removed from IPv6 addresses
|
||||
// extends from the IPv4 impl. so we can borrow its regex
|
||||
|
||||
/**
|
||||
* Validates an IPv6 address.
|
||||
* @author Feyd @ forums.devnetwork.net (public domain)
|
||||
* @note This function requires brackets to have been removed from address
|
||||
* in URI.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_IPv6 extends HTMLPurifier_AttrDef_IPv4
|
||||
{
|
||||
|
||||
|
@@ -2,16 +2,42 @@
|
||||
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
|
||||
// appears to be a dud class: no currently allowed CSS uses this type
|
||||
// Uses this: widows, orphans, z-index, counter-increment, counter-reset
|
||||
|
||||
/**
|
||||
* Validates an integer.
|
||||
* @note While this class was modeled off the CSS definition, no currently
|
||||
* allowed CSS uses this type. The properties that do are: widows,
|
||||
* orphans, z-index, counter-increment, counter-reset. Some of the
|
||||
* HTML attributes, however, find use for a non-negative version of this.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_Integer extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
var $non_negative = false;
|
||||
/**
|
||||
* Bool indicating whether or not negative values are allowed
|
||||
*/
|
||||
var $negative = true;
|
||||
|
||||
function HTMLPurifier_AttrDef_Integer($non_negative = false) {
|
||||
$this->non_negative = $non_negative;
|
||||
/**
|
||||
* Bool indicating whether or not zero is allowed
|
||||
*/
|
||||
var $zero = true;
|
||||
|
||||
/**
|
||||
* Bool indicating whether or not positive values are allowed
|
||||
*/
|
||||
var $positive = true;
|
||||
|
||||
/**
|
||||
* @param $negative Bool indicating whether or not negative values are allowed
|
||||
* @param $zero Bool indicating whether or not zero is allowed
|
||||
* @param $positive Bool indicating whether or not positive values are allowed
|
||||
*/
|
||||
function HTMLPurifier_AttrDef_Integer(
|
||||
$negative = true, $zero = true, $positive = true
|
||||
) {
|
||||
$this->negative = $negative;
|
||||
$this->zero = $zero;
|
||||
$this->positive = $positive;
|
||||
}
|
||||
|
||||
function validate($integer, $config, &$context) {
|
||||
@@ -19,15 +45,27 @@ class HTMLPurifier_AttrDef_Integer extends HTMLPurifier_AttrDef
|
||||
$integer = $this->parseCDATA($integer);
|
||||
if ($integer === '') return false;
|
||||
|
||||
if ( !$this->non_negative && $integer[0] === '-' ) {
|
||||
// we could possibly simply typecast it to integer, but there are
|
||||
// certain fringe cases that must not return an integer.
|
||||
|
||||
// clip leading sign
|
||||
if ( $this->negative && $integer[0] === '-' ) {
|
||||
$digits = substr($integer, 1);
|
||||
} elseif( $integer[0] === '+' ) {
|
||||
$digits = $integer = substr($integer, 1);
|
||||
if ($digits === '0') $integer = '0'; // rm minus sign for zero
|
||||
} elseif( $this->positive && $integer[0] === '+' ) {
|
||||
$digits = $integer = substr($integer, 1); // rm unnecessary plus
|
||||
} else {
|
||||
$digits = $integer;
|
||||
}
|
||||
|
||||
// test if it's numeric
|
||||
if (!ctype_digit($digits)) return false;
|
||||
|
||||
// perform scope tests
|
||||
if (!$this->zero && $integer == 0) return false;
|
||||
if (!$this->positive && $integer > 0) return false;
|
||||
if (!$this->negative && $integer < 0) return false;
|
||||
|
||||
return $integer;
|
||||
|
||||
}
|
||||
|
@@ -2,8 +2,10 @@
|
||||
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
|
||||
// built according to RFC 3066, which obsoleted RFC 1766
|
||||
|
||||
/**
|
||||
* Validates the HTML attribute lang, effectively a language code.
|
||||
* @note Built according to RFC 3066, which obsoleted RFC 1766
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_Lang extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
|
78
library/HTMLPurifier/AttrDef/ListStyle.php
Normal file
78
library/HTMLPurifier/AttrDef/ListStyle.php
Normal file
@@ -0,0 +1,78 @@
|
||||
<?php
|
||||
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
|
||||
/**
|
||||
* Validates shorthand CSS property list-style.
|
||||
* @note This currently does not support list-style-image, as that functionality
|
||||
* is not implemented yet elsewhere.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_ListStyle extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
/**
|
||||
* Local copy of component validators.
|
||||
* @note See HTMLPurifier_AttrDef_Font::$info for a similar impl.
|
||||
*/
|
||||
var $info;
|
||||
|
||||
function HTMLPurifier_AttrDef_ListStyle($config) {
|
||||
$def = $config->getCSSDefinition();
|
||||
$this->info['list-style-type'] = $def->info['list-style-type'];
|
||||
$this->info['list-style-position'] = $def->info['list-style-position'];
|
||||
}
|
||||
|
||||
function validate($string, $config, &$context) {
|
||||
|
||||
// regular pre-processing
|
||||
$string = $this->parseCDATA($string);
|
||||
if ($string === '') return false;
|
||||
|
||||
$bits = explode(' ', strtolower($string)); // bits to process
|
||||
|
||||
$caught_type = false;
|
||||
$caught_position = false;
|
||||
$caught_none = false; // as in keyword none, which is in all of them
|
||||
|
||||
$ret = '';
|
||||
|
||||
foreach ($bits as $bit) {
|
||||
if ($caught_none && ($caught_type || $caught_position)) break;
|
||||
if ($caught_type && $caught_position) break;
|
||||
|
||||
if ($bit === '') continue;
|
||||
|
||||
if ($bit === 'none') {
|
||||
if ($caught_none) continue;
|
||||
$caught_none = true;
|
||||
$ret .= 'none ';
|
||||
continue;
|
||||
}
|
||||
|
||||
// if we add anymore, roll it into a loop
|
||||
|
||||
$r = $this->info['list-style-type']->validate($bit, $config, $context);
|
||||
if ($r !== false) {
|
||||
if ($caught_type) continue;
|
||||
$caught_type = true;
|
||||
$ret .= $r . ' ';
|
||||
continue;
|
||||
}
|
||||
|
||||
$r = $this->info['list-style-position']->validate($bit, $config, $context);
|
||||
if ($r !== false) {
|
||||
if ($caught_position) continue;
|
||||
$caught_position = true;
|
||||
$ret .= $r . ' ';
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
$ret = rtrim($ret);
|
||||
return $ret ? $ret : false;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
?>
|
@@ -3,6 +3,12 @@
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
require_once 'HTMLPurifier/AttrDef/Length.php';
|
||||
|
||||
/**
|
||||
* Validates a MultiLength as defined by the HTML spec.
|
||||
*
|
||||
* A multilength is either a integer (pixel count), a percentage, or
|
||||
* a relative number.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_MultiLength extends HTMLPurifier_AttrDef_Length
|
||||
{
|
||||
|
||||
|
@@ -2,12 +2,34 @@
|
||||
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
|
||||
/**
|
||||
* Framework class for strings that involve multiple values.
|
||||
*
|
||||
* Certain CSS properties such as border-width and margin allow multiple
|
||||
* lengths to be specified. This class can take a vanilla border-width
|
||||
* definition and multiply it, usually into a max of four.
|
||||
*
|
||||
* @note Even though the CSS specification isn't clear about it, inherit
|
||||
* can only be used alone: it will never manifest as part of a multi
|
||||
* shorthand declaration. Thus, this class does not allow inherit.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_Multiple extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
/**
|
||||
* Instance of component definition to defer validation to.
|
||||
*/
|
||||
var $single;
|
||||
|
||||
/**
|
||||
* Max number of values allowed.
|
||||
*/
|
||||
var $max;
|
||||
|
||||
/**
|
||||
* @param $single HTMLPurifier_AttrDef to multiply
|
||||
* @param $max Max number of values allowed (usually four)
|
||||
*/
|
||||
function HTMLPurifier_AttrDef_Multiple($single, $max = 4) {
|
||||
$this->single = $single;
|
||||
$this->max = $max;
|
||||
|
@@ -1,10 +1,19 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Validates a number as defined by the CSS spec.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_Number extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
/**
|
||||
* Bool indicating whether or not only positive values allowed.
|
||||
*/
|
||||
var $non_negative = false;
|
||||
|
||||
/**
|
||||
* @param $non_negative Bool indicating whether negatives are forbidden
|
||||
*/
|
||||
function HTMLPurifier_AttrDef_Number($non_negative = false) {
|
||||
$this->non_negative = $non_negative;
|
||||
}
|
||||
|
@@ -1,23 +0,0 @@
|
||||
<?php
|
||||
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
|
||||
// for col and row spans, essentially, a positive integer
|
||||
class HTMLPurifier_AttrDef_NumberSpan extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
function validate($string, $config, &$context) {
|
||||
|
||||
$string = trim($string);
|
||||
if ($string === '') return false;
|
||||
if ($string === '1') return false; // this is the default value
|
||||
if (!is_numeric($string)) return false;
|
||||
$int = (int) $string;
|
||||
if ($int <= 0) return false;
|
||||
return (string) $int;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
?>
|
@@ -3,11 +3,21 @@
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
require_once 'HTMLPurifier/AttrDef/Number.php';
|
||||
|
||||
/**
|
||||
* Validates a Percentage as defined by the HTML spec.
|
||||
* @note This also allows integer pixel values.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_Percentage extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
/**
|
||||
* Instance of HTMLPurifier_AttrDef_Number to defer pixel validation
|
||||
*/
|
||||
var $number_def;
|
||||
|
||||
/**
|
||||
* @param Bool indicating whether to forbid negative values
|
||||
*/
|
||||
function HTMLPurifier_AttrDef_Percentage($non_negative = false) {
|
||||
$this->number_def = new HTMLPurifier_AttrDef_Number($non_negative);
|
||||
}
|
||||
|
@@ -2,6 +2,9 @@
|
||||
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
|
||||
/**
|
||||
* Validates an integer representation of pixels according to the HTML spec.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_Pixels extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
|
@@ -2,6 +2,9 @@
|
||||
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
|
||||
/**
|
||||
* Validates arbitrary text according to the HTML spec.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_Text extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
|
@@ -2,9 +2,18 @@
|
||||
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
|
||||
/**
|
||||
* Validates the value for the CSS property text-decoration
|
||||
* @note This class could be generalized into a version that acts sort of
|
||||
* like Enum except you can compound the allowed values.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_TextDecoration extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
/**
|
||||
* Lookup table of allowed values.
|
||||
* @protected
|
||||
*/
|
||||
var $allowed_values = array(
|
||||
'line-through' => true,
|
||||
'overline' => true,
|
||||
|
@@ -5,12 +5,16 @@ require_once 'HTMLPurifier/URIScheme.php';
|
||||
require_once 'HTMLPurifier/URISchemeRegistry.php';
|
||||
require_once 'HTMLPurifier/AttrDef/Host.php';
|
||||
|
||||
HTMLPurifier_ConfigDef::define(
|
||||
'URI', 'DefaultScheme', 'http',
|
||||
HTMLPurifier_ConfigSchema::define(
|
||||
'URI', 'DefaultScheme', 'http', 'string',
|
||||
'Defines through what scheme the output will be served, in order to '.
|
||||
'select the proper object validator when no scheme information is present.'
|
||||
);
|
||||
|
||||
/**
|
||||
* Validates a URI as defined by RFC 3986.
|
||||
* @note Scheme-specific mechanics deferred to HTMLPurifier_URIScheme
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
@@ -32,13 +36,13 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
|
||||
// for HTTP and thus won't work for our generic URI parsing
|
||||
|
||||
// according to the RFC... (but this cuts corners, i.e. non-validating)
|
||||
$r_URI = '!^'.
|
||||
'(([^:/?#<>]+):)?'. // 2. Scheme
|
||||
'(//([^/?#<>]*))?'. // 4. Authority
|
||||
'([^?#<>]*)'. // 5. Path
|
||||
'(\?([^#<>]*))?'. // 7. Query
|
||||
'(#([^<>]*))?'. // 8. Fragment
|
||||
'$!';
|
||||
$r_URI = '!'.
|
||||
'(([^:/?#<>\'"]+):)?'. // 2. Scheme
|
||||
'(//([^/?#<>\'"]*))?'. // 4. Authority
|
||||
'([^?#<>\'"]*)'. // 5. Path
|
||||
'(\?([^#<>\'"]*))?'. // 7. Query
|
||||
'(#([^<>\'"]*))?'. // 8. Fragment
|
||||
'!';
|
||||
|
||||
$matches = array();
|
||||
$result = preg_match($r_URI, $uri, $matches);
|
||||
|
@@ -1,12 +1,31 @@
|
||||
<?php
|
||||
|
||||
// AttrTransform = Attribute Transformation, when handling one attribute
|
||||
// isn't enough
|
||||
/**
|
||||
* Processes an entire attribute array for corrections needing multiple values.
|
||||
*
|
||||
* Occasionally, a certain attribute will need to be removed and popped onto
|
||||
* another value. Instead of creating a complex return syntax for
|
||||
* HTMLPurifier_AttrDef, we just pass the whole attribute array to a
|
||||
* specialized object and have that do the special work. That is the
|
||||
* family of HTMLPurifier_AttrTransform.
|
||||
*
|
||||
* An attribute transformation can be assigned to run before or after
|
||||
* HTMLPurifier_AttrDef validation. See HTMLPurifier_HTMLDefinition for
|
||||
* more details.
|
||||
*/
|
||||
|
||||
class HTMLPurifier_AttrTransform
|
||||
{
|
||||
function HTMLPurifier_AttrTransform() {}
|
||||
|
||||
function transform($token, $config = null) {
|
||||
/**
|
||||
* Abstract: makes changes to the attributes dependent on multiple values.
|
||||
*
|
||||
* @param $attr Assoc array of attributes, usually from
|
||||
* HTMLPurifier_Token_Tag::$attributes
|
||||
* @param $config Mandatory HTMLPurifier_Config object.
|
||||
* @returns Processed attribute array.
|
||||
*/
|
||||
function transform($attr, $config) {
|
||||
trigger_error('Cannot call abstract function', E_USER_ERROR);
|
||||
}
|
||||
}
|
||||
|
@@ -4,20 +4,26 @@ require_once 'HTMLPurifier/AttrTransform.php';
|
||||
|
||||
// this MUST be placed in post, as it assumes that any value in dir is valid
|
||||
|
||||
HTMLPurifier_ConfigDef::define(
|
||||
'Attr', 'DefaultTextDir', 'ltr',
|
||||
HTMLPurifier_ConfigSchema::define(
|
||||
'Attr', 'DefaultTextDir', 'ltr', 'string',
|
||||
'Defines the default text direction (ltr or rtl) of the document '.
|
||||
'being parsed. This generally is the same as the value of the dir '.
|
||||
'attribute in HTML, or ltr if that is not specified.'
|
||||
);
|
||||
HTMLPurifier_ConfigSchema::defineAllowedValues(
|
||||
'Attr', 'DefaultTextDir', array( 'ltr', 'rtl' )
|
||||
);
|
||||
|
||||
/**
|
||||
* Post-trasnform that ensures that bdo tags have the dir attribute set.
|
||||
*/
|
||||
class HTMLPurifier_AttrTransform_BdoDir extends HTMLPurifier_AttrTransform
|
||||
{
|
||||
|
||||
function transform($attributes, $config) {
|
||||
if (isset($attributes['dir'])) return $attributes;
|
||||
$attributes['dir'] = $config->get('Attr', 'DefaultTextDir');
|
||||
return $attributes;
|
||||
function transform($attr, $config) {
|
||||
if (isset($attr['dir'])) return $attr;
|
||||
$attr['dir'] = $config->get('Attr', 'DefaultTextDir');
|
||||
return $attr;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -4,41 +4,44 @@ require_once 'HTMLPurifier/AttrTransform.php';
|
||||
|
||||
// must be called POST validation
|
||||
|
||||
HTMLPurifier_ConfigDef::define(
|
||||
'Attr', 'DefaultInvalidImage', '',
|
||||
HTMLPurifier_ConfigSchema::define(
|
||||
'Attr', 'DefaultInvalidImage', '', 'string',
|
||||
'This is the default image an img tag will be pointed to if it does '.
|
||||
'not have a valid src attribute. In future versions, we may allow the '.
|
||||
'image tag to be removed completely, but due to design issues, this is '.
|
||||
'not possible right now.'
|
||||
);
|
||||
|
||||
HTMLPurifier_ConfigDef::define(
|
||||
'Attr', 'DefaultInvalidImageAlt', 'Invalid image',
|
||||
HTMLPurifier_ConfigSchema::define(
|
||||
'Attr', 'DefaultInvalidImageAlt', 'Invalid image', 'string',
|
||||
'This is the content of the alt tag of an invalid image if the user '.
|
||||
'had not previously specified an alt attribute. It has no effect when the '.
|
||||
'image is valid but there was no alt attribute present.'
|
||||
);
|
||||
|
||||
/**
|
||||
* Post-transform that ensures the required attrs of img (alt and src) are set
|
||||
*/
|
||||
class HTMLPurifier_AttrTransform_ImgRequired extends HTMLPurifier_AttrTransform
|
||||
{
|
||||
|
||||
function transform($attributes, $config) {
|
||||
function transform($attr, $config) {
|
||||
|
||||
$src = true;
|
||||
if (!isset($attributes['src'])) {
|
||||
$attributes['src'] = $config->get('Attr', 'DefaultInvalidImage');
|
||||
if (!isset($attr['src'])) {
|
||||
$attr['src'] = $config->get('Attr', 'DefaultInvalidImage');
|
||||
$src = false;
|
||||
}
|
||||
|
||||
if (!isset($attributes['alt'])) {
|
||||
if (!isset($attr['alt'])) {
|
||||
if ($src) {
|
||||
$attributes['alt'] = basename($attributes['src']);
|
||||
$attr['alt'] = basename($attr['src']);
|
||||
} else {
|
||||
$attributes['alt'] = $config->get('Attr', 'DefaultInvalidImageAlt');
|
||||
$attr['alt'] = $config->get('Attr', 'DefaultInvalidImageAlt');
|
||||
}
|
||||
}
|
||||
|
||||
return $attributes;
|
||||
return $attr;
|
||||
|
||||
}
|
||||
|
||||
|
@@ -2,9 +2,11 @@
|
||||
|
||||
require_once 'HTMLPurifier/AttrTransform.php';
|
||||
|
||||
// this transformation may be done pre or post validation, but post is
|
||||
// preferred, since invalid languages then will have been dropped.
|
||||
|
||||
/**
|
||||
* Post-transform that copies lang's value to xml:lang (and vice-versa)
|
||||
* @note Theoretically speaking, this could be a pre-transform, but putting
|
||||
* post is more efficient.
|
||||
*/
|
||||
class HTMLPurifier_AttrTransform_Lang extends HTMLPurifier_AttrTransform
|
||||
{
|
||||
|
||||
|
@@ -2,6 +2,9 @@
|
||||
|
||||
require_once 'HTMLPurifier/AttrTransform.php';
|
||||
|
||||
/**
|
||||
* Pre-transform that changes deprecated align attribute to text-align.
|
||||
*/
|
||||
class HTMLPurifier_AttrTransform_TextAlign
|
||||
extends HTMLPurifier_AttrTransform {
|
||||
|
||||
|
@@ -8,26 +8,26 @@ require_once 'HTMLPurifier/AttrDef/Percentage.php';
|
||||
require_once 'HTMLPurifier/AttrDef/Multiple.php';
|
||||
require_once 'HTMLPurifier/AttrDef/TextDecoration.php';
|
||||
require_once 'HTMLPurifier/AttrDef/FontFamily.php';
|
||||
require_once 'HTMLPurifier/AttrDef/Font.php';
|
||||
require_once 'HTMLPurifier/AttrDef/Border.php';
|
||||
require_once 'HTMLPurifier/AttrDef/ListStyle.php';
|
||||
|
||||
/**
|
||||
* Defines allowed CSS attributes and what their values are.
|
||||
* @see HTMLPurifier_HTMLDefinition
|
||||
*/
|
||||
class HTMLPurifier_CSSDefinition
|
||||
{
|
||||
|
||||
/**
|
||||
* Assoc array of attribute name to definition object.
|
||||
*/
|
||||
var $info = array();
|
||||
|
||||
function &instance($prototype = null) {
|
||||
static $instance = null;
|
||||
if ($prototype) {
|
||||
$instance = $prototype;
|
||||
} elseif (!$instance) {
|
||||
$instance = new HTMLPurifier_CSSDefinition();
|
||||
$instance->setup();
|
||||
}
|
||||
return $instance;
|
||||
}
|
||||
|
||||
function HTMLPurifier_CSSDefinition() {}
|
||||
|
||||
function setup() {
|
||||
/**
|
||||
* Constructs the info array. The meat of this class.
|
||||
*/
|
||||
function setup($config) {
|
||||
|
||||
$this->info['text-align'] = new HTMLPurifier_AttrDef_Enum(
|
||||
array('left', 'right', 'center', 'justify'), false);
|
||||
@@ -50,15 +50,28 @@ class HTMLPurifier_CSSDefinition
|
||||
array('normal', 'italic', 'oblique'), false);
|
||||
$this->info['font-variant'] = new HTMLPurifier_AttrDef_Enum(
|
||||
array('normal', 'small-caps'), false);
|
||||
|
||||
$this->info['list-style-position'] = new HTMLPurifier_AttrDef_Enum(
|
||||
array('inside', 'outside'), false);
|
||||
$this->info['list-style-type'] = new HTMLPurifier_AttrDef_Enum(
|
||||
array('disc', 'circle', 'square', 'decimal', 'lower-roman',
|
||||
'upper-roman', 'lower-alpha', 'upper-alpha'), false);
|
||||
|
||||
$this->info['list-style'] = new HTMLPurifier_AttrDef_ListStyle($config);
|
||||
|
||||
$this->info['text-transform'] = new HTMLPurifier_AttrDef_Enum(
|
||||
array('capitalize', 'uppercase', 'lowercase', 'none'), false);
|
||||
$this->info['color'] = new HTMLPurifier_AttrDef_Color();
|
||||
|
||||
// technically speaking, this one should get its own validator, but
|
||||
// since we don't support background images, it effectively is
|
||||
// equivalent to color. The only trouble is that if the author
|
||||
// specifies an image and a color, they'll both end up getting dropped,
|
||||
// even though we ought to implement it and just discard the image
|
||||
// info. This will be fixed in a later version (see TODO) when
|
||||
// better URI filtering is implemented.
|
||||
$this->info['background'] =
|
||||
|
||||
$border_color =
|
||||
$this->info['border-top-color'] =
|
||||
$this->info['border-bottom-color'] =
|
||||
@@ -151,6 +164,33 @@ class HTMLPurifier_CSSDefinition
|
||||
array('normal', 'bold', 'bolder', 'lighter', '100', '200', '300',
|
||||
'400', '500', '600', '700', '800', '900'), false);
|
||||
|
||||
// MUST be called after other font properties, as it references
|
||||
// a CSSDefinition object
|
||||
$this->info['font'] = new HTMLPurifier_AttrDef_Font($config);
|
||||
|
||||
// same here
|
||||
$this->info['border'] =
|
||||
$this->info['border-bottom'] =
|
||||
$this->info['border-top'] =
|
||||
$this->info['border-left'] =
|
||||
$this->info['border-right'] = new HTMLPurifier_AttrDef_Border($config);
|
||||
|
||||
$this->info['border-collapse'] = new HTMLPurifier_AttrDef_Enum(array(
|
||||
'collapse', 'seperate'));
|
||||
|
||||
$this->info['caption-side'] = new HTMLPurifier_AttrDef_Enum(array(
|
||||
'top', 'bottom'));
|
||||
|
||||
$this->info['table-layout'] = new HTMLPurifier_AttrDef_Enum(array(
|
||||
'auto', 'fixed'));
|
||||
|
||||
$this->info['vertical-align'] = new HTMLPurifier_AttrDef_Composite(array(
|
||||
new HTMLPurifier_AttrDef_Enum(array('baseline', 'sub', 'super',
|
||||
'top', 'text-top', 'middle', 'bottom', 'text-bottom')),
|
||||
new HTMLPurifier_AttrDef_CSSLength(),
|
||||
new HTMLPurifier_AttrDef_Percentage()
|
||||
));
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -5,15 +5,8 @@
|
||||
// false = delete parent node and all children
|
||||
// array(...) = replace children nodes with these
|
||||
|
||||
// this is the hardest one to implement. We'll use fancy regexp tricks
|
||||
// right now, we only expect it to return TRUE or FALSE (it won't attempt
|
||||
// to fix the tree)
|
||||
|
||||
// we may end up writing custom code for each HTML case
|
||||
// in order to make it self correcting
|
||||
|
||||
HTMLPurifier_ConfigDef::define(
|
||||
'Core', 'EscapeInvalidChildren', false,
|
||||
HTMLPurifier_ConfigSchema::define(
|
||||
'Core', 'EscapeInvalidChildren', false, 'bool',
|
||||
'When true, a child is found that is not allowed in the context of the '.
|
||||
'parent element will be transformed into text as if it were ASCII. When '.
|
||||
'false, that element and all internal tags will be dropped, though text '.
|
||||
@@ -21,25 +14,72 @@ HTMLPurifier_ConfigDef::define(
|
||||
'preserving child nodes.'
|
||||
);
|
||||
|
||||
/**
|
||||
* Defines allowed child nodes and validates tokens against it.
|
||||
*/
|
||||
class HTMLPurifier_ChildDef
|
||||
{
|
||||
/**
|
||||
* Type of child definition, usually right-most part of class name lowercase
|
||||
*
|
||||
* Used occasionally in terms of context. Possible values include
|
||||
* custom, required, optional and empty.
|
||||
*/
|
||||
var $type;
|
||||
|
||||
/**
|
||||
* Bool that indicates whether or not an empty array of children is okay
|
||||
*
|
||||
* This is necessary for redundant checking when changes affecting
|
||||
* a child node may cause a parent node to now be disallowed.
|
||||
*/
|
||||
var $allow_empty;
|
||||
function validateChildren($tokens_of_children) {
|
||||
|
||||
/**
|
||||
* Validates nodes according to definition and returns modification.
|
||||
*
|
||||
* @warning $context is NOT HTMLPurifier_AttrContext
|
||||
* @param $tokens_of_children Array of HTMLPurifier_Token
|
||||
* @param $config HTMLPurifier_Config object
|
||||
* @param $context String context indicating inline, block or unknown
|
||||
* @return bool true to leave nodes as is
|
||||
* @return bool false to remove parent node
|
||||
* @return array of replacement child tokens
|
||||
*/
|
||||
function validateChildren($tokens_of_children, $config, $context) {
|
||||
trigger_error('Call to abstract function', E_USER_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Custom validation class, accepts DTD child definitions
|
||||
*
|
||||
* @warning Currently this class is an all or nothing proposition, that is,
|
||||
* it will only give a bool return value.
|
||||
*/
|
||||
class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef
|
||||
{
|
||||
var $type = 'custom';
|
||||
var $allow_empty = false;
|
||||
/**
|
||||
* Allowed child pattern as defined by the DTD
|
||||
*/
|
||||
var $dtd_regex;
|
||||
/**
|
||||
* PCRE regex derived from $dtd_regex
|
||||
* @private
|
||||
*/
|
||||
var $_pcre_regex;
|
||||
/**
|
||||
* @param $dtd_regex Allowed child pattern from the DTD
|
||||
*/
|
||||
function HTMLPurifier_ChildDef_Custom($dtd_regex) {
|
||||
$this->dtd_regex = $dtd_regex;
|
||||
$this->_compileRegex();
|
||||
}
|
||||
/**
|
||||
* Compiles the PCRE regex from a DTD regex ($dtd_regex to $_pcre_regex)
|
||||
*/
|
||||
function _compileRegex() {
|
||||
$raw = str_replace(' ', '', $this->dtd_regex);
|
||||
if ($raw{0} != '(') {
|
||||
@@ -79,9 +119,18 @@ class HTMLPurifier_ChildDef_Custom extends HTMLPurifier_ChildDef
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Definition that allows a set of elements, but disallows empty children.
|
||||
*/
|
||||
class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef
|
||||
{
|
||||
/**
|
||||
* Lookup table of allowed elements.
|
||||
*/
|
||||
var $elements = array();
|
||||
/**
|
||||
* @param $elements List of allowed element names (lowercase).
|
||||
*/
|
||||
function HTMLPurifier_ChildDef_Required($elements) {
|
||||
if (is_string($elements)) {
|
||||
$elements = str_replace(' ', '', $elements);
|
||||
@@ -165,8 +214,13 @@ class HTMLPurifier_ChildDef_Required extends HTMLPurifier_ChildDef
|
||||
}
|
||||
}
|
||||
|
||||
// only altered behavior is that it returns an empty array
|
||||
// instead of a false (to delete the node)
|
||||
/**
|
||||
* Definition that allows a set of elements, and allows no children.
|
||||
* @note This is a hack to reuse code from HTMLPurifier_ChildDef_Required,
|
||||
* really, one shouldn't inherit from the other. Only altered behavior
|
||||
* is to overload a returned false with an array. Thus, it will never
|
||||
* return false.
|
||||
*/
|
||||
class HTMLPurifier_ChildDef_Optional extends HTMLPurifier_ChildDef_Required
|
||||
{
|
||||
var $allow_empty = true;
|
||||
@@ -178,23 +232,48 @@ class HTMLPurifier_ChildDef_Optional extends HTMLPurifier_ChildDef_Required
|
||||
}
|
||||
}
|
||||
|
||||
// placeholder
|
||||
/**
|
||||
* Definition that disallows all elements.
|
||||
* @warning validateChildren() in this class is actually never called, because
|
||||
* empty elements are corrected in HTMLPurifier_Strategy_MakeWellFormed
|
||||
* before child definitions are parsed in earnest by
|
||||
* HTMLPurifier_Strategy_FixNesting.
|
||||
*/
|
||||
class HTMLPurifier_ChildDef_Empty extends HTMLPurifier_ChildDef
|
||||
{
|
||||
var $allow_empty = true;
|
||||
var $type = 'empty';
|
||||
function HTMLPurifier_ChildDef_Empty() {}
|
||||
function validateChildren($tokens_of_children, $config, $context) {
|
||||
return false;
|
||||
return array();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Definition that uses different definitions depending on context.
|
||||
*
|
||||
* The del and ins tags are notable because they allow different types of
|
||||
* elements depending on whether or not they're in a block or inline context.
|
||||
* Chameleon allows this behavior to happen by using two different
|
||||
* definitions depending on context. While this somewhat generalized,
|
||||
* it is specifically intended for those two tags.
|
||||
*/
|
||||
class HTMLPurifier_ChildDef_Chameleon extends HTMLPurifier_ChildDef
|
||||
{
|
||||
|
||||
/**
|
||||
* Instance of the definition object to use when inline. Usually stricter.
|
||||
*/
|
||||
var $inline;
|
||||
/**
|
||||
* Instance of the definition object to use when block.
|
||||
*/
|
||||
var $block;
|
||||
|
||||
/**
|
||||
* @param $inline List of elements to allow when inline.
|
||||
* @param $block List of elements to allow when block.
|
||||
*/
|
||||
function HTMLPurifier_ChildDef_Chameleon($inline, $block) {
|
||||
$this->inline = new HTMLPurifier_ChildDef_Optional($inline);
|
||||
$this->block = new HTMLPurifier_ChildDef_Optional($block);
|
||||
@@ -219,4 +298,141 @@ class HTMLPurifier_ChildDef_Chameleon extends HTMLPurifier_ChildDef
|
||||
}
|
||||
}
|
||||
|
||||
?>
|
||||
/**
|
||||
* Definition for tables
|
||||
*/
|
||||
class HTMLPurifier_ChildDef_Table extends HTMLPurifier_ChildDef
|
||||
{
|
||||
var $allow_empty = false;
|
||||
var $type = 'table';
|
||||
function HTMLPurifier_ChildDef_Table() {}
|
||||
function validateChildren($tokens_of_children, $config, $context) {
|
||||
if (empty($tokens_of_children)) return false;
|
||||
|
||||
// this ensures that the loop gets run one last time before closing
|
||||
// up. It's a little bit of a hack, but it works! Just make sure you
|
||||
// get rid of the token later.
|
||||
$tokens_of_children[] = false;
|
||||
|
||||
// only one of these elements is allowed in a table
|
||||
$caption = false;
|
||||
$thead = false;
|
||||
$tfoot = false;
|
||||
|
||||
// as many of these as you want
|
||||
$cols = array();
|
||||
$content = array();
|
||||
|
||||
$nesting = 0; // current depth so we can determine nodes
|
||||
$is_collecting = false; // are we globbing together tokens to package
|
||||
// into one of the collectors?
|
||||
$collection = array(); // collected nodes
|
||||
$tag_index = 0; // the first node might be whitespace,
|
||||
// so this tells us where the start tag is
|
||||
|
||||
foreach ($tokens_of_children as $token) {
|
||||
$is_child = ($nesting == 0);
|
||||
|
||||
if ($token === false) {
|
||||
// terminating sequence started
|
||||
} elseif ($token->type == 'start') {
|
||||
$nesting++;
|
||||
} elseif ($token->type == 'end') {
|
||||
$nesting--;
|
||||
}
|
||||
|
||||
// handle node collection
|
||||
if ($is_collecting) {
|
||||
if ($is_child) {
|
||||
// okay, let's stash the tokens away
|
||||
// first token tells us the type of the collection
|
||||
switch ($collection[$tag_index]->name) {
|
||||
case 'tr':
|
||||
case 'tbody':
|
||||
$content[] = $collection;
|
||||
break;
|
||||
case 'caption':
|
||||
if ($caption !== false) break;
|
||||
$caption = $collection;
|
||||
break;
|
||||
case 'thead':
|
||||
case 'tfoot':
|
||||
// access the appropriate variable, $thead or $tfoot
|
||||
$var = $collection[$tag_index]->name;
|
||||
if ($$var === false) {
|
||||
$$var = $collection;
|
||||
} else {
|
||||
// transmutate the first and less entries into
|
||||
// tbody tags, and then put into content
|
||||
$collection[$tag_index]->name = 'tbody';
|
||||
$collection[count($collection)-1]->name = 'tbody';
|
||||
$content[] = $collection;
|
||||
}
|
||||
break;
|
||||
case 'colgroup':
|
||||
$cols[] = $collection;
|
||||
break;
|
||||
}
|
||||
$collection = array();
|
||||
$is_collecting = false;
|
||||
$tag_index = 0;
|
||||
} else {
|
||||
// add the node to the collection
|
||||
$collection[] = $token;
|
||||
}
|
||||
}
|
||||
|
||||
// terminate
|
||||
if ($token === false) break;
|
||||
|
||||
if ($is_child) {
|
||||
// determine what we're dealing with
|
||||
if ($token->name == 'col') {
|
||||
// the only empty tag in the possie, we can handle it
|
||||
// immediately
|
||||
$cols[] = array_merge($collection, array($token));
|
||||
$collection = array();
|
||||
$tag_index = 0;
|
||||
continue;
|
||||
}
|
||||
switch($token->name) {
|
||||
case 'caption':
|
||||
case 'colgroup':
|
||||
case 'thead':
|
||||
case 'tfoot':
|
||||
case 'tbody':
|
||||
case 'tr':
|
||||
$is_collecting = true;
|
||||
$collection[] = $token;
|
||||
continue;
|
||||
default:
|
||||
if ($token->type == 'text' && $token->is_whitespace) {
|
||||
$collection[] = $token;
|
||||
$tag_index++;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (empty($content)) return false;
|
||||
|
||||
$ret = array();
|
||||
if ($caption !== false) $ret = array_merge($ret, $caption);
|
||||
if ($cols !== false) foreach ($cols as $token_array) $ret = array_merge($ret, $token_array);
|
||||
if ($thead !== false) $ret = array_merge($ret, $thead);
|
||||
if ($tfoot !== false) $ret = array_merge($ret, $tfoot);
|
||||
foreach ($content as $token_array) $ret = array_merge($ret, $token_array);
|
||||
if (!empty($collection) && $is_collecting == false){
|
||||
// grab the trailing space
|
||||
$ret = array_merge($ret, $collection);
|
||||
}
|
||||
|
||||
array_pop($tokens_of_children); // remove phantom token
|
||||
|
||||
return ($ret === $tokens_of_children) ? true : $ret;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
?>
|
||||
|
@@ -1,39 +1,129 @@
|
||||
<?php
|
||||
|
||||
// subclass this to add custom settings
|
||||
/**
|
||||
* Configuration object that triggers customizable behavior.
|
||||
*
|
||||
* @warning This class is strongly defined: that means that the class
|
||||
* will fail if an undefined directive is retrieved or set.
|
||||
*
|
||||
* @note Many classes that could (although many times don't) use the
|
||||
* configuration object make it a mandatory parameter. This is
|
||||
* because a configuration object should always be forwarded,
|
||||
* otherwise, you run the risk of missing a parameter and then
|
||||
* being stumped when a configuration directive doesn't work.
|
||||
*/
|
||||
class HTMLPurifier_Config
|
||||
{
|
||||
|
||||
/**
|
||||
* Two-level associative array of configuration directives
|
||||
*/
|
||||
var $conf;
|
||||
|
||||
/**
|
||||
* Reference HTMLPurifier_ConfigSchema for value checking
|
||||
*/
|
||||
var $def;
|
||||
|
||||
/**
|
||||
* Instance of HTMLPurifier_HTMLDefinition
|
||||
*/
|
||||
var $html_definition;
|
||||
|
||||
/**
|
||||
* Instance of HTMLPurifier_CSSDefinition
|
||||
*/
|
||||
var $css_definition;
|
||||
|
||||
/**
|
||||
* @param $definition HTMLPurifier_ConfigSchema that defines what directives
|
||||
* are allowed.
|
||||
*/
|
||||
function HTMLPurifier_Config(&$definition) {
|
||||
$this->conf = $definition->info; // set up the defaults
|
||||
$this->conf = $definition->defaults; // set up, copy in defaults
|
||||
$this->def = $definition; // keep a copy around for checking
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience constructor that creates a default configuration object.
|
||||
* @return Default HTMLPurifier_Config object.
|
||||
*/
|
||||
function createDefault() {
|
||||
$definition =& HTMLPurifier_ConfigDef::instance();
|
||||
$definition =& HTMLPurifier_ConfigSchema::instance();
|
||||
$config = new HTMLPurifier_Config($definition);
|
||||
return $config;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retreives a value from the configuration.
|
||||
* @param $namespace String namespace
|
||||
* @param $key String key
|
||||
*/
|
||||
function get($namespace, $key) {
|
||||
if (!isset($this->conf[$namespace][$key])) {
|
||||
trigger_error('Cannot retrieve value of undefined directive',
|
||||
E_USER_ERROR);
|
||||
E_USER_WARNING);
|
||||
return;
|
||||
}
|
||||
return $this->conf[$namespace][$key];
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets a value to configuration.
|
||||
* @param $namespace String namespace
|
||||
* @param $key String key
|
||||
* @param $value Mixed value
|
||||
*/
|
||||
function set($namespace, $key, $value) {
|
||||
if (!isset($this->conf[$namespace][$key])) {
|
||||
trigger_error('Cannot set undefined directive to value',
|
||||
E_USER_ERROR);
|
||||
E_USER_WARNING);
|
||||
return;
|
||||
}
|
||||
$value = $this->def->validate($value,
|
||||
$this->def->info[$namespace][$key]->type);
|
||||
if (is_string($value)) {
|
||||
// resolve value alias if defined
|
||||
if (isset($this->def->info[$namespace][$key]->aliases[$value])) {
|
||||
$value = $this->def->info[$namespace][$key]->aliases[$value];
|
||||
}
|
||||
if ($this->def->info[$namespace][$key]->allowed !== true) {
|
||||
// check to see if the value is allowed
|
||||
if (!isset($this->def->info[$namespace][$key]->allowed[$value])) {
|
||||
trigger_error('Value not supported', E_USER_WARNING);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
if ($value === null) {
|
||||
trigger_error('Value is of invalid type', E_USER_WARNING);
|
||||
return;
|
||||
}
|
||||
$this->conf[$namespace][$key] = $value;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves a copy of the HTML definition.
|
||||
*/
|
||||
function getHTMLDefinition() {
|
||||
if ($this->html_definition === null) {
|
||||
$this->html_definition = new HTMLPurifier_HTMLDefinition();
|
||||
$this->html_definition->setup($this);
|
||||
}
|
||||
return $this->html_definition;
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves a copy of the CSS definition
|
||||
*/
|
||||
function getCSSDefinition() {
|
||||
if ($this->css_definition === null) {
|
||||
$this->css_definition = new HTMLPurifier_CSSDefinition();
|
||||
$this->css_definition->setup($this);
|
||||
}
|
||||
return $this->css_definition;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
?>
|
@@ -1,50 +0,0 @@
|
||||
<?php
|
||||
|
||||
class HTMLPurifier_ConfigDef {
|
||||
|
||||
var $info = array();
|
||||
|
||||
function initialize() {
|
||||
$this->defineNamespace('Core', 'Core features that are always available.');
|
||||
$this->defineNamespace('Attr', 'Features regarding attribute validation.');
|
||||
$this->defineNamespace('URI', 'Features regarding Uniform Resource Identifiers.');
|
||||
}
|
||||
|
||||
function &instance($prototype = null) {
|
||||
static $instance;
|
||||
if ($prototype !== null) {
|
||||
$instance = $prototype;
|
||||
} elseif ($instance === null || $prototype === true) {
|
||||
$instance = new HTMLPurifier_ConfigDef();
|
||||
$instance->initialize();
|
||||
}
|
||||
return $instance;
|
||||
}
|
||||
|
||||
function define($namespace, $name, $default, $description) {
|
||||
$def =& HTMLPurifier_ConfigDef::instance();
|
||||
if (!isset($def->info[$namespace])) {
|
||||
trigger_error('Cannot define directive for undefined namespace',
|
||||
E_USER_ERROR);
|
||||
return;
|
||||
}
|
||||
if (isset($def->info[$namespace][$name])) {
|
||||
// this behavior is at risk of change
|
||||
trigger_error('Cannot redefine directive', E_USER_ERROR);
|
||||
return;
|
||||
}
|
||||
$def->info[$namespace][$name] = $default;
|
||||
}
|
||||
|
||||
function defineNamespace($namespace, $description) {
|
||||
$def =& HTMLPurifier_ConfigDef::instance();
|
||||
if (isset($def->info[$namespace])) {
|
||||
trigger_error('Cannot redefine namespace', E_USER_ERROR);
|
||||
return;
|
||||
}
|
||||
$def->info[$namespace] = array();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
?>
|
337
library/HTMLPurifier/ConfigSchema.php
Normal file
337
library/HTMLPurifier/ConfigSchema.php
Normal file
@@ -0,0 +1,337 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Configuration definition, defines directives and their defaults.
|
||||
* @todo The ability to define things multiple times is confusing and should
|
||||
* be factored out to its own function named registerDependency() or
|
||||
* addNote(), where only the namespace.name and an extra descriptions
|
||||
* documenting the nature of the dependency are needed. Since it's
|
||||
* possible that the dependency is registered before the configuration
|
||||
* is defined, deferring it to some sort of cache until it actually
|
||||
* gets defined would be wise, keeping it opaque until it does get
|
||||
* defined. We could add a finalize() method which would cause it to
|
||||
* error out if we get a dangling dependency. It's difficult, however,
|
||||
* to know whether or not it's a dependency, or a codependency, that is
|
||||
* neither of them fully depends on it. Where does the configuration go
|
||||
* then? This could be partially resolved by allowing blanket definitions
|
||||
* and then splitting them up into finer-grained versions, however, there
|
||||
* might be implementation difficulties in ini files regarding order of
|
||||
* execution.
|
||||
*/
|
||||
class HTMLPurifier_ConfigSchema {
|
||||
|
||||
/**
|
||||
* Defaults of the directives and namespaces.
|
||||
* @note This shares the exact same structure as HTMLPurifier_Config::$conf
|
||||
*/
|
||||
var $defaults = array();
|
||||
|
||||
/**
|
||||
* Definition of the directives.
|
||||
*/
|
||||
var $info = array();
|
||||
|
||||
/**
|
||||
* Definition of namespaces.
|
||||
*/
|
||||
var $info_namespace = array();
|
||||
|
||||
/**
|
||||
* Lookup table of allowed types.
|
||||
*/
|
||||
var $types = array(
|
||||
'string' => 'String',
|
||||
'istring' => 'Case-insensitive string',
|
||||
'int' => 'Integer',
|
||||
'float' => 'Float',
|
||||
'bool' => 'Boolean',
|
||||
'lookup' => 'Lookup array',
|
||||
'list' => 'Array list',
|
||||
'hash' => 'Associative array',
|
||||
'mixed' => 'Mixed'
|
||||
);
|
||||
|
||||
/**
|
||||
* Initializes the default namespaces.
|
||||
*/
|
||||
function initialize() {
|
||||
$this->defineNamespace('Core', 'Core features that are always available.');
|
||||
$this->defineNamespace('Attr', 'Features regarding attribute validation.');
|
||||
$this->defineNamespace('URI', 'Features regarding Uniform Resource Identifiers.');
|
||||
$this->defineNamespace('HTML', 'Configuration regarding allowed HTML.');
|
||||
$this->defineNamespace('CSS', 'Configuration regarding allowed CSS.');
|
||||
$this->defineNamespace('Test', 'Developer testing configuration for our unit tests.');
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves an instance of the application-wide configuration definition.
|
||||
*/
|
||||
function &instance($prototype = null) {
|
||||
static $instance;
|
||||
if ($prototype !== null) {
|
||||
$instance = $prototype;
|
||||
} elseif ($instance === null || $prototype === true) {
|
||||
$instance = new HTMLPurifier_ConfigSchema();
|
||||
$instance->initialize();
|
||||
}
|
||||
return $instance;
|
||||
}
|
||||
|
||||
/**
|
||||
* Defines a directive for configuration
|
||||
* @warning Will fail of directive's namespace is defined
|
||||
* @param $namespace Namespace the directive is in
|
||||
* @param $name Key of directive
|
||||
* @param $default Default value of directive
|
||||
* @param $type Allowed type of the directive. See
|
||||
* HTMLPurifier_DirectiveDef::$type for allowed values
|
||||
* @param $description Description of directive for documentation
|
||||
*/
|
||||
function define(
|
||||
$namespace, $name, $default, $type,
|
||||
$description
|
||||
) {
|
||||
$def =& HTMLPurifier_ConfigSchema::instance();
|
||||
if (!isset($def->info[$namespace])) {
|
||||
trigger_error('Cannot define directive for undefined namespace',
|
||||
E_USER_ERROR);
|
||||
return;
|
||||
}
|
||||
if (!ctype_alnum($name)) {
|
||||
trigger_error('Directive name must be alphanumeric',
|
||||
E_USER_ERROR);
|
||||
return;
|
||||
}
|
||||
if (isset($def->info[$namespace][$name])) {
|
||||
if (
|
||||
$def->info[$namespace][$name]->type !== $type ||
|
||||
$def->defaults[$namespace][$name] !== $default
|
||||
) {
|
||||
trigger_error('Inconsistent default or type, cannot redefine');
|
||||
return;
|
||||
}
|
||||
} else {
|
||||
if (!isset($def->types[$type])) {
|
||||
trigger_error('Invalid type for configuration directive',
|
||||
E_USER_ERROR);
|
||||
return;
|
||||
}
|
||||
if ($def->validate($default, $type) === null) {
|
||||
trigger_error('Default value does not match directive type',
|
||||
E_USER_ERROR);
|
||||
return;
|
||||
}
|
||||
$def->info[$namespace][$name] =
|
||||
new HTMLPurifier_ConfigEntity_Directive();
|
||||
$def->info[$namespace][$name]->type = $type;
|
||||
$def->defaults[$namespace][$name] = $default;
|
||||
}
|
||||
$backtrace = debug_backtrace();
|
||||
$file = $def->mungeFilename($backtrace[0]['file']);
|
||||
$line = $backtrace[0]['line'];
|
||||
$def->info[$namespace][$name]->addDescription($file,$line,$description);
|
||||
}
|
||||
|
||||
/**
|
||||
* Defines a namespace for directives to be put into.
|
||||
* @param $namespace Namespace's name
|
||||
* @param $description Description of the namespace
|
||||
*/
|
||||
function defineNamespace($namespace, $description) {
|
||||
$def =& HTMLPurifier_ConfigSchema::instance();
|
||||
if (isset($def->info[$namespace])) {
|
||||
trigger_error('Cannot redefine namespace', E_USER_ERROR);
|
||||
return;
|
||||
}
|
||||
if (!ctype_alnum($namespace)) {
|
||||
trigger_error('Namespace name must be alphanumeric',
|
||||
E_USER_ERROR);
|
||||
return;
|
||||
}
|
||||
$def->info[$namespace] = array();
|
||||
$def->info_namespace[$namespace] = new HTMLPurifier_ConfigEntity_Namespace();
|
||||
$def->info_namespace[$namespace]->description = $description;
|
||||
$def->defaults[$namespace] = array();
|
||||
}
|
||||
|
||||
/**
|
||||
* Defines a directive value alias.
|
||||
*
|
||||
* Directive value aliases are convenient for developers because it lets
|
||||
* them set a directive to several values and get the same result.
|
||||
* @param $namespace Directive's namespace
|
||||
* @param $name Name of Directive
|
||||
* @param $alias Name of aliased value
|
||||
* @param $real Value aliased value will be converted into
|
||||
*/
|
||||
function defineValueAliases($namespace, $name, $aliases) {
|
||||
$def =& HTMLPurifier_ConfigSchema::instance();
|
||||
if (!isset($def->info[$namespace][$name])) {
|
||||
trigger_error('Cannot set value alias for non-existant directive',
|
||||
E_USER_ERROR);
|
||||
return;
|
||||
}
|
||||
foreach ($aliases as $alias => $real) {
|
||||
if (!$def->info[$namespace][$name] !== true &&
|
||||
!isset($def->info[$namespace][$name]->allowed[$real])
|
||||
) {
|
||||
trigger_error('Cannot define alias to value that is not allowed',
|
||||
E_USER_ERROR);
|
||||
return;
|
||||
}
|
||||
if (isset($def->info[$namespace][$name]->allowed[$alias])) {
|
||||
trigger_error('Cannot define alias over allowed value',
|
||||
E_USER_ERROR);
|
||||
return;
|
||||
}
|
||||
$def->info[$namespace][$name]->aliases[$alias] = $real;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Defines a set of allowed values for a directive.
|
||||
* @param $namespace Namespace of directive
|
||||
* @param $name Name of directive
|
||||
* @param $allowed_values Arraylist of allowed values
|
||||
*/
|
||||
function defineAllowedValues($namespace, $name, $allowed_values) {
|
||||
$def =& HTMLPurifier_ConfigSchema::instance();
|
||||
if (!isset($def->info[$namespace][$name])) {
|
||||
trigger_error('Cannot define allowed values for undefined directive',
|
||||
E_USER_ERROR);
|
||||
return;
|
||||
}
|
||||
if ($def->info[$namespace][$name]->allowed === true) {
|
||||
$def->info[$namespace][$name]->allowed = array();
|
||||
}
|
||||
foreach ($allowed_values as $value) {
|
||||
$def->info[$namespace][$name]->allowed[$value] = true;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Validate a variable according to type. Return null if invalid.
|
||||
*/
|
||||
function validate($var, $type) {
|
||||
if (!isset($this->types[$type])) {
|
||||
trigger_error('Invalid type', E_USER_ERROR);
|
||||
return;
|
||||
}
|
||||
switch ($type) {
|
||||
case 'mixed':
|
||||
return $var;
|
||||
case 'istring':
|
||||
case 'string':
|
||||
if (!is_string($var)) return;
|
||||
if ($type === 'istring') $var = strtolower($var);
|
||||
return $var;
|
||||
case 'int':
|
||||
if (is_string($var) && ctype_digit($var)) $var = (int) $var;
|
||||
elseif (!is_int($var)) return;
|
||||
return $var;
|
||||
case 'float':
|
||||
if (is_string($var) && is_numeric($var)) $var = (float) $var;
|
||||
elseif (!is_float($var)) return;
|
||||
return $var;
|
||||
case 'bool':
|
||||
if (is_int($var) && ($var === 0 || $var === 1)) {
|
||||
$var = (bool) $var;
|
||||
} elseif (!is_bool($var)) return;
|
||||
return $var;
|
||||
case 'list':
|
||||
case 'hash':
|
||||
case 'lookup':
|
||||
if (!is_array($var)) return;
|
||||
$keys = array_keys($var);
|
||||
if ($keys === array_keys($keys)) {
|
||||
if ($type == 'list') return $var;
|
||||
elseif ($type == 'lookup') {
|
||||
$new = array();
|
||||
foreach ($var as $key) {
|
||||
$new[$key] = true;
|
||||
}
|
||||
return $new;
|
||||
} else return;
|
||||
}
|
||||
if ($type === 'lookup') {
|
||||
foreach ($var as $key => $value) {
|
||||
$var[$key] = true;
|
||||
}
|
||||
}
|
||||
return $var;
|
||||
}
|
||||
}
|
||||
|
||||
function mungeFilename($filename) {
|
||||
$offset = strrpos($filename, 'HTMLPurifier');
|
||||
$filename = substr($filename, $offset);
|
||||
$filename = str_replace('\\', '/', $filename);
|
||||
return $filename;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Base class for configuration entity
|
||||
*/
|
||||
class HTMLPurifier_ConfigEntity {}
|
||||
|
||||
/**
|
||||
* Structure object describing of a namespace
|
||||
*/
|
||||
class HTMLPurifier_ConfigEntity_Namespace extends HTMLPurifier_ConfigEntity {
|
||||
|
||||
/**
|
||||
* String description of what kinds of directives go in this namespace.
|
||||
*/
|
||||
var $description;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Structure object containing definition of a directive.
|
||||
* @note This structure does not contain default values
|
||||
*/
|
||||
class HTMLPurifier_ConfigEntity_Directive extends HTMLPurifier_ConfigEntity
|
||||
{
|
||||
|
||||
/**
|
||||
* Hash of value aliases, i.e. values that are equivalent.
|
||||
*/
|
||||
var $aliases = array();
|
||||
|
||||
/**
|
||||
* Lookup table of allowed values of the element, bool true if all allowed.
|
||||
*/
|
||||
var $allowed = true;
|
||||
|
||||
/**
|
||||
* Allowed type of the directive. Values are:
|
||||
* - string
|
||||
* - istring (case insensitive string)
|
||||
* - int
|
||||
* - float
|
||||
* - bool
|
||||
* - lookup (array of value => true)
|
||||
* - list (regular numbered index array)
|
||||
* - hash (array of key => value)
|
||||
* - mixed (anything goes)
|
||||
*/
|
||||
var $type = 'mixed';
|
||||
/**
|
||||
* Plaintext descriptions of the configuration entity is. Organized by
|
||||
* file and line number, so multiple descriptions are allowed.
|
||||
*/
|
||||
var $descriptions = array();
|
||||
|
||||
/**
|
||||
* Adds a description to the array
|
||||
*/
|
||||
function addDescription($file, $line, $description) {
|
||||
if (!isset($this->descriptions[$file])) $this->descriptions[$file] = array();
|
||||
$this->descriptions[$file][$line] = $description;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
?>
|
301
library/HTMLPurifier/Encoder.php
Normal file
301
library/HTMLPurifier/Encoder.php
Normal file
@@ -0,0 +1,301 @@
|
||||
<?php
|
||||
|
||||
require_once 'HTMLPurifier/EntityLookup.php';
|
||||
|
||||
HTMLPurifier_ConfigSchema::define(
|
||||
'Core', 'Encoding', 'utf-8', 'istring',
|
||||
'If for some reason you are unable to convert all webpages to UTF-8, '.
|
||||
'you can use this directive as a stop-gap compatibility change to '.
|
||||
'let HTMLPurifier deal with non UTF-8 input. This technique has '.
|
||||
'notable deficiencies: absolutely no characters outside of the selected '.
|
||||
'character encoding will be preserved, not even the ones that have '.
|
||||
'been ampersand escaped (this is due to a UTF-8 specific <em>feature</em> '.
|
||||
'that automatically resolves all entities), making it pretty useless '.
|
||||
'for anything except the most I18N-blind applications. This directive '.
|
||||
'only accepts ISO-8859-1 if iconv is not enabled.'
|
||||
);
|
||||
|
||||
if ( !function_exists('iconv') ) {
|
||||
// only encodings with native PHP support
|
||||
HTMLPurifier_ConfigSchema::defineAllowedValues(
|
||||
'Core', 'Encoding', array(
|
||||
'utf-8',
|
||||
'iso-8859-1'
|
||||
)
|
||||
);
|
||||
HTMLPurifier_ConfigSchema::defineValueAliases(
|
||||
'Core', 'Encoding', array(
|
||||
'iso8859-1' => 'iso-8859-1'
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
HTMLPurifier_ConfigSchema::define(
|
||||
'Test', 'ForceNoIconv', false, 'bool',
|
||||
'When set to true, HTMLPurifier_Encoder will act as if iconv does not '.
|
||||
'exist and use only pure PHP implementations.'
|
||||
);
|
||||
|
||||
/**
|
||||
* A UTF-8 specific character encoder that handles cleaning and transforming.
|
||||
*/
|
||||
class HTMLPurifier_Encoder
|
||||
{
|
||||
|
||||
/**
|
||||
* Cleans a UTF-8 string for well-formedness and SGML validity
|
||||
*
|
||||
* It will parse according to UTF-8 and return a valid UTF8 string, with
|
||||
* non-SGML codepoints excluded.
|
||||
*
|
||||
* @note Just for reference, the non-SGML code points are 0 to 31 and
|
||||
* 127 to 159, inclusive. However, we allow code points 9, 10
|
||||
* and 13, which are the tab, line feed and carriage return
|
||||
* respectively. 128 and above the code points map to multibyte
|
||||
* UTF-8 representations.
|
||||
*
|
||||
* @note Fallback code adapted from utf8ToUnicode by Henri Sivonen and
|
||||
* hsivonen@iki.fi at <http://iki.fi/hsivonen/php-utf8/> under the
|
||||
* LGPL license. Notes on what changed are inside, but in general,
|
||||
* the original code transformed UTF-8 text into an array of integer
|
||||
* Unicode codepoints. Understandably, transforming that back to
|
||||
* a string would be somewhat expensive, so the function was modded to
|
||||
* directly operate on the string. However, this discourages code
|
||||
* reuse, and the logic enumerated here would be useful for any
|
||||
* function that needs to be able to understand UTF-8 characters.
|
||||
* As of right now, only smart lossless character encoding converters
|
||||
* would need that, and I'm probably not going to implement them.
|
||||
* Once again, PHP 6 should solve all our problems.
|
||||
*/
|
||||
function cleanUTF8($str, $force_php = false) {
|
||||
|
||||
static $non_sgml_chars = array();
|
||||
if (empty($non_sgml_chars)) {
|
||||
for ($i = 0; $i <= 31; $i++) {
|
||||
// non-SGML ASCII chars
|
||||
// save \r, \t and \n
|
||||
if ($i == 9 || $i == 13 || $i == 10) continue;
|
||||
$non_sgml_chars[chr($i)] = '';
|
||||
}
|
||||
for ($i = 127; $i <= 159; $i++) {
|
||||
$non_sgml_chars[HTMLPurifier_Encoder::unichr($i)] = '';
|
||||
}
|
||||
}
|
||||
|
||||
static $iconv = null;
|
||||
if ($iconv === null) $iconv = function_exists('iconv');
|
||||
|
||||
if ($iconv && !$force_php) {
|
||||
// do the shortcut way
|
||||
$str = @iconv('UTF-8', 'UTF-8//IGNORE', $str);
|
||||
return strtr($str, $non_sgml_chars);
|
||||
}
|
||||
|
||||
$mState = 0; // cached expected number of octets after the current octet
|
||||
// until the beginning of the next UTF8 character sequence
|
||||
$mUcs4 = 0; // cached Unicode character
|
||||
$mBytes = 1; // cached expected number of octets in the current sequence
|
||||
|
||||
// original code involved an $out that was an array of Unicode
|
||||
// codepoints. Instead of having to convert back into UTF-8, we've
|
||||
// decided to directly append valid UTF-8 characters onto a string
|
||||
// $out once they're done. $char accumulates raw bytes, while $mUcs4
|
||||
// turns into the Unicode code point, so there's some redundancy.
|
||||
|
||||
$out = '';
|
||||
$char = '';
|
||||
|
||||
$len = strlen($str);
|
||||
for($i = 0; $i < $len; $i++) {
|
||||
$in = ord($str{$i});
|
||||
$char .= $str[$i]; // append byte to char
|
||||
if (0 == $mState) {
|
||||
// When mState is zero we expect either a US-ASCII character
|
||||
// or a multi-octet sequence.
|
||||
if (0 == (0x80 & ($in))) {
|
||||
// US-ASCII, pass straight through.
|
||||
if (($in <= 31 || $in == 127) &&
|
||||
!($in == 9 || $in == 13 || $in == 10) // save \r\t\n
|
||||
) {
|
||||
// control characters, remove
|
||||
} else {
|
||||
$out .= $char;
|
||||
}
|
||||
// reset
|
||||
$char = '';
|
||||
$mBytes = 1;
|
||||
} elseif (0xC0 == (0xE0 & ($in))) {
|
||||
// First octet of 2 octet sequence
|
||||
$mUcs4 = ($in);
|
||||
$mUcs4 = ($mUcs4 & 0x1F) << 6;
|
||||
$mState = 1;
|
||||
$mBytes = 2;
|
||||
} elseif (0xE0 == (0xF0 & ($in))) {
|
||||
// First octet of 3 octet sequence
|
||||
$mUcs4 = ($in);
|
||||
$mUcs4 = ($mUcs4 & 0x0F) << 12;
|
||||
$mState = 2;
|
||||
$mBytes = 3;
|
||||
} elseif (0xF0 == (0xF8 & ($in))) {
|
||||
// First octet of 4 octet sequence
|
||||
$mUcs4 = ($in);
|
||||
$mUcs4 = ($mUcs4 & 0x07) << 18;
|
||||
$mState = 3;
|
||||
$mBytes = 4;
|
||||
} elseif (0xF8 == (0xFC & ($in))) {
|
||||
// First octet of 5 octet sequence.
|
||||
//
|
||||
// This is illegal because the encoded codepoint must be
|
||||
// either:
|
||||
// (a) not the shortest form or
|
||||
// (b) outside the Unicode range of 0-0x10FFFF.
|
||||
// Rather than trying to resynchronize, we will carry on
|
||||
// until the end of the sequence and let the later error
|
||||
// handling code catch it.
|
||||
$mUcs4 = ($in);
|
||||
$mUcs4 = ($mUcs4 & 0x03) << 24;
|
||||
$mState = 4;
|
||||
$mBytes = 5;
|
||||
} elseif (0xFC == (0xFE & ($in))) {
|
||||
// First octet of 6 octet sequence, see comments for 5
|
||||
// octet sequence.
|
||||
$mUcs4 = ($in);
|
||||
$mUcs4 = ($mUcs4 & 1) << 30;
|
||||
$mState = 5;
|
||||
$mBytes = 6;
|
||||
} else {
|
||||
// Current octet is neither in the US-ASCII range nor a
|
||||
// legal first octet of a multi-octet sequence.
|
||||
$mState = 0;
|
||||
$mUcs4 = 0;
|
||||
$mBytes = 1;
|
||||
$char = '';
|
||||
}
|
||||
} else {
|
||||
// When mState is non-zero, we expect a continuation of the
|
||||
// multi-octet sequence
|
||||
if (0x80 == (0xC0 & ($in))) {
|
||||
// Legal continuation.
|
||||
$shift = ($mState - 1) * 6;
|
||||
$tmp = $in;
|
||||
$tmp = ($tmp & 0x0000003F) << $shift;
|
||||
$mUcs4 |= $tmp;
|
||||
|
||||
if (0 == --$mState) {
|
||||
// End of the multi-octet sequence. mUcs4 now contains
|
||||
// the final Unicode codepoint to be output
|
||||
|
||||
// Check for illegal sequences and codepoints.
|
||||
|
||||
// From Unicode 3.1, non-shortest form is illegal
|
||||
if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
|
||||
((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
|
||||
((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
|
||||
(4 < $mBytes) ||
|
||||
// From Unicode 3.2, surrogate characters = illegal
|
||||
(($mUcs4 & 0xFFFFF800) == 0xD800) ||
|
||||
// Codepoints outside the Unicode range are illegal
|
||||
($mUcs4 > 0x10FFFF)
|
||||
) {
|
||||
|
||||
} elseif (0xFEFF != $mUcs4 && // omit BOM
|
||||
!($mUcs4 >= 128 && $mUcs4 <= 159) // omit non-SGML
|
||||
) {
|
||||
$out .= $char;
|
||||
}
|
||||
// initialize UTF8 cache (reset)
|
||||
$mState = 0;
|
||||
$mUcs4 = 0;
|
||||
$mBytes = 1;
|
||||
$char = '';
|
||||
}
|
||||
} else {
|
||||
// ((0xC0 & (*in) != 0x80) && (mState != 0))
|
||||
// Incomplete multi-octet sequence.
|
||||
// used to result in complete fail, but we'll reset
|
||||
$mState = 0;
|
||||
$mUcs4 = 0;
|
||||
$mBytes = 1;
|
||||
$char ='';
|
||||
}
|
||||
}
|
||||
}
|
||||
return $out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Translates a Unicode codepoint into its corresponding UTF-8 character.
|
||||
*/
|
||||
function unichr($code) {
|
||||
if($code > 1114111 or $code < 0 or
|
||||
($code >= 55296 and $code <= 57343) ) {
|
||||
// bits are set outside the "valid" range as defined
|
||||
// by UNICODE 4.1.0
|
||||
return '';
|
||||
}
|
||||
|
||||
$x = $y = $z = $w = 0;
|
||||
if ($code < 128) {
|
||||
// regular ASCII character
|
||||
$x = $code;
|
||||
} else {
|
||||
// set up bits for UTF-8
|
||||
$x = ($code & 63) | 128;
|
||||
if ($code < 2048) {
|
||||
$y = (($code & 2047) >> 6) | 192;
|
||||
} else {
|
||||
$y = (($code & 4032) >> 6) | 128;
|
||||
if($code < 65536) {
|
||||
$z = (($code >> 12) & 15) | 224;
|
||||
} else {
|
||||
$z = (($code >> 12) & 63) | 128;
|
||||
$w = (($code >> 18) & 7) | 240;
|
||||
}
|
||||
}
|
||||
}
|
||||
// set up the actual character
|
||||
$ret = '';
|
||||
if($w) $ret .= chr($w);
|
||||
if($z) $ret .= chr($z);
|
||||
if($y) $ret .= chr($y);
|
||||
$ret .= chr($x);
|
||||
|
||||
return $ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a string to UTF-8 based on configuration.
|
||||
*/
|
||||
function convertToUTF8($str, $config) {
|
||||
static $iconv = null;
|
||||
if ($iconv === null) $iconv = function_exists('iconv');
|
||||
$encoding = $config->get('Core', 'Encoding');
|
||||
if ($encoding === 'utf-8') return $str;
|
||||
if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
|
||||
return @iconv($encoding, 'utf-8//IGNORE', $str);
|
||||
} elseif ($encoding === 'iso-8859-1') {
|
||||
return @utf8_encode($str);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts a string from UTF-8 based on configuration.
|
||||
* @note Currently, this is a lossy conversion, with unexpressable
|
||||
* characters being omitted.
|
||||
*/
|
||||
function convertFromUTF8($str, $config) {
|
||||
static $iconv = null;
|
||||
if ($iconv === null) $iconv = function_exists('iconv');
|
||||
$encoding = $config->get('Core', 'Encoding');
|
||||
if ($encoding === 'utf-8') return $str;
|
||||
if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
|
||||
return @iconv('utf-8', $encoding . '//IGNORE', $str);
|
||||
} elseif ($encoding === 'iso-8859-1') {
|
||||
return @utf8_decode($str);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
?>
|
@@ -1,12 +1,22 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Object that provides entity lookup table from entity name to character
|
||||
*/
|
||||
class HTMLPurifier_EntityLookup {
|
||||
|
||||
/**
|
||||
* Assoc array of entity name to character represented.
|
||||
* @public
|
||||
*/
|
||||
var $table;
|
||||
|
||||
function HTMLPurifier_EntityLookup() {}
|
||||
|
||||
// to enforce Singleton-ness
|
||||
/**
|
||||
* Sets up the entity lookup table from the serialized file contents.
|
||||
* @note The serialized contents are versioned, but were generated
|
||||
* using the maintenance script generate_entity_file.php
|
||||
* @warning This is not in constructor to help enforce the Singleton
|
||||
*/
|
||||
function setup($file = false) {
|
||||
if (!$file) {
|
||||
$file = dirname(__FILE__) . '/EntityLookup/data.txt';
|
||||
@@ -14,6 +24,10 @@ class HTMLPurifier_EntityLookup {
|
||||
$this->table = unserialize(file_get_contents($file));
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves sole instance of the object.
|
||||
* @param Optional prototype of custom lookup table to overload with.
|
||||
*/
|
||||
function instance($prototype = false) {
|
||||
// no references, since PHP doesn't copy unless modified
|
||||
static $instance = null;
|
||||
|
179
library/HTMLPurifier/EntityParser.php
Normal file
179
library/HTMLPurifier/EntityParser.php
Normal file
@@ -0,0 +1,179 @@
|
||||
<?php
|
||||
|
||||
require_once 'HTMLPurifier/EntityLookup.php';
|
||||
require_once 'HTMLPurifier/Encoder.php';
|
||||
|
||||
/**
|
||||
* Handles referencing and derefencing character entities
|
||||
*/
|
||||
class HTMLPurifier_EntityParser
|
||||
{
|
||||
|
||||
/**
|
||||
* Reference to entity lookup table.
|
||||
* @protected
|
||||
*/
|
||||
var $_entity_lookup;
|
||||
|
||||
/**
|
||||
* Callback regex string for parsing entities.
|
||||
* @protected
|
||||
*/
|
||||
var $_substituteEntitiesRegex =
|
||||
'/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z]+));?/';
|
||||
// 1. hex 2. dec 3. string
|
||||
|
||||
|
||||
/**
|
||||
* Decimal to parsed string conversion table for special entities.
|
||||
* @protected
|
||||
*/
|
||||
var $_special_dec2str =
|
||||
array(
|
||||
34 => '"',
|
||||
38 => '&',
|
||||
39 => "'",
|
||||
60 => '<',
|
||||
62 => '>'
|
||||
);
|
||||
|
||||
/**
|
||||
* Stripped entity names to decimal conversion table for special entities.
|
||||
* @protected
|
||||
*/
|
||||
var $_special_ent2dec =
|
||||
array(
|
||||
'quot' => 34,
|
||||
'amp' => 38,
|
||||
'lt' => 60,
|
||||
'gt' => 62
|
||||
);
|
||||
|
||||
/**
|
||||
* Substitutes non-special entities with their parsed equivalents. Since
|
||||
* running this whenever you have parsed character is t3h 5uck, we run
|
||||
* it before everything else.
|
||||
*
|
||||
* @protected
|
||||
* @param $string String to have non-special entities parsed.
|
||||
* @returns Parsed string.
|
||||
*/
|
||||
function substituteNonSpecialEntities($string) {
|
||||
// it will try to detect missing semicolons, but don't rely on it
|
||||
return preg_replace_callback(
|
||||
$this->_substituteEntitiesRegex,
|
||||
array($this, 'nonSpecialEntityCallback'),
|
||||
$string
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Callback function for substituteNonSpecialEntities() that does the work.
|
||||
*
|
||||
* @warning Though this is public in order to let the callback happen,
|
||||
* calling it directly is not recommended.
|
||||
* @note Based on Feyd's function at
|
||||
* <http://forums.devnetwork.net/viewtopic.php?p=191404#191404>,
|
||||
* which is in public domain.
|
||||
* @note While we're going to do code point parsing anyway, a good
|
||||
* optimization would be to refuse to translate code points that
|
||||
* are non-SGML characters. However, this could lead to duplication.
|
||||
* @note This function is heavily intimate with the inner workings of
|
||||
* UTF-8 and would also be well suited in the Encoder class (or at
|
||||
* least deferring some processing to it). This is also very
|
||||
* similar to the unichr function in
|
||||
* maintenance/generate-entity-file.php (although this is superior,
|
||||
* due to its sanity checks).
|
||||
* @param $matches PCRE matches array, with 0 the entire match, and
|
||||
* either index 1, 2 or 3 set with a hex value, dec value,
|
||||
* or string (respectively).
|
||||
* @returns Replacement string.
|
||||
*/
|
||||
|
||||
// +----------+----------+----------+----------+
|
||||
// | 33222222 | 22221111 | 111111 | |
|
||||
// | 10987654 | 32109876 | 54321098 | 76543210 | bit
|
||||
// +----------+----------+----------+----------+
|
||||
// | | | | 0xxxxxxx | 1 byte 0x00000000..0x0000007F
|
||||
// | | | 110yyyyy | 10xxxxxx | 2 byte 0x00000080..0x000007FF
|
||||
// | | 1110zzzz | 10yyyyyy | 10xxxxxx | 3 byte 0x00000800..0x0000FFFF
|
||||
// | 11110www | 10wwzzzz | 10yyyyyy | 10xxxxxx | 4 byte 0x00010000..0x0010FFFF
|
||||
// +----------+----------+----------+----------+
|
||||
// | 00000000 | 00011111 | 11111111 | 11111111 | Theoretical upper limit of legal scalars: 2097151 (0x001FFFFF)
|
||||
// | 00000000 | 00010000 | 11111111 | 11111111 | Defined upper limit of legal scalar codes
|
||||
// +----------+----------+----------+----------+
|
||||
|
||||
function nonSpecialEntityCallback($matches) {
|
||||
// replaces all but big five
|
||||
$entity = $matches[0];
|
||||
$is_num = (@$matches[0][1] === '#');
|
||||
if ($is_num) {
|
||||
$is_hex = (@$entity[2] === 'x');
|
||||
$code = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
|
||||
|
||||
// abort for special characters
|
||||
if (isset($this->_special_dec2str[$code])) return $entity;
|
||||
|
||||
return HTMLPurifier_Encoder::unichr($code);
|
||||
} else {
|
||||
if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
|
||||
if (!$this->_entity_lookup) {
|
||||
require_once 'HTMLPurifier/EntityLookup.php';
|
||||
$this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
|
||||
}
|
||||
if (isset($this->_entity_lookup->table[$matches[3]])) {
|
||||
return $this->_entity_lookup->table[$matches[3]];
|
||||
} else {
|
||||
return $entity;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Substitutes only special entities with their parsed equivalents.
|
||||
*
|
||||
* @notice We try to avoid calling this function because otherwise, it
|
||||
* would have to be called a lot (for every parsed section).
|
||||
*
|
||||
* @protected
|
||||
* @param $string String to have non-special entities parsed.
|
||||
* @returns Parsed string.
|
||||
*/
|
||||
function substituteSpecialEntities($string) {
|
||||
return preg_replace_callback(
|
||||
$this->_substituteEntitiesRegex,
|
||||
array($this, 'specialEntityCallback'),
|
||||
$string);
|
||||
}
|
||||
|
||||
/**
|
||||
* Callback function for substituteSpecialEntities() that does the work.
|
||||
*
|
||||
* This callback has same syntax as nonSpecialEntityCallback().
|
||||
*
|
||||
* @warning Though this is public in order to let the callback happen,
|
||||
* calling it directly is not recommended.
|
||||
* @param $matches PCRE-style matches array, with 0 the entire match, and
|
||||
* either index 1, 2 or 3 set with a hex value, dec value,
|
||||
* or string (respectively).
|
||||
* @returns Replacement string.
|
||||
*/
|
||||
function specialEntityCallback($matches) {
|
||||
$entity = $matches[0];
|
||||
$is_num = (@$matches[0][1] === '#');
|
||||
if ($is_num) {
|
||||
$is_hex = (@$entity[2] === 'x');
|
||||
$int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
|
||||
return isset($this->_special_dec2str[$int]) ?
|
||||
$this->_special_dec2str[$int] :
|
||||
$entity;
|
||||
} else {
|
||||
return isset($this->_special_ent2dec[$matches[3]]) ?
|
||||
$this->_special_ent2dec[$matches[3]] :
|
||||
$entity;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
?>
|
@@ -2,35 +2,124 @@
|
||||
|
||||
// pretty-printing with indentation would be pretty cool
|
||||
|
||||
require_once 'HTMLPurifier/Lexer.php';
|
||||
|
||||
HTMLPurifier_ConfigSchema::define(
|
||||
'Core', 'CleanUTF8DuringGeneration', false, 'bool',
|
||||
'When true, HTMLPurifier_Generator will also check all strings it '.
|
||||
'escapes for UTF-8 well-formedness as a defense in depth measure. '.
|
||||
'This could cause a considerable performance impact, and is not '.
|
||||
'strictly necessary due to the fact that the Lexers should have '.
|
||||
'ensured that all the UTF-8 strings were well-formed. Note that '.
|
||||
'the configuration value is only read at the beginning of '.
|
||||
'generateFromTokens.'
|
||||
);
|
||||
|
||||
HTMLPurifier_ConfigSchema::define(
|
||||
'Core', 'XHTML', true, 'bool',
|
||||
'Determines whether or not output is XHTML or not. When disabled, HTML '.
|
||||
'Purifier goes into HTML 4.01 removes XHTML-specific markup constructs, '.
|
||||
'such as boolean attribute expansion and trailing slashes in empty tags. '.
|
||||
'This directive was available since 1.1.'
|
||||
);
|
||||
|
||||
// extension constraints could be factored into ConfigSchema
|
||||
HTMLPurifier_ConfigSchema::define(
|
||||
'Core', 'TidyFormat', false, 'bool',
|
||||
'<p>Determines whether or not to run Tidy on the final output for pretty '.
|
||||
'formatting reasons, such as indentation and wrap.</p><p>This can greatly '.
|
||||
'improve readability for editors who are hand-editing the HTML, but is '.
|
||||
'by no means necessary as HTML Purifier has already fixed all major '.
|
||||
'errors the HTML may have had. Tidy is a non-default extension, and this directive '.
|
||||
'will silently fail if Tidy is not available.</p><p>If you are looking to make '.
|
||||
'the overall look of your page\'s source better, I recommend running Tidy '.
|
||||
'on the entire page rather than just user-content (after all, the '.
|
||||
'indentation relative to the containing blocks will be incorrect).</p><p>This '.
|
||||
'directive was available since 1.1.1.</p>'
|
||||
);
|
||||
|
||||
/**
|
||||
* Generates HTML from tokens.
|
||||
*/
|
||||
class HTMLPurifier_Generator
|
||||
{
|
||||
|
||||
// only unit tests may omit configuration: internals MUST pass config
|
||||
/**
|
||||
* Bool cache of %Core.CleanUTF8DuringGeneration
|
||||
* @private
|
||||
*/
|
||||
var $_clean_utf8 = false;
|
||||
|
||||
/**
|
||||
* Bool cache of %Core.XHTML
|
||||
*/
|
||||
var $_xhtml = true;
|
||||
|
||||
/**
|
||||
* Generates HTML from an array of tokens.
|
||||
* @param $tokens Array of HTMLPurifier_Token
|
||||
* @param $config HTMLPurifier_Config object
|
||||
* @return Generated HTML
|
||||
* @note Only unit tests may omit configuration: internals MUST pass config
|
||||
*/
|
||||
function generateFromTokens($tokens, $config = null) {
|
||||
$html = '';
|
||||
if (!$config) $config = HTMLPurifier_Config::createDefault();
|
||||
$this->_clean_utf8 = $config->get('Core', 'CleanUTF8DuringGeneration');
|
||||
$this->_xhtml = $config->get('Core', 'XHTML');
|
||||
if (!$tokens) return '';
|
||||
foreach ($tokens as $token) {
|
||||
$html .= $this->generateFromToken($token, $config);
|
||||
$html .= $this->generateFromToken($token);
|
||||
}
|
||||
if ($config->get('Core', 'TidyFormat') && extension_loaded('tidy')) {
|
||||
|
||||
$tidy_options = array(
|
||||
'indent'=> true,
|
||||
'output-xhtml' => $this->_xhtml,
|
||||
'show-body-only' => true,
|
||||
'indent-spaces' => 2,
|
||||
'wrap' => 68,
|
||||
);
|
||||
if (version_compare(PHP_VERSION, '5', '<')) {
|
||||
tidy_set_encoding('utf8');
|
||||
foreach ($tidy_options as $key => $value) {
|
||||
tidy_setopt($key, $value);
|
||||
}
|
||||
tidy_parse_string($html);
|
||||
tidy_clean_repair();
|
||||
$html = tidy_get_output();
|
||||
} else {
|
||||
$tidy = new Tidy;
|
||||
$tidy->parseString($html, $tidy_options, 'utf8');
|
||||
$tidy->cleanRepair();
|
||||
$html = (string) $tidy;
|
||||
}
|
||||
}
|
||||
return $html;
|
||||
}
|
||||
|
||||
function generateFromToken($token, $config) {
|
||||
/**
|
||||
* Generates HTML from a single token.
|
||||
* @param $token HTMLPurifier_Token object.
|
||||
* @return Generated HTML
|
||||
*/
|
||||
function generateFromToken($token) {
|
||||
if (!isset($token->type)) return '';
|
||||
if ($token->type == 'start') {
|
||||
$attr = $this->generateAttributes($token->attributes, $config);
|
||||
$attr = $this->generateAttributes($token->attributes);
|
||||
return '<' . $token->name . ($attr ? ' ' : '') . $attr . '>';
|
||||
|
||||
} elseif ($token->type == 'end') {
|
||||
return '</' . $token->name . '>';
|
||||
|
||||
} elseif ($token->type == 'empty') {
|
||||
$attr = $this->generateAttributes($token->attributes, $config);
|
||||
return '<' . $token->name . ($attr ? ' ' : '') . $attr . ' />';
|
||||
$attr = $this->generateAttributes($token->attributes);
|
||||
return '<' . $token->name . ($attr ? ' ' : '') . $attr .
|
||||
( $this->_xhtml ? ' /': '' )
|
||||
. '>';
|
||||
|
||||
} elseif ($token->type == 'text') {
|
||||
return htmlspecialchars($token->data, ENT_COMPAT, 'UTF-8');
|
||||
return $this->escape($token->data);
|
||||
|
||||
} else {
|
||||
return '';
|
||||
@@ -38,14 +127,34 @@ class HTMLPurifier_Generator
|
||||
}
|
||||
}
|
||||
|
||||
function generateAttributes($assoc_array_of_attributes, $config) {
|
||||
/**
|
||||
* Generates attribute declarations from attribute array.
|
||||
* @param $assoc_array_of_attributes Attribute array
|
||||
* @return Generate HTML fragment for insertion.
|
||||
*/
|
||||
function generateAttributes($assoc_array_of_attributes) {
|
||||
$html = '';
|
||||
foreach ($assoc_array_of_attributes as $key => $value) {
|
||||
$html .= $key.'="'.htmlspecialchars($value, ENT_COMPAT, 'UTF-8').'" ';
|
||||
if (!$this->_xhtml) {
|
||||
// remove namespaced attributes
|
||||
if (strpos($key, ':') !== false) continue;
|
||||
// also needed: check for attribute minimization
|
||||
}
|
||||
$html .= $key.'="'.$this->escape($value).'" ';
|
||||
}
|
||||
return rtrim($html);
|
||||
}
|
||||
|
||||
/**
|
||||
* Escapes raw text data.
|
||||
* @param $string String data to escape for HTML.
|
||||
* @return String escaped data.
|
||||
*/
|
||||
function escape($string) {
|
||||
if ($this->_clean_utf8) $string = HTMLPurifier_Lexer::cleanUTF8($string);
|
||||
return htmlspecialchars($string, ENT_COMPAT, 'UTF-8');
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
?>
|
@@ -9,7 +9,7 @@ require_once 'HTMLPurifier/AttrDef.php';
|
||||
require_once 'HTMLPurifier/AttrDef/Pixels.php';
|
||||
require_once 'HTMLPurifier/AttrDef/Length.php';
|
||||
require_once 'HTMLPurifier/AttrDef/MultiLength.php';
|
||||
require_once 'HTMLPurifier/AttrDef/NumberSpan.php';
|
||||
require_once 'HTMLPurifier/AttrDef/Integer.php';
|
||||
require_once 'HTMLPurifier/AttrDef/URI.php';
|
||||
require_once 'HTMLPurifier/AttrDef/CSS.php';
|
||||
require_once 'HTMLPurifier/AttrTransform.php';
|
||||
@@ -30,12 +30,8 @@ require_once 'HTMLPurifier/TagTransform.php';
|
||||
* each allowed element. It also contains special use information (always
|
||||
* prefixed by info) for intelligent tag closing and global attributes.
|
||||
*
|
||||
* Planned improvements include attribute transformation objects as well as
|
||||
* migration of auto-tag-closing from HTMLPurifier_Strategy_MakeWellFormed
|
||||
* (these can likely just be extensions of ElementDef).
|
||||
*
|
||||
* After development drops off, the definition generation will be moved to
|
||||
* a maintenance script and we will stipulate that definition be created
|
||||
* For optimization, the definition generation may be moved to
|
||||
* a maintenance script and stipulate that definition be created
|
||||
* by a factory method that unserializes a serialized version of Definition.
|
||||
* Customization would entail copying the maintenance script, making the
|
||||
* necessary changes, generating the serialized object, and then hooking it
|
||||
@@ -46,40 +42,46 @@ require_once 'HTMLPurifier/TagTransform.php';
|
||||
class HTMLPurifier_HTMLDefinition
|
||||
{
|
||||
|
||||
/**
|
||||
* Associative array of element names to HTMLPurifier_ElementDef
|
||||
* @public
|
||||
*/
|
||||
var $info = array();
|
||||
|
||||
// used solely by HTMLPurifier_Strategy_ValidateAttributes
|
||||
/**
|
||||
* Associative array of global attribute name to attribute definition.
|
||||
* @public
|
||||
*/
|
||||
var $info_global_attr = array();
|
||||
|
||||
// used solely by HTMLPurifier_Strategy_FixNesting
|
||||
/**
|
||||
* String name of parent element HTML will be going into.
|
||||
* @public
|
||||
*/
|
||||
var $info_parent = 'div';
|
||||
|
||||
// used solely by HTMLPurifier_Strategy_RemoveForeignElements
|
||||
/**
|
||||
* Associative array of deprecated tag name to HTMLPurifier_TagTransform
|
||||
* @public
|
||||
*/
|
||||
var $info_tag_transform = array();
|
||||
|
||||
// used solely by HTMLPurifier_Strategy_ValidateAttributes
|
||||
/**
|
||||
* List of HTMLPurifier_AttrTransform to be performed before validation.
|
||||
* @public
|
||||
*/
|
||||
var $info_attr_transform_pre = array();
|
||||
|
||||
/**
|
||||
* List of HTMLPurifier_AttrTransform to be performed after validation/
|
||||
* @public
|
||||
*/
|
||||
var $info_attr_transform_post = array();
|
||||
|
||||
// WARNING! Prototype is not passed by reference, so in order to get
|
||||
// a copy of the real one, you'll have to destroy your copy and
|
||||
// use instance() to get it.
|
||||
// Usually, however, modifying the returned definition (reference) should be
|
||||
// sufficient
|
||||
function &instance($prototype = null) {
|
||||
static $instance = null;
|
||||
if ($prototype) {
|
||||
$instance = $prototype;
|
||||
} elseif (!$instance) {
|
||||
$instance = new HTMLPurifier_HTMLDefinition();
|
||||
$instance->setup();
|
||||
}
|
||||
return $instance;
|
||||
}
|
||||
|
||||
function HTMLPurifier_HTMLDefinition() {}
|
||||
|
||||
function setup() {
|
||||
/**
|
||||
* Initializes the definition, the meat of the class.
|
||||
*/
|
||||
function setup($config) {
|
||||
|
||||
// emulates the structure of the DTD
|
||||
// these are condensed, however, with bad stuff taken out
|
||||
@@ -130,6 +132,7 @@ class HTMLPurifier_HTMLDefinition
|
||||
$e_misc = "$e_misc_inline";
|
||||
$e_inline = "a | $e_special | $e_fontstyle | $e_phrase".
|
||||
" | $e_inline_forms";
|
||||
// pseudo-property we created for convenience, see later on
|
||||
$e__inline = "#PCDATA | $e_inline | $e_misc_inline";
|
||||
// note the casing
|
||||
$e_Inline = new HTMLPurifier_ChildDef_Optional($e__inline);
|
||||
@@ -206,8 +209,7 @@ class HTMLPurifier_HTMLDefinition
|
||||
|
||||
$this->info['a']->child = $e_a_content;
|
||||
|
||||
$this->info['table']->child = new HTMLPurifier_ChildDef_Custom(
|
||||
'(caption?, (col*|colgroup*), thead?, tfoot?, (tbody+|tr+))');
|
||||
$this->info['table']->child = new HTMLPurifier_ChildDef_Table();
|
||||
|
||||
// not a real entity, watch the double underscore
|
||||
$e__row = new HTMLPurifier_ChildDef_Required('tr');
|
||||
@@ -309,7 +311,7 @@ class HTMLPurifier_HTMLDefinition
|
||||
$this->info['col']->attr['width'] =
|
||||
$this->info['colgroup']->attr['width'] = $e_MultiLength;
|
||||
|
||||
$e__NumberSpan = new HTMLPurifier_AttrDef_NumberSpan();
|
||||
$e__NumberSpan = new HTMLPurifier_AttrDef_Integer(false, false, true);
|
||||
$this->info['colgroup']->attr['span'] =
|
||||
$this->info['col']->attr['span'] =
|
||||
$this->info['td']->attr['rowspan'] =
|
||||
@@ -394,15 +396,52 @@ class HTMLPurifier_HTMLDefinition
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Structure that stores an element definition.
|
||||
*/
|
||||
class HTMLPurifier_ElementDef
|
||||
{
|
||||
|
||||
/**
|
||||
* Associative array of attribute name to HTMLPurifier_AttrDef
|
||||
* @public
|
||||
*/
|
||||
var $attr = array();
|
||||
|
||||
/**
|
||||
* List of tag's HTMLPurifier_AttrTransform to be done before validation
|
||||
* @public
|
||||
*/
|
||||
var $attr_transform_pre = array();
|
||||
|
||||
/**
|
||||
* List of tag's HTMLPurifier_AttrTransform to be done after validation
|
||||
* @public
|
||||
*/
|
||||
var $attr_transform_post = array();
|
||||
|
||||
/**
|
||||
* Lookup table of tags that close this tag.
|
||||
* @public
|
||||
*/
|
||||
var $auto_close = array();
|
||||
|
||||
/**
|
||||
* HTMLPurifier_ChildDef of this tag.
|
||||
* @public
|
||||
*/
|
||||
var $child;
|
||||
|
||||
/**
|
||||
* Type of the tag: inline or block or unknown?
|
||||
* @public
|
||||
*/
|
||||
var $type = 'unknown';
|
||||
|
||||
/**
|
||||
* Lookup table of tags excluded from all descendants of this tag.
|
||||
* @public
|
||||
*/
|
||||
var $excludes = array();
|
||||
|
||||
}
|
||||
|
@@ -1,15 +1,33 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Component of HTMLPurifier_AttrContext that accumulates IDs to prevent dupes
|
||||
* @note In Slashdot-speak, dupe means duplicate.
|
||||
*/
|
||||
class HTMLPurifier_IDAccumulator
|
||||
{
|
||||
|
||||
/**
|
||||
* Lookup table of IDs we've accumulated.
|
||||
* @public
|
||||
*/
|
||||
var $ids = array();
|
||||
|
||||
/**
|
||||
* Add an ID to the lookup table.
|
||||
* @param $id ID to be added.
|
||||
* @return Bool status, true if success, false if there's a dupe
|
||||
*/
|
||||
function add($id) {
|
||||
if (isset($this->ids[$id])) return false;
|
||||
return $this->ids[$id] = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load a list of IDs into the lookup table
|
||||
* @param $array_of_ids Array of IDs to load
|
||||
* @note This function doesn't care about duplicates
|
||||
*/
|
||||
function load($array_of_ids) {
|
||||
foreach ($array_of_ids as $id) {
|
||||
$this->ids[$id] = true;
|
||||
|
@@ -1,13 +1,14 @@
|
||||
<?php
|
||||
|
||||
require_once 'HTMLPurifier/Token.php';
|
||||
require_once 'HTMLPurifier/Encoder.php';
|
||||
require_once 'HTMLPurifier/EntityParser.php';
|
||||
|
||||
HTMLPurifier_ConfigDef::define(
|
||||
'Core', 'AcceptFullDocuments', true,
|
||||
HTMLPurifier_ConfigSchema::define(
|
||||
'Core', 'AcceptFullDocuments', true, 'bool',
|
||||
'This parameter determines whether or not the filter should accept full '.
|
||||
'HTML documents, not just HTML fragments. When on, it will '.
|
||||
'drop all sections except the content between body. Depending on '.
|
||||
'the implementation in use, this may speed up document parse times.'
|
||||
'drop all sections except the content between body.'
|
||||
);
|
||||
|
||||
/**
|
||||
@@ -54,6 +55,13 @@ HTMLPurifier_ConfigDef::define(
|
||||
class HTMLPurifier_Lexer
|
||||
{
|
||||
|
||||
function HTMLPurifier_Lexer() {
|
||||
$this->_encoder = new HTMLPurifier_Encoder();
|
||||
$this->_entity_parser = new HTMLPurifier_EntityParser();
|
||||
}
|
||||
|
||||
var $_encoder;
|
||||
|
||||
/**
|
||||
* Lexes an HTML string into tokens.
|
||||
*
|
||||
@@ -101,112 +109,6 @@ class HTMLPurifier_Lexer
|
||||
return $lexer;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decimal to parsed string conversion table for special entities.
|
||||
* @protected
|
||||
*/
|
||||
var $_special_dec2str =
|
||||
array(
|
||||
34 => '"',
|
||||
38 => '&',
|
||||
39 => "'",
|
||||
60 => '<',
|
||||
62 => '>'
|
||||
);
|
||||
|
||||
/**
|
||||
* Stripped entity names to decimal conversion table for special entities.
|
||||
* @protected
|
||||
*/
|
||||
var $_special_ent2dec =
|
||||
array(
|
||||
'quot' => 34,
|
||||
'amp' => 38,
|
||||
'lt' => 60,
|
||||
'gt' => 62
|
||||
);
|
||||
|
||||
/**
|
||||
* Most common entity to raw value conversion table for special entities.
|
||||
* @protected
|
||||
*/
|
||||
var $_special_entity2str =
|
||||
array(
|
||||
'"' => '"',
|
||||
'&' => '&',
|
||||
'<' => '<',
|
||||
'>' => '>',
|
||||
''' => "'",
|
||||
''' => "'",
|
||||
''' => "'"
|
||||
);
|
||||
|
||||
/**
|
||||
* Callback regex string for parsing entities.
|
||||
* @protected
|
||||
*/
|
||||
var $_substituteEntitiesRegex =
|
||||
'/&(?:[#]x([a-fA-F0-9]+)|[#]0*(\d+)|([A-Za-z]+));?/';
|
||||
// 1. hex 2. dec 3. string
|
||||
|
||||
/**
|
||||
* Substitutes non-special entities with their parsed equivalents. Since
|
||||
* running this whenever you have parsed character is t3h 5uck, we run
|
||||
* it before everything else.
|
||||
*
|
||||
* @protected
|
||||
* @param $string String to have non-special entities parsed.
|
||||
* @returns Parsed string.
|
||||
*/
|
||||
function substituteNonSpecialEntities($string) {
|
||||
// it will try to detect missing semicolons, but don't rely on it
|
||||
return preg_replace_callback(
|
||||
$this->_substituteEntitiesRegex,
|
||||
array($this, 'nonSpecialEntityCallback'),
|
||||
$string
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Callback function for substituteNonSpecialEntities() that does the work.
|
||||
*
|
||||
* @warning Though this is public in order to let the callback happen,
|
||||
* calling it directly is not recommended.
|
||||
* @param $matches PCRE matches array, with 0 the entire match, and
|
||||
* either index 1, 2 or 3 set with a hex value, dec value,
|
||||
* or string (respectively).
|
||||
* @returns Replacement string.
|
||||
* @todo Implement string translations
|
||||
*/
|
||||
function nonSpecialEntityCallback($matches) {
|
||||
// replaces all but big five
|
||||
$entity = $matches[0];
|
||||
$is_num = (@$matches[0][1] === '#');
|
||||
if ($is_num) {
|
||||
$is_hex = (@$entity[2] === 'x');
|
||||
$int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
|
||||
if (isset($this->_special_dec2str[$int])) return $entity;
|
||||
return chr($int);
|
||||
} else {
|
||||
if (isset($this->_special_ent2dec[$matches[3]])) return $entity;
|
||||
if (!$this->_entity_lookup) {
|
||||
require_once 'HTMLPurifier/EntityLookup.php';
|
||||
$this->_entity_lookup = HTMLPurifier_EntityLookup::instance();
|
||||
}
|
||||
if (isset($this->_entity_lookup->table[$matches[3]])) {
|
||||
return $this->_entity_lookup->table[$matches[3]];
|
||||
} else {
|
||||
return $entity;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Contains a copy of the EntityLookup table.
|
||||
* @protected
|
||||
*/
|
||||
var $_entity_lookup;
|
||||
|
||||
/**
|
||||
* Translates CDATA sections into regular sections (through escaping).
|
||||
*
|
||||
@@ -236,13 +138,37 @@ class HTMLPurifier_Lexer
|
||||
return htmlspecialchars($matches[1], ENT_COMPAT, 'UTF-8');
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes a piece of HTML and normalizes it by converting entities, fixing
|
||||
* encoding, extracting bits, and other good stuff.
|
||||
*/
|
||||
function normalize($html, $config) {
|
||||
|
||||
// extract body from document if applicable
|
||||
if ($config->get('Core', 'AcceptFullDocuments')) {
|
||||
$html = $this->extractBody($html);
|
||||
}
|
||||
|
||||
// escape CDATA
|
||||
$html = $this->escapeCDATA($html);
|
||||
|
||||
// expand entities that aren't the big five
|
||||
$html = $this->_entity_parser->substituteNonSpecialEntities($html);
|
||||
|
||||
// clean into wellformed UTF-8 string for an SGML context: this has
|
||||
// to be done after entity expansion because the entities sometimes
|
||||
// represent non-SGML characters (horror, horror!)
|
||||
$html = $this->_encoder->cleanUTF8($html);
|
||||
|
||||
return $html;
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes a string of HTML (fragment or document) and returns the content
|
||||
*/
|
||||
function extractBody($html, $return_bool = false) {
|
||||
function extractBody($html) {
|
||||
$matches = array();
|
||||
$result = preg_match('!<body[^>]*>(.+?)</body>!is', $html, $matches);
|
||||
if ($return_bool) return $result;
|
||||
if ($result) {
|
||||
return $matches[1];
|
||||
} else {
|
||||
|
@@ -12,15 +12,19 @@ require_once 'HTMLPurifier/TokenFactory.php';
|
||||
* documents, it performs twenty times faster than
|
||||
* HTMLPurifier_Lexer_DirectLex,and is the default choice for PHP 5.
|
||||
*
|
||||
* @notice
|
||||
* Any empty elements will have empty tokens associated with them, even if
|
||||
* @note Any empty elements will have empty tokens associated with them, even if
|
||||
* this is prohibited by the spec. This is cannot be fixed until the spec
|
||||
* comes into play.
|
||||
*
|
||||
* @todo Determine DOM's entity parsing behavior, point to local entity files
|
||||
* if necessary.
|
||||
* @todo Make div access less fragile, and refrain from preprocessing when
|
||||
* HTML tag and friends are already present.
|
||||
* @note PHP's DOM extension does not actually parse any entities, we use
|
||||
* our own function to do that.
|
||||
*
|
||||
* @warning DOM tends to drop whitespace, which may wreak havoc on indenting.
|
||||
* If this is a huge problem, due to the fact that HTML is hand
|
||||
* edited and youa re unable to get a parser cache that caches the
|
||||
* the output of HTML Purifier while keeping the original HTML lying
|
||||
* around, you may want to run Tidy on the resulting output or use
|
||||
* HTMLPurifier_DirectLex
|
||||
*/
|
||||
|
||||
class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
||||
@@ -30,38 +34,34 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
||||
|
||||
public function __construct() {
|
||||
// setup the factory
|
||||
parent::HTMLPurifier_Lexer();
|
||||
$this->factory = new HTMLPurifier_TokenFactory();
|
||||
}
|
||||
|
||||
public function tokenizeHTML($string, $config = null) {
|
||||
if (!$config) $config = HTMLPurifier_Config::createDefault();
|
||||
|
||||
if ($config->get('Core', 'AcceptFullDocuments')) {
|
||||
$is_full = $this->extractBody($string, true);
|
||||
}
|
||||
$string = $this->normalize($string, $config);
|
||||
|
||||
$doc = new DOMDocument();
|
||||
$doc->encoding = 'UTF-8'; // technically does nothing, but whatever
|
||||
|
||||
// replace and escape the CDATA sections, since parsing under HTML
|
||||
// mode won't get 'em.
|
||||
$string = $this->escapeCDATA($string);
|
||||
|
||||
if (!$is_full) {
|
||||
// preprocess string, essential for UTF-8
|
||||
$string =
|
||||
'<!DOCTYPE html '.
|
||||
'PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"'.
|
||||
'"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'.
|
||||
'<html><head>'.
|
||||
'<meta http-equiv="Content-Type" content="text/html;'.
|
||||
' charset=utf-8" />'.
|
||||
'</head><body>'.$string.'</body></html>';
|
||||
}
|
||||
'</head><body><div>'.$string.'</div></body></html>';
|
||||
|
||||
$doc = new DOMDocument();
|
||||
$doc->encoding = 'UTF-8'; // technically does nothing, but whatever
|
||||
@$doc->loadHTML($string); // mute all errors, handle it transparently
|
||||
|
||||
$tokens = array();
|
||||
$this->tokenizeDOM(
|
||||
$doc->childNodes->item(1)-> // html
|
||||
getElementsByTagName('body')->item(0) // body
|
||||
$doc->getElementsByTagName('html')->item(0)-> // html
|
||||
getElementsByTagName('body')->item(0)-> // body
|
||||
getElementsByTagName('div')->item(0) // div
|
||||
, $tokens);
|
||||
return $tokens;
|
||||
}
|
||||
@@ -79,32 +79,32 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
|
||||
protected function tokenizeDOM($node, &$tokens, $collect = false) {
|
||||
// recursive goodness!
|
||||
|
||||
// intercept non element nodes
|
||||
|
||||
if ( isset($node->data) ) {
|
||||
if ($node->nodeType === XML_TEXT_NODE ||
|
||||
$node->nodeType === XML_CDATA_SECTION_NODE) {
|
||||
$tokens[] = $this->factory->createText($node->data);
|
||||
} elseif ($node->nodeType === XML_COMMENT_NODE) {
|
||||
$tokens[] = $this->factory->createComment($node->data);
|
||||
}
|
||||
// quite possibly, the object wasn't handled, that's fine
|
||||
// intercept non element nodes. WE MUST catch all of them,
|
||||
// but we're not getting the character reference nodes because
|
||||
// those should have been preprocessed
|
||||
if ($node->nodeType === XML_TEXT_NODE ||
|
||||
$node->nodeType === XML_CDATA_SECTION_NODE) {
|
||||
$tokens[] = $this->factory->createText($node->data);
|
||||
return;
|
||||
} elseif ($node->nodeType === XML_COMMENT_NODE) {
|
||||
$tokens[] = $this->factory->createComment($node->data);
|
||||
return;
|
||||
}
|
||||
|
||||
$attr = $node->hasAttributes() ?
|
||||
$this->transformAttrToAssoc($node->attributes) :
|
||||
array();
|
||||
|
||||
// We still have to make sure that the element actually IS empty
|
||||
if (!$node->childNodes->length) {
|
||||
if ($collect) {
|
||||
$tokens[] = $this->factory->createEmpty(
|
||||
$node->tagName,
|
||||
$this->transformAttrToAssoc($node->attributes)
|
||||
);
|
||||
$tokens[] = $this->factory->createEmpty($node->tagName, $attr);
|
||||
}
|
||||
} else {
|
||||
if ($collect) { // don't wrap on first iteration
|
||||
$tokens[] = $this->factory->createStart(
|
||||
$tag_name = $node->tagName, // somehow, it get's dropped
|
||||
$this->transformAttrToAssoc($node->attributes)
|
||||
$attr
|
||||
);
|
||||
}
|
||||
foreach ($node->childNodes as $node) {
|
||||
|
@@ -20,6 +20,21 @@ require_once 'HTMLPurifier/Lexer.php';
|
||||
class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
{
|
||||
|
||||
/**
|
||||
* Most common entity to raw value conversion table for special entities.
|
||||
* @protected
|
||||
*/
|
||||
var $_special_entity2str =
|
||||
array(
|
||||
'"' => '"',
|
||||
'&' => '&',
|
||||
'<' => '<',
|
||||
'>' => '>',
|
||||
''' => "'",
|
||||
''' => "'",
|
||||
''' => "'"
|
||||
);
|
||||
|
||||
/**
|
||||
* Parses special entities into the proper characters.
|
||||
*
|
||||
@@ -51,7 +66,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
if ($num_amp_2 <= $num_esc_amp) return $string;
|
||||
|
||||
// hmm... now we have some uncommon entities. Use the callback.
|
||||
$string = $this->substituteSpecialEntities($string);
|
||||
$string = $this->_entity_parser->substituteSpecialEntities($string);
|
||||
return $string;
|
||||
}
|
||||
|
||||
@@ -61,73 +76,16 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
*/
|
||||
var $_whitespace = "\x20\x09\x0D\x0A";
|
||||
|
||||
/**
|
||||
* Substitutes only special entities with their parsed equivalents.
|
||||
*
|
||||
* @notice We try to avoid calling this function because otherwise, it
|
||||
* would have to be called a lot (for every parsed section).
|
||||
*
|
||||
* @protected
|
||||
* @param $string String to have non-special entities parsed.
|
||||
* @returns Parsed string.
|
||||
*/
|
||||
function substituteSpecialEntities($string) {
|
||||
return preg_replace_callback(
|
||||
$this->_substituteEntitiesRegex,
|
||||
array('HTMLPurifier_Lexer_DirectLex', 'specialEntityCallback'),
|
||||
$string);
|
||||
}
|
||||
|
||||
/**
|
||||
* Callback function for substituteSpecialEntities() that does the work.
|
||||
*
|
||||
* This callback has same syntax as nonSpecialEntityCallback().
|
||||
*
|
||||
* @warning Though this is public in order to let the callback happen,
|
||||
* calling it directly is not recommended.
|
||||
* @param $matches PCRE-style matches array, with 0 the entire match, and
|
||||
* either index 1, 2 or 3 set with a hex value, dec value,
|
||||
* or string (respectively).
|
||||
* @returns Replacement string.
|
||||
*/
|
||||
function specialEntityCallback($matches) {
|
||||
$entity = $matches[0];
|
||||
$is_num = (@$matches[0][1] === '#');
|
||||
if ($is_num) {
|
||||
$is_hex = (@$entity[2] === 'x');
|
||||
$int = $is_hex ? hexdec($matches[1]) : (int) $matches[2];
|
||||
return isset($this->_special_dec2str[$int]) ?
|
||||
$this->_special_dec2str[$int] :
|
||||
$entity;
|
||||
} else {
|
||||
return isset($this->_special_ent2dec[$matches[3]]) ?
|
||||
$this->_special_ent2dec[$matches[3]] :
|
||||
$entity;
|
||||
}
|
||||
}
|
||||
|
||||
function tokenizeHTML($string, $config = null) {
|
||||
function tokenizeHTML($html, $config = null) {
|
||||
|
||||
if (!$config) $config = HTMLPurifier_Config::createDefault();
|
||||
|
||||
// some quick checking (if empty, return empty)
|
||||
$string = @ (string) $string;
|
||||
if ($string == '') return array();
|
||||
|
||||
if ($config->get('Core', 'AcceptFullDocuments')) {
|
||||
$string = $this->extractBody($string);
|
||||
}
|
||||
$html = $this->normalize($html, $config);
|
||||
|
||||
$cursor = 0; // our location in the text
|
||||
$inside_tag = false; // whether or not we're parsing the inside of a tag
|
||||
$array = array(); // result array
|
||||
|
||||
// escape CDATA
|
||||
$string = $this->escapeCDATA($string);
|
||||
|
||||
// expand entities THAT AREN'T THE BIG FIVE
|
||||
$string = $this->substituteNonSpecialEntities($string);
|
||||
|
||||
// infinite loop protection
|
||||
// has to be pretty big, since html docs can be big
|
||||
// we're allow two hundred thousand tags... more than enough?
|
||||
@@ -138,8 +96,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
// infinite loop protection
|
||||
if (++$loops > 200000) return array();
|
||||
|
||||
$position_next_lt = strpos($string, '<', $cursor);
|
||||
$position_next_gt = strpos($string, '>', $cursor);
|
||||
$position_next_lt = strpos($html, '<', $cursor);
|
||||
$position_next_gt = strpos($html, '>', $cursor);
|
||||
|
||||
// triggers on "<b>asdf</b>" but not "asdf <b></b>"
|
||||
if ($position_next_lt === $cursor) {
|
||||
@@ -153,7 +111,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
HTMLPurifier_Token_Text(
|
||||
$this->parseData(
|
||||
substr(
|
||||
$string, $cursor, $position_next_lt - $cursor
|
||||
$html, $cursor, $position_next_lt - $cursor
|
||||
)
|
||||
)
|
||||
);
|
||||
@@ -163,13 +121,13 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
} elseif (!$inside_tag) {
|
||||
// We are not inside tag but there are no more tags
|
||||
// If we're already at the end, break
|
||||
if ($cursor === strlen($string)) break;
|
||||
if ($cursor === strlen($html)) break;
|
||||
// Create Text of rest of string
|
||||
$array[] = new
|
||||
HTMLPurifier_Token_Text(
|
||||
$this->parseData(
|
||||
substr(
|
||||
$string, $cursor
|
||||
$html, $cursor
|
||||
)
|
||||
)
|
||||
);
|
||||
@@ -178,7 +136,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
// We are in tag and it is well formed
|
||||
// Grab the internals of the tag
|
||||
$strlen_segment = $position_next_gt - $cursor;
|
||||
$segment = substr($string, $cursor, $strlen_segment);
|
||||
$segment = substr($html, $cursor, $strlen_segment);
|
||||
|
||||
// Check if it's a comment
|
||||
if (
|
||||
@@ -259,7 +217,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
|
||||
HTMLPurifier_Token_Text(
|
||||
'<' .
|
||||
$this->parseData(
|
||||
substr($string, $cursor)
|
||||
substr($html, $cursor)
|
||||
)
|
||||
);
|
||||
break;
|
||||
|
@@ -29,23 +29,24 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
|
||||
*/
|
||||
var $tokens = array();
|
||||
|
||||
function tokenizeHTML($html, $config = null) {
|
||||
function tokenizeHTML($string, $config = null) {
|
||||
|
||||
$this->tokens = array();
|
||||
|
||||
if (!$config) $config = HTMLPurifier_Config::createDefault();
|
||||
$html = $this->escapeCDATA($html);
|
||||
if ($config->get('Core', 'AcceptFullDocuments')) {
|
||||
$html = $this->extractBody($html);
|
||||
}
|
||||
$html = $this->substituteNonSpecialEntities($html);
|
||||
$string = $this->normalize($string, $config);
|
||||
|
||||
$parser=& new XML_HTMLSax3();
|
||||
$parser->set_object($this);
|
||||
$parser->set_element_handler('openHandler','closeHandler');
|
||||
$parser->set_data_handler('dataHandler');
|
||||
$parser->set_escape_handler('escapeHandler');
|
||||
$parser->set_option('XML_OPTION_ENTITIES_PARSED', 1);
|
||||
$parser->parse($html);
|
||||
$tokens = $this->tokens;
|
||||
$this->tokens = array();
|
||||
return $tokens;
|
||||
|
||||
$parser->parse($string);
|
||||
|
||||
return $this->tokens;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
|
@@ -8,8 +8,8 @@
|
||||
* features, such as custom tags, custom parsing of text, etc.
|
||||
*/
|
||||
|
||||
HTMLPurifier_ConfigDef::define(
|
||||
'Core', 'EscapeInvalidTags', false,
|
||||
HTMLPurifier_ConfigSchema::define(
|
||||
'Core', 'EscapeInvalidTags', false, 'bool',
|
||||
'When true, invalid tags will be written back to the document as plain '.
|
||||
'text. Otherwise, they are silently dropped.'
|
||||
);
|
||||
|
@@ -3,9 +3,15 @@
|
||||
require_once 'HTMLPurifier/Strategy.php';
|
||||
require_once 'HTMLPurifier/Config.php';
|
||||
|
||||
class HTMLPurifier_Strategy_Composite
|
||||
/**
|
||||
* Composite strategy that runs multiple strategies on tokens.
|
||||
*/
|
||||
class HTMLPurifier_Strategy_Composite extends HTMLPurifier_Strategy
|
||||
{
|
||||
|
||||
/**
|
||||
* List of strategies to run tokens through.
|
||||
*/
|
||||
var $strategies = array();
|
||||
|
||||
function HTMLPurifier_Strategy_Composite() {
|
||||
|
@@ -7,6 +7,9 @@ require_once 'HTMLPurifier/Strategy/MakeWellFormed.php';
|
||||
require_once 'HTMLPurifier/Strategy/FixNesting.php';
|
||||
require_once 'HTMLPurifier/Strategy/ValidateAttributes.php';
|
||||
|
||||
/**
|
||||
* Core strategy composed of the big four strategies.
|
||||
*/
|
||||
class HTMLPurifier_Strategy_Core extends HTMLPurifier_Strategy_Composite
|
||||
{
|
||||
|
||||
|
@@ -34,21 +34,18 @@ require_once 'HTMLPurifier/HTMLDefinition.php';
|
||||
class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
|
||||
{
|
||||
|
||||
var $definition;
|
||||
|
||||
function HTMLPurifier_Strategy_FixNesting() {
|
||||
$this->definition = HTMLPurifier_HTMLDefinition::instance();
|
||||
}
|
||||
|
||||
function execute($tokens, $config) {
|
||||
|
||||
//####################################################################//
|
||||
// Pre-processing
|
||||
|
||||
// get a copy of the HTML definition
|
||||
$definition = $config->getHTMLDefinition();
|
||||
|
||||
// insert implicit "parent" node, will be removed at end.
|
||||
// ! we might want to move this to configuration
|
||||
// DEFINITION CALL
|
||||
$parent_name = $this->definition->info_parent;
|
||||
$parent_name = $definition->info_parent;
|
||||
array_unshift($tokens, new HTMLPurifier_Token_Start($parent_name));
|
||||
$tokens[] = new HTMLPurifier_Token_End($parent_name);
|
||||
|
||||
@@ -104,7 +101,7 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
|
||||
if ($count = count($stack)) {
|
||||
$parent_index = $stack[$count-1];
|
||||
$parent_name = $tokens[$parent_index]->name;
|
||||
$parent_def = $this->definition->info[$parent_name];
|
||||
$parent_def = $definition->info[$parent_name];
|
||||
} else {
|
||||
// unknown info, it won't be used anyway
|
||||
$parent_index = $parent_name = $parent_def = null;
|
||||
@@ -143,7 +140,7 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
|
||||
$result = false;
|
||||
} else {
|
||||
// DEFINITION CALL
|
||||
$def = $this->definition->info[$tokens[$i]->name];
|
||||
$def = $definition->info[$tokens[$i]->name];
|
||||
$child_def = $def->child;
|
||||
|
||||
// have DTD child def validate children
|
||||
@@ -190,6 +187,7 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
|
||||
if (!$parent_def->child->allow_empty) {
|
||||
// we need to do a double-check
|
||||
$i = $parent_index;
|
||||
array_pop($stack);
|
||||
}
|
||||
|
||||
// PROJECTED OPTIMIZATION: Process all children elements before
|
||||
@@ -233,7 +231,7 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
|
||||
array_pop($stack);
|
||||
// pop an exclusion lookup off exclusion stack if
|
||||
// we ended node and that node had exclusions
|
||||
if ($this->definition->info[$tokens[$i]->name]->excludes) {
|
||||
if ($definition->info[$tokens[$i]->name]->excludes) {
|
||||
array_pop($exclude_stack);
|
||||
}
|
||||
}
|
||||
@@ -258,4 +256,4 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
|
||||
|
||||
}
|
||||
|
||||
?>
|
||||
?>
|
||||
|
@@ -4,18 +4,15 @@ require_once 'HTMLPurifier/Strategy.php';
|
||||
require_once 'HTMLPurifier/HTMLDefinition.php';
|
||||
require_once 'HTMLPurifier/Generator.php';
|
||||
|
||||
/**
|
||||
* Takes tokens makes them well-formed (balance end tags, etc.)
|
||||
*/
|
||||
class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
|
||||
{
|
||||
|
||||
var $generator;
|
||||
var $definition;
|
||||
|
||||
function HTMLPurifier_Strategy_MakeWellFormed() {
|
||||
$this->generator = new HTMLPurifier_Generator();
|
||||
$this->definition = HTMLPurifier_HTMLDefinition::instance();
|
||||
}
|
||||
|
||||
function execute($tokens, $config) {
|
||||
$definition = $config->getHTMLDefinition();
|
||||
$generator = new HTMLPurifier_Generator();
|
||||
$result = array();
|
||||
$current_nesting = array();
|
||||
$escape_invalid_tags = $config->get('Core', 'EscapeInvalidTags');
|
||||
@@ -26,7 +23,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
|
||||
}
|
||||
|
||||
// DEFINITION CALL
|
||||
$info = $this->definition->info[$token->name]->child;
|
||||
$info = $definition->info[$token->name]->child;
|
||||
|
||||
// test if it claims to be a start tag but is empty
|
||||
if ($info->type == 'empty' &&
|
||||
@@ -63,7 +60,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
|
||||
|
||||
$parent = array_pop($current_nesting);
|
||||
$parent_name = $parent->name;
|
||||
$parent_info = $this->definition->info[$parent_name];
|
||||
$parent_info = $definition->info[$parent_name];
|
||||
|
||||
if (isset($parent_info->auto_close[$token->name])) {
|
||||
$result[] = new HTMLPurifier_Token_End($parent_name);
|
||||
@@ -89,7 +86,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
|
||||
if (empty($current_nesting)) {
|
||||
if ($escape_invalid_tags) {
|
||||
$result[] = new HTMLPurifier_Token_Text(
|
||||
$this->generator->generateFromToken($token, $config)
|
||||
$generator->generateFromToken($token, $config)
|
||||
);
|
||||
}
|
||||
continue;
|
||||
@@ -126,7 +123,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
|
||||
if ($skipped_tags === false) {
|
||||
if ($escape_invalid_tags) {
|
||||
$result[] = new HTMLPurifier_Token_Text(
|
||||
$this->generator->generateFromToken($token, $config)
|
||||
$generator->generateFromToken($token, $config)
|
||||
);
|
||||
}
|
||||
continue;
|
||||
|
@@ -16,35 +16,28 @@ require_once 'HTMLPurifier/TagTransform.php';
|
||||
class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
|
||||
{
|
||||
|
||||
var $generator;
|
||||
var $definition;
|
||||
|
||||
function HTMLPurifier_Strategy_RemoveForeignElements() {
|
||||
$this->generator = new HTMLPurifier_Generator();
|
||||
$this->definition = HTMLPurifier_HTMLDefinition::instance();
|
||||
}
|
||||
|
||||
function execute($tokens, $config) {
|
||||
$definition = $config->getHTMLDefinition();
|
||||
$generator = new HTMLPurifier_Generator();
|
||||
$result = array();
|
||||
$escape_invalid_tags = $config->get('Core', 'EscapeInvalidTags');
|
||||
foreach($tokens as $token) {
|
||||
if (!empty( $token->is_tag )) {
|
||||
// DEFINITION CALL
|
||||
if (isset($this->definition->info[$token->name])) {
|
||||
if (isset($definition->info[$token->name])) {
|
||||
// leave untouched
|
||||
} elseif (
|
||||
isset($this->definition->info_tag_transform[$token->name])
|
||||
isset($definition->info_tag_transform[$token->name])
|
||||
) {
|
||||
// there is a transformation for this tag
|
||||
// DEFINITION CALL
|
||||
$token = $this->
|
||||
definition->
|
||||
info_tag_transform[$token->name]->
|
||||
transform($token);
|
||||
$token = $definition->
|
||||
info_tag_transform[$token->name]->
|
||||
transform($token);
|
||||
} elseif ($escape_invalid_tags) {
|
||||
// invalid tag, generate HTML and insert in
|
||||
$token = new HTMLPurifier_Token_Text(
|
||||
$this->generator->generateFromToken($token, $config)
|
||||
$generator->generateFromToken($token, $config)
|
||||
);
|
||||
} else {
|
||||
continue;
|
||||
|
@@ -3,11 +3,11 @@
|
||||
require_once 'HTMLPurifier/Strategy.php';
|
||||
require_once 'HTMLPurifier/HTMLDefinition.php';
|
||||
require_once 'HTMLPurifier/IDAccumulator.php';
|
||||
require_once 'HTMLPurifier/ConfigDef.php';
|
||||
require_once 'HTMLPurifier/ConfigSchema.php';
|
||||
require_once 'HTMLPurifier/AttrContext.php';
|
||||
|
||||
HTMLPurifier_ConfigDef::define(
|
||||
'Attr', 'IDBlacklist', array(),
|
||||
HTMLPurifier_ConfigSchema::define(
|
||||
'Attr', 'IDBlacklist', array(), 'list',
|
||||
'Array of IDs not allowed in the document.');
|
||||
|
||||
/**
|
||||
@@ -17,14 +17,10 @@ HTMLPurifier_ConfigDef::define(
|
||||
class HTMLPurifier_Strategy_ValidateAttributes extends HTMLPurifier_Strategy
|
||||
{
|
||||
|
||||
var $definition;
|
||||
|
||||
function HTMLPurifier_Strategy_ValidateAttributes() {
|
||||
$this->definition = HTMLPurifier_HTMLDefinition::instance();
|
||||
}
|
||||
|
||||
function execute($tokens, $config) {
|
||||
|
||||
$definition = $config->getHTMLDefinition();
|
||||
|
||||
// setup StrategyContext
|
||||
$context = new HTMLPurifier_AttrContext();
|
||||
|
||||
@@ -36,7 +32,7 @@ class HTMLPurifier_Strategy_ValidateAttributes extends HTMLPurifier_Strategy
|
||||
|
||||
// create alias to global definition array, see also $defs
|
||||
// DEFINITION CALL
|
||||
$d_defs = $this->definition->info_global_attr;
|
||||
$d_defs = $definition->info_global_attr;
|
||||
|
||||
foreach ($tokens as $key => $token) {
|
||||
|
||||
@@ -50,14 +46,14 @@ class HTMLPurifier_Strategy_ValidateAttributes extends HTMLPurifier_Strategy
|
||||
// do global transformations (pre)
|
||||
// ex. <ELEMENT lang="fr"> to <ELEMENT lang="fr" xml:lang="fr">
|
||||
// DEFINITION CALL
|
||||
foreach ($this->definition->info_attr_transform_pre as $transform) {
|
||||
foreach ($definition->info_attr_transform_pre as $transform) {
|
||||
$attr = $transform->transform($attr, $config);
|
||||
}
|
||||
|
||||
// do local transformations only applicable to this element (pre)
|
||||
// ex. <p align="right"> to <p style="text-align:right;">
|
||||
// DEFINITION CALL
|
||||
foreach ($this->definition->info[$token->name]->attr_transform_pre
|
||||
foreach ($definition->info[$token->name]->attr_transform_pre
|
||||
as $transform
|
||||
) {
|
||||
$attr = $transform->transform($attr, $config);
|
||||
@@ -66,7 +62,7 @@ class HTMLPurifier_Strategy_ValidateAttributes extends HTMLPurifier_Strategy
|
||||
// create alias to this element's attribute definition array, see
|
||||
// also $d_defs (global attribute definition array)
|
||||
// DEFINITION CALL
|
||||
$defs = $this->definition->info[$token->name]->attr;
|
||||
$defs = $definition->info[$token->name]->attr;
|
||||
|
||||
// iterate through all the attribute keypairs
|
||||
// Watch out for name collisions: $key has previously been used
|
||||
@@ -116,10 +112,10 @@ class HTMLPurifier_Strategy_ValidateAttributes extends HTMLPurifier_Strategy
|
||||
}
|
||||
|
||||
// post transforms
|
||||
foreach ($this->definition->info_attr_transform_post as $transform) {
|
||||
foreach ($definition->info_attr_transform_post as $transform) {
|
||||
$attr = $transform->transform($attr, $config);
|
||||
}
|
||||
foreach ($this->definition->info[$token->name]->attr_transform_post as $transform) {
|
||||
foreach ($definition->info[$token->name]->attr_transform_post as $transform) {
|
||||
$attr = $transform->transform($attr, $config);
|
||||
}
|
||||
|
||||
|
@@ -2,20 +2,37 @@
|
||||
|
||||
require_once('HTMLPurifier/Token.php');
|
||||
|
||||
/**
|
||||
* Defines a mutation of an obsolete tag into a valid tag.
|
||||
*/
|
||||
class HTMLPurifier_TagTransform
|
||||
{
|
||||
|
||||
/**
|
||||
* Tag name to transform the tag to.
|
||||
* @public
|
||||
*/
|
||||
var $transform_to;
|
||||
|
||||
/**
|
||||
* Transforms the obsolete tag into the valid tag.
|
||||
* @param $tag Tag to be transformed.
|
||||
*/
|
||||
function transform($tag) {
|
||||
trigger_error('Call to abstract function', E_USER_ERROR);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Simple transformation, just change tag name to something else.
|
||||
*/
|
||||
class HTMLPurifier_TagTransform_Simple extends HTMLPurifier_TagTransform
|
||||
{
|
||||
|
||||
var $transform_to;
|
||||
|
||||
/**
|
||||
* @param $transform_to Tag name to transform to.
|
||||
*/
|
||||
function HTMLPurifier_TagTransform_Simple($transform_to) {
|
||||
$this->transform_to = $transform_to;
|
||||
}
|
||||
@@ -28,6 +45,12 @@ class HTMLPurifier_TagTransform_Simple extends HTMLPurifier_TagTransform
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Transforms CENTER tags into proper version (DIV with text-align CSS)
|
||||
*
|
||||
* Takes a CENTER tag, parses the align attribute, and then if it's valid
|
||||
* assigns it to the CSS property text-align.
|
||||
*/
|
||||
class HTMLPurifier_TagTransform_Center extends HTMLPurifier_TagTransform
|
||||
{
|
||||
var $transform_to = 'div';
|
||||
@@ -51,6 +74,18 @@ class HTMLPurifier_TagTransform_Center extends HTMLPurifier_TagTransform
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Transforms FONT tags to the proper form (SPAN with CSS styling)
|
||||
*
|
||||
* This transformation takes the three proprietary attributes of FONT and
|
||||
* transforms them into their corresponding CSS attributes. These are color,
|
||||
* face, and size.
|
||||
*
|
||||
* @note Size is an interesting case because it doesn't map cleanly to CSS.
|
||||
* Thanks to
|
||||
* http://style.cleverchimp.com/font_size_intervals/altintervals.html
|
||||
* for reasonable mappings.
|
||||
*/
|
||||
class HTMLPurifier_TagTransform_Font extends HTMLPurifier_TagTransform
|
||||
{
|
||||
|
||||
@@ -78,8 +113,6 @@ class HTMLPurifier_TagTransform_Font extends HTMLPurifier_TagTransform
|
||||
return $new_tag;
|
||||
}
|
||||
|
||||
// font size lookup table based off of:
|
||||
// http://style.cleverchimp.com/font_size_intervals/altintervals.html
|
||||
$attributes = $tag->attributes;
|
||||
$prepend_style = '';
|
||||
|
||||
|
@@ -11,6 +11,14 @@
|
||||
*/
|
||||
class HTMLPurifier_Token {
|
||||
var $type; /**< Type of node to bypass <tt>is_a()</tt>. @public */
|
||||
|
||||
/**
|
||||
* Copies the tag into a new one (clone substitute).
|
||||
* @return Copied token
|
||||
*/
|
||||
function copy() {
|
||||
trigger_error('Cannot copy abstract class', E_USER_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@@ -2,12 +2,30 @@
|
||||
|
||||
require_once 'HTMLPurifier/Token.php';
|
||||
|
||||
/**
|
||||
* Factory for token generation (PHP 5 only).
|
||||
*
|
||||
* @note Doing some benchmarking indicates that the new operator is much
|
||||
* slower than the clone operator (even discounting the cost of the
|
||||
* constructor). This class is for that optimization. We may want to
|
||||
* consider porting this to PHP 4 by virtue of the fact it makes the code
|
||||
* easier to read. Other then that, there's not much point as we don't
|
||||
* maintain parallel HTMLPurifier_Token hierarchies (the main reason why
|
||||
* you'd want to use an abstract factory).
|
||||
*/
|
||||
class HTMLPurifier_TokenFactory
|
||||
{
|
||||
|
||||
/**
|
||||
* Prototypes that will be cloned.
|
||||
* @private
|
||||
*/
|
||||
// p stands for prototype
|
||||
private $p_start, $p_end, $p_empty, $p_text, $p_comment;
|
||||
|
||||
/**
|
||||
* Generates blank prototypes for cloning.
|
||||
*/
|
||||
public function __construct() {
|
||||
$this->p_start = new HTMLPurifier_Token_Start('', array());
|
||||
$this->p_end = new HTMLPurifier_Token_End('');
|
||||
@@ -16,30 +34,57 @@ class HTMLPurifier_TokenFactory
|
||||
$this->p_comment= new HTMLPurifier_Token_Comment('');
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a HTMLPurifier_Token_Start.
|
||||
* @param $name Tag name
|
||||
* @param $attribute Associative array of attributes
|
||||
* @return Generated HTMLPurifier_Token_Start
|
||||
*/
|
||||
public function createStart($name, $attributes = array()) {
|
||||
$p = clone $this->p_start;
|
||||
$p->HTMLPurifier_Token_Tag($name, $attributes);
|
||||
return $p;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a HTMLPurifier_Token_End.
|
||||
* @param $name Tag name
|
||||
* @return Generated HTMLPurifier_Token_End
|
||||
*/
|
||||
public function createEnd($name) {
|
||||
$p = clone $this->p_end;
|
||||
$p->HTMLPurifier_Token_Tag($name);
|
||||
return $p;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a HTMLPurifier_Token_Empty.
|
||||
* @param $name Tag name
|
||||
* @param $attribute Associative array of attributes
|
||||
* @return Generated HTMLPurifier_Token_Empty
|
||||
*/
|
||||
public function createEmpty($name, $attributes = array()) {
|
||||
$p = clone $this->p_empty;
|
||||
$p->HTMLPurifier_Token_Tag($name, $attributes);
|
||||
return $p;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a HTMLPurifier_Token_Text.
|
||||
* @param $data Data of text token
|
||||
* @return Generated HTMLPurifier_Token_Text
|
||||
*/
|
||||
public function createText($data) {
|
||||
$p = clone $this->p_text;
|
||||
$p->HTMLPurifier_Token_Text($data);
|
||||
return $p;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a HTMLPurifier_Token_Comment.
|
||||
* @param $data Data of comment token
|
||||
* @return Generated HTMLPurifier_Token_Comment
|
||||
*/
|
||||
public function createComment($data) {
|
||||
$p = clone $this->p_comment;
|
||||
$p->HTMLPurifier_Token_Comment($data);
|
||||
|
@@ -1,10 +1,29 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Validator for the components of a URI for a specific scheme
|
||||
*/
|
||||
class HTMLPurifier_URIScheme
|
||||
{
|
||||
|
||||
/**
|
||||
* Scheme's default port (integer)
|
||||
* @public
|
||||
*/
|
||||
var $default_port = null;
|
||||
|
||||
/**
|
||||
* Validates the components of a URI
|
||||
* @note This implementation should be called by children if they define
|
||||
* a default port, as it does port processing.
|
||||
* @note Fragment is omitted as that is scheme independent
|
||||
* @param $userinfo User info found before at sign in authority
|
||||
* @param $host Hostname in authority
|
||||
* @param $port Port found after colon in authority
|
||||
* @param $path Path of URI
|
||||
* @param $query Query of URI, found after question mark
|
||||
* @param $config HTMLPurifier_Config object
|
||||
*/
|
||||
function validateComponents(
|
||||
$userinfo, $host, $port, $path, $query, $config
|
||||
) {
|
||||
|
@@ -2,6 +2,10 @@
|
||||
|
||||
require_once 'HTMLPurifier/URIScheme.php';
|
||||
|
||||
/**
|
||||
* Validates ftp (File Transfer Protocol) URIs as defined by generic RFC 1738.
|
||||
* @todo Typecode check on path
|
||||
*/
|
||||
class HTMLPurifier_URIScheme_ftp extends HTMLPurifier_URIScheme {
|
||||
|
||||
var $default_port = 21;
|
||||
|
@@ -2,6 +2,9 @@
|
||||
|
||||
require_once 'HTMLPurifier/URIScheme.php';
|
||||
|
||||
/**
|
||||
* Validates http (HyperText Transfer Protocol) as defined by RFC 2616
|
||||
*/
|
||||
class HTMLPurifier_URIScheme_http extends HTMLPurifier_URIScheme {
|
||||
|
||||
var $default_port = 80;
|
||||
|
@@ -2,6 +2,9 @@
|
||||
|
||||
require_once 'HTMLPurifier/URIScheme/http.php';
|
||||
|
||||
/**
|
||||
* Validates https (Secure HTTP) according to http scheme.
|
||||
*/
|
||||
class HTMLPurifier_URIScheme_https extends HTMLPurifier_URIScheme_http {
|
||||
|
||||
var $default_port = 443;
|
||||
|
@@ -5,6 +5,12 @@ require_once 'HTMLPurifier/URIScheme.php';
|
||||
// VERY RELAXED! Shouldn't cause problems, not even Firefox checks if the
|
||||
// email is valid, but be careful!
|
||||
|
||||
/**
|
||||
* Validates mailto (for E-mail) according to RFC 2368
|
||||
* @todo Validate the email address
|
||||
* @todo Filter allowed query parameters
|
||||
*/
|
||||
|
||||
class HTMLPurifier_URIScheme_mailto extends HTMLPurifier_URIScheme {
|
||||
|
||||
function validateComponents(
|
||||
|
@@ -2,6 +2,9 @@
|
||||
|
||||
require_once 'HTMLPurifier/URIScheme.php';
|
||||
|
||||
/**
|
||||
* Validates news (Usenet) as defined by generic RFC 1738
|
||||
*/
|
||||
class HTMLPurifier_URIScheme_news extends HTMLPurifier_URIScheme {
|
||||
|
||||
function validateComponents(
|
||||
|
@@ -2,6 +2,9 @@
|
||||
|
||||
require_once 'HTMLPurifier/URIScheme.php';
|
||||
|
||||
/**
|
||||
* Validates nntp (Network News Transfer Protocol) as defined by generic RFC 1738
|
||||
*/
|
||||
class HTMLPurifier_URIScheme_nntp extends HTMLPurifier_URIScheme {
|
||||
|
||||
var $default_port = 119;
|
||||
|
@@ -1,6 +1,6 @@
|
||||
<?php
|
||||
|
||||
HTMLPurifier_ConfigDef::define(
|
||||
HTMLPurifier_ConfigSchema::define(
|
||||
'URI', 'AllowedSchemes', array(
|
||||
'http' => true, // "Hypertext Transfer Protocol", nuf' said
|
||||
'https' => true, // HTTP over SSL (Secure Socket Layer)
|
||||
@@ -11,25 +11,32 @@ HTMLPurifier_ConfigDef::define(
|
||||
// for Usenet, these two are similar, but distinct
|
||||
'nntp' => true, // individual Netnews articles
|
||||
'news' => true // newsgroup or individual Netnews articles),
|
||||
),
|
||||
), 'lookup',
|
||||
'Whitelist that defines the schemes that a URI is allowed to have. This '.
|
||||
'prevents XSS attacks from using pseudo-schemes like javascript or mocha.'
|
||||
);
|
||||
|
||||
HTMLPurifier_ConfigDef::define(
|
||||
'URI', 'OverrideAllowedSchemes', true,
|
||||
HTMLPurifier_ConfigSchema::define(
|
||||
'URI', 'OverrideAllowedSchemes', true, 'bool',
|
||||
'If this is set to true (which it is by default), you can override '.
|
||||
'%URI.AllowedSchemes by simply registering a HTMLPurifier_URIScheme '.
|
||||
'to the registry. If false, you will also have to update that directive '.
|
||||
'in order to add more schemes.'
|
||||
);
|
||||
|
||||
/**
|
||||
* Registry for retrieving specific URI scheme validator objects.
|
||||
*/
|
||||
class HTMLPurifier_URISchemeRegistry
|
||||
{
|
||||
|
||||
// pass a registry object $prototype with a compatible interface and
|
||||
// the function will copy it and return it all further times.
|
||||
// pass bool true to reset to the default registry
|
||||
/**
|
||||
* Retrieve sole instance of the registry.
|
||||
* @param $prototype Optional prototype to overload sole instance with,
|
||||
* or bool true to reset to default registry.
|
||||
* @note Pass a registry object $prototype with a compatible interface and
|
||||
* the function will copy it and return it all further times.
|
||||
*/
|
||||
function &instance($prototype = null) {
|
||||
static $instance = null;
|
||||
if ($prototype !== null) {
|
||||
@@ -40,9 +47,23 @@ class HTMLPurifier_URISchemeRegistry
|
||||
return $instance;
|
||||
}
|
||||
|
||||
/**
|
||||
* Cache of retrieved schemes.
|
||||
* @protected
|
||||
*/
|
||||
var $schemes = array();
|
||||
|
||||
/**
|
||||
* Directory where scheme objects can be found
|
||||
* @private
|
||||
*/
|
||||
var $_scheme_dir = null;
|
||||
|
||||
/**
|
||||
* Retrieves a scheme validator object
|
||||
* @param $scheme String scheme name like http or mailto
|
||||
* @param $config HTMLPurifier_Config object
|
||||
*/
|
||||
function &getScheme($scheme, $config = null) {
|
||||
if (!$config) $config = HTMLPurifier_Config::createDefault();
|
||||
$null = null; // for the sake of passing by reference
|
||||
@@ -67,6 +88,11 @@ class HTMLPurifier_URISchemeRegistry
|
||||
return $this->schemes[$scheme];
|
||||
}
|
||||
|
||||
/**
|
||||
* Registers a custom scheme to the cache.
|
||||
* @param $scheme Scheme name
|
||||
* @param $scheme_obj HTMLPurifier_URIScheme object
|
||||
*/
|
||||
function register($scheme, &$scheme_obj) {
|
||||
$this->schemes[$scheme] =& $scheme_obj;
|
||||
}
|
||||
|
100
phpdoc.ini
Normal file
100
phpdoc.ini
Normal file
@@ -0,0 +1,100 @@
|
||||
;; phpDocumentor parse configuration file
|
||||
;;
|
||||
;; This file is designed to cut down on repetitive typing on the command-line or web interface
|
||||
;; You can copy this file to create a number of configuration files that can be used with the
|
||||
;; command-line switch -c, as in phpdoc -c default.ini or phpdoc -c myini.ini. The web
|
||||
;; interface will automatically generate a list of .ini files that can be used.
|
||||
;;
|
||||
;; default.ini is used to generate the online manual at http://www.phpdoc.org/docs
|
||||
;;
|
||||
;; ALL .ini files must be in the user subdirectory of phpDocumentor with an extension of .ini
|
||||
;;
|
||||
;; Copyright 2002, Greg Beaver <cellog@users.sourceforge.net>
|
||||
;;
|
||||
;; WARNING: do not change the name of any command-line parameters, phpDocumentor will ignore them
|
||||
|
||||
[Parse Data]
|
||||
;; title of all the documentation
|
||||
;; legal values: any string
|
||||
title = HTML Purifier API Documentation
|
||||
|
||||
;; parse files that start with a . like .bash_profile
|
||||
;; legal values: true, false
|
||||
hidden = false
|
||||
|
||||
;; show elements marked @access private in documentation by setting this to on
|
||||
;; legal values: on, off
|
||||
parseprivate = off
|
||||
|
||||
;; parse with javadoc-like description (first sentence is always the short description)
|
||||
;; legal values: on, off
|
||||
javadocdesc = on
|
||||
|
||||
;; add any custom @tags separated by commas here
|
||||
;; legal values: any legal tagname separated by commas.
|
||||
;customtags = mytag1,mytag2
|
||||
|
||||
;; This is only used by the XML:DocBook/peardoc2 converter
|
||||
defaultcategoryname = Documentation
|
||||
|
||||
;; what is the main package?
|
||||
;; legal values: alphanumeric string plus - and _
|
||||
defaultpackagename = HTMLPurifier
|
||||
|
||||
;; output any parsing information? set to on for cron jobs
|
||||
;; legal values: on
|
||||
;quiet = on
|
||||
|
||||
;; parse a PEAR-style repository. Do not turn this on if your project does
|
||||
;; not have a parent directory named "pear"
|
||||
;; legal values: on/off
|
||||
;pear = on
|
||||
|
||||
;; where should the documentation be written?
|
||||
;; legal values: a legal path
|
||||
target = docs/phpdoc
|
||||
|
||||
;; Which files should be parsed out as special documentation files, such as README,
|
||||
;; INSTALL and CHANGELOG? This overrides the default files found in
|
||||
;; phpDocumentor.ini (this file is not a user .ini file, but the global file)
|
||||
readmeinstallchangelog = README, INSTALL, NEWS, WYSIWYG, SLOW, LICENSE, CREDITS
|
||||
|
||||
;; limit output to the specified packages, even if others are parsed
|
||||
;; legal values: package names separated by commas
|
||||
;packageoutput = package1,package2
|
||||
|
||||
;; comma-separated list of files to parse
|
||||
;; legal values: paths separated by commas
|
||||
;filename = /path/to/file1,/path/to/file2,fileincurrentdirectory
|
||||
|
||||
;; comma-separated list of directories to parse
|
||||
;; legal values: directory paths separated by commas
|
||||
;directory = /path1,/path2,.,..,subdirectory
|
||||
;directory = /home/jeichorn/cvs/pear
|
||||
directory = ./
|
||||
|
||||
;; template base directory (the equivalent directory of <installdir>/phpDocumentor)
|
||||
;templatebase = /path/to/my/templates
|
||||
|
||||
;; directory to find any example files in through @example and {@example} tags
|
||||
;examplesdir = /path/to/my/templates
|
||||
|
||||
;; comma-separated list of files, directories or wildcards ? and * (any wildcard) to ignore
|
||||
;; legal values: any wildcard strings separated by commas
|
||||
;ignore = /path/to/ignore*,*list.php,myfile.php,subdirectory/
|
||||
ignore = pear-*,templates/,Documentation/,test*.php,Lexer.inc
|
||||
|
||||
sourcecode = on
|
||||
|
||||
;; comma-separated list of Converters to use in outputformat:Convertername:templatedirectory format
|
||||
;; legal values: HTML:frames:default,HTML:frames:l0l33t,HTML:frames:phpdoc.de,HTML:frames:phphtmllib,
|
||||
;; HTML:frames:earthli,
|
||||
;; HTML:frames:DOM/default,HTML:frames:DOM/l0l33t,HTML:frames:DOM/phpdoc.de,
|
||||
;; HTML:frames:DOM/phphtmllib,HTML:frames:DOM/earthli
|
||||
;; HTML:Smarty:default,HTML:Smarty:PHP,HTML:Smarty:HandS
|
||||
;; PDF:default:default,CHM:default:default,XML:DocBook/peardoc2:default
|
||||
output=HTML:frames:default
|
||||
|
||||
;; turn this option on if you want highlighted source code for every file
|
||||
;; legal values: on/off
|
||||
sourcecode = on
|
14
smoketests/common.php
Normal file
14
smoketests/common.php
Normal file
@@ -0,0 +1,14 @@
|
||||
<?php
|
||||
|
||||
header('Content-type: text/html; charset=UTF-8');
|
||||
|
||||
set_include_path('../library' . PATH_SEPARATOR . get_include_path());
|
||||
require_once 'HTMLPurifier.php';
|
||||
|
||||
function escapeHTML($string) {
|
||||
$string = HTMLPurifier_Encoder::cleanUTF8($string);
|
||||
$string = htmlspecialchars($string, ENT_COMPAT, 'UTF-8');
|
||||
return $string;
|
||||
}
|
||||
|
||||
?>
|
@@ -1,4 +1,8 @@
|
||||
<!DOCTYPE html
|
||||
<?php
|
||||
|
||||
require_once 'common.php';
|
||||
|
||||
?><!DOCTYPE html
|
||||
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||
<html>
|
||||
@@ -10,9 +14,6 @@
|
||||
<h1>HTMLPurifier UTF-8 Smoketest</h1>
|
||||
<?php
|
||||
|
||||
set_include_path('../library' . PATH_SEPARATOR . get_include_path());
|
||||
require_once 'HTMLPurifier.php';
|
||||
|
||||
$purifier = new HTMLPurifier();
|
||||
$string = '
|
||||
<ul>
|
||||
@@ -27,5 +28,10 @@ $string = '
|
||||
<?php echo $string; ?>
|
||||
<h2>Purified</h2>
|
||||
<?php echo $purifier->purify($string); ?>
|
||||
<h2>Analysis</h2>
|
||||
<p>The content in <strong>Raw</strong> should be equivalent to the content
|
||||
in <strong>Purified</strong>. If <strong>Purified</strong> is mangled, there
|
||||
is likely trouble a-brewing in the library. If
|
||||
both are mangled, check to see that this file was not corrupted.</p>
|
||||
</body>
|
||||
</html>
|
53
smoketests/variableWidthAttack.php
Normal file
53
smoketests/variableWidthAttack.php
Normal file
@@ -0,0 +1,53 @@
|
||||
<?php
|
||||
|
||||
require_once 'common.php';
|
||||
|
||||
?><!DOCTYPE html
|
||||
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<title>HTMLPurifier Variable Width Attack Smoketest</title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||
</head>
|
||||
<body>
|
||||
<h1>HTMLPurifier Variable Width Attack Smoketest</h1>
|
||||
<p>For more information, see
|
||||
<a href="http://applesoup.googlepages.com/bypass_filter.txt">Cheng Peng Su's
|
||||
original advisory.</a> This particular exploit code appears only to work
|
||||
in Internet Explorer, if it works at all.</p>
|
||||
<h2>Test</h2>
|
||||
<?php
|
||||
|
||||
$purifier = new HTMLPurifier();
|
||||
|
||||
?>
|
||||
<table>
|
||||
<thead><tr><th>ASCII</th><th width="30%">Raw</th><th>Output</th><th>Render</th></tr></thead>
|
||||
<tbody>
|
||||
<?php
|
||||
|
||||
for ($i = 0; $i < 256; $i++) {
|
||||
$c = chr($i);
|
||||
$html = '<img src="" alt="X' . $c . '"';
|
||||
$html .= '>A"'; // in our out the attribute? ;-)
|
||||
$html .= "onerror=alert('$i')>O";
|
||||
$pure_html = $purifier->purify($html);
|
||||
?>
|
||||
<tr>
|
||||
<td><?php echo $i; ?></td>
|
||||
<td style="font-size:8pt;"><?php echo escapeHTML($html); ?></td>
|
||||
<td style="font-size:8pt;"><?php echo escapeHTML($pure_html); ?></td>
|
||||
<td><?php echo $pure_html; ?></td>
|
||||
</tr>
|
||||
<?php } ?>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<h2>Analysis</h2>
|
||||
|
||||
<p>By making sure that UTF-8 is well formed and non-SGML codepoints are
|
||||
removed, as well as escaping quotes outside of tags, this is a non-threat.</p>
|
||||
|
||||
</body>
|
||||
</html>
|
@@ -1,10 +1,14 @@
|
||||
<!DOCTYPE html
|
||||
<?php
|
||||
|
||||
require_once('common.php');
|
||||
|
||||
?><!DOCTYPE html
|
||||
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<title>HTMLPurifier XSS Attacks Smoketest</title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||
<title>HTMLPurifier XSS Attacks Smoketest</title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||
</head>
|
||||
<body>
|
||||
<h1>HTMLPurifier XSS Attacks Smoketest</h1>
|
||||
@@ -13,17 +17,17 @@
|
||||
<p>The last segment of tests regarding blacklisted websites is not
|
||||
applicable at the moment, but when we add that functionality they'll be
|
||||
relevant.</p>
|
||||
<p>Most of the XSS broadcasts its presence by spawning an alert dialogue.</p>
|
||||
<h2>Test</h2>
|
||||
<?php
|
||||
|
||||
if (version_compare(PHP_VERSION, '5', '<')) exit('<p>Requires PHP 5.</p>');
|
||||
|
||||
set_include_path('../library' . PATH_SEPARATOR . get_include_path());
|
||||
require_once 'HTMLPurifier.php';
|
||||
|
||||
$xml = simplexml_load_file('xssAttacks.xml');
|
||||
$purifier = new HTMLPurifier();
|
||||
|
||||
?>
|
||||
<!-- form is used so that we can use textareas and stay valid -->
|
||||
<form method="post" action="xssAttacks.php">
|
||||
<table>
|
||||
<thead><tr><th>Name</th><th width="30%">Raw</th><th>Output</th><th>Render</th></tr></thead>
|
||||
@@ -36,10 +40,10 @@ foreach ($xml->attack as $attack) {
|
||||
if ($attack->name == 'US-ASCII encoding') $code = urldecode($code);
|
||||
?>
|
||||
<tr>
|
||||
<td><?php echo htmlspecialchars($attack->name); ?></td>
|
||||
<td><textarea readonly="readonly" cols="20" rows="2"><?php echo htmlspecialchars($code); ?></textarea></td>
|
||||
<td><?php echo escapeHTML($attack->name); ?></td>
|
||||
<td><textarea readonly="readonly" cols="20" rows="2"><?php echo escapeHTML($code); ?></textarea></td>
|
||||
<?php $pure_html = $purifier->purify($code); ?>
|
||||
<td><textarea readonly="readonly" cols="20" rows="2"><?php echo htmlspecialchars($pure_html); ?></textarea></td>
|
||||
<td><textarea readonly="readonly" cols="20" rows="2"><?php echo escapeHTML($pure_html); ?></textarea></td>
|
||||
<td><?php echo $pure_html ?></td>
|
||||
</tr>
|
||||
<?php
|
||||
|
17
test-settings.sample.php
Normal file
17
test-settings.sample.php
Normal file
@@ -0,0 +1,17 @@
|
||||
<?php
|
||||
|
||||
// This file is necessary to run the unit tests and profiling
|
||||
// scripts.
|
||||
|
||||
// Is PEAR available on your system? If it isn't, set to false. If PEAR
|
||||
// is not part of the default include_path, add it.
|
||||
$GLOBALS['HTMLPurifierTest']['PEAR'] = true;
|
||||
|
||||
// How many times should profiling scripts iterate over the function? More runs
|
||||
// means more accurate results, but they'll take longer to perform.
|
||||
$GLOBALS['HTMLPurifierTest']['Runs'] = 2;
|
||||
|
||||
// Where is SimpleTest located?
|
||||
$simpletest_location = '/path/to/simpletest/';
|
||||
|
||||
?>
|
21
tests/HTMLPurifier/AttrDef/BorderTest.php
Normal file
21
tests/HTMLPurifier/AttrDef/BorderTest.php
Normal file
@@ -0,0 +1,21 @@
|
||||
<?php
|
||||
|
||||
require_once 'HTMLPurifier/AttrDef/Border.php';
|
||||
|
||||
class HTMLPurifier_AttrDef_BorderTest extends HTMLPurifier_AttrDef_PixelsTest
|
||||
{
|
||||
|
||||
function test() {
|
||||
|
||||
$this->def = new HTMLPurifier_AttrDef_Border(HTMLPurifier_Config::createDefault());
|
||||
|
||||
$this->assertDef('thick solid red', 'thick solid #F00');
|
||||
$this->assertDef('thick solid');
|
||||
$this->assertDef('solid red', 'solid #F00');
|
||||
$this->assertDef('1px solid #000');
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
?>
|
@@ -20,9 +20,11 @@ class HTMLPurifier_AttrDef_CSSTest extends HTMLPurifier_AttrDefHarness
|
||||
$this->assertDef('font-weight:bold;');
|
||||
$this->assertDef('list-style-position:outside;');
|
||||
$this->assertDef('list-style-type:upper-roman;');
|
||||
$this->assertDef('list-style:upper-roman inside;');
|
||||
$this->assertDef('text-transform:capitalize;');
|
||||
$this->assertDef('background-color:rgb(0,0,255);');
|
||||
$this->assertDef('background-color:transparent;');
|
||||
$this->assertDef('background:#FF9;');
|
||||
$this->assertDef('color:#F00;');
|
||||
$this->assertDef('border-top-color:#F00;');
|
||||
$this->assertDef('border-color:#F00 #FF0;');
|
||||
@@ -60,6 +62,15 @@ class HTMLPurifier_AttrDef_CSSTest extends HTMLPurifier_AttrDefHarness
|
||||
$this->assertDef('text-decoration:underline;');
|
||||
$this->assertDef('font-family:sans-serif;');
|
||||
$this->assertDef('font-family:Gill, \'Times New Roman\', sans-serif;');
|
||||
$this->assertDef('font:12px serif;');
|
||||
$this->assertDef('border:1px solid #000;');
|
||||
$this->assertDef('border-bottom:2em double #FF00FA;');
|
||||
$this->assertDef('border-collapse:collapse;');
|
||||
$this->assertDef('caption-side:top;');
|
||||
$this->assertDef('vertical-align:middle;');
|
||||
$this->assertDef('vertical-align:12px;');
|
||||
$this->assertDef('vertical-align:50%;');
|
||||
$this->assertDef('table-layout:fixed;');
|
||||
|
||||
// duplicates
|
||||
$this->assertDef('text-align:right;text-align:left;',
|
||||
@@ -81,6 +92,10 @@ class HTMLPurifier_AttrDef_CSSTest extends HTMLPurifier_AttrDefHarness
|
||||
$this->assertDef('position:absolute;', false);
|
||||
$this->assertDef('background-image:url(javascript:alert\(\));', false);
|
||||
|
||||
// airy input
|
||||
$this->assertDef(' font-weight : bold; color : #ff0000',
|
||||
'font-weight:bold;color:#ff0000;');
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -23,6 +23,9 @@ class HTMLPurifier_AttrDef_ColorTest extends HTMLPurifier_AttrDefHarness
|
||||
$this->assertDef('rgb(200%, -10%, 0%)', 'rgb(100%,0%,0%)');
|
||||
$this->assertDef('rgb(256,-23,34)', 'rgb(255,0,34)');
|
||||
|
||||
// color keywords, of course
|
||||
$this->assertDef('red', '#F00');
|
||||
|
||||
// maybe hex transformations would be another nice feature
|
||||
// at the very least transform rgb percent to rgb integer
|
||||
|
||||
|
36
tests/HTMLPurifier/AttrDef/FontTest.php
Normal file
36
tests/HTMLPurifier/AttrDef/FontTest.php
Normal file
@@ -0,0 +1,36 @@
|
||||
<?php
|
||||
|
||||
require_once 'HTMLPurifier/AttrDefHarness.php';
|
||||
require_once 'HTMLPurifier/AttrDef/Font.php';
|
||||
|
||||
class HTMLPurifier_AttrDef_FontTest extends HTMLPurifier_AttrDefHarness
|
||||
{
|
||||
|
||||
function test() {
|
||||
|
||||
$this->def = new HTMLPurifier_AttrDef_Font(HTMLPurifier_Config::createDefault());
|
||||
|
||||
// hodgepodge of usage cases from W3C spec, but " -> '
|
||||
$this->assertDef('12px/14px sans-serif');
|
||||
$this->assertDef('80% sans-serif');
|
||||
$this->assertDef('x-large/110% \'New Century Schoolbook\', serif');
|
||||
$this->assertDef('bold italic large Palatino, serif');
|
||||
$this->assertDef('normal small-caps 120%/120% fantasy');
|
||||
$this->assertDef('300 italic 1.3em/1.7em \'FB Armada\', sans-serif');
|
||||
$this->assertDef('600 9px Charcoal');
|
||||
$this->assertDef('600 9px/ 12px Charcoal', '600 9px/12px Charcoal');
|
||||
|
||||
// spacing
|
||||
$this->assertDef('12px / 14px sans-serif', '12px/14px sans-serif');
|
||||
|
||||
// system fonts
|
||||
$this->assertDef('menu');
|
||||
|
||||
$this->assertDef('800', false);
|
||||
$this->assertDef('600 9px//12px Charcoal', false);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
?>
|
@@ -16,20 +16,45 @@ class HTMLPurifier_AttrDef_IntegerTest extends HTMLPurifier_AttrDefHarness
|
||||
$this->assertDef('14');
|
||||
$this->assertDef('+24', '24');
|
||||
$this->assertDef(' 14 ', '14');
|
||||
$this->assertDef('-0', '0');
|
||||
|
||||
$this->assertDef('-1.4', false);
|
||||
$this->assertDef('3.4', false);
|
||||
$this->assertDef('asdf', false);
|
||||
$this->assertDef('asdf', false); // must not return zero
|
||||
$this->assertDef('2in', false); // must not return zero
|
||||
|
||||
}
|
||||
|
||||
function testNonNegative() {
|
||||
function assertRange($negative, $zero, $positive) {
|
||||
$this->assertDef('-100', $negative);
|
||||
$this->assertDef('-1', $negative);
|
||||
$this->assertDef('0', $zero);
|
||||
$this->assertDef('1', $positive);
|
||||
$this->assertDef('42', $positive);
|
||||
}
|
||||
|
||||
function testRange() {
|
||||
|
||||
$this->def = new HTMLPurifier_AttrDef_Integer(true);
|
||||
$this->def = new HTMLPurifier_AttrDef_Integer(false);
|
||||
$this->assertRange(false, true, true); // non-negative
|
||||
|
||||
$this->assertDef('0');
|
||||
$this->assertDef('1');
|
||||
$this->assertDef('-1', false);
|
||||
$this->def = new HTMLPurifier_AttrDef_Integer(false, false);
|
||||
$this->assertRange(false, false, true); // positive
|
||||
|
||||
|
||||
// fringe cases
|
||||
|
||||
$this->def = new HTMLPurifier_AttrDef_Integer(false, false, false);
|
||||
$this->assertRange(false, false, false); // allow none
|
||||
|
||||
$this->def = new HTMLPurifier_AttrDef_Integer(true, false, false);
|
||||
$this->assertRange(true, false, false); // negative
|
||||
|
||||
$this->def = new HTMLPurifier_AttrDef_Integer(false, true, false);
|
||||
$this->assertRange(false, true, false); // zero
|
||||
|
||||
$this->def = new HTMLPurifier_AttrDef_Integer(true, true, false);
|
||||
$this->assertRange(true, true, false); // non-positive
|
||||
|
||||
}
|
||||
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user