1
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-08-06 06:07:26 +02:00

Compare commits

..

140 Commits

Author SHA1 Message Date
Edward Z. Yang
f38e81785f Release 2.1.5
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1814 48356398-32a2-884e-a903-53898d9a118a
2008-06-19 22:57:15 +00:00
Edward Z. Yang
2cc829a8cf Fix PHP 4.3.9/10 bug with float handling
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1806 48356398-32a2-884e-a903-53898d9a118a
2008-06-19 21:13:56 +00:00
Edward Z. Yang
e80a54a7c9 Add missing include.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1805 48356398-32a2-884e-a903-53898d9a118a
2008-06-19 19:58:53 +00:00
Edward Z. Yang
6f71e65661 [2.1.5] [MFH] Fix text-decoration: none bug
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1800 48356398-32a2-884e-a903-53898d9a118a
2008-06-17 03:18:23 +00:00
Edward Z. Yang
6f25c39c3e [2.1.5] [MFH] Fix Shift_JIS bug.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1793 48356398-32a2-884e-a903-53898d9a118a
2008-06-11 19:01:22 +00:00
Edward Z. Yang
b8b1ac283d [2.1.5] [MFH] Fix regression in FontFamily
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1792 48356398-32a2-884e-a903-53898d9a118a
2008-06-11 18:54:19 +00:00
Edward Z. Yang
450fc6649d [2.1.5] [MFH] Fix Shift_JIS encoding wonkiness with yen symbols and whatnot, as well as other patches
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1791 48356398-32a2-884e-a903-53898d9a118a
2008-06-11 18:49:56 +00:00
Edward Z. Yang
369a69d533 [2.1.5] [MFH] Fix stray backslashes in font-family.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1790 48356398-32a2-884e-a903-53898d9a118a
2008-06-11 17:43:48 +00:00
Edward Z. Yang
72f5819ef6 [2.1.5] [MFH] Round up imagecrash support with HTML.MaxImgLength
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1789 48356398-32a2-884e-a903-53898d9a118a
2008-06-11 17:38:25 +00:00
Edward Z. Yang
3540ea7fce [2.1.5] [MFH] Make modules use setup($config) instead of constructor
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1788 48356398-32a2-884e-a903-53898d9a118a
2008-06-11 17:10:39 +00:00
Edward Z. Yang
c03953f85e [2.1.5] [MFH] Percent encode query and hash, and lazy update with attr validator
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1787 48356398-32a2-884e-a903-53898d9a118a
2008-06-11 04:00:06 +00:00
Edward Z. Yang
0d262b3a1d Add missing bits from previous commit.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1786 48356398-32a2-884e-a903-53898d9a118a
2008-06-11 01:56:22 +00:00
Edward Z. Yang
234cd2196f [2.1.5] [MFH] Complete the imagecrash added protection fixes
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1785 48356398-32a2-884e-a903-53898d9a118a
2008-06-11 01:53:31 +00:00
Edward Z. Yang
0dbe87bbc7 [2.1.5] [MFH] Disable Tidy tests
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1784 48356398-32a2-884e-a903-53898d9a118a
2008-06-11 01:25:05 +00:00
Edward Z. Yang
245b5bdb27 Merged r1746: Length and UnitConverter implementation.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1783 48356398-32a2-884e-a903-53898d9a118a
2008-06-11 01:21:36 +00:00
Edward Z. Yang
864cb9e136 - Fix tagging script to work off of php4
- Fix svn.php to not clobber svn extension
- Update NEWS

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1743 48356398-32a2-884e-a903-53898d9a118a
2008-05-18 20:12:17 +00:00
Edward Z. Yang
487fcd55ea Release 2.1.4
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1736 48356398-32a2-884e-a903-53898d9a118a
2008-05-18 18:56:27 +00:00
Edward Z. Yang
ec6b6821cf [2.1.4] Add information about PHP 5.0.5 or earlier.
- Fix segfault in 5.0.x with IDAccumulator test.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1726 48356398-32a2-884e-a903-53898d9a118a
2008-05-16 01:25:22 +00:00
Edward Z. Yang
f26eb7551a [2.1.4] [MFH] Fixed bug with fallback languages in LanguageFactory
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1724 48356398-32a2-884e-a903-53898d9a118a
2008-05-15 23:20:21 +00:00
Edward Z. Yang
a2aca4819d [2.1.4] [MFH] Revamp URI handling of percent encoding and validation from r1709
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1721 48356398-32a2-884e-a903-53898d9a118a
2008-05-15 05:30:20 +00:00
Edward Z. Yang
a75e4c6b7c [2.1.4] [MFH] getInstance -> instance from r1689
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1720 48356398-32a2-884e-a903-53898d9a118a
2008-05-15 05:24:34 +00:00
Edward Z. Yang
e7fa8cbdd5 [2.1.4] [MFH] Add protection against imagecrash attack with CSS height/width from r1684
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1719 48356398-32a2-884e-a903-53898d9a118a
2008-05-15 05:21:37 +00:00
Edward Z. Yang
5fa575f8ac [2.1.4] [MFH] Encoder optimization and shut-up operator bugfix from r1680
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1718 48356398-32a2-884e-a903-53898d9a118a
2008-05-15 05:16:36 +00:00
Edward Z. Yang
9f23bc005b [2.1.4] [MFH] addAttribute() can be called multiple times, from r1634
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1717 48356398-32a2-884e-a903-53898d9a118a
2008-05-15 05:13:11 +00:00
Edward Z. Yang
957a840f54 [2.1.4] [MFH] Fix bug with rgb(0, 1, 2) color syntax with spaces inside shorthand syntax from r1612
- Also, repair botched comment patch

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1716 48356398-32a2-884e-a903-53898d9a118a
2008-05-15 05:04:39 +00:00
Edward Z. Yang
a7762c5137 [2.1.4] [MFH] Fix bug in comment parsing with DirectLex from r1570
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1715 48356398-32a2-884e-a903-53898d9a118a
2008-05-15 04:43:52 +00:00
Edward Z. Yang
aca9d725ed [2.1.4] [MFH] Fix bug with trusted script handling in libxml versions later than 2.6.28 from r1553.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1714 48356398-32a2-884e-a903-53898d9a118a
2008-05-15 04:40:13 +00:00
Edward Z. Yang
4ce3deba26 [2.1.4] [MFH] Recursive auto-close with <span><span><div> from r1492
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1713 48356398-32a2-884e-a903-53898d9a118a
2008-05-15 04:32:05 +00:00
Edward Z. Yang
d4da02ba95 [2.1.4] [MFH] Case-insensitive CSS from r1461
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1712 48356398-32a2-884e-a903-53898d9a118a
2008-05-15 04:26:30 +00:00
Edward Z. Yang
97d3c8509c [2.1.4] [MFH] register() for DefinitionCacheFactory from r1464
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1711 48356398-32a2-884e-a903-53898d9a118a
2008-05-15 04:21:23 +00:00
Edward Z. Yang
21c6803401 [2.1.4] [MFH] Color and CSS bugfixes from r1473
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1710 48356398-32a2-884e-a903-53898d9a118a
2008-05-15 04:01:45 +00:00
Edward Z. Yang
36badb06f6 Branch out PHP 4 development: we're going PHP 5!
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/branches/php4@1455 48356398-32a2-884e-a903-53898d9a118a
2007-11-23 21:18:32 +00:00
Edward Z. Yang
4066416160 Slight clarification of where ElementDef's required_attr property gets populated
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1454 48356398-32a2-884e-a903-53898d9a118a
2007-11-13 02:49:47 +00:00
Edward Z. Yang
fad6aa45fa Make phpdoc more efficient, ignore the conf directory
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1450 48356398-32a2-884e-a903-53898d9a118a
2007-11-06 17:50:30 +00:00
Edward Z. Yang
a7e6d85f6d Update PEAR packager
- Ignore standalone directories
- Normalize base directory with realpath

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1447 48356398-32a2-884e-a903-53898d9a118a
2007-11-06 16:37:25 +00:00
Edward Z. Yang
c330860606 Release 2.1.3.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1443 48356398-32a2-884e-a903-53898d9a118a
2007-11-06 03:39:59 +00:00
Edward Z. Yang
0ea53e5a3d Make multitest.php also manage standalone version testing.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1442 48356398-32a2-884e-a903-53898d9a118a
2007-11-06 03:34:45 +00:00
Edward Z. Yang
68167176dc [2.1.3]
- Officially support 4.3.7 and up
- Modify PH5P to remove incompatible parameter type def
- Add more versions to multitest

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1441 48356398-32a2-884e-a903-53898d9a118a
2007-11-05 05:25:59 +00:00
Edward Z. Yang
bb08f679f0 [2.1.3]
- Work around unnecessary DOMElement type-cast in PH5P that caused errors in PHP 5.1
- Work around PHP 4 SimpleTest lack-of-error complaining for one-time-only HTMLDefinition errors, this may indicate problems with error-collecting facilities in PHP 5
- Make ErrorCollectorEMock work in both PHP 4 and PHP 5
. tests/multitest.php allows you to test multiple versions by running tests/index.php through multiple interpreters using `phpv` shell script (you must provide this script!)
. Minor cosmetic change to flush-definition-cache.php: trailing newline is outputted
. Maintenance script for generating PH5P patch added, original PH5P source file also added under version control
. Full unit test runner script title made more descriptive with PHP version

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1440 48356398-32a2-884e-a903-53898d9a118a
2007-11-05 05:01:51 +00:00
Edward Z. Yang
8cd1806ec8 Update INSTALL file with better instructions. Translation needs updating.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1439 48356398-32a2-884e-a903-53898d9a118a
2007-11-05 03:40:32 +00:00
Edward Z. Yang
1274cfed49 [2.1.3] Fix possible error in DirectLex reported by Nate Abele
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1438 48356398-32a2-884e-a903-53898d9a118a
2007-11-05 03:22:22 +00:00
Edward Z. Yang
1ab47ba949 Update NEWS.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1436 48356398-32a2-884e-a903-53898d9a118a
2007-11-02 03:20:55 +00:00
Edward Z. Yang
da95ee096a Beef up HTML Purifier help message. Todo: make it hideable.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1435 48356398-32a2-884e-a903-53898d9a118a
2007-11-02 01:55:45 +00:00
Edward Z. Yang
6d7250c309 Update Doxygen file after doxygen -u command
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1429 48356398-32a2-884e-a903-53898d9a118a
2007-10-30 03:08:06 +00:00
Edward Z. Yang
df55df1083 Update Doxyfile with new paths, also exclude standalone directory
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1428 48356398-32a2-884e-a903-53898d9a118a
2007-10-30 02:46:26 +00:00
Edward Z. Yang
1a8d864a42 Have tests also check for test-settings in conf file, this allows for configuration files to be separately versioned
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1427 48356398-32a2-884e-a903-53898d9a118a
2007-10-30 02:26:11 +00:00
Edward Z. Yang
552102f7f2 [2.1.3]
- HTMLDefinition->addElement now returns a reference to the created element object, as implied by the documentation
. Extend Injector hooks to allow for more powerful injector routines
. HTMLDefinition->addBlankElement created, as according to the HTMLModule method

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1425 48356398-32a2-884e-a903-53898d9a118a
2007-10-02 22:50:59 +00:00
Edward Z. Yang
f5371bbad4 [2.1.3]
- Buggy treatment of end tags of elements that have required attributes fixed (does not manifest on default tag-set)
- Spurious internal content reorganization error suppressed
. Error unit tests can now specify the expectation of no errors. Future iterations of the harness will be extremely strict about what errors are allowed

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1424 48356398-32a2-884e-a903-53898d9a118a
2007-10-02 01:19:46 +00:00
Edward Z. Yang
c8b020879d [2.1.3] Refine injector algorithm regarding behavior inside nodes that allow paragraphs inside them
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1423 48356398-32a2-884e-a903-53898d9a118a
2007-09-27 00:39:05 +00:00
Edward Z. Yang
094b20f58f [2.1.3] Fix PHP warning from MakeAbsolute, also improve URIFilter documentation
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1422 48356398-32a2-884e-a903-53898d9a118a
2007-09-27 00:07:27 +00:00
Edward Z. Yang
f2df669eec Refactor IDAccumulator so that unit tests now work, and initialization is inside the class.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1421 48356398-32a2-884e-a903-53898d9a118a
2007-09-26 23:36:37 +00:00
Edward Z. Yang
ca43df9fdd [2.1.3] Fatal error when <img> tag (or any other element with required attributes) has 'id' attribute fixed, thanks NykO18 for reporting
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1420 48356398-32a2-884e-a903-53898d9a118a
2007-09-26 23:18:24 +00:00
Edward Z. Yang
5f76796e14 Some small doc updates
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1419 48356398-32a2-884e-a903-53898d9a118a
2007-09-25 02:42:35 +00:00
Edward Z. Yang
1f9a6ba30e [2.1.3] Activate strict blockquote functionality for HTML 4.01 Strict.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1417 48356398-32a2-884e-a903-53898d9a118a
2007-09-09 01:46:59 +00:00
Edward Z. Yang
ccca8cc34f [2.1.3] Rename configuration directive
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1416 48356398-32a2-884e-a903-53898d9a118a
2007-09-09 01:35:50 +00:00
Edward Z. Yang
28c29656af [2.1.3] Fix off-by-one bug in injector functionality for dormant injectors
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1415 48356398-32a2-884e-a903-53898d9a118a
2007-09-09 01:27:09 +00:00
Edward Z. Yang
88f4f57a47 [2.1.3] Fix poor include ordering.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1414 48356398-32a2-884e-a903-53898d9a118a
2007-09-06 19:38:12 +00:00
Edward Z. Yang
43a98de909 Fix up some comments, reduce code duplication.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1409 48356398-32a2-884e-a903-53898d9a118a
2007-09-04 00:15:07 +00:00
Edward Z. Yang
b9d886d53b Release 2.1.2.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1402 48356398-32a2-884e-a903-53898d9a118a
2007-09-03 15:30:12 +00:00
Edward Z. Yang
5b3c8c5534 [2.1.2] Implement border-spacing
- Fix PH5P testing in PHP4

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1401 48356398-32a2-884e-a903-53898d9a118a
2007-09-03 15:16:33 +00:00
Edward Z. Yang
dd40d41bc3 Refactor TODO list.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1400 48356398-32a2-884e-a903-53898d9a118a
2007-09-02 17:22:31 +00:00
Edward Z. Yang
37a80f1295 Fix typo in sample PHP code.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1399 48356398-32a2-884e-a903-53898d9a118a
2007-08-26 18:42:55 +00:00
Edward Z. Yang
fb367dc871 [2.1.2] Correct usage of entity -> character entity reference.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1398 48356398-32a2-884e-a903-53898d9a118a
2007-08-26 18:29:37 +00:00
Edward Z. Yang
29c3c21b34 [2.1.2] Merge in Brett Zamir's patches.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1397 48356398-32a2-884e-a903-53898d9a118a
2007-08-26 18:20:46 +00:00
Edward Z. Yang
e45cc503a2 [2.1.2] Refactory merge-library.php script
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1396 48356398-32a2-884e-a903-53898d9a118a
2007-08-26 17:04:31 +00:00
Edward Z. Yang
85cdea0120 [2.1.2] Remove inclusion reflection from URISchemeRegistry
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1395 48356398-32a2-884e-a903-53898d9a118a
2007-08-26 15:43:17 +00:00
Edward Z. Yang
c7676afb0d Ignore out/ directory.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1391 48356398-32a2-884e-a903-53898d9a118a
2007-08-26 03:49:11 +00:00
Edward Z. Yang
d75c695994 [2.1.2] Fix problems with standalone distribution, change smoketests so that it's easier to test the standalone.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1388 48356398-32a2-884e-a903-53898d9a118a
2007-08-19 21:38:19 +00:00
Edward Z. Yang
6f6fcbc354 Add install script for PEAR installs.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1385 48356398-32a2-884e-a903-53898d9a118a
2007-08-19 19:52:45 +00:00
Edward Z. Yang
c31d6ec80e Add that PH5P is PHP5 only.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1384 48356398-32a2-884e-a903-53898d9a118a
2007-08-19 19:37:34 +00:00
Edward Z. Yang
cb92a57e4e [2.1.2] Implement experimental HTML5 parsing using PH5P
- Fix debugger so that tokens can be printed without an index
- Fix some broken PEAR unit tests

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1383 48356398-32a2-884e-a903-53898d9a118a
2007-08-19 18:49:35 +00:00
Edward Z. Yang
423afedbf4 [2.1.2] Fix validation errors in configuration form
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1382 48356398-32a2-884e-a903-53898d9a118a
2007-08-19 16:24:55 +00:00
Edward Z. Yang
7827a95273 [2.1.2] Fix some validation problems in printDefinition.php's output
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1381 48356398-32a2-884e-a903-53898d9a118a
2007-08-19 15:38:37 +00:00
Edward Z. Yang
9881a34712 More unit test refactoring into seperate methods.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1380 48356398-32a2-884e-a903-53898d9a118a
2007-08-16 06:48:24 +00:00
Edward Z. Yang
a19f30fdcf [2.1.2] Fix silly little typo with border-collapse:separate
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1379 48356398-32a2-884e-a903-53898d9a118a
2007-08-11 06:52:26 +00:00
Edward Z. Yang
8f58c7f49e [2.1.2?] Final migration for Injectors, deprecate the config and context parameters in assertResult
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1378 48356398-32a2-884e-a903-53898d9a118a
2007-08-08 05:38:52 +00:00
Edward Z. Yang
71301b36eb [2.1.2?] Implemented Object module for trusted users.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1377 48356398-32a2-884e-a903-53898d9a118a
2007-08-08 05:16:15 +00:00
Edward Z. Yang
4f0d012dfa [2.1.2?] Add missing sub-tests for strategy, fix error message typo.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1376 48356398-32a2-884e-a903-53898d9a118a
2007-08-08 05:08:59 +00:00
Edward Z. Yang
24a4dfdf83 [2.1.2?] Fix invisible DirectLex parsing error with empty elements that have attributes containing slashes
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1375 48356398-32a2-884e-a903-53898d9a118a
2007-08-08 05:05:30 +00:00
Edward Z. Yang
f922285383 More unit test refactoring; remove unnecessary periods from HTMLDefinition error messages
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1374 48356398-32a2-884e-a903-53898d9a118a
2007-08-07 05:38:22 +00:00
Edward Z. Yang
3af6457801 Refactor unit tests to have one logical assertion per method.
- Support executing a single unit tests using __only prefix
- Hook in Email classes to main code, even if they're unused


git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1373 48356398-32a2-884e-a903-53898d9a118a
2007-08-06 06:22:23 +00:00
Edward Z. Yang
d51d3c127b Release 2.1.1.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1366 48356398-32a2-884e-a903-53898d9a118a
2007-08-05 01:20:55 +00:00
Edward Z. Yang
4f92c0377f [2.1.1] Fix syntax error in standalone library
- fix faulty PHP4 test
- remove unnecessary HTMLPurifier_Config::create() call

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1365 48356398-32a2-884e-a903-53898d9a118a
2007-08-05 01:15:23 +00:00
Edward Z. Yang
c3efafb07d [2.1.1] Fix *another* but with addFilter() Jeez!
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1364 48356398-32a2-884e-a903-53898d9a118a
2007-08-04 22:46:17 +00:00
Edward Z. Yang
79c18eb781 [2.1.1] Single test methods can be invoked by prefixing them with __only
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1363 48356398-32a2-884e-a903-53898d9a118a
2007-08-04 14:51:06 +00:00
Edward Z. Yang
7b64bc37e2 [2.1.1] Fix show-stopping bug in URIDefinition.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1361 48356398-32a2-884e-a903-53898d9a118a
2007-08-03 21:17:15 +00:00
Edward Z. Yang
b3aa5fa0dc [2.1.1] Add prefix directory to include path in standalone: this prevents PEAR from clobbering our unit tests
- Add missing include to unit test harness
- Add missing unit test for HTMLPurifier::getInstance

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1358 48356398-32a2-884e-a903-53898d9a118a
2007-08-03 15:11:08 +00:00
Edward Z. Yang
350d8301dd Release 2.1.0.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1351 48356398-32a2-884e-a903-53898d9a118a
2007-08-03 03:04:40 +00:00
Edward Z. Yang
a40e16dd2e [2.1.0] Allow i18n font names
- Minor typos fixed; we're release ready!

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1350 48356398-32a2-884e-a903-53898d9a118a
2007-08-03 02:48:52 +00:00
Edward Z. Yang
ee388e86c0 Fix code typo in URI Filter documentation.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1349 48356398-32a2-884e-a903-53898d9a118a
2007-08-03 00:08:45 +00:00
Edward Z. Yang
79df79b2fd [2.1.0] Add tutorial for creating URI Filters
- Update NEWS

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1348 48356398-32a2-884e-a903-53898d9a118a
2007-08-02 23:34:30 +00:00
Edward Z. Yang
f5b72c623c [2.1.0] Implement Ruby.
- Destroy some zombie context variables
- Reorganize some TODO items

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1347 48356398-32a2-884e-a903-53898d9a118a
2007-08-02 22:44:42 +00:00
Edward Z. Yang
7bccc24977 [2.1.0] Implement MakeAbsolute URI filter
- Move some directives with complex dependencies to URIDefinition
- Fix a missing extends
- Add hierarchical information to URI schemes
- Fix bug in URIHarness.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1346 48356398-32a2-884e-a903-53898d9a118a
2007-08-02 21:47:24 +00:00
Edward Z. Yang
25fe416ab2 Add test-case for blank TinyMCE allowed list.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1345 48356398-32a2-884e-a903-53898d9a118a
2007-08-02 15:13:12 +00:00
Edward Z. Yang
a9012f4387 Guard merge-library against non-cli execution.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1344 48356398-32a2-884e-a903-53898d9a118a
2007-08-02 12:51:52 +00:00
Edward Z. Yang
82f8561123 Factor out cli execution guard to common.php
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1343 48356398-32a2-884e-a903-53898d9a118a
2007-08-02 12:45:15 +00:00
Edward Z. Yang
0b743fb2db Update maintenance files with cgi-fcgi compiled PHP executable workaround.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1342 48356398-32a2-884e-a903-53898d9a118a
2007-08-02 12:40:54 +00:00
Edward Z. Yang
08e32597df Fix flush-definition-cache to clear everything, and make it accept a parameter specifying which cache to flush. Also, set svn:executable to CLI scripts.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1340 48356398-32a2-884e-a903-53898d9a118a
2007-08-02 12:24:50 +00:00
Edward Z. Yang
2b82fbacad Minor re-prioritization of TODO.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1339 48356398-32a2-884e-a903-53898d9a118a
2007-08-02 01:53:46 +00:00
Edward Z. Yang
710820cbe9 [2.1.0] Repair minor PHP4 regression due to undefined configuration directive
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1338 48356398-32a2-884e-a903-53898d9a118a
2007-08-02 01:48:43 +00:00
Edward Z. Yang
22ef52a7f6 [2.1.0] Migrate host blacklist functionality to URIFilter.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1336 48356398-32a2-884e-a903-53898d9a118a
2007-08-02 01:41:37 +00:00
Edward Z. Yang
4919187fc6 [2.1.0] Further refactoring of AttrDef_URI, creation of new URIFilter and URIDefinition subsystems.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1335 48356398-32a2-884e-a903-53898d9a118a
2007-08-02 01:12:27 +00:00
Edward Z. Yang
797b899305 [2.1.0] Create new URI object and migrate URI validation systems to use it. URIScheme interface changed.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1334 48356398-32a2-884e-a903-53898d9a118a
2007-08-01 18:34:46 +00:00
Edward Z. Yang
8c9dbe142d [2.1.0] Refactor AttrDef_URI: removed URIParser functionality
- Genericized flush-definition-cache script

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1333 48356398-32a2-884e-a903-53898d9a118a
2007-08-01 14:55:09 +00:00
Edward Z. Yang
2a002857ce [2.1.0] All unit tests inherit from HTMLPurifier_Harness, not UnitTestCase. prepareCommon() refactored to global test-case.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1332 48356398-32a2-884e-a903-53898d9a118a
2007-08-01 14:06:59 +00:00
Edward Z. Yang
9d98b45dea Fix typo in news file.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1331 48356398-32a2-884e-a903-53898d9a118a
2007-08-01 13:16:49 +00:00
Edward Z. Yang
b0f3116b9e [2.1.0] URI scheme is munged off if there is no authority and the scheme is the default one
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1330 48356398-32a2-884e-a903-53898d9a118a
2007-08-01 13:15:33 +00:00
Edward Z. Yang
b03a44abff Remove expectations from assertOutput in URITest.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1329 48356398-32a2-884e-a903-53898d9a118a
2007-08-01 02:19:43 +00:00
Edward Z. Yang
cf257cabde [2.1.0]
- AttrDef_URI unit tests refactored
- Block access to benchmarks: they should be called via command line

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1328 48356398-32a2-884e-a903-53898d9a118a
2007-08-01 01:48:51 +00:00
Edward Z. Yang
ab950a1909 [2.1.0] Fix fairly major bug introduced when logic was reorganized.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1324 48356398-32a2-884e-a903-53898d9a118a
2007-07-31 02:39:49 +00:00
Edward Z. Yang
a12ea4bb3b [2.1.0] Fix bug in mkdir_deep that would prevent absolute paths in Unix systems from being created properly
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1321 48356398-32a2-884e-a903-53898d9a118a
2007-07-31 02:04:32 +00:00
Edward Z. Yang
f80de908bd [2.1.0] Optimize ConfigSchema to only perform safety checks when HTMLPURIFIER_SCHEMA_STRICT is true
- Remove useless ->revision check in Config.php
- Add simple trace file to benchmarks folder

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1319 48356398-32a2-884e-a903-53898d9a118a
2007-07-31 01:04:38 +00:00
Edward Z. Yang
349c4de75b [2.1.0] Standalone file now can be generated using maintenance/merge-library.php. Also:
- HTMLPURIFIER_PREFIX constant added, and relevant files transitioned over
- Custom ChildDef added to default include list
- Tester accepts ?standalone parameter

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1316 48356398-32a2-884e-a903-53898d9a118a
2007-07-30 16:56:50 +00:00
Edward Z. Yang
89622c964e [2.1.0] Genericize element contents removal. This is done in a slightly hacky way since ElementDef is not available, but should be sufficient.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1313 48356398-32a2-884e-a903-53898d9a118a
2007-07-11 20:42:58 +00:00
Edward Z. Yang
732fe5cad7 [2.1.0] Two tiny bugfixes:
- Remove contents of <style> tags
- Use XHTMLStrict Tidy routines for XHTML 1.1

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1311 48356398-32a2-884e-a903-53898d9a118a
2007-07-11 20:06:15 +00:00
Edward Z. Yang
e7e81c0a5b [2.1.0] Fix some minor DirectLex bugs that may lead to PHP errors
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1310 48356398-32a2-884e-a903-53898d9a118a
2007-07-05 21:29:07 +00:00
Edward Z. Yang
626b2a13c8 Typographical and linkrot fixes for UTF-8 doc.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1308 48356398-32a2-884e-a903-53898d9a118a
2007-07-05 16:50:48 +00:00
Edward Z. Yang
35487c02ae Update test settings template.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1303 48356398-32a2-884e-a903-53898d9a118a
2007-06-30 16:13:10 +00:00
Edward Z. Yang
4bc1761b12 Update test settings file with more options.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1302 48356398-32a2-884e-a903-53898d9a118a
2007-06-30 05:02:27 +00:00
Edward Z. Yang
63f5414f2e [Phorum] Refactor settings.php into different files.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1298 48356398-32a2-884e-a903-53898d9a118a
2007-06-29 20:34:19 +00:00
Edward Z. Yang
88d014706b [Phorum] Double-reverse control.php's double-escaping
- Implement signature migration

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1297 48356398-32a2-884e-a903-53898d9a118a
2007-06-29 20:00:38 +00:00
Edward Z. Yang
f6de73d7e7 [Phorum] Deal more gracefully with signatures and edit messages. More improvements.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1296 48356398-32a2-884e-a903-53898d9a118a
2007-06-29 18:25:13 +00:00
Edward Z. Yang
733868a76d [2.1.0] Fix another AutoParagraph edge-case.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1295 48356398-32a2-884e-a903-53898d9a118a
2007-06-29 17:48:56 +00:00
Edward Z. Yang
fab6a212c8 Turn off WYSIWYG.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1286 48356398-32a2-884e-a903-53898d9a118a
2007-06-29 17:03:55 +00:00
Edward Z. Yang
ea1362ce5c [Phorum] Minor enhancements: add cache purge support and give a friendly HTML is on message above editor.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1281 48356398-32a2-884e-a903-53898d9a118a
2007-06-29 15:43:23 +00:00
Edward Z. Yang
cff498ef67 [2.1.0] Refine autoparagraphing algorithm.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1278 48356398-32a2-884e-a903-53898d9a118a
2007-06-29 03:57:14 +00:00
Edward Z. Yang
1765a7537a [Phorum] Fix cross-platform mutilation of cache data, remove excess newlines.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1277 48356398-32a2-884e-a903-53898d9a118a
2007-06-29 03:41:21 +00:00
Edward Z. Yang
d7157d0ccd Tweak to make more conducive to our new textareas.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1274 48356398-32a2-884e-a903-53898d9a118a
2007-06-29 02:17:17 +00:00
Edward Z. Yang
ed44b5c5ba [2.1.0] ConfigForm generates textareas instead of text inputs for lists, hashes, lookups, text and itext fields
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1273 48356398-32a2-884e-a903-53898d9a118a
2007-06-29 02:16:47 +00:00
Edward Z. Yang
5e5c0f3aa4 [2.1.0]
. Introduce new text/itext configuration directive values: these represent longer strings that would be more appropriately edited with a textarea
. Allow newlines to act as separators for lists, hashes, lookups and %HTML.Allowed

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1272 48356398-32a2-884e-a903-53898d9a118a
2007-06-29 01:54:48 +00:00
Edward Z. Yang
b2ed0aff01 [Phorum] Final polishing: Have default config auto-detect character encoding; add WYSIWYG hook; update error message to be more friendly
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1271 48356398-32a2-884e-a903-53898d9a118a
2007-06-29 00:48:55 +00:00
Edward Z. Yang
148681d1b0 Tidy up Phorum extension. Add svn:ignore, add img to the default list of allowed tags (for smileys), improve naming convention, move loading code out of main namespace, and add reset. Probably the last thing to do before this is feature complete is to have a WYSIWYG flag that turns on escaping for edit posts.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1270 48356398-32a2-884e-a903-53898d9a118a
2007-06-29 00:28:07 +00:00
Edward Z. Yang
2e7e411491 [2.1.0] Fix bug in auto-paragraphing: empty tags should be treated like start tags too.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1269 48356398-32a2-884e-a903-53898d9a118a
2007-06-29 00:24:59 +00:00
Edward Z. Yang
02051e465c [2.1.0] Phorum mod implemented for HTML Purifier. Some other code adjustments were made, they need to be cleaned up.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1267 48356398-32a2-884e-a903-53898d9a118a
2007-06-28 23:01:27 +00:00
Edward Z. Yang
a96b5bf612 [2.1.0] Friendly error messages for when injector needs a tag that's not allowed added
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1265 48356398-32a2-884e-a903-53898d9a118a
2007-06-28 13:06:15 +00:00
Edward Z. Yang
9dd7c8c7dd Add reference document on CSS lengths.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1264 48356398-32a2-884e-a903-53898d9a118a
2007-06-27 23:49:50 +00:00
Edward Z. Yang
0c59db1da3 Bring Null's flush() interface inline with parent.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1263 48356398-32a2-884e-a903-53898d9a118a
2007-06-27 21:03:07 +00:00
Edward Z. Yang
584a1abd15 [2.1.0] Standardize interface for Injector (we actually got it wrong in the 2.0.1-strict version, but this'll fix it)
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1262 48356398-32a2-884e-a903-53898d9a118a
2007-06-27 19:01:09 +00:00
Edward Z. Yang
a6ede3804e [2.1.0] True emoticon < fix.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1260 48356398-32a2-884e-a903-53898d9a118a
2007-06-27 16:40:18 +00:00
Edward Z. Yang
4476745003 Add new entries for 2.1.0 and 2.0.2
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1258 48356398-32a2-884e-a903-53898d9a118a
2007-06-27 15:16:27 +00:00
250 changed files with 17464 additions and 2890 deletions

1101
Doxyfile

File diff suppressed because it is too large Load Diff

211
INSTALL
View File

@@ -2,62 +2,57 @@
Install
How to install HTML Purifier
HTML Purifier is designed to run out of the box, so actually using the library
is extremely easy. (Although, if you were looking for a step-by-step
installation GUI, you've come to the wrong place!) The impatient can scroll
down to the bottom of this INSTALL document to see the code, but you really
should make sure a few things are properly done.
HTML Purifier is designed to run out of the box, so actually using the
library is extremely easy. (Although... if you were looking for a
step-by-step installation GUI, you've downloaded the wrong software!)
While the impatient can get going immediately with some of the sample
code at the bottom of this library, it's well worth performing some
basic sanity checks to get the most out of this library.
---------------------------------------------------------------------------
1. Compatibility
HTML Purifier works in both PHP 4 and PHP 5, from PHP 4.3.2 and up. It has no
core dependencies with other libraries.
THIS IS A DEPRECATED PHP4 VERSION OF HTML PURIFIER.
Optional extensions are iconv (usually installed) and tidy (also common).
If you use UTF-8 and don't plan on pretty-printing HTML, you can get away with
not having either of these extensions.
If you are running PHP5, please go to http://htmlpurifier.org to download
the latest version. This version of HTML Purifier is only actively tested
from PHP 4.3.7 to PHP 5.0.5. Essential security will be released for this branch
fixes will be issued for the PHP 4 version until August 8, 2008.
These optional extensions can enhance the capabilities of HTML Purifier:
* iconv : Converts text to and from non-UTF-8 encodings
* bcmath : Used for unit conversion and imagecrash protection
* tidy : Used for pretty-printing HTML
---------------------------------------------------------------------------
2. Reconnaissance
2. Including the library
A big plus of HTML Purifier is its inerrant support of standards, so
your web-pages should be standards-compliant. (They should also use
semantic markup, but that's another issue altogether, one HTML Purifier
cannot fix without reading your mind.)
Simply use:
require_once '/path/to/library/HTMLPurifier.auto.php';
...and you're good to go. Since HTML Purifier's codebase is fairly
large, I recommend only including HTML Purifier when you need it.
If you don't like your include_path to be fiddled around with, simply set
HTML Purifier's library/ directory to the include path yourself and then:
require_once 'HTMLPurifier.php';
Only the contents in the library/ folder are necessary, so you can remove
everything else when using HTML Purifier in a production environment.
3. Preparing the proper output environment
HTML Purifier is all about web-standards, so accordingly your webpages should
be standards compliant. HTML Purifier can deal with these doctypes:
HTML Purifier can process these doctypes:
* XHTML 1.0 Transitional (default)
* XHTML 1.0 Strict
* HTML 4.01 Transitional
* HTML 4.01 Strict
* XHTML 1.1 (sans Ruby)
* XHTML 1.1
...and these character encodings:
* UTF-8 (default)
* Any encoding iconv supports (support is crippled for i18n though)
* Any encoding iconv supports (with crippled internationalization support)
The defaults are there for a reason: they are best-practice choices that
should not be changed lightly. For those of you in the dark, you can determine
the doctype from this code in your HTML documents:
These defaults reflect what my choices where be if I were authoring an
HTML document, however, what you choose depends on the nature of your
codebase. If you don't know what doctype you are using, you can determine
the doctype from this identifier at the top of your source code:
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
@@ -66,14 +61,34 @@ the doctype from this code in your HTML documents:
<meta http-equiv="Content-type" content="text/html;charset=ENCODING">
For legacy codebases these declarations may be missing. If that is the case,
STOP, and read docs/enduser-utf8.html
You may currently be vulnerable to XSS and other security threats, and HTML
Purifier won't be able to fix that.
If the character encoding declaration is missing, STOP NOW, and
read 'docs/enduser-utf8.html' (web accessible at
http://htmlpurifier.org/docs/enduser-utf8.html). In fact, even if it is
present, read this document anyway, as most websites specify character
encoding incorrectly.
---------------------------------------------------------------------------
3. Including the library
The procedure is quite simple:
require_once '/path/to/library/HTMLPurifier.auto.php';
I recommend only including HTML Purifier when you need it, because that
call represents the inclusion of a lot of PHP files which constitute
the bulk of HTML Purifier's memory usage.
If you don't like your include_path to be fiddled around with, simply set
HTML Purifier's library/ directory to the include path yourself and then:
require_once 'HTMLPurifier.php';
Only the contents in the library/ folder are necessary, so you can remove
everything else when using HTML Purifier in a production environment.
---------------------------------------------------------------------------
4. Configuration
HTML Purifier is designed to run out-of-the-box, but occasionally HTML
@@ -90,7 +105,6 @@ object and read on:
$config = HTMLPurifier_Config::createDefault();
4.1. Setting a different character encoding
You really shouldn't use any other encoding except UTF-8, especially if you
@@ -117,7 +131,6 @@ but please be cognizant of the issues the "solution" creates (for this
reason, I do not include the solution in this document).
4.2. Setting a different doctype
For those of you using HTML 4.01 Transitional, you can disable
@@ -134,7 +147,6 @@ Other supported doctypes include:
* XHTML 1.1
4.3. Other settings
There are more configuration directives which can be read about
@@ -144,55 +156,24 @@ your code. Some of the more interesting ones are configurable at the
demo <http://htmlpurifier.org/demo.php> and are well worth looking into
for your own system.
For example, you can fine tune allowed elements and attributes, convert
relative URLs to absolute ones, and even autoparagraph input text! These
are, respectively, %HTML.Allowed, %URI.MakeAbsolute and %URI.Base, and
%AutoFormat.AutoParagraph. The %Namespace.Directive naming convention
translates to:
$config->set('Namespace', 'Directive', $value);
E.g.
$config->set('HTML', 'Allowed', 'p,b,a[href],i');
$config->set('URI', 'Base', 'http://www.example.com');
$config->set('URI', 'MakeAbsolute', true);
$config->set('AutoFormat', 'AutoParagraph', true);
5. Using the code
The interface is mind-numbingly simple:
$purifier = new HTMLPurifier();
$clean_html = $purifier->purify( $dirty_html );
...or, if you're using the configuration object:
$purifier = new HTMLPurifier($config);
$clean_html = $purifier->purify( $dirty_html );
That's it! For more examples, check out docs/examples/ (they aren't very
different though). Also, docs/enduser-slow.html gives advice on what to
do if HTML Purifier is slowing down your application.
6. Quick install
First, make sure library/HTMLPurifier/DefinitionCache/Serializer is
writable by the webserver (see Section 7: Caching below for details).
If your website is in UTF-8 and XHTML Transitional, use this code:
<?php
require_once '/path/to/htmlpurifier/library/HTMLPurifier.auto.php';
$purifier = new HTMLPurifier();
$clean_html = $purifier->purify($dirty_html);
?>
If your website is in a different encoding or doctype, use this code:
<?php
require_once '/path/to/htmlpurifier/library/HTMLPurifier.auto.php';
$config = HTMLPurifier_Config::createDefault();
$config->set('Core', 'Encoding', 'ISO-8859-1'); // replace with your encoding
$config->set('HTML', 'Doctype', 'HTML 4.01 Transitional'); // replace with your doctype
$purifier = new HTMLPurifier($config);
$clean_html = $purifier->purify($dirty_html);
?>
7. Caching
---------------------------------------------------------------------------
5. Caching
HTML Purifier generates some cache files (generally one or two) to speed up
its execution. For maximum performance, make sure that
@@ -228,3 +209,49 @@ Or move the cache directory somewhere else (no trailing slash):
$config->set('Cache', 'SerializerPath', '/home/user/absolute/path');
---------------------------------------------------------------------------
6. Using the code
The interface is mind-numbingly simple:
$purifier = new HTMLPurifier();
$clean_html = $purifier->purify( $dirty_html );
...or, if you're using the configuration object:
$purifier = new HTMLPurifier($config);
$clean_html = $purifier->purify( $dirty_html );
That's it! For more examples, check out docs/examples/ (they aren't very
different though). Also, docs/enduser-slow.html gives advice on what to
do if HTML Purifier is slowing down your application.
---------------------------------------------------------------------------
7. Quick install
First, make sure library/HTMLPurifier/DefinitionCache/Serializer is
writable by the webserver (see Section 5: Caching above for details).
If your website is in UTF-8 and XHTML Transitional, use this code:
<?php
require_once '/path/to/htmlpurifier/library/HTMLPurifier.auto.php';
$purifier = new HTMLPurifier();
$clean_html = $purifier->purify($dirty_html);
?>
If your website is in a different encoding or doctype, use this code:
<?php
require_once '/path/to/htmlpurifier/library/HTMLPurifier.auto.php';
$config = HTMLPurifier_Config::createDefault();
$config->set('Core', 'Encoding', 'ISO-8859-1'); // replace with your encoding
$config->set('HTML', 'Doctype', 'HTML 4.01 Transitional'); // replace with your doctype
$purifier = new HTMLPurifier($config);
$clean_html = $purifier->purify($dirty_html);
?>

197
NEWS
View File

@@ -9,6 +9,203 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
. Internal change
==========================
ERRATA
- PH5P is seriously broken here; it can result in fatal errors and exceptions.
If you desire to use it, please use it with the latest, PHP5-only version of
HTML Purifier.
2.1.5, released 2008-06-19
! More robust imagecrash protection with height/width CSS with %CSS.MaxImgLength,
and height/width HTML with %HTML.MaxImgLength.
- AttrValidator operations are now atomic; updates to attributes are not
manifest in token until end of operations. This prevents naughty internal
code from directly modifying CurrentToken when they're not supposed to.
- Percent encoding checks enabled for URI query and fragment
- Disable percent height/width attributes for img
- Fix stray backslashes in font-family; CSS Unicode character escapes are
now properly resolved (although *only* in font-family).
- Improve parseCDATA algorithm to take into account newline normalization
- Account for browser confusion between Yen character and backslash in
Shift_JIS encoding. This fix generalizes to any other encoding which is not
a strict superset of printable ASCII.
- Improved adherence to Unicode by checking for non-character codepoints.
Thanks Geoffrey Sneddon for reporting. This may result in degraded
performance for extremely large inputs.
- Allow CSS property-value pair ''text-decoration: none''
. Added HTMLPurifier_UnitConverter and HTMLPurifier_Length for convenient
handling of CSS-style lengths. HTMLPurifier_AttrDef_CSS_Length now uses
this class.
. API of HTMLPurifier_AttrDef_CSS_Length changed from __construct($disable_negative)
to __construct($min, $max). __construct(true) is equivalent to
__construct('0'). (replace __construct with HTMLPurifier_AttrDef_CSS_Length)
. Added HTMLPurifier_AttrDef_Switch class
. Rename HTMLPurifier_HTMLModule_Tidy->construct() to setup() and bubble method
up inheritance hierarchy to HTMLPurifier_HTMLModule. All HTMLModules
get this called with the configuration object. All modules now
use this rather than __construct(), although legacy code using constructors
will still work--the new format, however, lets modules access the
configuration object for HTML namespace dependant tweaks.
. AttrDef_HTML_Pixels now takes a single construction parameter, pixels.
2.1.4, released 2008-05-18
! DefinitionCacheFactory now can register new implementations
! CSS properties are now case-insensitive
! Encoder optimized with valid UTF-8 input
! HTML Purifier's URI handling is a lot more robust, with much stricter
validation checks and better percent encoding handling.
- Colors missing # but in hex form will be corrected
- CSS Number algorithm improved
- Autoclose now operates iteratively, i.e. <span><span><div> now has
both span tags closed.
- Fix bug with trusted script handling in libxml versions later than 2.6.28.
- Fix bug in comment parsing with DirectLex
- Fix bug with rgb(0, 1, 2) color syntax with spaces inside shorthand syntax
- HTMLPurifier_HTMLDefinition->addAttribute can now be called multiple times
on the same element without emitting errors.
- Iconv uses set_error_handler instead of shut-up operator
- Add protection against imagecrash attack with CSS height/width
- HTMLPurifier::getInstance() renamed to HTMLPurifier::instance() for consistency
- Fixed bug with fallback languages in LanguageFactory
2.1.3, released 2007-11-05
! tests/multitest.php allows you to test multiple versions by running
tests/index.php through multiple interpreters using `phpv` shell
script (you must provide this script!)
- Fixed poor include ordering for Email URI AttrDefs, causes fatal errors
on some systems.
- Injector algorithm further refined: off-by-one error regarding skip
counts for dormant injectors fixed
- Corrective blockquote definition now enabled for HTML 4.01 Strict
- Fatal error when <img> tag (or any other element with required attributes)
has 'id' attribute fixed, thanks NykO18 for reporting
- Fix warning emitted when a non-supported URI scheme is passed to the
MakeAbsolute URIFilter, thanks NykO18 (again)
- Further refine AutoParagraph injector. Behavior inside of elements
allowing paragraph tags clarified: only inline content delimeted by
double newlines (not block elements) are paragraphed.
- Buggy treatment of end tags of elements that have required attributes
fixed (does not manifest on default tag-set)
- Spurious internal content reorganization error suppressed
- HTMLDefinition->addElement now returns a reference to the created
element object, as implied by the documentation
- Phorum mod's HTML Purifier help message expanded (unreleased elsewhere)
- Fix a theoretical class of infinite loops from DirectLex reported
by Nate Abele
- Work around unnecessary DOMElement type-cast in PH5P that caused errors
in PHP 5.1
- Work around PHP 4 SimpleTest lack-of-error complaining for one-time-only
HTMLDefinition errors, this may indicate problems with error-collecting
facilities in PHP 5
- Make ErrorCollectorEMock work in both PHP 4 and PHP 5
- Make PH5P work with PHP 5.0 by removing unnecessary array parameter typedef
. %Core.AcceptFullDocuments renamed to %Core.ConvertDocumentToFragment
to better communicate its purpose
. Error unit tests can now specify the expectation of no errors. Future
iterations of the harness will be extremely strict about what errors
are allowed
. Extend Injector hooks to allow for more powerful injector routines
. HTMLDefinition->addBlankElement created, as according to the HTMLModule
method
. Doxygen configuration file updated, with minor improvements
. Test runner now checks for similarly named files in conf/ directory too.
. Minor cosmetic change to flush-definition-cache.php: trailing newline is
outputted
. Maintenance script for generating PH5P patch added, original PH5P source
file also added under version control
. Full unit test runner script title made more descriptive with PHP version
. Updated INSTALL file to state that 4.3.7 is the earliest version we
are actively testing
2.1.2, released 2007-09-03
! Implemented Object module for trusted users
! Implemented experimental HTML5 parsing mode using PH5P. To use, add
this to your code:
require_once 'HTMLPurifier/Lexer/PH5P.php';
$config->set('Core', 'LexerImpl', 'PH5P');
Note that this Lexer introduces some classes not in the HTMLPurifier
namespace. Also, this is PHP5 only.
! CSS property border-spacing implemented
- Fix non-visible parsing error in DirectLex with empty tags that have
slashes inside attribute values.
- Fix typo in CSS definition: border-collapse:seperate; was incorrectly
accepted as valid CSS. Usually non-visible, because this styling is the
default for tables in most browsers. Thanks Brett Zamir for pointing
this out.
- Fix validation errors in configuration form
- Hammer out a bunch of edge-case bugs in the standalone distribution
- Inclusion reflection removed from URISchemeRegistry; you must manually
include any new schema files you wish to use
- Numerous typo fixes in documentation thanks to Brett Zamir
. Unit test refactoring for one logical test per test function
. Config and context parameters in ComplexHarness deprecated: instead, edit
the $config and $context member variables
. HTML wrapper in DOMLex now takes DTD identifiers into account; doesn't
really make a difference, but is good for completeness sake
. merge-library.php script refactored for greater code reusability and
PHP4 compatibility
2.1.1, released 2007-08-04
- Fix show-stopper bug in %URI.MakeAbsolute functionality
- Fix PHP4 syntax error in standalone version
. Add prefix directory to include path for standalone, this prevents
other installations from clobbering the standalone's URI schemes
. Single test methods can be invoked by prefixing with __only
2.1.0, released 2007-08-02
# flush-htmldefinition-cache.php superseded in favor of a generic
flush-definition-cache.php script, you can clear a specific cache
by passing its name as a parameter to the script
! Phorum mod implemented for HTML Purifier
! With %Core.AggressivelyFixLt, <3 and similar emoticons no longer
trigger HTML removal in PHP5 (DOMLex). This directive is not necessary
for PHP4 (DirectLex).
! Standalone file now available, which greatly reduces the amount of
includes (although there are still a few files that reside in the
standalone folder)
! Relative URIs can now be transformed into their absolute equivalents
using %URI.Base and %URI.MakeAbsolute
! Ruby implemented for XHTML 1.1
! You can now define custom URI filtering behavior, see enduser-uri-filter.html
for more details
! UTF-8 font names now supported in CSS
- AutoFormatters emit friendly error messages if tags or attributes they
need are not allowed
- ConfigForm's compactification of directive names is now configurable
- AutoParagraph autoformatter algorithm refined after field-testing
- XHTML 1.1 now applies XHTML 1.0 Strict cleanup routines, namely
blockquote wrapping
- Contents of <style> tags removed by default when tags are removed
. HTMLPurifier_Config->getSerial() implemented, this is extremely useful
for output cache invalidation
. ConfigForm printer now can retrieve CSS and JS files as strings, in
case HTML Purifier's directory is not publically accessible
. Introduce new text/itext configuration directive values: these represent
longer strings that would be more appropriately edited with a textarea
. Allow newlines to act as separators for lists, hashes, lookups and
%HTML.Allowed
. ConfigForm generates textareas instead of text inputs for lists, hashes,
lookups, text and itext fields
. Hidden element content removal genericized: %Core.HiddenElements can
be used to customize this behavior, by default <script> and <style> are
hidden
. Added HTMLPURIFIER_PREFIX constant, should be used instead of dirname(__FILE__)
. Custom ChildDef added to default include list
. URIScheme reflection improved: will not attempt to include file if class
already exists. May clobber autoload, so I need to keep an eye on it
. ConfigSchema heavily optimized, will only collect information and validate
definitions when HTMLPURIFIER_SCHEMA_STRICT is true.
. AttrDef_URI unit tests and implementation refactored
. benchmarks/ directory now protected from public view with .htaccess file;
run the tests via command line
. URI scheme is munged off if there is no authority and the scheme is the
default one
. All unit tests inherit from HTMLPurifier_Harness, not UnitTestCase
. Interface for URIScheme changed
. Generic URI object to hold components of URI added, most systems involved
in URI validation have been migrated to use it
. Custom filtering for URIs factored out to URIDefinition interface for
maximum extensibility
2.0.1, released 2007-06-27
! Tag auto-closing now based on a ChildDef heuristic rather than a
manually set auto_close array; some behavior may change

36
TODO
View File

@@ -7,14 +7,9 @@ TODO List
? Maybe I'll Do It
==========================
2.1 release [Refactor, refactor!]
# URI validation routines tighter (see docs/dev-code-quality.html) (COMPLEX)
# Advanced URI filtering schemes (see docs/proposal-new-directives.txt)
# Ruby support
- Configuration profiles: predefined directives set with one func call
- Implement IDREF support (harder than it seems, since you cannot have
IDREFs to non-existent IDs)
- Allow non-ASCII characters in font names
If no interest is expressed for a feature that may required a considerable
amount of effort to implement, it may get endlessly delayed. Do not be
afraid to cast your vote for the next feature to be implemented!
2.2 release [Error'ed]
# Error logging for filtering/cleanup procedures
@@ -34,21 +29,22 @@ TODO List
- Remove empty inline tags<i></i>
- Append something to duplicate IDs so they're still usable (impl. note: the
dupe detector would also need to detect the suffix as well)
- Externalize inline CSS to promote clean HTML
2.4 release [It's All About Trust] (floating)
# Implement untrusted, dangerous elements/attributes
# Implement IDREF support (harder than it seems, since you cannot have
IDREFs to non-existent IDs)
# Frameset XHTML 1.0 and HTML 4.01 doctypes
3.0 release [Beyond HTML]
# Legit token based CSS parsing (will require revamping almost every
AttrDef class)
AttrDef class). Probably will use CSSTidy class
# More control over allowed CSS properties (maybe modularize it in the
same fashion!)
# Formatters for plaintext
- Smileys
- Standardize token armor for all areas of processing
- Fixes for Firefox's inability to handle COL alignment props (Bug 915)
- Automatically add non-breaking spaces to empty table cells when
empty-cells:show is applied to have compatibility with Internet Explorer
- Convert RTL/LTR override characters to <bdo> tags, or vice versa on demand.
Also, enable disabling of directionality
@@ -61,31 +57,33 @@ TODO List
Ongoing
- Lots of profiling, make it faster!
- Plugins for major CMSes (COMPLEX)
- WordPress (mostly written, needs beta-testing)
- phpBB
- Phorum
- eFiction
- more! (look for ones that use WYSIWYGs)
- Complete basic smoketests
Unknown release (on a scratch-an-itch basis)
? Semi-lossy dumb alternate character encoding transfor
# CHMOD install script for PEAR installs
? Have 'lang' attribute be checked against official lists, achieved by
encoding all characters that have string entity equivalents
- Explain how to use HTML Purifier in non-PHP languages / create
a simple command line stub
- Abstract ChildDef_BlockQuote to work with all elements that only
allow blocks in them, required or optional
- Reorganize Unit Tests
- Refactor loop tests (esp. AttrDef_URI)
- Reorganize configuration directives (Create more namespaces! Get messy!)
- Advanced URI filtering schemes (see docs/proposal-new-directives.txt)
- Implement lenient <ruby> child validation
- Explain how to use HTML Purifier in non-PHP languages / create
a simple command line stub (or complicated?)
- Fixes for Firefox's inability to handle COL alignment props (Bug 915)
- Automatically add non-breaking spaces to empty table cells when
empty-cells:show is applied to have compatibility with Internet Explorer
Requested
Wontfix
- Non-lossy smart alternate character encoding transformations (unless
patch provided)
- Pretty-printing HTML, users can use Tidy on the output on entire page
- Pretty-printing HTML: users can use Tidy on the output on entire page
- Native content compression, whitespace stripping (don't rely on Tidy, make
sure we don't remove from <pre> or related tags): use gzip if this is
really important

View File

@@ -1 +1 @@
2.0.1
2.1.5

View File

@@ -1,12 +1,7 @@
The 2.0.1 release introduces a number of stability and usability fixes,
as well as a number of (disabled by default) experimental features. The
security-minded should note that a reflected XSS vulnerability was patched
in smoketests/configForm.php; if you cannot upgrade immediately, please
delete that file (if that directory is not publically accessible, there
is no security risk). The maintenance changes include more helpful file
permissions errors, internal newline normalization, reordered includes
to prevent a missing class definition in some setups, and better cache
revision and id handling. The two experimental features are auto-formatting
(auto-paragraphing and linkification) and error collection, these can
be enabled with %AutoFormat.AutoParagraph, %AutoFormat.Linkify and
%Core.CollectErrors respectively.
Security and bugfix release 2.1.5 is a backport that fixes two vulnerabilities
related to CSS, one of which only occurs under Shift_JIS. It also improves
imagecrash protection (percent CSS width and height is now disabled for
images, and you can control the bounds with %CSS.MaxImgLength and
%HTML.MaxImgLength). Finally, there are number of bug fixes, most notably
support for text-decoration: none, improved adherence to Unicode and increased
percent encoding checks.

1
benchmarks/.htaccess Normal file
View File

@@ -0,0 +1 @@
Deny from all

12
benchmarks/Trace.php Normal file
View File

@@ -0,0 +1,12 @@
<?php
ini_set('xdebug.trace_format', 1);
ini_set('xdebug.show_mem_delta', true);
xdebug_start_trace(dirname(__FILE__) . '/Trace');
require_once '../library/HTMLPurifier.auto.php';
$purifier = new HTMLPurifier();
$data = $purifier->purify(file_get_contents('samples/Lexer/4.html'));
xdebug_stop_trace();

View File

@@ -18,6 +18,8 @@ TODO:
if (version_compare('5', PHP_VERSION, '>')) exit('Requires PHP 5 or higher.');
error_reporting(E_ALL); // probably not possible to use E_STRICT
define('HTMLPURIFIER_SCHEMA_STRICT', true); // description data needs to be collected
// load dual-libraries
require_once '../library/HTMLPurifier.auto.php';
require_once 'library/ConfigDoc.auto.php';

View File

@@ -11,8 +11,7 @@ docs/examples/demo.php - ad hoc HTML/PHP soup to the extreme
AttrDef - a lot of duplication, more generic classes need to be created;
a lot of strtolower() calls, no legit casing
Class - doesn't support Unicode characters (fringe); uses regular
expressions
Class - doesn't support Unicode characters (fringe); uses regular expressions
Lang - code duplication; premature optimization
Length - easily mistaken for CSSLength
URI - multiple regular expressions; missing validation for parts (?)
@@ -22,9 +21,6 @@ ConfigSchema - redefinition is a mess
Strategy
FixNesting - cannot bubble nodes out of structures, duplicated checks
for special-case parent node
MakeWellFormed - insufficient automatic closing definitions (check HTML
spec for optional end tags, also, closing based on type (block/inline)
might be efficient).
RemoveForeignElements - should be run in parallel with MakeWellFormed
URIScheme - needs to have callable generic checks
mailto - doesn't validate emails, doesn't validate querystring

View File

@@ -39,7 +39,7 @@ thead th {text-align:left;padding:0.1em;background-color:#EEE;}
<table cellspacing="0"><tbody>
<tr><td class="impl-yes">Implemented</td></tr>
<tr><td class="impl-partial">Partially implemented</td></tr>
<tr><td class="impl-no">Will not implement</td></tr>
<tr><td class="impl-no">Not priority to implement</td></tr>
<tr><td class="danger">Dangerous attribute/property</td></tr>
<tr><td class="css1">Present in CSS1</td></tr>
<tr><td class="feature">Feature, requires extra work</td></tr>
@@ -118,6 +118,7 @@ thead th {text-align:left;padding:0.1em;background-color:#EEE;}
<tbody>
<tr><th colspan="2">Table</th></tr>
<tr class="impl-yes"><td>border-collapse</td><td>ENUM(collapse, seperate)</td></tr>
<tr class="impl-yes"><td>border-space</td><td>MULTIPLE</td></tr>
<tr class="impl-yes"><td>caption-side</td><td>ENUM(top, bottom)</td></tr>
<tr class="feature"><td>empty-cells</td><td>ENUM(show, hide), No IE support makes this useless,
possible fix with &amp;nbsp;? Unknown release milestone.</td></tr>

View File

@@ -32,7 +32,7 @@
Before we even write any code, it is paramount to consider whether or
not the code we're writing is necessary or not. HTML Purifier, by default,
contains a large set of elements and attributes: large enough so that
<em>any</em> element or attribute in XHTML 1.0 (and its HTML variant)
<em>any</em> element or attribute in XHTML 1.0 or 1.1 (and its HTML variants)
that can be safely used by the general public is implemented.
</p>
@@ -76,11 +76,12 @@
<h3>XHTML 1.1</h3>
<p>
We have not implemented the
As of HTMLPurifier 2.1.0, we have implemented the
<a href="http://www.w3.org/TR/2001/REC-ruby-20010531/">Ruby module</a>,
which defines a set of tags
for publishing short annotations for text, used mostly in Japanese
and Chinese school texts.
and Chinese school texts, but applicable for positioning any text (not
limited to translations) above or below other corresponding text.
</p>
<h3>XHTML 2.0</h3>
@@ -492,10 +493,11 @@ $def =& $config->getHTMLDefinition(true);
<p>
The <code>(%flow;)*</code> indicates the allowed children of the
<code>li</code> tag: <code>li</code> allows any number of flow
elements as its children. In HTML Purifier, we'd write it like
<code>Flow</code> (here's where the content sets we were
discussing earlier come into play). There are three shorthand content models you
can specify:
elements as its children. (The <code>- O</code> allows the closing tag to be
omitted, though in XML this is not allowed.) In HTML Purifier,
we'd write it like <code>Flow</code> (here's where the content sets
we were discussing earlier come into play). There are three shorthand
content models you can specify:
</p>
<table class="table">
@@ -668,12 +670,22 @@ $def =& $config->getHTMLDefinition(true);
Common is a combination of the above-mentioned collections.
</p>
<p class="aside">
Readers familiar with the modularization may have noticed that the Core
attribute collection differs from that specified by the <a
href="http://www.w3.org/TR/xhtml-modularization/abstract_modules.html#s_commonatts">abstract
modules of the XHTML Modularization 1.1</a>. We believe this section
to be in error, as <code>br</code> permits the use of the <code>style</code>
attribute even though it uses the <code>Core</code> collection, and
the DTD and XML Schemas supplied by W3C support our interpretation.
</p>
<h3>Attributes</h3>
<p>
If you didn't read the <a href="#addAttribute">previous section on
If you didn't read the <a href="#addAttribute">earlier section on
adding attributes</a>, read it now. The last parameter is simply
array of attribute names to attribute implementations, in the exact
an array of attribute names to attribute implementations, in the exact
same format as <code>addAttribute()</code>.
</p>

View File

@@ -58,7 +58,7 @@ appear elsewhere on the document. The method is simple:</p>
<pre>$config->set('HTML', 'EnableAttrID', true);
$config->set('Attr', 'IDBlacklist' array(
'list', 'of', 'attributes', 'that', 'are', 'forbidden'
'list', 'of', 'attribute', 'values', 'that', 'are', 'forbidden'
));</pre>
<p>That being said, there are some notable drawbacks. First of all, you have to
@@ -71,9 +71,9 @@ to possible standards-compliance issues.</p>
<p>Furthermore, this position becomes untenable when a single web page must hold
multiple portions of user-submitted content. Since there's obviously no way
to find out before-hand what IDs users will use, the blacklist is helpless.
And even since HTML Purifier validates each segment seperately, perhaps doing
And since HTML Purifier validates each segment separately, perhaps doing
so at different times, it would be extremely difficult to dynamically update
the blacklist inbetween runs.</p>
the blacklist in between runs.</p>
<p>Finally, simply destroying the ID is extremely un-userfriendly behavior: after
all, they might have simply specified a duplicate ID by accident.</p>

View File

@@ -10,9 +10,7 @@ to be effective. Things to remember:
2. IDs: see enduser-id.html for more info
3. Links: document pending feature completion
Rudimentary blacklisting, we should also allow only relative URIs. We
need a doc to explain the stuff.
3. URIs: see enduser-uri-filter.html
4. CSS: document pending
Explain which CSS styles we blocked and why.

View File

@@ -22,7 +22,7 @@ out:</p>
<p class="emphasis">This ain't HTML Tidy!</p>
<p>Rather, Tidy stands for a cool set of Tidy-inspired in HTML Purifier
<p>Rather, Tidy stands for a cool set of Tidy-inspired features in HTML Purifier
that allows users to submit deprecated elements and attributes and get
valid strict markup back. For example:</p>
@@ -33,8 +33,8 @@ valid strict markup back. For example:</p>
<pre>&lt;div style=&quot;text-align:center;&quot;&gt;Centered&lt;/div&gt;</pre>
<p>...when this particular fix is run on the HTML. This tutorial will give
you down the lowdown of what exactly HTML Purifier will do when Tidy
is on, and how to fine tune this behavior. Once again, <strong>you do
you the lowdown of what exactly HTML Purifier will do when Tidy
is on, and how to fine-tune this behavior. Once again, <strong>you do
not need Tidy installed on your PHP to use these features!</strong></p>
<h2>What does it do?</h2>
@@ -221,7 +221,7 @@ general syntax:</p>
<p>The lowdown is, quite frankly, HTML Purifier's default settings are
probably good enough. The next step is to bump the level up to heavy,
and if that still doesn't satisfy your appetite, do some fine tuning.
and if that still doesn't satisfy your appetite, do some fine-tuning.
Other than that, don't worry about it: this all works silently and
effectively in the background.</p>

View File

@@ -0,0 +1,201 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta name="description" content="Tutorial for creating custom URI filters." />
<link rel="stylesheet" type="text/css" href="style.css" />
<title>URI Filters - HTML Purifier</title>
</head><body>
<h1>URI Filters</h1>
<div id="filing">Filed under End-User</div>
<div id="index">Return to the <a href="index.html">index</a>.</div>
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
<p>
This is a quick and dirty document to get you on your way to writing
custom URI filters for your own URL filtering needs. Why would you
want to write a URI filter? If you need URIs your users put into
HTML to magically change into a different URI, this is
exactly what you need!
</p>
<h2>Creating the class</h2>
<p>
Any URI filter you make will be a subclass of <code>HTMLPurifier_URIFilter</code>.
The scaffolding is thus:
</p>
<pre>class HTMLPurifier_URIFilter_<strong>NameOfFilter</strong> extends HTMLPurifier_URIFilter
{
var $name = '<strong>NameOfFilter</strong>';
function prepare($config) {}
function filter(&$uri, $config, &$context) {}
}</pre>
<p>
Fill in the variable <code>$name</code> with the name of your filter, and
take a look at the two methods. <code>prepare()</code> is an initialization
method that is called only once, before any filtering has been done of the
HTML. Use it to perform any costly setup work that only needs to be done
once. <code>filter()</code> is the guts and innards of our filter:
it takes the URI and does whatever needs to be done to it.
</p>
<p>
If you've worked with HTML Purifier, you'll recognize the <code>$config</code>
and <code>$context</code> parameters. On the other hand, <code>$uri</code>
is something unique to this section of the application: it's a
<code>HTMLPurifier_URI</code> object. The interface is thus:
</p>
<pre>class HTMLPurifier_URI
{
var $scheme, $userinfo, $host, $port, $path, $query, $fragment;
function HTMLPurifier_URI($scheme, $userinfo, $host, $port, $path, $query, $fragment);
function toString();
function copy();
function getSchemeObj($config, &$context);
function validate($config, &$context);
}</pre>
<p>
The first three methods are fairly self-explanatory: you have a constructor,
a serializer, and a cloner. Generally, you won't be using them when
you are manipulating the URI objects themselves.
<code>getSchemeObj()</code> is a special purpose method that returns
a <code>HTMLPurifier_URIScheme</code> object corresponding to the specific
URI at hand. <code>validate()</code> performs general-purpose validation
on the internal components of a URI. Once again, you don't need to
worry about these: they've already been handled for you.
</p>
<h2>URI format</h2>
<p>
As a URIFilter, we're interested in the member variables of the URI object.
</p>
<table class="quick"><tbody>
<tr><th>Scheme</th> <td>The protocol for identifying (and possibly locating) a resource (http, ftp, https)</td></tr>
<tr><th>Userinfo</th> <td>User information such as a username (bob)</td></tr>
<tr><th>Host</th> <td>Domain name or IP address of the server (example.com, 127.0.0.1)</td></tr>
<tr><th>Port</th> <td>Network port number for the server (80, 12345)</td></tr>
<tr><th>Path</th> <td>Data that identifies the resource, possibly hierarchical (/path/to, ed@example.com)</td></tr>
<tr><th>Query</th> <td>String of information to be interpreted by the resource (?q=search-term)</td></tr>
<tr><th>Fragment</th> <td>Additional information for the resource after retrieval (#bookmark)</td></tr>
</tbody></table>
<p>
Because the URI is presented to us in this form, and not
<code>http://bob@example.com:8080/foo.php?q=string#hash</code>, it saves us
a lot of trouble in having to parse the URI every time we want to filter
it. For the record, the above URI has the following components:
</p>
<table class="quick"><tbody>
<tr><th>Scheme</th> <td>http</td></tr>
<tr><th>Userinfo</th> <td>bob</td></tr>
<tr><th>Host</th> <td>example.com</td></tr>
<tr><th>Port</th> <td>8080</td></tr>
<tr><th>Path</th> <td>/foo.php</td></tr>
<tr><th>Query</th> <td>q=string</td></tr>
<tr><th>Fragment</th> <td>hash</td></tr>
</tbody></table>
<p>
Note that there is no question mark or octothorpe in the query or
fragment: these get removed during parsing.
</p>
<p>
With this information, you can get straight to implementing your
<code>filter()</code> method. But one more thing...
</p>
<h2>Return value: Boolean, not URI</h2>
<p>
You may have noticed that the URI is being passed in by reference.
This means that whatever changes you make to it, those changes will
be reflected in the URI object the callee had. <strong>Do not
return the URI object: it is unnecessary and will cause bugs.</strong>
Instead, return a boolean value, true if the filtering was successful,
or false if the URI is beyond repair and needs to be axed.
</p>
<p>
Let's suppose I wanted to write a filter that de-internationalized domain
names by converting them to <a href="http://en.wikipedia.org/wiki/Punycode">Punycode</a>.
Assuming that <code>punycode_encode($input)</code> converts <code>$input</code> to
Punycode and returns <code>false</code> on failure:
</p>
<pre>class HTMLPurifier_URIFilter_ConvertIDNToPunycode extends HTMLPurifier_URIFilter
{
var $name = 'ConvertIDNToPunycode';
function filter(&$uri, $config, &$context) {
if (is_null($uri->host)) return true;
if ($uri->host == utf8_decode($uri->host)) {
// is ASCII, abort
return true;
}
$host = punycode_encode($uri->host);
if ($host === false) return false;
$uri->host = $host;
return true;
}
}</pre>
<p>
Notice I did not <code>return $uri;</code>.
</p>
<h2>Activating your filter</h2>
<p>
Having a filter is all well and good, but you need to tell HTML Purifier
to use it. Fortunately, this part's simple:
</p>
<pre>$uri =& $config->getDefinition('URI');
$uri->addFilter(new HTMLPurifier_URIFilter_<strong>NameOfFilter</strong>());</pre>
<p>
If you want to be really fancy, you can define a configuration directive
for your filter and have HTML Purifier automatically manage whether or
not your filter gets loaded or not (this is how internal filters manage
things):
</p>
<pre>HTMLPurifier_ConfigSchema::define(
'URI', '<strong>NameOfFilter</strong>', false, 'bool',
'<strong>What your filter does.</strong>'
);
$uri =& $config->getDefinition('URI', true);
$uri->registerFilter(new HTMLPurifier_URIFilter_<strong>NameOfFilter</strong>());
</pre>
<p>
Now, your filter will only be called when %URI.<strong>NameOfFilter</strong>
is set to true.
</p>
<h2>Examples</h2>
<p>
Check the
<a href="http://htmlpurifier.org/svnroot/htmlpurifier/trunk/library/HTMLPurifier/URIFilter/">URIFilter</a>
directory for more implementation examples, and see <a href="http://htmlpurifier.org/svnroot/htmlpurifier/trunk/docs/proposal-new-directives.txt">the
new directives proposal document</a> for ideas on what could be implemented
as a filter.
</p>
<div id="version">$Id$</div>
</body></html>

View File

@@ -96,7 +96,7 @@ which can be a rewarding (but difficult) task.</p>
<h2 id="findcharset">Finding the real encoding</h2>
<p>In the beginning, there was ASCII, and things were simple. But they
weren't good, for no one could write in Cryllic or Thai. So there
weren't good, for no one could write in Cyrillic or Thai. So there
exploded a proliferation of character encodings to remedy the problem
by extending the characters ASCII could express. This ridiculously
simplified version of the history of character encodings shows us that
@@ -138,7 +138,7 @@ browser:</p>
<dd>View &gt; Encoding: bulleted item is unofficial name</dd>
</dl>
<p>Internet Explorer won't give you the mime (i.e. useful/real) name of the
<p>Internet Explorer won't give you the MIME (i.e. useful/real) name of the
character encoding, so you'll have to look it up using their description.
Some common ones:</p>
@@ -216,6 +216,12 @@ if your <code>META</code> tag claims that either:</p>
<h2 id="fixcharset">Fixing the encoding</h2>
<p class="aside">The advice given here is for pages being served as
vanilla <code>text/html</code>. Different practices must be used
for <code>application/xml</code> or <code>application/xml+xhtml</code>, see
<a href="http://www.w3.org/TR/2002/NOTE-xhtml-media-types-20020430/">W3C's
document on XHTML media types</a> for more information.</p>
<p>If your <code>META</code> encoding and your real encoding match,
savvy! You can skip this section. If they don't...</p>
@@ -231,7 +237,7 @@ of your real encoding.</p>
why the character encoding should be explicitly stated. When the
browser isn't told what the character encoding of a text is, it
has to guess: and sometimes the guess is wrong. Hackers can manipulate
this guess in order to slip XSS pass filters and then fool the
this guess in order to slip XSS past filters and then fool the
browser into executing it as active code. A great example of this
is the <a href="http://shiflett.org/archive/177">Google UTF-7
exploit</a>.</p>
@@ -302,7 +308,8 @@ languages</a>. The appropriate code is:</p>
<p>...replacing UTF-8 with whatever your embedded encoding is.
This code must come before any output, so be careful about
stray whitespace in your application.</p>
stray whitespace in your application (i.e., any whitespace before
output excluding whitespace within &lt;?php ?&gt; tags).</p>
<h4 id="fixcharset-server-phpini">PHP ini directive</h4>
@@ -313,8 +320,8 @@ header call: <code><a href="http://php.net/ini.core#ini.default-charset">default
<p>...will also do the trick. If PHP is running as an Apache module (and
not as FastCGI, consult
<a href="http://php.net/phpinfo">phpinfo</a>() for details), you can even use htaccess do apply this property
globally:</p>
<a href="http://php.net/phpinfo">phpinfo</a>() for details), you can even use htaccess to apply this property
across many PHP files:</p>
<pre><a href="http://php.net/configuration.changes#configuration.changes.apache">php_value</a> default_charset &quot;UTF-8&quot;</pre>
@@ -360,10 +367,11 @@ to send anything at all:</p>
<pre><a href="http://httpd.apache.org/docs/1.3/mod/core.html#adddefaultcharset">AddDefaultCharset</a> Off</pre>
<p>...making your <code>META</code> tags the sole source of
character encoding information. In these cases, it is
<em>especially</em> important to make sure you have valid <code>META</code>
tags on your pages and all the text before them is ASCII.</p>
<p>...making your internal charset declaration (usually the <code>META</code> tags)
the sole source of character encoding
information. In these cases, it is <em>especially</em> important to make
sure you have valid <code>META</code> tags on your pages and all the
text before them is ASCII.</p>
<blockquote class="aside"><p>These directives can also be
placed in httpd.conf file for Apache, but
@@ -428,28 +436,30 @@ IIS to change character encodings, I'd be grateful.</p>
<p><code>META</code> tags are the most common source of embedded
encodings, but they can also come from somewhere else: XML
processing instructions. They look like:</p>
Declarations. They look like:</p>
<pre>&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;</pre>
<p>...and are most often found in XML documents (including XHTML).</p>
<p>For XHTML, this processing instruction theoretically
<p>For XHTML, this XML Declaration theoretically
overrides the <code>META</code> tag. In reality, this happens only when the
XHTML is actually served as legit XML and not HTML, which is almost always
never due to Internet Explorer's lack of support for
<code>application/xhtml+xml</code> (even though doing so is often
argued to be <a href="http://www.hixie.ch/advocacy/xhtml">good practice</a>).</p>
argued to be <a href="http://www.hixie.ch/advocacy/xhtml">good
practice</a> and is required by the XHTML 1.1 specification).</p>
<p>For XML, however, this processing instruction is extremely important.
<p>For XML, however, this XML Declaration is extremely important.
Since most webservers are not configured to send charsets for .xml files,
this is the only thing a parser has to go on. Furthermore, the default
for XML files is UTF-8, which often butts heads with more common
ISO-8859-1 encoding (you see this in garbled RSS feeds).</p>
<p>In short, if you use XHTML and have gone through the
trouble of adding the XML header, make sure it jives
with your <code>META</code> tags and HTTP headers.</p>
trouble of adding the XML Declaration, make sure it jives
with your <code>META</code> tags (which should only be present
if served in text/html) and HTTP headers.</p>
<h3 id="fixcharset-internals">Inside the process</h3>
@@ -506,7 +516,7 @@ usage in one language sometimes requires the occasional special character
that, without surprise, is not available in your character set. Sometimes
developers get around this by adding support for multiple encodings: when
using Chinese, use Big5, when using Japanese, use Shift-JIS, when
using Greek, etc. Other times, they use character entities with great
using Greek, etc. Other times, they use character references with great
zeal.</p>
<p>UTF-8, however, obviates the need for any of these complicated
@@ -520,14 +530,14 @@ you don't have to use those user-unfriendly entities.</p>
<p>Websites encoded in Latin-1 (ISO-8859-1) which ocassionally need
a special character outside of their scope often will use a character
entity to achieve the desired effect. For instance, &theta; can be
entity reference to achieve the desired effect. For instance, &theta; can be
written <code>&amp;theta;</code>, regardless of the character encoding's
support of Greek letters.</p>
<p>This works nicely for limited use of special characters, but
say you wanted this sentence of Chinese text: &#28608;&#20809;,
&#36889;&#20841;&#20491;&#23383;&#26159;&#29978;&#40636;&#24847;&#24605;.
The entity-ized version would look like this:</p>
The ampersand encoded version would look like this:</p>
<pre>&amp;#28608;&amp;#20809;, &amp;#36889;&amp;#20841;&amp;#20491;&amp;#23383;&amp;#26159;&amp;#29978;&amp;#40636;&amp;#24847;&amp;#24605;</pre>
@@ -545,7 +555,7 @@ an application that originally used ISO-8859-1 but switched to UTF-8
when it became far to cumbersome to support foreign languages. Bots
will now actually go through articles and convert character entities
to their corresponding real characters for the sake of user-friendliness
and searcheability. See
and searchability. See
<a href="http://meta.wikimedia.org/wiki/Help:Special_characters">Meta's
page on special characters</a> for more details.
</p></blockquote>
@@ -567,10 +577,11 @@ which may be used by POST, and is required when you want to upload
files.</p>
<p>The following is a summarization of notes from
<a href="http://ppewww.physics.gla.ac.uk/~flavell/charset/form-i18n.html">
<a href="http://web.archive.org/web/20060427015200/ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html">
<code>FORM</code> submission and i18n</a>. That document contains lots
of useful information, but is written in a rambly manner, so
here I try to get right to the point.</p>
here I try to get right to the point. (Note: the original has
disappeared off the web, so I am linking to the Web Archive copy.)</p>
<h4 id="whyutf8-forms-urlencoded"><code>application/x-www-form-urlencoded</code></h4>
@@ -592,7 +603,7 @@ browser you're using, they might:</p>
<ul>
<li>Replace the unsupported characters with useless question marks,</li>
<li>Attempt to fix the characters (example: smart quotes to regular quotes),</li>
<li>Replace the character with a character entity, or</li>
<li>Replace the character with a character entity reference, or</li>
<li>Send it anyway as a different character encoding mixed in
with the original encoding (usually Windows-1252 rather than
iso-8859-1 or UTF-8 interspersed in 8-bit)</li>
@@ -608,7 +619,7 @@ since UTF-8 supports every character.</p>
<h4 id="whyutf8-forms-multipart"><code>multipart/form-data</code></h4>
<p>Multipart form submission takes a way a lot of the ambiguity
<p>Multipart form submission takes away a lot of the ambiguity
that percent-encoding had: the server now can explicitly ask for
certain encodings, and the client can explicitly tell the server
during the form submission what encoding the fields are in.</p>
@@ -621,9 +632,9 @@ Each method has deficiencies, especially the former.</p>
<p>If you tell the browser to send the form in the same encoding as
the page, you still have the trouble of what to do with characters
that are outside of the character encoding's range. The behavior, once
again, varies: Firefox 2.0 entity-izes them while Internet Explorer
7.0 mangles them beyond intelligibility. For serious internationalization purposes,
this is not an option.</p>
again, varies: Firefox 2.0 converts them to character entity references
while Internet Explorer 7.0 mangles them beyond intelligibility. For
serious internationalization purposes, this is not an option.</p>
<p>The other possibility is to set Accept-Encoding to UTF-8, which
begs the question: Why aren't you using UTF-8 for everything then?
@@ -663,12 +674,12 @@ it up to the module iconv to do the dirty work.</p>
<p>This approach, however, is not perfect. iconv is blithely unaware
of HTML character entities. HTML Purifier, in order to
protect against sophisticated escaping schemes, normalizes all character
and numeric entities before processing the text. This leads to
and numeric entitie references before processing the text. This leads to
one important ramification:</p>
<p><strong>Any character that is not supported by the target character
set, regardless of whether or not it is in the form of a character
entity or a raw character, will be silently ignored.</strong></p>
entity reference or a raw character, will be silently ignored.</strong></p>
<p>Example of this principle at work: say you have <code>&amp;theta;</code>
in your HTML, but the output is in Latin-1 (which, understandably,
@@ -677,7 +688,7 @@ set the encoding correctly using %Core.Encoding):</p>
<ul>
<li>The <code>Encoder</code> will transform the text from ISO 8859-1 to UTF-8
(note that theta is preserved since it doesn't actually use
(note that theta is preserved here since it doesn't actually use
any non-ASCII characters): <code>&amp;theta;</code></li>
<li>The <code>EntityParser</code> will transform all named and numeric
character entities to their corresponding raw UTF-8 equivalents:
@@ -700,7 +711,7 @@ Purifier has provided a slightly more palatable workaround using
<li>The <code>EntityParser</code> transforms entities: <code>&theta;</code></li>
<li>HTML Purifier processes the code: <code>&theta;</code></li>
<li>The <code>Encoder</code> replaces all non-ASCII characters
with numeric entities: <code>&amp;#952;</code></li>
with numeric entity reference: <code>&amp;#952;</code></li>
<li>For good measure, <code>Encoder</code> transforms encoding back to
original (which is strictly unnecessary for 99% of encodings
out there): <code>&amp;#952;</code> (remember, it's all ASCII!)</li>
@@ -710,19 +721,19 @@ Purifier has provided a slightly more palatable workaround using
the land of Unicode characters, and is totally unacceptable for Chinese
or Japanese texts. The even bigger kicker is that, supposing the
input encoding was actually ISO-8859-7, which <em>does</em> support
theta, the character would get entity-ized anyway! (The Encoder does
not discriminate).</p>
theta, the character would get converted into a character entity reference
anyway! (The Encoder does not discriminate).</p>
<p>The current functionality is about where HTML Purifier will be for
the rest of eternity. HTML Purifier could attempt to preserve the original
form of the entities so that they could be substituted back in, only the
form of the character references so that they could be substituted back in, only the
DOM extension kills them off irreversibly. HTML Purifier could also attempt
to be smart and only convert non-ASCII characters that weren't supported
by the target encoding, but that would require reimplementing iconv
with HTML awareness, something I will not do.</p>
<p>So there: either it's UTF-8 or crippled international support. Your pick! (and I'm
not being sarcastic here: some people could care less about other languages)</p>
not being sarcastic here: some people could care less about other languages).</p>
<h2 id="migrate">Migrate to UTF-8</h2>
@@ -984,7 +995,7 @@ and yes, it is variable width. Other traits:</p>
in different ways. It is beyond the scope of this document to explain
what precisely these implications are. PHPWact provides
a very good <a href="http://www.phpwact.org/php/i18n/utf-8">reference document</a>
on what to expect from each functions, although coverage is spotty in
on what to expect from each function, although coverage is spotty in
some areas. Their more general notes on
<a href="http://www.phpwact.org/php/i18n/charsets">character sets</a>
are also worth looking at for information on UTF-8. Some rules of thumb
@@ -998,7 +1009,7 @@ when dealing with Unicode text:</p>
<li>Think twice before using functions that:<ul>
<li>...count characters (strlen will return bytes, not characters;
str_split and word_wrap may corrupt)</li>
<li>...entity-ize things (UTF-8 doesn't need entities)</li>
<li>...convert characters to entity references (UTF-8 doesn't need entities)</li>
<li>...do very complex string processing (*printf)</li>
</ul></li>
</ul>

View File

@@ -40,6 +40,9 @@ information for casual developers using HTML Purifier.</p>
<dt><a href="enduser-customize.html">Customize</a></dt>
<dd>Tutorial for customizing HTML Purifier's tag and attribute sets.</dd>
<dt><a href="enduser-uri-filter.html">URI Filters</a></dt>
<dd>Tutorial for creating custom URI filters.</dd>
</dl>
<h2>Development</h2>

View File

@@ -32,7 +32,7 @@ Here are some fuzzy levels you could set:
One final note: when you start axing tags that are more commonly used, you
run the risk of accidentally destroying user data, especially if the data
is incoming from a WYSIWYG eidtor that hasn't been synced accordingly. This may
is incoming from a WYSIWYG editor that hasn't been synced accordingly. This may
make forbidden element to text transformations desirable (for example, images).

View File

@@ -2,7 +2,8 @@
Configuration Ideas
Here are some theoretical configuration ideas that we could implement some
time. Note the naming convention: %Namespace.Directive
time. Note the naming convention: %Namespace.Directive. If you want one
implemented, give us a ring, and we'll move it up the priority chain.
%Attr.RewriteFragments - if there's %Attr.IDPrefix we may want to transparently
rewrite the URLs we parse too. However, we can only do it when it's a pure
@@ -22,8 +23,6 @@ time. Note the naming convention: %Namespace.Directive
%URI.AddRelNofollow - will add rel="nofollow" to all links, preventing the
spread of ill-gotten pagerank
%URI.RelativeToAbsolute - transforms all relative URIs to absolute form
%URI.HostBlacklistRegex - regexes that if matching the host are disallowed
%URI.HostWhitelist - domain names that are excluded from the host blacklist
%URI.HostPolicy - determines whether or not its reject all and then whitelist

28
docs/ref-css-length.txt Normal file
View File

@@ -0,0 +1,28 @@
CSS Length Reference
To bound, or not to bound, that is the question
It's quite a reasonable request, really, and it's already been implemented
for HTML. That is, length bounding. It makes little sense to let users
define text blocks that have a font-size of 63,360 inches (that's a mile,
by the way) or a width of forty-fold the parent container.
But it's a little more complicated then that. There are multiple units
one can use, and we have to a little unit conversion to get things working.
Here's what we have:
Absolute:
1 in ~= 2.54 cm
1 cm = 10 mm
1 pt = 1/72 in
1 pc = 12 pt
Relative:
1 em ~= 10.0667 px
1 ex ~= 0.5 em, though Mozilla Firefox says 1 ex = 6px
1 px ~= 1 pt
Watch out: font-sizes can also be nested to get successively larger
(although I do not relish having to keep track of context font-sizes,
this may be necessary, especially for some of the more advanced features
for preventing things like white on white).

View File

@@ -33,6 +33,9 @@ blockquote .label {font-weight:bold; font-size:1em; margin:0 0 .1em;
.table thead th:first-child {-moz-border-radius-topleft:1em;}
.table tbody td {border-bottom:1px solid #CCC; padding-right:0.6em;padding-left:0.6em;}
/* A quick table*/
table.quick tbody th {text-align:right; padding-right:1em;}
/* Category of the file */
#filing {font-weight:bold; font-size:smaller; }

View File

@@ -22,8 +22,8 @@
*/
/*
HTML Purifier 2.0.1 - Standards Compliant HTML Filtering
Copyright (C) 2006 Edward Z. Yang
HTML Purifier 2.1.5 - Standards Compliant HTML Filtering
Copyright (C) 2006-2007 Edward Z. Yang
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
@@ -40,9 +40,11 @@
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
// almost every class has an undocumented dependency to these, so make sure
// they get included
require_once 'HTMLPurifier/ConfigSchema.php'; // important
// constants are slow, but we'll make one exception
define('HTMLPURIFIER_PREFIX', dirname(__FILE__));
// every class has an undocumented dependency to these, must be included!
require_once 'HTMLPurifier/ConfigSchema.php'; // fatal errors if not included
require_once 'HTMLPurifier/Config.php';
require_once 'HTMLPurifier/Context.php';
@@ -57,16 +59,23 @@ require_once 'HTMLPurifier/LanguageFactory.php';
HTMLPurifier_ConfigSchema::define(
'Core', 'CollectErrors', false, 'bool', '
Whether or not to collect errors found while filtering the document. This
is a useful way to give feedback to your users. CURRENTLY NOT IMPLEMENTED.
This directive has been available since 2.0.0.
is a useful way to give feedback to your users. <strong>Warning:</strong>
Currently this feature is very patchy and experimental, with lots of
possible error messages not yet implemented. It will not cause any problems,
but it may not help your users either. This directive has been available
since 2.0.0.
');
/**
* Main library execution class.
* Facade that coordinates HTML Purifier's subsystems in order to purify HTML.
*
* Facade that performs calls to the HTMLPurifier_Lexer,
* HTMLPurifier_Strategy and HTMLPurifier_Generator subsystems in order to
* purify HTML.
* @note There are several points in which configuration can be specified
* for HTML Purifier. The precedence of these (from lowest to
* highest) is as follows:
* -# Instance: new HTMLPurifier($config)
* -# Invocation: purify($html, $config)
* These configurations are entirely independent of each other and
* are *not* merged.
*
* @todo We need an easier way to inject strategies, it'll probably end
* up getting done through config though.
@@ -74,15 +83,16 @@ This directive has been available since 2.0.0.
class HTMLPurifier
{
var $version = '2.0.1';
var $version = '2.1.5';
var $config;
var $filters;
var $filters = array();
var $strategy, $generator;
/**
* Final HTMLPurifier_Context of last run purification. Might be an array.
* Resultant HTMLPurifier_Context of last run purification. Is an array
* of contexts if the last called method was purifyArray().
* @public
*/
var $context;
@@ -147,6 +157,11 @@ class HTMLPurifier
$context->register('ErrorCollector', $error_collector);
}
// setup id_accumulator context, necessary due to the fact that
// AttrValidator can be called from many places
$id_accumulator = HTMLPurifier_IDAccumulator::build($config, $context);
$context->register('IDAccumulator', $id_accumulator);
$html = HTMLPurifier_Encoder::convertToUTF8($html, $config, $context);
for ($i = 0, $size = count($this->filters); $i < $size; $i++) {
@@ -195,14 +210,16 @@ class HTMLPurifier
/**
* Singleton for enforcing just one HTML Purifier in your system
* @param $prototype Optional prototype HTMLPurifier instance to
* overload singleton with.
*/
function &getInstance($prototype = null) {
function &instance($prototype = null) {
static $htmlpurifier;
if (!$htmlpurifier || $prototype) {
if (is_a($prototype, 'HTMLPurifier')) {
$htmlpurifier = $prototype;
} elseif ($prototype) {
$htmlpurifier = new HTMLPurifier(HTMLPurifier_Config::create($prototype));
$htmlpurifier = new HTMLPurifier($prototype);
} else {
$htmlpurifier = new HTMLPurifier();
}
@@ -210,6 +227,9 @@ class HTMLPurifier
return $htmlpurifier;
}
function &getInstance($prototype = null) {
return HTMLPurifier::instance($prototype);
}
}

View File

@@ -54,18 +54,15 @@ class HTMLPurifier_AttrDef
*
* @warning This processing is inconsistent with XML's whitespace handling
* as specified by section 3.3.3 and referenced XHTML 1.0 section
* 4.7. Compliant processing requires all line breaks normalized
* to "\n", so the fix is not as simple as fixing it in this
* function. Trim and whitespace collapsing are supposed to only
* occur in NMTOKENs. However, note that we are NOT necessarily
* parsing XML, thus, this behavior may still be correct.
* 4.7. However, note that we are NOT necessarily
* parsing XML, thus, this behavior may still be correct. We
* assume that newlines have been normalized.
*
* @public
*/
function parseCDATA($string) {
$string = trim($string);
$string = str_replace("\n", '', $string);
$string = str_replace(array("\r", "\t"), ' ', $string);
$string = str_replace(array("\n", "\t", "\r"), ' ', $string);
return $string;
}
@@ -82,5 +79,13 @@ class HTMLPurifier_AttrDef
return $this;
}
/**
* Removes spaces from rgb(0, 0, 0) so that shorthand CSS properties work
* properly. THIS IS A HACK!
*/
function mungeRgb($string) {
return preg_replace('/rgb\((\d+)\s*,\s*(\d+)\s*,\s*(\d+)\)/', 'rgb(\1,\2,\3)', $string);
}
}

View File

@@ -38,7 +38,20 @@ class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef
list($property, $value) = explode(':', $declaration, 2);
$property = trim($property);
$value = trim($value);
if (!isset($definition->info[$property])) continue;
$ok = false;
do {
if (isset($definition->info[$property])) {
$ok = true;
break;
}
if (ctype_lower($property)) break;
$property = strtolower($property);
if (isset($definition->info[$property])) {
$ok = true;
break;
}
} while(0);
if (!$ok) continue;
// inefficient call, since the validator will do this again
if (strtolower(trim($value)) !== 'inherit') {
// inherit works for everything (but only on the base property)

View File

@@ -31,6 +31,9 @@ class HTMLPurifier_AttrDef_CSS_Background extends HTMLPurifier_AttrDef
$string = $this->parseCDATA($string);
if ($string === '') return false;
// munge rgb() decl if necessary
$string = $this->mungeRgb($string);
// assumes URI doesn't have spaces in it
$bits = explode(' ', strtolower($string)); // bits to process

View File

@@ -22,7 +22,7 @@ class HTMLPurifier_AttrDef_CSS_Border extends HTMLPurifier_AttrDef
function validate($string, $config, &$context) {
$string = $this->parseCDATA($string);
// we specifically will not support rgb() syntax with spaces
$string = $this->mungeRgb($string);
$bits = explode(' ', $string);
$done = array(); // segments we've finished
$ret = ''; // return value

View File

@@ -39,20 +39,13 @@ class HTMLPurifier_AttrDef_CSS_Color extends HTMLPurifier_AttrDef
if ($colors === null) $colors = $config->get('Core', 'ColorKeywords');
$color = trim($color);
if (!$color) return false;
if ($color === '') return false;
$lower = strtolower($color);
if (isset($colors[$lower])) return $colors[$lower];
if ($color[0] === '#') {
// hexadecimal handling
$hex = substr($color, 1);
$length = strlen($hex);
if ($length !== 3 && $length !== 6) return false;
if (!ctype_xdigit($hex)) return false;
} else {
if (strpos($color, 'rgb(') !== false) {
// rgb literal handling
if (strpos($color, 'rgb(')) return false;
$length = strlen($color);
if (strpos($color, ')') !== $length - 1) return false;
$triad = substr($color, 4, $length - 4 - 1);
@@ -90,6 +83,17 @@ class HTMLPurifier_AttrDef_CSS_Color extends HTMLPurifier_AttrDef
}
$new_triad = implode(',', $new_parts);
$color = "rgb($new_triad)";
} else {
// hexadecimal handling
if ($color[0] === '#') {
$hex = substr($color, 1);
} else {
$hex = $color;
$color = '#' . $color;
}
$length = strlen($hex);
if ($length !== 3 && $length !== 6) return false;
if (!ctype_xdigit($hex)) return false;
}
return $color;

View File

@@ -0,0 +1,26 @@
<?php
/**
* Decorator which enables CSS properties to be disabled for specific elements.
*/
class HTMLPurifier_AttrDef_CSS_DenyElementDecorator extends HTMLPurifier_AttrDef
{
var $def, $element;
/**
* @param $def Definition to wrap
* @param $element Element to deny
*/
function HTMLPurifier_AttrDef_CSS_DenyElementDecorator(&$def, $element) {
$this->def =& $def;
$this->element = $element;
}
/**
* Checks if CurrentToken is set and equal to $this->element
*/
function validate($string, $config, $context) {
$token = $context->get('CurrentToken', true);
if ($token && $token->name == $this->element) return false;
return $this->def->validate($string, $config, $context);
}
}

View File

@@ -19,7 +19,6 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
'cursive' => true
);
$string = $this->parseCDATA($string);
// assume that no font names contain commas in them
$fonts = explode(',', $string);
$final = '';
@@ -38,19 +37,51 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
$quote = $font[0];
if ($font[$length - 1] !== $quote) continue;
$font = substr($font, 1, $length - 2);
$new_font = '';
for ($i = 0, $c = strlen($font); $i < $c; $i++) {
if ($font[$i] === '\\') {
$i++;
if ($i >= $c) {
$new_font .= '\\';
break;
}
if (ctype_xdigit($font[$i])) {
$code = $font[$i];
for ($a = 1, $i++; $i < $c && $a < 6; $i++, $a++) {
if (!ctype_xdigit($font[$i])) break;
$code .= $font[$i];
}
// We have to be extremely careful when adding
// new characters, to make sure we're not breaking
// the encoding.
$char = HTMLPurifier_Encoder::unichr(hexdec($code));
if (HTMLPurifier_Encoder::cleanUTF8($char) === '') continue;
$new_font .= $char;
if ($i < $c && trim($font[$i]) !== '') $i--;
continue;
}
if ($font[$i] === "\n") continue;
}
$new_font .= $font[$i];
}
$font = $new_font;
}
// process font
if (ctype_alnum($font)) {
// $font is a pure representation of the font name
if (ctype_alnum($font) && $font !== '') {
// very simple font, allow it in unharmed
$final .= $font . ', ';
continue;
}
$nospace = str_replace(array(' ', '.', '!'), '', $font);
if (ctype_alnum($nospace)) {
// font with spaces in it
$final .= "'$font', ";
continue;
}
// complicated font, requires quoting
// armor single quotes and new lines
$font = str_replace("\\", "\\\\", $font);
$font = str_replace("'", "\\'", $font);
$final .= "'$font', ";
}
$final = rtrim($final, ', ');
if ($final === '') return false;

View File

@@ -1,7 +1,7 @@
<?php
require_once 'HTMLPurifier/AttrDef.php';
require_once 'HTMLPurifier/AttrDef/CSS/Number.php';
require_once 'HTMLPurifier/Length.php';
require_once 'HTMLPurifier/UnitConverter.php';
/**
* Represents a Length as defined by CSS.
@@ -9,46 +9,40 @@ require_once 'HTMLPurifier/AttrDef/CSS/Number.php';
class HTMLPurifier_AttrDef_CSS_Length extends HTMLPurifier_AttrDef
{
/**
* Valid unit lookup table.
* @warning The code assumes all units are two characters long. Be careful
* if we have to change this behavior!
*/
var $units = array('em' => true, 'ex' => true, 'px' => true, 'in' => true,
'cm' => true, 'mm' => true, 'pt' => true, 'pc' => true);
/**
* Instance of HTMLPurifier_AttrDef_Number to defer number validation to
*/
var $number_def;
var $min, $max;
/**
* @param $non_negative Bool indication whether or not negative values are
* allowed.
* @param HTMLPurifier_Length $max Minimum length, or null for no bound. String is also acceptable.
* @param HTMLPurifier_Length $max Maximum length, or null for no bound. String is also acceptable.
*/
function HTMLPurifier_AttrDef_CSS_Length($non_negative = false) {
$this->number_def = new HTMLPurifier_AttrDef_CSS_Number($non_negative);
function HTMLPurifier_AttrDef_CSS_Length($min = null, $max = null) {
$this->min = $min !== null ? HTMLPurifier_Length::make($min) : null;
$this->max = $max !== null ? HTMLPurifier_Length::make($max) : null;
}
function validate($length, $config, &$context) {
function validate($string, $config, $context) {
$string = $this->parseCDATA($string);
$length = $this->parseCDATA($length);
if ($length === '') return false;
if ($length === '0') return '0';
$strlen = strlen($length);
if ($strlen === 1) return false; // impossible!
// Optimizations
if ($string === '') return false;
if ($string === '0') return '0';
if (strlen($string) === 1) return false;
// we assume all units are two characters
$unit = substr($length, $strlen - 2);
if (!ctype_lower($unit)) $unit = strtolower($unit);
$number = substr($length, 0, $strlen - 2);
$length = HTMLPurifier_Length::make($string);
if (!$length->isValid()) return false;
if (!isset($this->units[$unit])) return false;
$number = $this->number_def->validate($number, $config, $context);
if ($number === false) return false;
return $number . $unit;
if ($this->min) {
$c = $length->compareTo($this->min);
if ($c === false) return false;
if ($c < 0) return false;
}
if ($this->max) {
$c = $length->compareTo($this->max);
if ($c === false) return false;
if ($c > 0) return false;
}
return $length->toString();
}
}

View File

@@ -18,6 +18,11 @@ class HTMLPurifier_AttrDef_CSS_Number extends HTMLPurifier_AttrDef
$this->non_negative = $non_negative;
}
/**
* @warning Some contexts do not pass $config, $context. These
* variables should not be used without checking HTMLPurifier_Length.
* This might not work properly in PHP4.
*/
function validate($number, $config, &$context) {
$number = $this->parseCDATA($number);

View File

@@ -15,10 +15,13 @@ class HTMLPurifier_AttrDef_CSS_TextDecoration extends HTMLPurifier_AttrDef
static $allowed_values = array(
'line-through' => true,
'overline' => true,
'underline' => true
'underline' => true,
);
$string = strtolower($this->parseCDATA($string));
if ($string === 'none') return $string;
$parts = explode(' ', $string);
$final = '';
foreach ($parts as $part) {

View File

@@ -15,7 +15,7 @@ class HTMLPurifier_AttrDef_CSS_URI extends HTMLPurifier_AttrDef_URI
{
function HTMLPurifier_AttrDef_CSS_URI() {
$this->HTMLPurifier_AttrDef_URI(true); // always embedded
parent::HTMLPurifier_AttrDef_URI(true); // always embedded
}
function validate($uri_string, $config, &$context) {

View File

@@ -8,6 +8,12 @@ require_once 'HTMLPurifier/AttrDef.php';
class HTMLPurifier_AttrDef_HTML_Pixels extends HTMLPurifier_AttrDef
{
var $max;
function HTMLPurifier_AttrDef_HTML_Pixels($max = null) {
$this->max = $max;
}
function validate($string, $config, &$context) {
$string = trim($string);
@@ -26,11 +32,18 @@ class HTMLPurifier_AttrDef_HTML_Pixels extends HTMLPurifier_AttrDef
// crash operating systems, see <http://ha.ckers.org/imagecrash.html>
// WARNING, above link WILL crash you if you're using Windows
if ($int > 1200) return '1200';
if ($this->max !== null && $int > $this->max) return (string) $this->max;
return (string) $int;
}
function make($string) {
if ($string === '') $max = null;
else $max = (int) $string;
$class = get_class($this);
return new $class($max);
}
}

View File

@@ -0,0 +1,32 @@
<?php
/**
* Decorator that, depending on a token, switches between two definitions.
*/
class HTMLPurifier_AttrDef_Switch
{
var $tag;
var $withTag, $withoutTag;
/**
* @param string $tag Tag name to switch upon
* @param HTMLPurifier_AttrDef $with_tag Call if token matches tag
* @param HTMLPurifier_AttrDef $without_tag Call if token doesn't match, or there is no token
*/
function HTMLPurifier_AttrDef_Switch($tag, $with_tag, $without_tag) {
$this->tag = $tag;
$this->withTag = $with_tag;
$this->withoutTag = $without_tag;
}
function validate($string, $config, $context) {
$token = $context->get('CurrentToken', true);
if (!$token || $token->name !== $this->tag) {
return $this->withoutTag->validate($string, $config, $context);
} else {
return $this->withTag->validate($string, $config, $context);
}
}
}

View File

@@ -1,90 +1,66 @@
<?php
require_once 'HTMLPurifier/AttrDef.php';
require_once 'HTMLPurifier/URIParser.php';
require_once 'HTMLPurifier/URIScheme.php';
require_once 'HTMLPurifier/URISchemeRegistry.php';
require_once 'HTMLPurifier/AttrDef/URI/Host.php';
require_once 'HTMLPurifier/PercentEncoder.php';
require_once 'HTMLPurifier/AttrDef/URI/Email.php';
// special case filtering directives
HTMLPurifier_ConfigSchema::define(
'URI', 'DefaultScheme', 'http', 'string',
'Defines through what scheme the output will be served, in order to '.
'select the proper object validator when no scheme information is present.'
);
'URI', 'Munge', null, 'string/null', '
<p>
Munges all browsable (usually http, https and ftp)
absolute URI\'s into another URI, usually a URI redirection service.
This directive accepts a URI, formatted with a <code>%s</code> where
the url-encoded original URI should be inserted (sample:
<code>http://www.google.com/url?q=%s</code>).
</p>
<p>
Uses for this directive:
</p>
<ul>
<li>
Prevent PageRank leaks, while being fairly transparent
to users (you may also want to add some client side JavaScript to
override the text in the statusbar). <strong>Notice</strong>:
Many security experts believe that this form of protection does not deter spam-bots.
</li>
<li>
Redirect users to a splash page telling them they are leaving your
website. While this is poor usability practice, it is often mandated
in corporate environments.
</li>
</ul>
<p>
This directive has been available since 1.3.0.
</p>
');
// disabling directives
HTMLPurifier_ConfigSchema::define(
'URI', 'Host', null, 'string/null',
'Defines the domain name of the server, so we can determine whether or '.
'an absolute URI is from your website or not. Not strictly necessary, '.
'as users should be using relative URIs to reference resources on your '.
'website. It will, however, let you use absolute URIs to link to '.
'subdomains of the domain you post here: i.e. example.com will allow '.
'sub.example.com. However, higher up domains will still be excluded: '.
'if you set %URI.Host to sub.example.com, example.com will be blocked. '.
'This directive has been available since 1.2.0.'
);
HTMLPurifier_ConfigSchema::define(
'URI', 'DisableExternal', false, 'bool',
'Disables links to external websites. This is a highly effective '.
'anti-spam and anti-pagerank-leech measure, but comes at a hefty price: no'.
'links or images outside of your domain will be allowed. Non-linkified '.
'URIs will still be preserved. If you want to be able to link to '.
'subdomains or use absolute URIs, specify %URI.Host for your website. '.
'This directive has been available since 1.2.0.'
);
HTMLPurifier_ConfigSchema::define(
'URI', 'DisableExternalResources', false, 'bool',
'Disables the embedding of external resources, preventing users from '.
'embedding things like images from other hosts. This prevents '.
'access tracking (good for email viewers), bandwidth leeching, '.
'cross-site request forging, goatse.cx posting, and '.
'other nasties, but also results in '.
'a loss of end-user functionality (they can\'t directly post a pic '.
'they posted from Flickr anymore). Use it if you don\'t have a '.
'robust user-content moderation team. This directive has been '.
'available since 1.3.0.'
);
HTMLPurifier_ConfigSchema::define(
'URI', 'DisableResources', false, 'bool',
'Disables embedding resources, essentially meaning no pictures. You can '.
'still link to them though. See %URI.DisableExternalResources for why '.
'this might be a good idea. This directive has been available since 1.3.0.'
);
HTMLPurifier_ConfigSchema::define(
'URI', 'Munge', null, 'string/null',
'Munges all browsable (usually http, https and ftp) URI\'s into some URL '.
'redirection service. Pass this directive a URI, with %s inserted where '.
'the url-encoded original URI should be inserted (sample: '.
'<code>http://www.google.com/url?q=%s</code>). '.
'This prevents PageRank leaks, while being as transparent as possible '.
'to users (you may also want to add some client side JavaScript to '.
'override the text in the statusbar). Warning: many security experts '.
'believe that this form of protection does not deter spam-bots. '.
'You can also use this directive to redirect users to a splash page '.
'telling them they are leaving your website. '.
'This directive has been available since 1.3.0.'
);
HTMLPurifier_ConfigSchema::define(
'URI', 'HostBlacklist', array(), 'list',
'List of strings that are forbidden in the host of any URI. Use it to '.
'kill domain names of spam, etc. Note that it will catch anything in '.
'the domain, so <tt>moo.com</tt> will catch <tt>moo.com.example.com</tt>. '.
'This directive has been available since 1.3.0.'
);
HTMLPurifier_ConfigSchema::define(
'URI', 'Disable', false, 'bool',
'Disables all URIs in all forms. Not sure why you\'d want to do that '.
'(after all, the Internet\'s founded on the notion of a hyperlink). '.
'This directive has been available since 1.3.0.'
);
'URI', 'Disable', false, 'bool', '
<p>
Disables all URIs in all forms. Not sure why you\'d want to do that
(after all, the Internet\'s founded on the notion of a hyperlink).
This directive has been available since 1.3.0.
</p>
');
HTMLPurifier_ConfigSchema::defineAlias('Attr', 'DisableURI', 'URI', 'Disable');
HTMLPurifier_ConfigSchema::define(
'URI', 'DisableResources', false, 'bool', '
<p>
Disables embedding resources, essentially meaning no pictures. You can
still link to them though. See %URI.DisableExternalResources for why
this might be a good idea. This directive has been available since 1.3.0.
</p>
');
/**
* Validates a URI as defined by RFC 3986.
* @note Scheme-specific mechanics deferred to HTMLPurifier_URIScheme
@@ -92,214 +68,73 @@ HTMLPurifier_ConfigSchema::defineAlias('Attr', 'DisableURI', 'URI', 'Disable');
class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
{
var $host;
var $embeds_resource;
var $parser;
var $embedsResource;
/**
* @param $embeds_resource_resource Does the URI here result in an extra HTTP request?
*/
function HTMLPurifier_AttrDef_URI($embeds_resource = false) {
$this->host = new HTMLPurifier_AttrDef_URI_Host();
$this->embeds_resource = (bool) $embeds_resource;
$this->parser = new HTMLPurifier_URIParser();
$this->embedsResource = (bool) $embeds_resource;
}
function validate($uri, $config, &$context) {
static $PercentEncoder = null;
if ($PercentEncoder === null) $PercentEncoder = new HTMLPurifier_PercentEncoder();
// We'll write stack-based parsers later, for now, use regexps to
// get things working as fast as possible (irony)
if ($config->get('URI', 'Disable')) return false;
// parse as CDATA
$uri = $this->parseCDATA($uri);
// fix up percent-encoding
$uri = $PercentEncoder->normalize($uri);
// parse the URI
$uri = $this->parser->parse($uri);
if ($uri === false) return false;
// while it would be nice to use parse_url(), that's specifically
// for HTTP and thus won't work for our generic URI parsing
// add embedded flag to context for validators
$context->register('EmbeddedURI', $this->embedsResource);
// according to the RFC... (but this cuts corners, i.e. non-validating)
$r_URI = '!'.
'(([^:/?#<>\'"]+):)?'. // 2. Scheme
'(//([^/?#<>\'"]*))?'. // 4. Authority
'([^?#<>\'"]*)'. // 5. Path
'(\?([^#<>\'"]*))?'. // 7. Query
'(#([^<>\'"]*))?'. // 8. Fragment
'!';
$matches = array();
$result = preg_match($r_URI, $uri, $matches);
if (!$result) return false; // invalid URI
// seperate out parts
$scheme = !empty($matches[1]) ? $matches[2] : null;
$authority = !empty($matches[3]) ? $matches[4] : null;
$path = $matches[5]; // always present, can be empty
$query = !empty($matches[6]) ? $matches[7] : null;
$fragment = !empty($matches[8]) ? $matches[9] : null;
$registry =& HTMLPurifier_URISchemeRegistry::instance();
if ($scheme !== null) {
// no need to validate the scheme's fmt since we do that when we
// retrieve the specific scheme object from the registry
$scheme = ctype_lower($scheme) ? $scheme : strtolower($scheme);
$scheme_obj = $registry->getScheme($scheme, $config, $context);
if (!$scheme_obj) return false; // invalid scheme, clean it out
} else {
$scheme_obj = $registry->getScheme(
$config->get('URI', 'DefaultScheme'), $config, $context
);
}
// something funky weird happened in the registry, abort!
if (!$scheme_obj) {
trigger_error(
'Default scheme object "' . $config->get('URI', 'DefaultScheme') . '" was not readable',
E_USER_WARNING
);
return false;
}
// the URI we're processing embeds_resource a resource in the page, but the URI
// it references cannot be located
if ($this->embeds_resource && !$scheme_obj->browsable) {
return false;
}
if ($authority !== null) {
$ok = false;
do {
// remove URI if it's absolute and we disabled externals or
// if it's absolute and embedded and we disabled external resources
unset($our_host);
if (
$config->get('URI', 'DisableExternal') ||
(
$config->get('URI', 'DisableExternalResources') &&
$this->embeds_resource
)
) {
$our_host = $config->get('URI', 'Host');
if ($our_host === null) return false;
}
// generic validation
$result = $uri->validate($config, $context);
if (!$result) break;
$HEXDIG = '[A-Fa-f0-9]';
$unreserved = 'A-Za-z0-9-._~'; // make sure you wrap with []
$sub_delims = '!$&\'()'; // needs []
$pct_encoded = "%$HEXDIG$HEXDIG";
$r_userinfo = "(?:[$unreserved$sub_delims:]|$pct_encoded)*";
$r_authority = "/^(($r_userinfo)@)?(\[[^\]]+\]|[^:]*)(:(\d*))?/";
$matches = array();
preg_match($r_authority, $authority, $matches);
// overloads regexp!
$userinfo = !empty($matches[1]) ? $matches[2] : null;
$host = !empty($matches[3]) ? $matches[3] : null;
$port = !empty($matches[4]) ? $matches[5] : null;
// chained filtering
$uri_def =& $config->getDefinition('URI');
$result = $uri_def->filter($uri, $config, $context);
if (!$result) break;
// validate port
if ($port !== null) {
$port = (int) $port;
if ($port < 1 || $port > 65535) $port = null;
}
// scheme-specific validation
$scheme_obj = $uri->getSchemeObj($config, $context);
if (!$scheme_obj) break;
if ($this->embedsResource && !$scheme_obj->browsable) break;
$result = $scheme_obj->validate($uri, $config, $context);
if (!$result) break;
$host = $this->host->validate($host, $config, $context);
if ($host === false) $host = null;
// survived gauntlet
$ok = true;
if ($this->checkBlacklist($host, $config, $context)) return false;
// more lenient absolute checking
if (isset($our_host)) {
$host_parts = array_reverse(explode('.', $host));
// could be cached
$our_host_parts = array_reverse(explode('.', $our_host));
foreach ($our_host_parts as $i => $discard) {
if (!isset($host_parts[$i])) return false;
if ($host_parts[$i] != $our_host_parts[$i]) return false;
}
}
// userinfo and host are validated within the regexp
} else {
$port = $host = $userinfo = null;
}
} while (false);
$context->destroy('EmbeddedURI');
if (!$ok) return false;
// query and fragment are quite simple in terms of definition:
// *( pchar / "/" / "?" ), so define their validation routines
// when we start fixing percent encoding
// back to string
$result = $uri->toString();
// path gets to be validated against a hodge-podge of rules depending
// on the status of authority and scheme, but it's not that important,
// esp. since it won't be applicable to everyone
// okay, now we defer execution to the subobject for more processing
// note that $fragment is omitted
list($userinfo, $host, $port, $path, $query) =
$scheme_obj->validateComponents(
$userinfo, $host, $port, $path, $query, $config, $context
);
// reconstruct authority
$authority = null;
if (!is_null($userinfo) || !is_null($host) || !is_null($port)) {
$authority = '';
if($userinfo !== null) $authority .= $userinfo . '@';
$authority .= $host;
if($port !== null) $authority .= ':' . $port;
}
// reconstruct the result
$result = '';
if ($scheme !== null) $result .= "$scheme:";
if ($authority !== null) $result .= "//$authority";
$result .= $path;
if ($query !== null) $result .= "?$query";
if ($fragment !== null) $result .= "#$fragment";
// munge if necessary
$munge = $config->get('URI', 'Munge');
if (!empty($scheme_obj->browsable) && $munge !== null) {
if ($authority !== null) {
$result = str_replace('%s', rawurlencode($result), $munge);
}
// munge entire URI if necessary
if (
!is_null($uri->host) && // indicator for authority
!empty($scheme_obj->browsable) &&
!is_null($munge = $config->get('URI', 'Munge'))
) {
$result = str_replace('%s', rawurlencode($result), $munge);
}
return $result;
}
/**
* Checks a host against an array blacklist
* @param $host Host to check
* @param $config HTMLPurifier_Config instance
* @param $context HTMLPurifier_Context instance
* @return bool Is spam?
*/
function checkBlacklist($host, &$config, &$context) {
$blacklist = $config->get('URI', 'HostBlacklist');
if (!empty($blacklist)) {
foreach($blacklist as $blacklisted_host_fragment) {
if (strpos($host, $blacklisted_host_fragment) !== false) {
return true;
}
}
}
return false;
}
}

View File

@@ -14,3 +14,5 @@ class HTMLPurifier_AttrDef_URI_Email extends HTMLPurifier_AttrDef
}
// sub-implementations
require_once 'HTMLPurifier/AttrDef/URI/Email/SimpleCheck.php';

View File

@@ -40,11 +40,23 @@ class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef
$ipv4 = $this->ipv4->validate($string, $config, $context);
if ($ipv4 !== false) return $ipv4;
// validate a domain name here, do filtering, etc etc etc
// A regular domain name.
// We could use this, but it would break I18N domain names
//$match = preg_match('/^[a-z0-9][\w\-\.]*[a-z0-9]$/i', $string);
//if (!$match) return false;
// This breaks I18N domain names, but we don't have proper IRI support,
// so force users to insert Punycode. If there's complaining we'll
// try to fix things into an international friendly form.
// The productions describing this are:
$a = '[a-z]'; // alpha
$an = '[a-z0-9]'; // alphanum
$and = '[a-z0-9-]'; // alphanum | "-"
// domainlabel = alphanum | alphanum *( alphanum | "-" ) alphanum
$domainlabel = "$an($and*$an)?";
// toplabel = alpha | alpha *( alphanum | "-" ) alphanum
$toplabel = "$a($and*$an)?";
// hostname = *( domainlabel "." ) toplabel [ "." ]
$match = preg_match("/^($domainlabel\.)*$toplabel\.?$/i", $string);
if (!$match) return false;
return $string;
}

View File

@@ -44,6 +44,9 @@ class HTMLPurifier_AttrTypes
$this->info['LanguageCode'] = new HTMLPurifier_AttrDef_Lang();
$this->info['Color'] = new HTMLPurifier_AttrDef_HTML_Color();
// unimplemented aliases
$this->info['ContentType'] = new HTMLPurifier_AttrDef_Text();
// number is really a positive integer (one or more digits)
// FIXME: ^^ not always, see start and value of list items
$this->info['Number'] = new HTMLPurifier_AttrDef_Integer(false, false, true);

View File

@@ -23,6 +23,13 @@ class HTMLPurifier_AttrValidator
$definition = $config->getHTMLDefinition();
$e =& $context->get('ErrorCollector', true);
// initialize IDAccumulator if necessary
$ok =& $context->get('IDAccumulator', true);
if (!$ok) {
$id_accumulator = HTMLPurifier_IDAccumulator::build($config, $context);
$context->register('IDAccumulator', $id_accumulator);
}
// initialize CurrentToken if necessary
$current_token =& $context->get('CurrentToken', true);
if (!$current_token) $context->register('CurrentToken', $token);
@@ -33,8 +40,8 @@ class HTMLPurifier_AttrValidator
// DEFINITION CALL
$d_defs = $definition->info_global_attr;
// reference attributes for easy manipulation
$attr =& $token->attr;
// don't update token until the very end, to ensure an atomic update
$attr = $token->attr;
// do global transformations (pre)
// nothing currently utilizes this
@@ -129,6 +136,8 @@ class HTMLPurifier_AttrValidator
if ($e && ($attr != $o)) $e->send(E_NOTICE, 'AttrValidator: Attributes transformed', $o, $attr);
}
$token->attr = $attr;
// destroy CurrentToken if we made it ourselves
if (!$current_token) $context->destroy('CurrentToken');

View File

@@ -7,6 +7,7 @@ require_once 'HTMLPurifier/AttrDef/CSS/BackgroundPosition.php';
require_once 'HTMLPurifier/AttrDef/CSS/Border.php';
require_once 'HTMLPurifier/AttrDef/CSS/Color.php';
require_once 'HTMLPurifier/AttrDef/CSS/Composite.php';
require_once 'HTMLPurifier/AttrDef/CSS/DenyElementDecorator.php';
require_once 'HTMLPurifier/AttrDef/CSS/Font.php';
require_once 'HTMLPurifier/AttrDef/CSS/FontFamily.php';
require_once 'HTMLPurifier/AttrDef/CSS/Length.php';
@@ -16,6 +17,7 @@ require_once 'HTMLPurifier/AttrDef/CSS/Percentage.php';
require_once 'HTMLPurifier/AttrDef/CSS/TextDecoration.php';
require_once 'HTMLPurifier/AttrDef/CSS/URI.php';
require_once 'HTMLPurifier/AttrDef/Enum.php';
require_once 'HTMLPurifier/AttrDef/Switch.php';
HTMLPurifier_ConfigSchema::define(
'CSS', 'DefinitionRev', 1, 'int', '
@@ -26,6 +28,20 @@ HTMLPurifier_ConfigSchema::define(
</p>
');
HTMLPurifier_ConfigSchema::define(
'CSS', 'MaxImgLength', '1200px', 'string/null', '
<p>
This parameter sets the maximum allowed length on <code>img</code> tags,
effectively the <code>width</code> and <code>height</code> properties.
Only absolute units of measurement (in, pt, pc, mm, cm) and pixels (px) are allowed. This is
in place to prevent imagecrash attacks, disable with null at your own risk.
This directive is similar to %HTML.MaxImgLength, and both should be
concurrently edited, although there are
subtle differences in the input format (the CSS max is a number with
a unit).
</p>
');
/**
* Defines allowed CSS attributes and what their values are.
* @see HTMLPurifier_HTMLDefinition
@@ -116,7 +132,7 @@ class HTMLPurifier_CSSDefinition extends HTMLPurifier_Definition
$this->info['border-left-width'] =
$this->info['border-right-width'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
new HTMLPurifier_AttrDef_Enum(array('thin', 'medium', 'thick')),
new HTMLPurifier_AttrDef_CSS_Length(true) //disallow negative
new HTMLPurifier_AttrDef_CSS_Length('0') //disallow negative
));
$this->info['border-width'] = new HTMLPurifier_AttrDef_CSS_Multiple($border_width);
@@ -142,7 +158,7 @@ class HTMLPurifier_CSSDefinition extends HTMLPurifier_Definition
$this->info['line-height'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
new HTMLPurifier_AttrDef_Enum(array('normal')),
new HTMLPurifier_AttrDef_CSS_Number(true), // no negatives
new HTMLPurifier_AttrDef_CSS_Length(true),
new HTMLPurifier_AttrDef_CSS_Length('0'),
new HTMLPurifier_AttrDef_CSS_Percentage(true)
));
@@ -164,7 +180,7 @@ class HTMLPurifier_CSSDefinition extends HTMLPurifier_Definition
$this->info['padding-bottom'] =
$this->info['padding-left'] =
$this->info['padding-right'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
new HTMLPurifier_AttrDef_CSS_Length(true),
new HTMLPurifier_AttrDef_CSS_Length('0'),
new HTMLPurifier_AttrDef_CSS_Percentage(true)
));
@@ -175,13 +191,25 @@ class HTMLPurifier_CSSDefinition extends HTMLPurifier_Definition
new HTMLPurifier_AttrDef_CSS_Percentage()
));
$this->info['width'] =
$this->info['height'] =
new HTMLPurifier_AttrDef_CSS_Composite(array(
new HTMLPurifier_AttrDef_CSS_Length(true),
$trusted_wh = new HTMLPurifier_AttrDef_CSS_Composite(array(
new HTMLPurifier_AttrDef_CSS_Length('0'),
new HTMLPurifier_AttrDef_CSS_Percentage(true),
new HTMLPurifier_AttrDef_Enum(array('auto'))
));
$max = $config->get('CSS', 'MaxImgLength');
$this->info['width'] =
$this->info['height'] =
$max === null ?
$trusted_wh :
new HTMLPurifier_AttrDef_Switch('img',
// For img tags:
new HTMLPurifier_AttrDef_CSS_Composite(array(
new HTMLPurifier_AttrDef_CSS_Length('0', $max),
new HTMLPurifier_AttrDef_Enum(array('auto'))
)),
// For everyone else:
$trusted_wh
);
$this->info['text-decoration'] = new HTMLPurifier_AttrDef_CSS_TextDecoration();
@@ -204,7 +232,7 @@ class HTMLPurifier_CSSDefinition extends HTMLPurifier_Definition
$this->info['border-right'] = new HTMLPurifier_AttrDef_CSS_Border($config);
$this->info['border-collapse'] = new HTMLPurifier_AttrDef_Enum(array(
'collapse', 'seperate'));
'collapse', 'separate'));
$this->info['caption-side'] = new HTMLPurifier_AttrDef_Enum(array(
'top', 'bottom'));
@@ -219,6 +247,8 @@ class HTMLPurifier_CSSDefinition extends HTMLPurifier_Definition
new HTMLPurifier_AttrDef_CSS_Percentage()
));
$this->info['border-spacing'] = new HTMLPurifier_AttrDef_CSS_Multiple(new HTMLPurifier_AttrDef_CSS_Length(), 2);
// partial support
$this->info['white-space'] = new HTMLPurifier_AttrDef_Enum(array('nowrap'));

View File

@@ -15,7 +15,10 @@ class HTMLPurifier_ChildDef_Optional extends HTMLPurifier_ChildDef_Required
var $type = 'optional';
function validateChildren($tokens_of_children, $config, &$context) {
$result = parent::validateChildren($tokens_of_children, $config, $context);
if ($result === false) return array();
if ($result === false) {
if (empty($tokens_of_children)) return true;
else return array();
}
return $result;
}
}

View File

@@ -5,6 +5,7 @@ require_once 'HTMLPurifier/ConfigSchema.php';
// member variables
require_once 'HTMLPurifier/HTMLDefinition.php';
require_once 'HTMLPurifier/CSSDefinition.php';
require_once 'HTMLPurifier/URIDefinition.php';
require_once 'HTMLPurifier/Doctype.php';
require_once 'HTMLPurifier/DefinitionCacheFactory.php';
@@ -41,7 +42,7 @@ class HTMLPurifier_Config
/**
* HTML Purifier's version
*/
var $version = '2.0.1';
var $version = '2.1.5';
/**
* Two-level associative array of configuration directives
@@ -75,6 +76,11 @@ class HTMLPurifier_Config
*/
var $serials = array();
/**
* Serial for entire configuration object
*/
var $serial;
/**
* @param $definition HTMLPurifier_ConfigSchema that defines what directives
* are allowed.
@@ -101,7 +107,6 @@ class HTMLPurifier_Config
$ret = HTMLPurifier_Config::createDefault();
if (is_string($config)) $ret->loadIni($config);
elseif (is_array($config)) $ret->loadArray($config);
if (isset($revision)) $ret->revision = $revision;
return $ret;
}
@@ -168,6 +173,17 @@ class HTMLPurifier_Config
return $this->serials[$namespace];
}
/**
* Returns a md5 signature for the entire configuration object
* that uniquely identifies that particular configuration
*/
function getSerial() {
if (empty($this->serial)) {
$this->serial = md5(serialize($this->getAll()));
}
return $this->serial;
}
/**
* Retrieves all directives, organized by namespace
*/
@@ -298,6 +314,8 @@ class HTMLPurifier_Config
$this->definitions[$type] = new HTMLPurifier_HTMLDefinition();
} elseif ($type == 'CSS') {
$this->definitions[$type] = new HTMLPurifier_CSSDefinition();
} elseif ($type == 'URI') {
$this->definitions[$type] = new HTMLPurifier_URIDefinition();
} else {
trigger_error("Definition of $type type not supported");
$false = false;
@@ -396,6 +414,26 @@ class HTMLPurifier_Config
* @static
*/
function loadArrayFromForm($array, $index, $allowed = true, $mq_fix = true) {
$ret = HTMLPurifier_Config::prepareArrayFromForm($array, $index, $allowed, $mq_fix);
$config = HTMLPurifier_Config::create($ret);
return $config;
}
/**
* Merges in configuration values from $_GET/$_POST to object. NOT STATIC.
* @note Same parameters as loadArrayFromForm
*/
function mergeArrayFromForm($array, $index, $allowed = true, $mq_fix = true) {
$ret = HTMLPurifier_Config::prepareArrayFromForm($array, $index, $allowed, $mq_fix);
$this->loadArray($ret);
}
/**
* Prepares an array from a form into something usable for the more
* strict parts of HTMLPurifier_Config
* @static
*/
function prepareArrayFromForm($array, $index, $allowed = true, $mq_fix = true) {
$array = (isset($array[$index]) && is_array($array[$index])) ? $array[$index] : array();
$mq = get_magic_quotes_gpc() && $mq_fix;
@@ -412,9 +450,7 @@ class HTMLPurifier_Config
$value = $mq ? stripslashes($array[$skey]) : $array[$skey];
$ret[$ns][$directive] = $value;
}
$config = HTMLPurifier_Config::create($ret);
return $config;
return $ret;
}
/**

View File

@@ -6,6 +6,8 @@ require_once 'HTMLPurifier/ConfigDef/Namespace.php';
require_once 'HTMLPurifier/ConfigDef/Directive.php';
require_once 'HTMLPurifier/ConfigDef/DirectiveAlias.php';
if (!defined('HTMLPURIFIER_SCHEMA_STRICT')) define('HTMLPURIFIER_SCHEMA_STRICT', false);
/**
* Configuration definition, defines directives and their defaults.
* @note If you update this, please update Printer_ConfigForm
@@ -49,6 +51,8 @@ class HTMLPurifier_ConfigSchema {
var $types = array(
'string' => 'String',
'istring' => 'Case-insensitive string',
'text' => 'Text',
'itext' => 'Case-insensitive text',
'int' => 'Integer',
'float' => 'Float',
'bool' => 'Boolean',
@@ -100,27 +104,30 @@ class HTMLPurifier_ConfigSchema {
* HTMLPurifier_DirectiveDef::$type for allowed values
* @param $description Description of directive for documentation
*/
function define(
$namespace, $name, $default, $type,
$description
) {
function define($namespace, $name, $default, $type, $description) {
$def =& HTMLPurifier_ConfigSchema::instance();
if (!isset($def->info[$namespace])) {
trigger_error('Cannot define directive for undefined namespace',
E_USER_ERROR);
return;
}
if (!ctype_alnum($name)) {
trigger_error('Directive name must be alphanumeric',
E_USER_ERROR);
return;
}
if (empty($description)) {
trigger_error('Description must be non-empty',
E_USER_ERROR);
return;
// basic sanity checks
if (HTMLPURIFIER_SCHEMA_STRICT) {
if (!isset($def->info[$namespace])) {
trigger_error('Cannot define directive for undefined namespace',
E_USER_ERROR);
return;
}
if (!ctype_alnum($name)) {
trigger_error('Directive name must be alphanumeric',
E_USER_ERROR);
return;
}
if (empty($description)) {
trigger_error('Description must be non-empty',
E_USER_ERROR);
return;
}
}
if (isset($def->info[$namespace][$name])) {
// already defined
if (
$def->info[$namespace][$name]->type !== $type ||
$def->defaults[$namespace][$name] !== $default
@@ -129,29 +136,35 @@ class HTMLPurifier_ConfigSchema {
return;
}
} else {
// process modifiers
// needs defining
// process modifiers (OPTIMIZE!)
$type_values = explode('/', $type, 2);
$type = $type_values[0];
$modifier = isset($type_values[1]) ? $type_values[1] : false;
$allow_null = ($modifier === 'null');
if (!isset($def->types[$type])) {
trigger_error('Invalid type for configuration directive',
E_USER_ERROR);
return;
}
$default = $def->validate($default, $type, $allow_null);
if ($def->isError($default)) {
trigger_error('Default value does not match directive type',
E_USER_ERROR);
return;
if (HTMLPURIFIER_SCHEMA_STRICT) {
if (!isset($def->types[$type])) {
trigger_error('Invalid type for configuration directive',
E_USER_ERROR);
return;
}
$default = $def->validate($default, $type, $allow_null);
if ($def->isError($default)) {
trigger_error('Default value does not match directive type',
E_USER_ERROR);
return;
}
}
$def->info[$namespace][$name] =
new HTMLPurifier_ConfigDef_Directive();
$def->info[$namespace][$name]->type = $type;
$def->info[$namespace][$name]->allow_null = $allow_null;
$def->defaults[$namespace][$name] = $default;
}
if (!HTMLPURIFIER_SCHEMA_STRICT) return;
$backtrace = debug_backtrace();
$file = $def->mungeFilename($backtrace[0]['file']);
$line = $backtrace[0]['line'];
@@ -166,19 +179,21 @@ class HTMLPurifier_ConfigSchema {
*/
function defineNamespace($namespace, $description) {
$def =& HTMLPurifier_ConfigSchema::instance();
if (isset($def->info[$namespace])) {
trigger_error('Cannot redefine namespace', E_USER_ERROR);
return;
}
if (!ctype_alnum($namespace)) {
trigger_error('Namespace name must be alphanumeric',
E_USER_ERROR);
return;
}
if (empty($description)) {
trigger_error('Description must be non-empty',
E_USER_ERROR);
return;
if (HTMLPURIFIER_SCHEMA_STRICT) {
if (isset($def->info[$namespace])) {
trigger_error('Cannot redefine namespace', E_USER_ERROR);
return;
}
if (!ctype_alnum($namespace)) {
trigger_error('Namespace name must be alphanumeric',
E_USER_ERROR);
return;
}
if (empty($description)) {
trigger_error('Description must be non-empty',
E_USER_ERROR);
return;
}
}
$def->info[$namespace] = array();
$def->info_namespace[$namespace] = new HTMLPurifier_ConfigDef_Namespace();
@@ -199,23 +214,25 @@ class HTMLPurifier_ConfigSchema {
*/
function defineValueAliases($namespace, $name, $aliases) {
$def =& HTMLPurifier_ConfigSchema::instance();
if (!isset($def->info[$namespace][$name])) {
if (HTMLPURIFIER_SCHEMA_STRICT && !isset($def->info[$namespace][$name])) {
trigger_error('Cannot set value alias for non-existant directive',
E_USER_ERROR);
return;
}
foreach ($aliases as $alias => $real) {
if (!$def->info[$namespace][$name] !== true &&
!isset($def->info[$namespace][$name]->allowed[$real])
) {
trigger_error('Cannot define alias to value that is not allowed',
E_USER_ERROR);
return;
}
if (isset($def->info[$namespace][$name]->allowed[$alias])) {
trigger_error('Cannot define alias over allowed value',
E_USER_ERROR);
return;
if (HTMLPURIFIER_SCHEMA_STRICT) {
if (!$def->info[$namespace][$name] !== true &&
!isset($def->info[$namespace][$name]->allowed[$real])
) {
trigger_error('Cannot define alias to value that is not allowed',
E_USER_ERROR);
return;
}
if (isset($def->info[$namespace][$name]->allowed[$alias])) {
trigger_error('Cannot define alias over allowed value',
E_USER_ERROR);
return;
}
}
$def->info[$namespace][$name]->aliases[$alias] = $real;
}
@@ -230,14 +247,14 @@ class HTMLPurifier_ConfigSchema {
*/
function defineAllowedValues($namespace, $name, $allowed_values) {
$def =& HTMLPurifier_ConfigSchema::instance();
if (!isset($def->info[$namespace][$name])) {
if (HTMLPURIFIER_SCHEMA_STRICT && !isset($def->info[$namespace][$name])) {
trigger_error('Cannot define allowed values for undefined directive',
E_USER_ERROR);
return;
}
$directive =& $def->info[$namespace][$name];
$type = $directive->type;
if ($type != 'string' && $type != 'istring') {
if (HTMLPURIFIER_SCHEMA_STRICT && $type != 'string' && $type != 'istring') {
trigger_error('Cannot define allowed values for directive whose type is not string',
E_USER_ERROR);
return;
@@ -248,8 +265,11 @@ class HTMLPurifier_ConfigSchema {
foreach ($allowed_values as $value) {
$directive->allowed[$value] = true;
}
if ($def->defaults[$namespace][$name] !== null &&
!isset($directive->allowed[$def->defaults[$namespace][$name]])) {
if (
HTMLPURIFIER_SCHEMA_STRICT &&
$def->defaults[$namespace][$name] !== null &&
!isset($directive->allowed[$def->defaults[$namespace][$name]])
) {
trigger_error('Default value must be in allowed range of variables',
E_USER_ERROR);
$directive->allowed = true; // undo undo!
@@ -267,30 +287,32 @@ class HTMLPurifier_ConfigSchema {
*/
function defineAlias($namespace, $name, $new_namespace, $new_name) {
$def =& HTMLPurifier_ConfigSchema::instance();
if (!isset($def->info[$namespace])) {
trigger_error('Cannot define directive alias in undefined namespace',
E_USER_ERROR);
return;
}
if (!ctype_alnum($name)) {
trigger_error('Directive name must be alphanumeric',
E_USER_ERROR);
return;
}
if (isset($def->info[$namespace][$name])) {
trigger_error('Cannot define alias over directive',
E_USER_ERROR);
return;
}
if (!isset($def->info[$new_namespace][$new_name])) {
trigger_error('Cannot define alias to undefined directive',
E_USER_ERROR);
return;
}
if ($def->info[$new_namespace][$new_name]->class == 'alias') {
trigger_error('Cannot define alias to alias',
E_USER_ERROR);
return;
if (HTMLPURIFIER_SCHEMA_STRICT) {
if (!isset($def->info[$namespace])) {
trigger_error('Cannot define directive alias in undefined namespace',
E_USER_ERROR);
return;
}
if (!ctype_alnum($name)) {
trigger_error('Directive name must be alphanumeric',
E_USER_ERROR);
return;
}
if (isset($def->info[$namespace][$name])) {
trigger_error('Cannot define alias over directive',
E_USER_ERROR);
return;
}
if (!isset($def->info[$new_namespace][$new_name])) {
trigger_error('Cannot define alias to undefined directive',
E_USER_ERROR);
return;
}
if ($def->info[$new_namespace][$new_name]->class == 'alias') {
trigger_error('Cannot define alias to alias',
E_USER_ERROR);
return;
}
}
$def->info[$namespace][$name] =
new HTMLPurifier_ConfigDef_DirectiveAlias(
@@ -313,8 +335,10 @@ class HTMLPurifier_ConfigSchema {
return $var;
case 'istring':
case 'string':
case 'text': // no difference, just is longer/multiple line string
case 'itext':
if (!is_string($var)) break;
if ($type === 'istring') $var = strtolower($var);
if ($type === 'istring' || $type === 'itext') $var = strtolower($var);
return $var;
case 'int':
if (is_string($var) && ctype_digit($var)) $var = (int) $var;
@@ -345,9 +369,13 @@ class HTMLPurifier_ConfigSchema {
// a single empty string item, but having an empty
// array is more intuitive
if ($var == '') return array();
// simplistic string to array method that only works
// for simple lists of tag names or alphanumeric characters
$var = explode(',',$var);
if (strpos($var, "\n") === false && strpos($var, "\r") === false) {
// simplistic string to array method that only works
// for simple lists of tag names or alphanumeric characters
$var = explode(',',$var);
} else {
$var = preg_split('/(,|[\n\r]+)/', $var);
}
// remove spaces
foreach ($var as $i => $j) $var[$i] = trim($j);
if ($type === 'hash') {
@@ -388,6 +416,7 @@ class HTMLPurifier_ConfigSchema {
* Takes an absolute path and munges it into a more manageable relative path
*/
function mungeFilename($filename) {
if (!HTMLPURIFIER_SCHEMA_STRICT) return $filename;
$offset = strrpos($filename, 'HTMLPurifier');
$filename = substr($filename, $offset);
$filename = str_replace('\\', '/', $filename);

View File

@@ -5,6 +5,7 @@ require_once 'HTMLPurifier/ChildDef.php';
require_once 'HTMLPurifier/ChildDef/Empty.php';
require_once 'HTMLPurifier/ChildDef/Required.php';
require_once 'HTMLPurifier/ChildDef/Optional.php';
require_once 'HTMLPurifier/ChildDef/Custom.php';
// NOT UNIT TESTED!!!

View File

@@ -120,6 +120,9 @@ class HTMLPurifier_DefinitionCache
/**
* Clears all expired (older version or revision) objects from cache
* @note Be carefuly implementing this method as flush. Flush must
* not interfere with other Definition types, and cleanup()
* should not be repeatedly called by userland code.
*/
function cleanup($config) {
trigger_error('Cannot call abstract method', E_USER_ERROR);

View File

@@ -24,7 +24,7 @@ class HTMLPurifier_DefinitionCache_Null extends HTMLPurifier_DefinitionCache
return false;
}
function flush() {
function flush($config) {
return false;
}

View File

@@ -99,7 +99,7 @@ class HTMLPurifier_DefinitionCache_Serializer extends
*/
function generateBaseDirectoryPath($config) {
$base = $config->get('Cache', 'SerializerPath');
$base = is_null($base) ? dirname(__FILE__) . '/Serializer' : $base;
$base = is_null($base) ? HTMLPURIFIER_PREFIX . '/HTMLPurifier/DefinitionCache/Serializer' : $base;
return $base;
}

View File

@@ -1,6 +1,7 @@
<?php
require_once 'HTMLPurifier/DefinitionCache.php';
require_once 'HTMLPurifier/DefinitionCache/Serializer.php';
HTMLPurifier_ConfigSchema::define(
'Cache', 'DefinitionImpl', 'Serializer', 'string/null', '
@@ -10,10 +11,6 @@ to disable caching (not recommended, as you will see a definite
performance degradation). This directive has been available since 2.0.0.
');
HTMLPurifier_ConfigSchema::defineAllowedValues(
'Cache', 'DefinitionImpl', array('Serializer')
);
HTMLPurifier_ConfigSchema::defineAlias(
'Core', 'DefinitionCache',
'Cache', 'DefinitionImpl'
@@ -27,6 +24,7 @@ class HTMLPurifier_DefinitionCacheFactory
{
var $caches = array('Serializer' => array());
var $implementations = array();
var $decorators = array();
/**
@@ -51,14 +49,21 @@ class HTMLPurifier_DefinitionCacheFactory
return $instance;
}
/**
* Registers a new definition cache object
* @param $short Short name of cache object, for reference
* @param $long Full class name of cache object, for construction
*/
function register($short, $long) {
$this->implementations[$short] = $long;
}
/**
* Factory method that creates a cache object based on configuration
* @param $name Name of definitions handled by cache
* @param $config Instance of HTMLPurifier_Config
*/
function &create($type, $config) {
// only one implementation as for right now, $config will
// be used to determine implementation
$method = $config->get('Cache', 'DefinitionImpl');
if ($method === null) {
$null = new HTMLPurifier_DefinitionCache_Null($type);
@@ -67,7 +72,17 @@ class HTMLPurifier_DefinitionCacheFactory
if (!empty($this->caches[$method][$type])) {
return $this->caches[$method][$type];
}
$cache = new HTMLPurifier_DefinitionCache_Serializer($type);
if (
isset($this->implementations[$method]) &&
class_exists($class = $this->implementations[$method])
) {
$cache = new $class($type);
} else {
if ($method != 'Serializer') {
trigger_error("Unrecognized DefinitionCache $method, using Serializer instead", E_USER_WARNING);
}
$cache = new HTMLPurifier_DefinitionCache_Serializer($type);
}
foreach ($this->decorators as $decorator) {
$new_cache = $decorator->decorate($cache);
// prevent infinite recursion in PHP 4

View File

@@ -82,7 +82,7 @@ class HTMLPurifier_ElementDef
/**
* List of the names of required attributes this element has. Dynamically
* populated.
* populated by HTMLPurifier_HTMLDefinition::getElement
* @public
*/
var $required_attr = array();

View File

@@ -62,6 +62,12 @@ class HTMLPurifier_Encoder
trigger_error('Cannot instantiate encoder, call methods statically', E_USER_ERROR);
}
/**
* Error-handler that mutes errors, alternative to shut-up operator.
*/
function muteErrorHandler() {}
/**
/**
* Cleans a UTF-8 string for well-formedness and SGML validity
*
@@ -90,26 +96,13 @@ class HTMLPurifier_Encoder
*/
function cleanUTF8($str, $force_php = false) {
static $non_sgml_chars = array();
if (empty($non_sgml_chars)) {
for ($i = 0; $i <= 31; $i++) {
// non-SGML ASCII chars
// save \r, \t and \n
if ($i == 9 || $i == 13 || $i == 10) continue;
$non_sgml_chars[chr($i)] = '';
}
for ($i = 127; $i <= 159; $i++) {
$non_sgml_chars[HTMLPurifier_Encoder::unichr($i)] = '';
}
}
static $iconv = null;
if ($iconv === null) $iconv = function_exists('iconv');
if ($iconv && !$force_php) {
// do the shortcut way
$str = @iconv('UTF-8', 'UTF-8//IGNORE', $str);
return strtr($str, $non_sgml_chars);
// UTF-8 validity is checked since PHP 4.3.5
// This is an optimization: if the string is already valid UTF-8, no
// need to do PHP stuff. 99% of the time, this will be the case.
// The regexp matches the XML char production, as well as well as excluding
// non-SGML codepoints U+007F to U+009F
if (preg_match('/^[\x{9}\x{A}\x{D}\x{20}-\x{7E}\x{A0}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]*$/Du', $str)) {
return $str;
}
$mState = 0; // cached expected number of octets after the current octet
@@ -220,7 +213,17 @@ class HTMLPurifier_Encoder
) {
} elseif (0xFEFF != $mUcs4 && // omit BOM
!($mUcs4 >= 128 && $mUcs4 <= 159) // omit non-SGML
// check for valid Char unicode codepoints
(
0x9 == $mUcs4 ||
0xA == $mUcs4 ||
0xD == $mUcs4 ||
(0x20 <= $mUcs4 && 0x7E >= $mUcs4) ||
// 7F-9F is not strictly prohibited by XML,
// but it is non-SGML, and thus we don't allow it
(0xA0 <= $mUcs4 && 0xD7FF >= $mUcs4) ||
(0x10000 <= $mUcs4 && 0x10FFFF >= $mUcs4)
)
) {
$out .= $char;
}
@@ -313,14 +316,23 @@ class HTMLPurifier_Encoder
* @static
*/
function convertToUTF8($str, $config, &$context) {
static $iconv = null;
if ($iconv === null) $iconv = function_exists('iconv');
$encoding = $config->get('Core', 'Encoding');
if ($encoding === 'utf-8') return $str;
static $iconv = null;
if ($iconv === null) $iconv = function_exists('iconv');
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
return @iconv($encoding, 'utf-8//IGNORE', $str);
$str = iconv($encoding, 'utf-8//IGNORE', $str);
// If the string is bjorked by Shift_JIS or a similar encoding
// that doesn't support all of ASCII, convert the naughty
// characters to their true byte-wise ASCII/UTF-8 equivalents.
$str = strtr($str, HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding));
restore_error_handler();
return $str;
} elseif ($encoding === 'iso-8859-1') {
return @utf8_encode($str);
$str = utf8_encode($str);
restore_error_handler();
return $str;
}
trigger_error('Encoding not supported', E_USER_ERROR);
}
@@ -332,17 +344,31 @@ class HTMLPurifier_Encoder
* characters being omitted.
*/
function convertFromUTF8($str, $config, &$context) {
static $iconv = null;
if ($iconv === null) $iconv = function_exists('iconv');
$encoding = $config->get('Core', 'Encoding');
if ($encoding === 'utf-8') return $str;
if ($config->get('Core', 'EscapeNonASCIICharacters')) {
static $iconv = null;
if ($iconv === null) $iconv = function_exists('iconv');
if ($escape = $config->get('Core', 'EscapeNonASCIICharacters')) {
$str = HTMLPurifier_Encoder::convertToASCIIDumbLossless($str);
}
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
if ($iconv && !$config->get('Test', 'ForceNoIconv')) {
return @iconv('utf-8', $encoding . '//IGNORE', $str);
// Undo our previous fix in convertToUTF8, otherwise iconv will barf
$ascii_fix = HTMLPurifier_Encoder::testEncodingSupportsASCII($encoding);
if (!$escape && !empty($ascii_fix)) {
$clear_fix = array();
foreach ($ascii_fix as $utf8 => $native) $clear_fix[$utf8] = '';
$str = strtr($str, $clear_fix);
}
$str = strtr($str, array_flip($ascii_fix));
// Normal stuff
$str = iconv('utf-8', $encoding . '//IGNORE', $str);
restore_error_handler();
return $str;
} elseif ($encoding === 'iso-8859-1') {
return @utf8_decode($str);
$str = utf8_decode($str);
restore_error_handler();
return $str;
}
trigger_error('Encoding not supported', E_USER_ERROR);
}
@@ -395,6 +421,47 @@ class HTMLPurifier_Encoder
return $result;
}
/**
* This expensive function tests whether or not a given character
* encoding supports ASCII. 7/8-bit encodings like Shift_JIS will
* fail this test, and require special processing. Variable width
* encodings shouldn't ever fail.
*
* @param string $encoding Encoding name to test, as per iconv format
* @param bool $bypass Whether or not to bypass the precompiled arrays.
* @return Array of UTF-8 characters to their corresponding ASCII,
* which can be used to "undo" any overzealous iconv action.
*/
function testEncodingSupportsASCII($encoding, $bypass = false) {
static $encodings = array();
if (!$bypass) {
if (isset($encodings[$encoding])) return $encodings[$encoding];
$lenc = strtolower($encoding);
switch ($lenc) {
case 'shift_jis':
return array("\xC2\xA5" => '\\', "\xE2\x80\xBE" => '~');
case 'johab':
return array("\xE2\x82\xA9" => '\\');
}
if (strpos($lenc, 'iso-8859-') === 0) return array();
}
$ret = array();
set_error_handler(array('HTMLPurifier_Encoder', 'muteErrorHandler'));
if (iconv('UTF-8', $encoding, 'a') === false) return false;
for ($i = 0x20; $i <= 0x7E; $i++) { // all printable ASCII chars
$c = chr($i);
if (iconv('UTF-8', "$encoding//IGNORE", $c) === '') {
// Reverse engineer: what's the UTF-8 equiv of this byte
// sequence? This assumes that there's no variable width
// encoding that doesn't support ASCII.
$ret[iconv($encoding, 'UTF-8//IGNORE', $c)] = $c;
}
}
restore_error_handler();
$encodings[$encoding] = $ret;
return $ret;
}
}

View File

@@ -19,7 +19,7 @@ class HTMLPurifier_EntityLookup {
*/
function setup($file = false) {
if (!$file) {
$file = dirname(__FILE__) . '/EntityLookup/entities.ser';
$file = HTMLPURIFIER_PREFIX . '/HTMLPurifier/EntityLookup/entities.ser';
}
$this->table = unserialize(file_get_contents($file));
}

View File

@@ -110,12 +110,13 @@ HTMLPurifier_ConfigSchema::define(
');
HTMLPurifier_ConfigSchema::define(
'HTML', 'Allowed', null, 'string/null', '
'HTML', 'Allowed', null, 'itext/null', '
<p>
This is a convenience directive that rolls the functionality of
%HTML.AllowedElements and %HTML.AllowedAttributes into one directive.
Specify elements and attributes that are allowed using:
<code>element1[attr1|attr2],element2...</code>.
<code>element1[attr1|attr2],element2...</code>. You can also use
newlines instead of commas to separate elements.
</p>
<p>
<strong>Warning</strong>:
@@ -221,6 +222,8 @@ class HTMLPurifier_HTMLDefinition extends HTMLPurifier_Definition
/**
* Adds a custom attribute to a pre-existing element
* @note This is strictly convenience, and does not have a corresponding
* method in HTMLPurifier_HTMLModule
* @param $element_name String element name to add attribute to
* @param $attr_name String name of attribute
* @param $def Attribute definition, can be string or object, see
@@ -228,20 +231,37 @@ class HTMLPurifier_HTMLDefinition extends HTMLPurifier_Definition
*/
function addAttribute($element_name, $attr_name, $def) {
$module =& $this->getAnonymousModule();
$element =& $module->addBlankElement($element_name);
if (!isset($module->info[$element_name])) {
$element =& $module->addBlankElement($element_name);
} else {
$element =& $module->info[$element_name];
}
$element->attr[$attr_name] = $def;
}
/**
* Adds a custom element to your HTML definition
* @note See HTMLPurifier_HTMLModule::addElement for detailed
* parameter descriptions.
* parameter and return value descriptions.
*/
function addElement($element_name, $type, $contents, $attr_collections, $attributes) {
function &addElement($element_name, $type, $contents, $attr_collections, $attributes) {
$module =& $this->getAnonymousModule();
// assume that if the user is calling this, the element
// is safe. This may not be a good idea
$module->addElement($element_name, true, $type, $contents, $attr_collections, $attributes);
$element =& $module->addElement($element_name, true, $type, $contents, $attr_collections, $attributes);
return $element;
}
/**
* Adds a blank element to your HTML definition, for overriding
* existing behavior
* @note See HTMLPurifier_HTMLModule::addBlankElement for detailed
* parameter and return value descriptions.
*/
function &addBlankElement($element_name) {
$module =& $this->getAnonymousModule();
$element =& $module->addBlankElement($element_name);
return $element;
}
/**
@@ -329,7 +349,7 @@ class HTMLPurifier_HTMLDefinition extends HTMLPurifier_Definition
if (isset($this->info_content_sets['Block'][$block_wrapper])) {
$this->info_block_wrapper = $block_wrapper;
} else {
trigger_error('Cannot use non-block element as block wrapper.',
trigger_error('Cannot use non-block element as block wrapper',
E_USER_ERROR);
}
@@ -339,7 +359,7 @@ class HTMLPurifier_HTMLDefinition extends HTMLPurifier_Definition
$this->info_parent = $parent;
$this->info_parent_def = $def;
} else {
trigger_error('Cannot use unrecognized element as parent.',
trigger_error('Cannot use unrecognized element as parent',
E_USER_ERROR);
$this->info_parent_def = $this->manager->getElement($this->info_parent, true);
}
@@ -426,8 +446,9 @@ class HTMLPurifier_HTMLDefinition extends HTMLPurifier_Definition
$elements = array();
$attributes = array();
$chunks = explode(',', $list);
$chunks = preg_split('/(,|[\n\r]+)/', $list);
foreach ($chunks as $chunk) {
if (empty($chunk)) continue;
// remove TinyMCE element control characters
if (!strpos($chunk, '[')) {
$element = $chunk;

View File

@@ -219,5 +219,14 @@ class HTMLPurifier_HTMLModule
}
return $ret;
}
/**
* Lazy load construction of the module after determining whether
* or not it's needed, and also when a finalized configuration object
* is available.
* @param $config Instance of HTMLPurifier_Config
*/
function setup($config) {}
}

View File

@@ -15,7 +15,7 @@ class HTMLPurifier_HTMLModule_Bdo extends HTMLPurifier_HTMLModule
'I18N' => array('dir' => false)
);
function HTMLPurifier_HTMLModule_Bdo() {
function setup($config) {
$bdo =& $this->addElement(
'bdo', true, 'Inline', 'Inline', array('Core', 'Lang'),
array(

View File

@@ -12,7 +12,7 @@ class HTMLPurifier_HTMLModule_Edit extends HTMLPurifier_HTMLModule
var $name = 'Edit';
function HTMLPurifier_HTMLModule_Edit() {
function setup($config) {
$contents = 'Chameleon: #PCDATA | Inline ! #PCDATA | Flow';
$attr = array(
'cite' => 'URI',

View File

@@ -11,7 +11,7 @@ class HTMLPurifier_HTMLModule_Hypertext extends HTMLPurifier_HTMLModule
var $name = 'Hypertext';
function HTMLPurifier_HTMLModule_Hypertext() {
function setup($config) {
$a =& $this->addElement(
'a', true, 'Inline', 'Inline', 'Common',
array(

View File

@@ -5,6 +5,18 @@ require_once 'HTMLPurifier/HTMLModule.php';
require_once 'HTMLPurifier/AttrDef/URI.php';
require_once 'HTMLPurifier/AttrTransform/ImgRequired.php';
HTMLPurifier_ConfigSchema::define(
'HTML', 'MaxImgLength', 1200, 'int/null', '
<p>
This directive controls the maximum number of pixels in the width and
height attributes in <code>img</code> tags. This is
in place to prevent imagecrash attacks, disable with null at your own risk.
This directive is similar to %CSS.MaxImgLength, and both should be
concurrently edited, although there are
subtle differences in the input format (the HTML max is an integer).
</p>
');
/**
* XHTML 1.1 Image Module provides basic image embedding.
* @note There is specialized code for removing empty images in
@@ -15,17 +27,26 @@ class HTMLPurifier_HTMLModule_Image extends HTMLPurifier_HTMLModule
var $name = 'Image';
function HTMLPurifier_HTMLModule_Image() {
function setup($config) {
$max = $config->get('HTML', 'MaxImgLength');
$img =& $this->addElement(
'img', true, 'Inline', 'Empty', 'Common',
array(
'alt*' => 'Text',
'height' => 'Length',
// According to the spec, it's Length, but percents can
// be abused, so we allow only Pixels. A trusted module
// could overload this with the real value.
'height' => 'Pixels#' . $max,
'width' => 'Pixels#' . $max,
'longdesc' => 'URI',
'src*' => new HTMLPurifier_AttrDef_URI(true), // embedded
'width' => 'Length'
)
);
if ($max === null || $config->get('HTML', 'Trusted')) {
$img->attr['height'] =
$img->attr['width'] = 'Length';
}
// kind of strange, but splitting things up would be inefficient
$img->attr_transform_pre[] =
$img->attr_transform_post[] =

View File

@@ -25,7 +25,7 @@ class HTMLPurifier_HTMLModule_Legacy extends HTMLPurifier_HTMLModule
var $name = 'Legacy';
function HTMLPurifier_HTMLModule_Legacy() {
function setup($config) {
$this->addElement('basefont', true, 'Inline', 'Empty', false, array(
'color' => 'Color',

View File

@@ -21,7 +21,7 @@ class HTMLPurifier_HTMLModule_List extends HTMLPurifier_HTMLModule
var $content_sets = array('Flow' => 'List');
function HTMLPurifier_HTMLModule_List() {
function setup($config) {
$this->addElement('ol', true, 'List', 'Required: li', 'Common');
$this->addElement('ul', true, 'List', 'Required: li', 'Common');
$this->addElement('dl', true, 'List', 'Required: dt | dd', 'Common');

View File

@@ -0,0 +1,47 @@
<?php
require_once 'HTMLPurifier/HTMLModule.php';
/**
* XHTML 1.1 Object Module, defines elements for generic object inclusion
* @warning Users will commonly use <embed> to cater to legacy browsers: this
* module does not allow this sort of behavior
*/
class HTMLPurifier_HTMLModule_Object extends HTMLPurifier_HTMLModule
{
var $name = 'Object';
function setup($config) {
$this->addElement('object', false, 'Inline', 'Optional: #PCDATA | Flow | param', 'Common',
array(
'archive' => 'URI',
'classid' => 'URI',
'codebase' => 'URI',
'codetype' => 'Text',
'data' => 'URI',
'declare' => 'Bool#declare',
'height' => 'Length',
'name' => 'CDATA',
'standby' => 'Text',
'tabindex' => 'Number',
'type' => 'ContentType',
'width' => 'Length'
)
);
$this->addElement('param', false, false, 'Empty', false,
array(
'id' => 'ID',
'name*' => 'Text',
'type' => 'Text',
'value' => 'Text',
'valuetype' => 'Enum#data,ref,object'
)
);
}
}

View File

@@ -17,7 +17,7 @@ class HTMLPurifier_HTMLModule_Presentation extends HTMLPurifier_HTMLModule
var $name = 'Presentation';
function HTMLPurifier_HTMLModule_Presentation() {
function setup($config) {
$this->addElement('b', true, 'Inline', 'Inline', 'Common');
$this->addElement('big', true, 'Inline', 'Inline', 'Common');
$this->addElement('hr', true, 'Block', 'Empty', 'Common');

View File

@@ -0,0 +1,28 @@
<?php
require_once 'HTMLPurifier/HTMLModule.php';
/**
* XHTML 1.1 Ruby Annotation Module, defines elements that indicate
* short runs of text alongside base text for annotation or pronounciation.
*/
class HTMLPurifier_HTMLModule_Ruby extends HTMLPurifier_HTMLModule
{
var $name = 'Ruby';
function setup($config) {
$this->addElement('ruby', true, 'Inline',
'Custom: ((rb, (rt | (rp, rt, rp))) | (rbc, rtc, rtc?))',
'Common');
$this->addElement('rbc', true, false, 'Required: rb', 'Common');
$this->addElement('rtc', true, false, 'Required: rt', 'Common');
$rb =& $this->addElement('rb', true, false, 'Inline', 'Common');
$rb->excludes = array('ruby' => true);
$rt =& $this->addElement('rt', true, false, 'Inline', 'Common', array('rbspan' => 'Number'));
$rt->excludes = array('ruby' => true);
$this->addElement('rp', true, false, 'Optional: #PCDATA', 'Common');
}
}

View File

@@ -32,7 +32,7 @@ class HTMLPurifier_HTMLModule_Scripting extends HTMLPurifier_HTMLModule
var $elements = array('script', 'noscript');
var $content_sets = array('Block' => 'script | noscript', 'Inline' => 'script | noscript');
function HTMLPurifier_HTMLModule_Scripting() {
function setup($config) {
// TODO: create custom child-definition for noscript that
// auto-wraps stray #PCDATA in a similar manner to
// blockquote's custom definition (we would use it but

View File

@@ -18,7 +18,7 @@ class HTMLPurifier_HTMLModule_StyleAttribute extends HTMLPurifier_HTMLModule
'Core' => array(0 => array('Style'))
);
function HTMLPurifier_HTMLModule_StyleAttribute() {
function setup($config) {
$this->attr_collections['Style']['style'] = new HTMLPurifier_AttrDef_CSS();
}

View File

@@ -11,7 +11,7 @@ class HTMLPurifier_HTMLModule_Tables extends HTMLPurifier_HTMLModule
var $name = 'Tables';
function HTMLPurifier_HTMLModule_Tables() {
function setup($config) {
$this->addElement('caption', true, false, 'Inline', 'Common');

View File

@@ -10,7 +10,7 @@ class HTMLPurifier_HTMLModule_Target extends HTMLPurifier_HTMLModule
var $name = 'Target';
function HTMLPurifier_HTMLModule_Target() {
function setup($config) {
$elements = array('a');
foreach ($elements as $name) {
$e =& $this->addBlankElement($name);

View File

@@ -22,7 +22,7 @@ class HTMLPurifier_HTMLModule_Text extends HTMLPurifier_HTMLModule
'Flow' => 'Heading | Block | Inline'
);
function HTMLPurifier_HTMLModule_Text() {
function setup($config) {
// Inline Phrasal -------------------------------------------------
$this->addElement('abbr', true, 'Inline', 'Inline', 'Common');

View File

@@ -70,7 +70,7 @@ class HTMLPurifier_HTMLModule_Tidy extends HTMLPurifier_HTMLModule
* @todo Wildcard matching and error reporting when an added or
* subtracted fix has no effect.
*/
function construct($config) {
function setup($config) {
// create fixes, initialize fixesForLevel
$fixes = $this->makeFixes();

View File

@@ -13,6 +13,8 @@ require_once 'HTMLPurifier/AttrTransform/Length.php';
require_once 'HTMLPurifier/AttrTransform/ImgSpace.php';
require_once 'HTMLPurifier/AttrTransform/EnumToCSS.php';
require_once 'HTMLPurifier/ChildDef/StrictBlockquote.php';
class HTMLPurifier_HTMLModule_Tidy_XHTMLAndHTML4 extends
HTMLPurifier_HTMLModule_Tidy
{
@@ -188,5 +190,17 @@ class HTMLPurifier_HTMLModule_Tidy_Strict extends
{
var $name = 'Tidy_Strict';
var $defaultLevel = 'light';
function makeFixes() {
$r = parent::makeFixes();
$r['blockquote#content_model_type'] = 'strictblockquote';
return $r;
}
var $defines_child_def = true;
function getChildDef($def) {
if ($def->content_model_type != 'strictblockquote') return parent::getChildDef($def);
return new HTMLPurifier_ChildDef_StrictBlockquote($def->content_model);
}
}

View File

@@ -1,26 +0,0 @@
<?php
require_once 'HTMLPurifier/HTMLModule/Tidy.php';
require_once 'HTMLPurifier/ChildDef/StrictBlockquote.php';
class HTMLPurifier_HTMLModule_Tidy_XHTMLStrict extends
HTMLPurifier_HTMLModule_Tidy
{
var $name = 'Tidy_XHTMLStrict';
var $defaultLevel = 'light';
function makeFixes() {
$r = array();
$r['blockquote#content_model_type'] = 'strictblockquote';
return $r;
}
var $defines_child_def = true;
function getChildDef($def) {
if ($def->content_model_type != 'strictblockquote') return false;
return new HTMLPurifier_ChildDef_StrictBlockquote($def->content_model);
}
}

View File

@@ -28,12 +28,13 @@ require_once 'HTMLPurifier/HTMLModule/Target.php';
require_once 'HTMLPurifier/HTMLModule/Scripting.php';
require_once 'HTMLPurifier/HTMLModule/XMLCommonAttributes.php';
require_once 'HTMLPurifier/HTMLModule/NonXMLCommonAttributes.php';
require_once 'HTMLPurifier/HTMLModule/Ruby.php';
require_once 'HTMLPurifier/HTMLModule/Object.php';
// tidy modules
require_once 'HTMLPurifier/HTMLModule/Tidy.php';
require_once 'HTMLPurifier/HTMLModule/Tidy/XHTMLAndHTML4.php';
require_once 'HTMLPurifier/HTMLModule/Tidy/XHTML.php';
require_once 'HTMLPurifier/HTMLModule/Tidy/XHTMLStrict.php';
require_once 'HTMLPurifier/HTMLModule/Tidy/Proprietary.php';
HTMLPurifier_ConfigSchema::define(
@@ -171,7 +172,7 @@ class HTMLPurifier_HTMLModuleManager
$common = array(
'CommonAttributes', 'Text', 'Hypertext', 'List',
'Presentation', 'Edit', 'Bdo', 'Tables', 'Image',
'StyleAttribute', 'Scripting'
'StyleAttribute', 'Scripting', 'Object'
);
$transitional = array('Legacy', 'Target');
$xml = array('XMLCommonAttributes');
@@ -207,7 +208,7 @@ class HTMLPurifier_HTMLModuleManager
$this->doctypes->register(
'XHTML 1.0 Strict', true,
array_merge($common, $xml, $non_xml),
array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_XHTMLStrict', 'Tidy_Proprietary'),
array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Strict', 'Tidy_Proprietary'),
array(),
'-//W3C//DTD XHTML 1.0 Strict//EN',
'http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd'
@@ -215,8 +216,8 @@ class HTMLPurifier_HTMLModuleManager
$this->doctypes->register(
'XHTML 1.1', true,
array_merge($common, $xml),
array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Proprietary'), // Tidy_XHTML1_1
array_merge($common, $xml, array('Ruby')),
array('Tidy_Strict', 'Tidy_XHTML', 'Tidy_Proprietary', 'Tidy_Strict'), // Tidy_XHTML1_1
array(),
'-//W3C//DTD XHTML 1.1//EN',
'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'
@@ -341,13 +342,12 @@ class HTMLPurifier_HTMLModuleManager
foreach ($modules as $module) {
$this->processModule($module);
$this->modules[$module]->setup($config);
}
foreach ($this->doctype->tidyModules as $module) {
$this->processModule($module);
if (method_exists($this->modules[$module], 'construct')) {
$this->modules[$module]->construct($config);
}
$this->modules[$module]->setup($config);
}
// setup lookup table based on all valid modules

View File

@@ -1,11 +1,15 @@
<?php
HTMLPurifier_ConfigSchema::define(
'Attr', 'IDBlacklist', array(), 'list',
'Array of IDs not allowed in the document.'
);
/**
* Component of HTMLPurifier_AttrContext that accumulates IDs to prevent dupes
* @note In Slashdot-speak, dupe means duplicate.
* @note This class does not accept $config or $context, thus, it is the
* burden of the callee to register the appropriate errors or
* configuration.
* @note The default constructor does not accept $config or $context objects:
* use must use the static build() factory method to perform initialization.
*/
class HTMLPurifier_IDAccumulator
{
@@ -16,6 +20,19 @@ class HTMLPurifier_IDAccumulator
*/
var $ids = array();
/**
* Builds an IDAccumulator, also initializing the default blacklist
* @param $config Instance of HTMLPurifier_Config
* @param $context Instance of HTMLPurifier_Context
* @return Fully initialized HTMLPurifier_IDAccumulator
* @static
*/
function build($config, &$context) {
$acc = new HTMLPurifier_IDAccumulator();
$acc->load($config->get('Attr', 'IDBlacklist'));
return $acc;
}
/**
* Add an ID to the lookup table.
* @param $id ID to be added.

View File

@@ -4,10 +4,18 @@
* Injects tokens into the document while parsing for well-formedness.
* This enables "formatter-like" functionality such as auto-paragraphing,
* smiley-ification and linkification to take place.
*
* @todo Allow injectors to request a re-run on their output. This
* would help if an operation is recursive.
*/
class HTMLPurifier_Injector
{
/**
* Advisory name of injector, this is for friendly error messages
*/
var $name;
/**
* Amount of tokens the injector needs to skip + 1. Because
* the decrement is the first thing that happens, this needs to
@@ -40,16 +48,37 @@ class HTMLPurifier_Injector
var $inputIndex;
/**
* Prepares the injector by giving it the config and context objects,
* so that important variables can be extracted and not passed via
* parameter constantly. Remember: always instantiate a new injector
* when handling a set of HTML.
* Array of elements and attributes this injector creates and therefore
* need to be allowed by the definition. Takes form of
* array('element' => array('attr', 'attr2'), 'element2')
*/
var $needed = array();
/**
* Prepares the injector by giving it the config and context objects:
* this allows references to important variables to be made within
* the injector. This function also checks if the HTML environment
* will work with the Injector: if p tags are not allowed, the
* Auto-Paragraphing injector should not be enabled.
* @param $config Instance of HTMLPurifier_Config
* @param $context Instance of HTMLPurifier_Context
* @return Boolean false if success, string of missing needed element/attribute if failure
*/
function prepare($config, &$context) {
$this->htmlDefinition = $config->getHTMLDefinition();
// perform $needed checks
foreach ($this->needed as $element => $attributes) {
if (is_int($element)) $element = $attributes;
if (!isset($this->htmlDefinition->info[$element])) return $element;
if (!is_array($attributes)) continue;
foreach ($attributes as $name) {
if (!isset($this->htmlDefinition->info[$element]->attr[$name])) return "$element.$name";
}
}
$this->currentNesting =& $context->get('CurrentNesting');
$this->inputTokens =& $context->get('InputTokens');
$this->inputIndex =& $context->get('InputIndex');
return false;
}
/**
@@ -77,9 +106,16 @@ class HTMLPurifier_Injector
function handleText(&$token) {}
/**
* Handler that is called when a start token is processed
* Handler that is called when a start or empty token is processed
*/
function handleStart(&$token) {}
function handleElement(&$token) {}
/**
* Notifier that is called when an end token is processed
* @note This differs from handlers in that the token is read-only
*/
function notifyEnd($token) {}
}

View File

@@ -6,15 +6,28 @@ HTMLPurifier_ConfigSchema::define(
'AutoFormat', 'AutoParagraph', false, 'bool', '
<p>
This directive turns on auto-paragraphing, where double newlines are
converted in to paragraphs whenever possible. Auto-paragraphing
applies when:
converted in to paragraphs whenever possible. Auto-paragraphing:
</p>
<ul>
<li>There are inline elements or text in the root node</li>
<li>There are inline elements or text with double newlines or
block elements in nodes that allow paragraph tags</li>
<li>There are double newlines in paragraph tags</li>
<li>Always applies to inline elements or text in the root node,</li>
<li>Applies to inline elements or text with double newlines in nodes
that allow paragraph tags,</li>
<li>Applies to double newlines in paragraph tags</li>
</ul>
<p>
<code>p</code> tags must be allowed for this directive to take effect.
We do not use <code>br</code> tags for paragraphing, as that is
semantically incorrect.
</p>
<p>
To prevent auto-paragraphing as a content-producer, refrain from using
double-newlines except to specify a new paragraph or in contexts where
it has special meaning (whitespace usually has no meaning except in
tags like <code>pre</code>, so this should not be difficult.) To prevent
the paragraphing of inline text adjacent to block elements, wrap them
in <code>div</code> tags (the behavior is slightly different outside of
the root node.)
</p>
<p>
This directive has been available since 2.0.1.
</p>
@@ -27,6 +40,9 @@ HTMLPurifier_ConfigSchema::define(
class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
{
var $name = 'AutoParagraph';
var $needed = array('p');
function _pStart() {
$par = new HTMLPurifier_Token_Start('p');
$par->armor['MakeWellFormed_TagClosedError'] = true;
@@ -54,19 +70,27 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
$ok = false;
// test if up-coming tokens are either block or have
// a double newline in them
$nesting = 0;
for ($i = $this->inputIndex + 1; isset($this->inputTokens[$i]); $i++) {
if ($this->inputTokens[$i]->type == 'start'){
if (!$this->_isInline($this->inputTokens[$i])) {
$ok = true;
// we haven't found a double-newline, and
// we've hit a block element, so don't paragraph
$ok = false;
break;
}
break;
$nesting++;
}
if ($this->inputTokens[$i]->type == 'end') {
if ($nesting <= 0) break;
$nesting--;
}
if ($this->inputTokens[$i]->type == 'end') break;
if ($this->inputTokens[$i]->type == 'text') {
// found it!
if (strpos($this->inputTokens[$i]->data, "\n\n") !== false) {
$ok = true;
break;
}
if (!$this->inputTokens[$i]->is_whitespace) break;
}
}
if ($ok) {
@@ -79,7 +103,7 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
}
function handleStart(&$token) {
function handleElement(&$token) {
// check if we're inside a tag already
if (!empty($this->currentNesting)) {
if ($this->allowsElement('p')) {
@@ -88,11 +112,19 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
// this token is already paragraph, abort
if ($token->name == 'p') return;
// check if this token is adjacent to the parent
if ($this->inputTokens[$this->inputIndex - 1]->type != 'start') {
// this token is a block level, abort
if (!$this->_isInline($token)) return;
// check if this token is adjacent to the parent token
$prev = $this->inputTokens[$this->inputIndex - 1];
if ($prev->type != 'start') {
// not adjacent, we can abort early
// add lead paragraph tag if our token is inline
if ($this->_isInline($token)) {
// and the previous tag was an end paragraph
if (
$prev->name == 'p' && $prev->type == 'end' &&
$this->_isInline($token)
) {
$token = array($this->_pStart(), $token);
}
return;
@@ -105,8 +137,8 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
$ok = false;
// maintain a mini-nesting counter, this lets us bail out
// early if possible
$j = 2; // current nesting, is two due to parent and this start
for ($i = $this->inputIndex + 1; isset($this->inputTokens[$i]); $i++) {
$j = 1; // current nesting, one is due to parent (we recalculate current token)
for ($i = $this->inputIndex; isset($this->inputTokens[$i]); $i++) {
if ($this->inputTokens[$i]->type == 'start') $j++;
if ($this->inputTokens[$i]->type == 'end') $j--;
if ($this->inputTokens[$i]->type == 'text') {
@@ -150,7 +182,14 @@ class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
$needs_start = false;
$needs_end = false;
for ($i = 0, $c = count($raw_paragraphs); $i < $c; $i++) {
$c = count($raw_paragraphs);
if ($c == 1) {
// there were no double-newlines, abort quickly
$result[] = new HTMLPurifier_Token_Text($data);
return;
}
for ($i = 0; $i < $c; $i++) {
$par = $raw_paragraphs[$i];
if (trim($par) !== '') {
$paragraphs[] = $par;

View File

@@ -6,7 +6,8 @@ HTMLPurifier_ConfigSchema::define(
'AutoFormat', 'Linkify', false, 'bool', '
<p>
This directive turns on linkification, auto-linking http, ftp and
https URLs. This directive has been available since 2.0.1.
https URLs. <code>a</code> tags with the <code>href</code> attribute
must be allowed. This directive has been available since 2.0.1.
</p>
');
@@ -16,7 +17,10 @@ HTMLPurifier_ConfigSchema::define(
class HTMLPurifier_Injector_Linkify extends HTMLPurifier_Injector
{
function handleText(&$token, $config, &$context) {
var $name = 'Linkify';
var $needed = array('a' => array('href'));
function handleText(&$token) {
if (!$this->allowsElement('a')) return;
if (strpos($token->data, '://') === false) {

View File

@@ -6,8 +6,9 @@ HTMLPurifier_ConfigSchema::define(
'AutoFormat', 'PurifierLinkify', false, 'bool', '
<p>
Internal auto-formatter that converts configuration directives in
syntax <a>%Namespace.Directive</a> to links. This directive has been available
since 2.0.1.
syntax <a>%Namespace.Directive</a> to links. <code>a</code> tags
with the <code>href</code> attribute must be allowed.
This directive has been available since 2.0.1.
</p>
');
@@ -27,14 +28,16 @@ HTMLPurifier_ConfigSchema::define(
class HTMLPurifier_Injector_PurifierLinkify extends HTMLPurifier_Injector
{
var $name = 'PurifierLinkify';
var $docURL;
var $needed = array('a' => array('href'));
function prepare($config, &$context) {
parent::prepare($config, $context);
$this->docURL = $config->get('AutoFormatParam', 'PurifierLinkifyDocURL');
return parent::prepare($config, $context);
}
function handleText(&$token, $config, &$context) {
function handleText(&$token) {
if (!$this->allowsElement('a')) return;
if (strpos($token->data, '%') === false) return;

View File

@@ -25,6 +25,13 @@ class HTMLPurifier_Language
*/
var $errorNames = array();
/**
* True if no message file was found for this language, so English
* is being used instead. Check this if you'd like to notify the
* user that they've used a non-supported language.
*/
var $error = false;
/**
* Has the language object been loaded yet?
* @private

View File

@@ -0,0 +1,11 @@
<?php
// private language message file for unit testing purposes
// this language file has no class associated with it
$fallback = 'en';
$messages = array(
'HTMLPurifier' => 'HTML Purifier XNone'
);

View File

@@ -28,7 +28,7 @@ $messages = array(
'Strategy_RemoveForeignElements: Foreign element to text' => 'Unrecognized $CurrentToken.Serialized tag converted to text',
'Strategy_RemoveForeignElements: Foreign element removed' => 'Unrecognized $CurrentToken.Serialized tag removed',
'Strategy_RemoveForeignElements: Comment removed' => 'Comment containing "$CurrentToken.Data" removed',
'Strategy_RemoveForeignElements: Script removed' => 'Script removed',
'Strategy_RemoveForeignElements: Foreign meta element removed' => 'Unrecognized $CurrentToken.Serialized meta tag and all descendants removed',
'Strategy_RemoveForeignElements: Token removed to end' => 'Tags and text starting from $1 element where removed to end',
'Strategy_MakeWellFormed: Unnecessary end tag removed' => 'Unnecessary $CurrentToken.Serialized tag removed',

View File

@@ -16,6 +16,7 @@ This directive has been available since 2.0.0.
* caching and fallbacks.
* @note Thanks to MediaWiki for the general logic, although this version
* has been entirely rewritten
* @todo Serialized cache for languages
*/
class HTMLPurifier_LanguageFactory
{
@@ -82,47 +83,49 @@ class HTMLPurifier_LanguageFactory
*/
function setup() {
$this->validator = new HTMLPurifier_AttrDef_Lang();
$this->dir = dirname(__FILE__);
$this->dir = HTMLPURIFIER_PREFIX . '/HTMLPurifier';
}
/**
* Creates a language object, handles class fallbacks
* @param $config Instance of HTMLPurifier_Config
* @param $context Instance of HTMLPurifier_Context
* @param $code Code to override configuration with. Private parameter.
*/
function create($config, &$context) {
function create($config, &$context, $code = false) {
// validate language code
$code = $this->validator->validate(
$config->get('Core', 'Language'), $config, $context
);
if ($code === false) {
$code = $this->validator->validate(
$config->get('Core', 'Language'), $config, $context
);
} else {
$code = $this->validator->validate($code, $config, $context);
}
if ($code === false) $code = 'en'; // malformed code becomes English
$pcode = str_replace('-', '_', $code); // make valid PHP classname
static $depth = 0; // recursion protection
if ($code == 'en') {
$class = 'HTMLPurifier_Language';
$file = $this->dir . '/Language.php';
$lang = new HTMLPurifier_Language($config, $context);
} else {
$class = 'HTMLPurifier_Language_' . $pcode;
$file = $this->dir . '/Language/classes/' . $code . '.php';
// PHP5/APC deps bug workaround can go here
// you can bypass the conditional include by loading the
// file yourself
if (file_exists($file) && !class_exists($class)) {
include_once $file;
}
}
if (!class_exists($class)) {
// go fallback
$fallback = HTMLPurifier_LanguageFactory::getFallbackFor($code);
$depth++;
$lang = HTMLPurifier_LanguageFactory::factory( $fallback );
$depth--;
} else {
$lang = new $class($config, $context);
if (file_exists($file)) {
include $file;
$lang = new $class($config, $context);
} else {
// Go fallback
$raw_fallback = $this->getFallbackFor($code);
$fallback = $raw_fallback ? $raw_fallback : 'en';
$depth++;
$lang = $this->create($config, $context, $fallback);
if (!$raw_fallback) {
$lang->error = true;
}
$depth--;
}
}
$lang->code = $code;

View File

@@ -0,0 +1,111 @@
<?php
/**
* Represents a measurable length, with a string numeric magnitude
* and a unit. This object is immutable.
*/
class HTMLPurifier_Length
{
/**
* String numeric magnitude.
*/
var $n;
/**
* String unit. False is permitted if $n = 0.
*/
var $unit;
/**
* Whether or not this length is valid. Null if not calculated yet.
*/
var $isValid;
/*
* @param number $n Magnitude
* @param string $u Unit
*/
function HTMLPurifier_Length($n = '0', $u = false) {
$this->n = (string) $n;
$this->unit = $u !== false ? (string) $u : false;
}
/**
* @param string $s Unit string, like '2em' or '3.4in'
* @warning Does not perform validation.
*/
function make($s) {
if (is_a($s, 'HTMLPurifier_Length')) return $s;
$n_length = strspn($s, '1234567890.+-');
$n = substr($s, 0, $n_length);
$unit = substr($s, $n_length);
if ($unit === '') $unit = false;
return new HTMLPurifier_Length($n, $unit);
}
/**
* Validates the number and unit.
*/
function validate() {
// Special case:
static $allowedUnits = array(
'em' => true, 'ex' => true, 'px' => true, 'in' => true,
'cm' => true, 'mm' => true, 'pt' => true, 'pc' => true
);
if ($this->n === '+0' || $this->n === '-0') $this->n = '0';
if ($this->n === '0' && $this->unit === false) return true;
if (!ctype_lower($this->unit)) $this->unit = strtolower($this->unit);
if (!isset($allowedUnits[$this->unit])) return false;
// Hack:
$def = new HTMLPurifier_AttrDef_CSS_Number();
$a = false; // hack hack
$result = $def->validate($this->n, $a, $a);
if ($result === false) return false;
$this->n = $result;
return true;
}
/**
* Returns string representation of number.
*/
function toString() {
if (!$this->isValid()) return false;
return $this->n . $this->unit;
}
/**
* Retrieves string numeric magnitude.
*/
function getN() {return $this->n;}
/**
* Retrieves string unit.
*/
function getUnit() {return $this->unit;}
/**
* Returns true if this length unit is valid.
*/
function isValid() {
if ($this->isValid === null) $this->isValid = $this->validate();
return $this->isValid;
}
/**
* Compares two lengths, and returns 1 if greater, -1 if less and 0 if equal.
* @warning If both values are too large or small, this calculation will
* not work properly
*/
function compareTo($l) {
if ($l === false) return false;
if ($l->unit !== $this->unit) {
$converter = new HTMLPurifier_UnitConverter();
$l = $converter->convert($l, $this->unit);
if ($l === false) return false;
}
return $this->n - $l->n;
}
}

View File

@@ -13,11 +13,14 @@ if (version_compare(PHP_VERSION, "5", ">=")) {
}
HTMLPurifier_ConfigSchema::define(
'Core', 'AcceptFullDocuments', true, 'bool',
'This parameter determines whether or not the filter should accept full '.
'HTML documents, not just HTML fragments. When on, it will '.
'drop all sections except the content between body.'
);
'Core', 'ConvertDocumentToFragment', true, 'bool', '
This parameter determines whether or not the filter should convert
input that is a full document with html and body tags to a fragment
of just the contents of a body tag. This parameter is simply something
HTML Purifier can do during an edge-case: for most inputs, this
processing is not necessary.
');
HTMLPurifier_ConfigSchema::defineAlias('Core', 'AcceptFullDocuments', 'Core', 'ConvertDocumentToFragment');
HTMLPurifier_ConfigSchema::define(
'Core', 'LexerImpl', null, 'mixed/null', '
@@ -66,6 +69,16 @@ HTMLPurifier_ConfigSchema::define(
</p>
');
HTMLPurifier_ConfigSchema::define(
'Core', 'AggressivelyFixLt', false, 'bool', '
This directive enables aggressive pre-filter fixes HTML Purifier can
perform in order to ensure that open angled-brackets do not get killed
during parsing stage. Enabling this will result in two preg_replace_callback
calls and one preg_replace call for every bit of HTML passed through here.
It is not necessary and will have no effect for PHP 4.
This directive has been available since 2.1.0.
');
/**
* Forgivingly lexes HTML (SGML-style) markup into tokens.
*
@@ -179,6 +192,9 @@ class HTMLPurifier_Lexer
return new HTMLPurifier_Lexer_DOMLex();
case 'DirectLex':
return new HTMLPurifier_Lexer_DirectLex();
case 'PH5P':
// experimental Lexer that must be manually included
return new HTMLPurifier_Lexer_PH5P();
default:
trigger_error("Cannot instantiate unrecognized Lexer type " . htmlspecialchars($lexer), E_USER_ERROR);
}
@@ -303,7 +319,7 @@ class HTMLPurifier_Lexer
function normalize($html, $config, &$context) {
// extract body from document if applicable
if ($config->get('Core', 'AcceptFullDocuments')) {
if ($config->get('Core', 'ConvertDocumentToFragment')) {
$html = $this->extractBody($html);
}

View File

@@ -42,15 +42,18 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
$html = $this->normalize($html, $config, $context);
// attempt to armor stray angled brackets that cannot possibly
// form tags and thus are probably being used as emoticons
if ($config->get('Core', 'AggressivelyFixLt')) {
$char = '[^a-z!\/]';
$comment = "/<!--(.*?)(-->|\z)/is";
$html = preg_replace_callback($comment, array('HTMLPurifier_Lexer_DOMLex', 'callbackArmorCommentEntities'), $html);
$html = preg_replace("/<($char)/i", '&lt;\\1', $html);
$html = preg_replace_callback($comment, array('HTMLPurifier_Lexer_DOMLex', 'callbackUndoCommentSubst'), $html); // fix comments
}
// preprocess html, essential for UTF-8
$html =
'<!DOCTYPE html '.
'PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"'.
'"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">'.
'<html><head>'.
'<meta http-equiv="Content-Type" content="text/html;'.
' charset=utf-8" />'.
'</head><body><div>'.$html.'</div></body></html>';
$html = $this->wrapHTML($html, $config, $context);
$doc = new DOMDocument();
$doc->encoding = 'UTF-8'; // theoretically, the above has this covered
@@ -87,10 +90,27 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
$tokens[] = $this->factory->createText($node->data);
return;
} elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
// undo DOM's special treatment of <script> tags
$tokens[] = $this->factory->createText($this->parseData($node->data));
// undo libxml's special treatment of <script> and <style> tags
$last = end($tokens);
$data = $node->data;
// (note $node->tagname is already normalized)
if ($last instanceof HTMLPurifier_Token_Start && $last->name == 'script') {
$new_data = trim($data);
if (substr($new_data, 0, 4) === '<!--') {
$data = substr($new_data, 4);
if (substr($data, -3) === '-->') {
$data = substr($data, 0, -3);
} else {
// Highly suspicious! Not sure what to do...
}
}
}
$tokens[] = $this->factory->createText($this->parseData($data));
return;
} elseif ($node->nodeType === XML_COMMENT_NODE) {
// this is code is only invoked for comments in script/style in versions
// of libxml pre-2.6.28 (regular comments, of course, are still
// handled regularly)
$tokens[] = $this->factory->createComment($node->data);
return;
} elseif (
@@ -151,5 +171,41 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
*/
public function muteErrorHandler($errno, $errstr) {}
/**
* Callback function for undoing escaping of stray angled brackets
* in comments
*/
function callbackUndoCommentSubst($matches) {
return '<!--' . strtr($matches[1], array('&amp;'=>'&','&lt;'=>'<')) . $matches[2];
}
/**
* Callback function that entity-izes ampersands in comments so that
* callbackUndoCommentSubst doesn't clobber them
*/
function callbackArmorCommentEntities($matches) {
return '<!--' . str_replace('&', '&amp;', $matches[1]) . $matches[2];
}
/**
* Wraps an HTML fragment in the necessary HTML
*/
function wrapHTML($html, $config, &$context) {
$def = $config->getDefinition('HTML');
$ret = '';
if (!empty($def->doctype->dtdPublic) || !empty($def->doctype->dtdSystem)) {
$ret .= '<!DOCTYPE html ';
if (!empty($def->doctype->dtdPublic)) $ret .= 'PUBLIC "' . $def->doctype->dtdPublic . '" ';
if (!empty($def->doctype->dtdSystem)) $ret .= '"' . $def->doctype->dtdSystem . '" ';
$ret .= '>';
}
$ret .= '<html><head>';
$ret .= '<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />';
$ret .= '</head><body><div>'.$html.'</div></body></html>';
return $ret;
}
}

View File

@@ -150,11 +150,25 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
// We are in tag and it is well formed
// Grab the internals of the tag
$strlen_segment = $position_next_gt - $cursor;
if ($strlen_segment < 1) {
// there's nothing to process!
$token = new HTMLPurifier_Token_Text('<');
$cursor++;
continue;
}
$segment = substr($html, $cursor, $strlen_segment);
if ($segment === false) {
// somehow, we attempted to access beyond the end of
// the string, defense-in-depth, reported by Nate Abele
break;
}
// Check if it's a comment
if (
substr($segment, 0, 3) == '!--'
strncmp('!--', $segment, 3) === 0
) {
// re-determine segment length, looking for -->
$position_comment_end = strpos($html, '-->', $cursor);
@@ -170,12 +184,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
}
$strlen_segment = $position_comment_end - $cursor;
$segment = substr($html, $cursor, $strlen_segment);
$token = new
HTMLPurifier_Token_Comment(
substr(
$segment, 3, $strlen_segment - 3
)
);
$token = new HTMLPurifier_Token_Comment(substr($segment, 3));
if ($maintain_line_numbers) {
$token->line = $current_line;
$current_line += $this->substrCount($html, $nl, $cursor, $strlen_segment);
@@ -204,7 +213,8 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
// Check leading character is alnum, if not, we may
// have accidently grabbed an emoticon. Translate into
// text and go our merry way
if (!ctype_alnum($segment[0])) {
if (!ctype_alpha($segment[0])) {
// XML: $segment[0] !== '_' && $segment[0] !== ':'
if ($e) $e->send(E_NOTICE, 'Lexer: Unescaped lt');
$token = new
HTMLPurifier_Token_Text(
@@ -228,7 +238,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
// trailing slash. Remember, we could have a tag like <br>, so
// any later token processing scripts must convert improperly
// classified EmptyTags from StartTags.
$is_self_closing= (strpos($segment,'/') === $strlen_segment-1);
$is_self_closing = (strrpos($segment,'/') === $strlen_segment-1);
if ($is_self_closing) {
$strlen_segment--;
$segment = substr($segment, 0, $strlen_segment);
@@ -371,6 +381,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
$value = $quoted_value;
}
}
if ($value === false) $value = '';
return array($key => $value);
}
@@ -385,7 +396,6 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
// infinite loop protection
$loops = 0;
while(true) {
// infinite loop protection
@@ -399,7 +409,6 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
}
$cursor += ($value = strspn($string, $this->_whitespace, $cursor));
// grab the key
$key_begin = $cursor; //we're currently at the start of the key
@@ -435,6 +444,11 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
$cursor++;
$cursor += strspn($string, $this->_whitespace, $cursor);
if ($cursor === false) {
$array[$key] = '';
break;
}
// we might be in front of a quote right now
$char = @$string[$cursor];
@@ -452,7 +466,14 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
$value_end = $cursor;
}
// we reached a premature end
if ($cursor === false) {
$cursor = $size;
$value_end = $cursor;
}
$value = substr($string, $value_begin, $value_end - $value_begin);
if ($value === false) $value = '';
$array[$key] = $this->parseData($value);
$cursor++;

File diff suppressed because it is too large Load Diff

View File

@@ -2,12 +2,68 @@
/**
* Class that handles operations involving percent-encoding in URIs.
*
* @warning
* Be careful when reusing instances of PercentEncoder. The object
* you use for normalize() SHOULD NOT be used for encode(), or
* vice-versa.
*/
class HTMLPurifier_PercentEncoder
{
/**
* Fix up percent-encoding by decoding unreserved characters and normalizing
* Reserved characters to preserve when using encode().
*/
var $preserve = array();
/**
* String of characters that should be preserved while using encode().
*/
function HTMLPurifier_PercentEncoder($preserve = false) {
// unreserved letters, ought to const-ify
for ($i = 48; $i <= 57; $i++) $this->preserve[$i] = true; // digits
for ($i = 65; $i <= 90; $i++) $this->preserve[$i] = true; // upper-case
for ($i = 97; $i <= 122; $i++) $this->preserve[$i] = true; // lower-case
$this->preserve[45] = true; // Dash -
$this->preserve[46] = true; // Period .
$this->preserve[95] = true; // Underscore _
$this->preserve[126]= true; // Tilde ~
// extra letters not to escape
if ($preserve !== false) {
for ($i = 0, $c = strlen($preserve); $i < $c; $i++) {
$this->preserve[ord($preserve[$i])] = true;
}
}
}
/**
* Our replacement for urlencode, it encodes all non-reserved characters,
* as well as any extra characters that were instructed to be preserved.
* @note
* Assumes that the string has already been normalized, making any
* and all percent escape sequences valid. Percents will not be
* re-escaped, regardless of their status in $preserve
* @param $string String to be encoded
* @return Encoded string.
*/
function encode($string) {
$ret = '';
for ($i = 0, $c = strlen($string); $i < $c; $i++) {
if ($string[$i] !== '%' && !isset($this->preserve[$int = ord($string[$i])]) ) {
$ret .= '%' . sprintf('%02X', $int);
} else {
$ret .= $string[$i];
}
}
return $ret;
}
/**
* Fix up percent-encoding by decoding unreserved characters and normalizing.
* @warning This function is affected by $preserve, even though the
* usual desired behavior is for this not to preserve those
* characters. Be careful when reusing instances of PercentEncoder!
* @param $string String to normalize
*/
function normalize($string) {
@@ -27,12 +83,7 @@ class HTMLPurifier_PercentEncoder
continue;
}
$int = hexdec($encoding);
if (
($int >= 48 && $int <= 57) || // digits
($int >= 65 && $int <= 90) || // uppercase letters
($int >= 97 && $int <= 122) || // lowercase letters
$int == 126 || $int == 45 || $int == 46 || $int == 95 // ~-._
) {
if (isset($this->preserve[$int])) {
$ret .= chr($int) . $text;
continue;
}

View File

@@ -1,7 +1,7 @@
.hp-config {}
.hp-config tbody th {text-align:right;}
.hp-config tbody th {text-align:right; padding-right:0.5em;}
.hp-config thead, .hp-config .namespace {background:#3C578C; color:#FFF;}
.hp-config .namespace th {text-align:center;}
.hp-config .verbose {display:none;}

View File

@@ -23,18 +23,55 @@ class HTMLPurifier_Printer_ConfigForm extends HTMLPurifier_Printer
*/
var $name;
/**
* Whether or not to compress directive names, clipping them off
* after a certain amount of letters. False to disable or integer letters
* before clipping.
* @protected
*/
var $compress = false;
/**
* @param $name Form element name for directives to be stuffed into
* @param $doc_url String documentation URL, will have fragment tagged on
* @param $compress Integer max length before compressing a directive name, set to false to turn off
*/
function HTMLPurifier_Printer_ConfigForm($name, $doc_url = null) {
function HTMLPurifier_Printer_ConfigForm(
$name, $doc_url = null, $compress = false
) {
parent::HTMLPurifier_Printer();
$this->docURL = $doc_url;
$this->name = $name;
$this->compress = $compress;
// initialize sub-printers
$this->fields['default'] = new HTMLPurifier_Printer_ConfigForm_default();
$this->fields['bool'] = new HTMLPurifier_Printer_ConfigForm_bool();
}
/**
* Sets default column and row size for textareas in sub-printers
* @param $cols Integer columns of textarea, null to use default
* @param $rows Integer rows of textarea, null to use default
*/
function setTextareaDimensions($cols = null, $rows = null) {
if ($cols) $this->fields['default']->cols = $cols;
if ($rows) $this->fields['default']->rows = $rows;
}
/**
* Retrieves styling, in case it is not accessible by webserver
*/
function getCSS() {
return file_get_contents(HTMLPURIFIER_PREFIX . '/HTMLPurifier/Printer/ConfigForm.css');
}
/**
* Retrieves JavaScript, in case it is not accessible by webserver
*/
function getJavaScript() {
return file_get_contents(HTMLPURIFIER_PREFIX . '/HTMLPurifier/Printer/ConfigForm.js');
}
/**
* Returns HTML output for a configuration form
* @param $config Configuration object of current form state
@@ -63,14 +100,14 @@ class HTMLPurifier_Printer_ConfigForm extends HTMLPurifier_Printer
$ret .= $this->renderNamespace($ns, $directives);
}
if ($render_controls) {
$ret .= $this->start('tfoot');
$ret .= $this->start('tbody');
$ret .= $this->start('tr');
$ret .= $this->start('td', array('colspan' => 2, 'class' => 'controls'));
$ret .= $this->elementEmpty('input', array('type' => 'Submit', 'value' => 'Submit'));
$ret .= $this->elementEmpty('input', array('type' => 'submit', 'value' => 'Submit'));
$ret .= '[<a href="?">Reset</a>]';
$ret .= $this->end('td');
$ret .= $this->end('tr');
$ret .= $this->end('tfoot');
$ret .= $this->end('tbody');
}
$ret .= $this->end('table');
return $ret;
@@ -98,11 +135,12 @@ class HTMLPurifier_Printer_ConfigForm extends HTMLPurifier_Printer
$ret .= $this->start('a', array('href' => $url));
}
$attr = array('for' => "{$this->name}:$ns.$directive");
// crop directive name if it's too long
if (strlen($directive) < 14) {
if (!$this->compress || (strlen($directive) < $this->compress)) {
$directive_disp = $directive;
} else {
$directive_disp = substr($directive, 0, 12) . '...';
$directive_disp = substr($directive, 0, $this->compress - 2) . '...';
$attr['title'] = $directive;
}
@@ -176,6 +214,8 @@ class HTMLPurifier_Printer_ConfigForm_NullDecorator extends HTMLPurifier_Printer
* Swiss-army knife configuration form field printer
*/
class HTMLPurifier_Printer_ConfigForm_default extends HTMLPurifier_Printer {
var $cols = 18;
var $rows = 5;
function render($ns, $directive, $value, $name, $config) {
$this->prepareGenerator($config);
// this should probably be split up a little
@@ -190,12 +230,12 @@ class HTMLPurifier_Printer_ConfigForm_default extends HTMLPurifier_Printer {
$value[] = $val;
}
case 'list':
$value = implode(',', $value);
$value = implode(PHP_EOL, $value);
break;
case 'hash':
$nvalue = '';
foreach ($value as $i => $v) {
$nvalue .= "$i:$v,";
$nvalue .= "$i:$v" . PHP_EOL;
}
$value = $nvalue;
break;
@@ -220,6 +260,15 @@ class HTMLPurifier_Printer_ConfigForm_default extends HTMLPurifier_Printer {
$ret .= $this->element('option', $val, $attr);
}
$ret .= $this->end('select');
} elseif (
$def->type == 'text' || $def->type == 'itext' ||
$def->type == 'list' || $def->type == 'hash' || $def->type == 'lookup'
) {
$attr['cols'] = $this->cols;
$attr['rows'] = $this->rows;
$ret .= $this->start('textarea', $attr);
$ret .= $this->text($value);
$ret .= $this->end('textarea');
} else {
$attr['value'] = $value;
$attr['type'] = 'text';

View File

@@ -102,6 +102,7 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer
$ret .= $this->element('td', $this->listifyTagLookup($lookup));
$ret .= $this->end('tr');
}
$ret .= $this->end('table');
return $ret;
}
@@ -179,7 +180,8 @@ class HTMLPurifier_Printer_HTMLDefinition extends HTMLPurifier_Printer
$def->validateChildren(array(), $this->config, $context);
}
$elements = $def->elements;
} elseif ($def->type == 'chameleon') {
}
if ($def->type == 'chameleon') {
$attr['rowspan'] = 2;
} elseif ($def->type == 'empty') {
$elements = array();

View File

@@ -195,7 +195,7 @@ class HTMLPurifier_Strategy_FixNesting extends HTMLPurifier_Strategy
//################################################################//
// Process result by interpreting $result
if ($result === true) {
if ($result === true || $child_tokens === $result) {
// leave the node as is
// register start token as a parental node start

View File

@@ -36,28 +36,23 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
$definition = $config->getHTMLDefinition();
// CurrentNesting
$this->currentNesting = array();
$context->register('CurrentNesting', $this->currentNesting);
// InputIndex
$this->inputIndex = false;
$context->register('InputIndex', $this->inputIndex);
// InputTokens
$context->register('InputTokens', $tokens);
$this->inputTokens =& $tokens;
// OutputTokens
// local variables
$result = array();
$this->outputTokens =& $result;
// %Core.EscapeInvalidTags
$escape_invalid_tags = $config->get('Core', 'EscapeInvalidTags');
$generator = new HTMLPurifier_Generator();
$escape_invalid_tags = $config->get('Core', 'EscapeInvalidTags');
$e =& $context->get('ErrorCollector', true);
// member variables
$this->currentNesting = array();
$this->inputIndex = false;
$this->inputTokens =& $tokens;
$this->outputTokens =& $result;
// context variables
$context->register('CurrentNesting', $this->currentNesting);
$context->register('InputIndex', $this->inputIndex);
$context->register('InputTokens', $tokens);
// -- begin INJECTOR --
$this->injectors = array();
@@ -67,7 +62,8 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
unset($injectors['Custom']); // special case
foreach ($injectors as $injector => $b) {
$injector = "HTMLPurifier_Injector_$injector";
if ($b) $this->injectors[] = new $injector;
if (!$b) continue;
$this->injectors[] = new $injector;
}
foreach ($custom_injectors as $injector) {
if (is_string($injector)) {
@@ -87,9 +83,17 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
// give the injectors references to the definition and context
// variables for performance reasons
foreach ($this->injectors as $i => $x) {
$this->injectors[$i]->prepare($config, $context);
$error = $this->injectors[$i]->prepare($config, $context);
if (!$error) continue;
list($injector) = array_splice($this->injectors, $i, 1);
$name = $injector->name;
trigger_error("Cannot enable $name injector because $error is not allowed", E_USER_WARNING);
}
// warning: most foreach loops follow the convention $i => $x.
// be sure, for PHP4 compatibility, to only perform write operations
// directly referencing the object using $i: $x is only safe for reads
// -- end INJECTOR --
$token = false;
@@ -100,6 +104,8 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
// if all goes well, this token will be passed through unharmed
$token = $tokens[$this->inputIndex];
//printTokens($tokens, $this->inputIndex);
foreach ($this->injectors as $i => $x) {
if ($x->skip > 0) $this->injectors[$i]->skip--;
}
@@ -109,7 +115,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
if ($token->type === 'text') {
// injector handler code; duplicated for performance reasons
foreach ($this->injectors as $i => $x) {
if (!$x->skip) $x->handleText($token, $config, $context);
if (!$x->skip) $this->injectors[$i]->handleText($token);
if (is_array($token)) {
$this->currentInjector = $i;
break;
@@ -122,26 +128,24 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
$info = $definition->info[$token->name]->child;
// quick checks:
// test if it claims to be a start tag but is empty
// quick tag checks: anything that's *not* an end tag
$ok = false;
if ($info->type == 'empty' && $token->type == 'start') {
$result[] = new HTMLPurifier_Token_Empty($token->name, $token->attr);
continue;
}
// test if it claims to be empty but really is a start tag
if ($info->type != 'empty' && $token->type == 'empty' ) {
$result[] = new HTMLPurifier_Token_Start($token->name, $token->attr);
$result[] = new HTMLPurifier_Token_End($token->name);
continue;
}
// automatically insert empty tags
if ($token->type == 'empty') {
$result[] = $token;
continue;
}
// start tags have precedence, so they get passed through...
if ($token->type == 'start') {
// test if it claims to be a start tag but is empty
$token = new HTMLPurifier_Token_Empty($token->name, $token->attr);
$ok = true;
} elseif ($info->type != 'empty' && $token->type == 'empty' ) {
// claims to be empty but really is a start tag
$token = array(
new HTMLPurifier_Token_Start($token->name, $token->attr),
new HTMLPurifier_Token_End($token->name)
);
$ok = true;
} elseif ($token->type == 'empty') {
// real empty token
$ok = true;
} elseif ($token->type == 'start') {
// start tag
// ...unless they also have to close their parent
if (!empty($this->currentNesting)) {
@@ -154,25 +158,26 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
// the parent
if (!isset($parent_info->child->elements[$token->name])) {
if ($e) $e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag auto closed', $parent);
// close the parent, then append the token
// close the parent, then re-loop to reprocess token
$result[] = new HTMLPurifier_Token_End($parent->name);
$result[] = $token;
$this->currentNesting[] = $token;
$this->inputIndex--;
continue;
}
$this->currentNesting[] = $parent; // undo the pop
}
// injector handler code; duplicated for performance reasons
$ok = true;
}
// injector handler code; duplicated for performance reasons
if ($ok) {
foreach ($this->injectors as $i => $x) {
if (!$x->skip) $x->handleStart($token, $config, $context);
if (!$x->skip) $this->injectors[$i]->handleElement($token);
if (is_array($token)) {
$this->currentInjector = $i;
break;
}
}
$this->processToken($token, $config, $context);
continue;
}
@@ -197,6 +202,9 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
$current_parent = array_pop($this->currentNesting);
if ($current_parent->name == $token->name) {
$result[] = $token;
foreach ($this->injectors as $i => $x) {
$this->injectors[$i]->notifyEnd($token);
}
continue;
}
@@ -233,16 +241,16 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
// okay, we found it, close all the skipped tags
// note that skipped tags contains the element we need closed
$size = count($skipped_tags);
for ($i = $size - 1; $i > 0; $i--) {
if ($e && !isset($skipped_tags[$i]->armor['MakeWellFormed_TagClosedError'])) {
for ($i = count($skipped_tags) - 1; $i >= 0; $i--) {
if ($i && $e && !isset($skipped_tags[$i]->armor['MakeWellFormed_TagClosedError'])) {
$e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by element end', $skipped_tags[$i]);
}
$result[] = new HTMLPurifier_Token_End($skipped_tags[$i]->name);
$result[] = $new_token = new HTMLPurifier_Token_End($skipped_tags[$i]->name);
foreach ($this->injectors as $j => $x) { // $j, not $i!!!
$this->injectors[$j]->notifyEnd($new_token);
}
}
$result[] = new HTMLPurifier_Token_End($skipped_tags[$i]->name);
}
$context->destroy('CurrentNesting');
@@ -250,17 +258,18 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
$context->destroy('InputIndex');
$context->destroy('CurrentToken');
// we're at the end now, fix all still unclosed tags
// not using processToken() because at this point we don't
// care about current nesting
// we're at the end now, fix all still unclosed tags (this is
// duplicated from the end of the loop with some slight modifications)
// not using $skipped_tags since it would invariably be all of them
if (!empty($this->currentNesting)) {
$size = count($this->currentNesting);
for ($i = $size - 1; $i >= 0; $i--) {
for ($i = count($this->currentNesting) - 1; $i >= 0; $i--) {
if ($e && !isset($this->currentNesting[$i]->armor['MakeWellFormed_TagClosedError'])) {
$e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by document end', $this->currentNesting[$i]);
}
$result[] =
new HTMLPurifier_Token_End($this->currentNesting[$i]->name);
$result[] = $new_token = new HTMLPurifier_Token_End($this->currentNesting[$i]->name);
foreach ($this->injectors as $j => $x) { // $j, not $i!!!
$this->injectors[$j]->notifyEnd($new_token);
}
}
}
@@ -280,9 +289,17 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
array_splice($this->inputTokens, $this->inputIndex--, 1, $token);
// adjust the injector skips based on the array substitution
$offset = count($token) + 1;
for ($i = 0; $i <= $this->currentInjector; $i++) {
$this->injectors[$i]->skip += $offset;
if ($this->injectors) {
$offset = count($token);
for ($i = 0; $i <= $this->currentInjector; $i++) {
// because of the skip back, we need to add one more
// for uninitialized injectors. I'm not exactly
// sure why this is the case, but I think it has to
// do with the fact that we're decrementing skips
// before re-checking text
if (!$this->injectors[$i]->skip) $this->injectors[$i]->skip++;
$this->injectors[$i]->skip += $offset;
}
}
} elseif ($token) {
// regular case

View File

@@ -8,19 +8,38 @@ require_once 'HTMLPurifier/TagTransform.php';
require_once 'HTMLPurifier/AttrValidator.php';
HTMLPurifier_ConfigSchema::define(
'Core', 'RemoveInvalidImg', true, 'bool',
'This directive enables pre-emptive URI checking in <code>img</code> '.
'tags, as the attribute validation strategy is not authorized to '.
'remove elements from the document. This directive has been available '.
'since 1.3.0, revert to pre-1.3.0 behavior by setting to false.'
'Core', 'RemoveInvalidImg', true, 'bool', '
<p>
This directive enables pre-emptive URI checking in <code>img</code>
tags, as the attribute validation strategy is not authorized to
remove elements from the document. This directive has been available
since 1.3.0, revert to pre-1.3.0 behavior by setting to false.
</p>
'
);
HTMLPurifier_ConfigSchema::define(
'Core', 'RemoveScriptContents', true, 'bool', '
'Core', 'RemoveScriptContents', null, 'bool/null', '
<p>
This directive enables HTML Purifier to remove not only script tags
but all of their contents. This directive has been available since 2.0.0,
revert to pre-2.0.0 behavior by setting to false.
but all of their contents. This directive has been deprecated since 2.1.0,
and when not set the value of %Core.HiddenElements will take
precedence. This directive has been available since 2.0.0, and can be used to
revert to pre-2.0.0 behavior by setting it to false.
</p>
'
);
HTMLPurifier_ConfigSchema::define(
'Core', 'HiddenElements', array('script' => true, 'style' => true), 'lookup', '
<p>
This directive is a lookup array of elements which should have their
contents removed when they are not allowed by the HTML definition.
For example, the contents of a <code>script</code> tag are not
normally shown in a document, so if script tags are to be removed,
their contents should be removed to. This is opposed to a <code>b</code>
tag, which defines some presentational changes but does not hide its
contents.
</p>
'
);
@@ -43,7 +62,16 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
$escape_invalid_tags = $config->get('Core', 'EscapeInvalidTags');
$remove_invalid_img = $config->get('Core', 'RemoveInvalidImg');
$remove_script_contents = $config->get('Core', 'RemoveScriptContents');
$hidden_elements = $config->get('Core', 'HiddenElements');
// remove script contents compatibility
if ($remove_script_contents === true) {
$hidden_elements['script'] = true;
} elseif ($remove_script_contents === false && isset($hidden_elements['script'])) {
unset($hidden_elements['script']);
}
$attr_validator = new HTMLPurifier_AttrValidator();
@@ -88,6 +116,7 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
// mostly everything's good, but
// we need to make sure required attributes are in order
if (
($token->type === 'start' || $token->type === 'empty') &&
$definition->info[$token->name]->required_attr &&
($token->name != 'img' || $remove_invalid_img) // ensure config option still works
) {
@@ -106,8 +135,7 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
$token->armor['ValidateAttributes'] = true;
}
// CAN BE GENERICIZED
if ($token->name == 'script' && $token->type == 'start') {
if (isset($hidden_elements[$token->name]) && $token->type == 'start') {
$textify_comments = $token->name;
} elseif ($token->name === $textify_comments && $token->type == 'end') {
$textify_comments = false;
@@ -122,7 +150,7 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
} else {
// check if we need to destroy all of the tag's children
// CAN BE GENERICIZED
if ($token->name == 'script' && $remove_script_contents) {
if (isset($hidden_elements[$token->name])) {
if ($token->type == 'start') {
$remove_until = $token->name;
} elseif ($token->type == 'empty') {
@@ -130,7 +158,7 @@ class HTMLPurifier_Strategy_RemoveForeignElements extends HTMLPurifier_Strategy
} else {
$remove_until = false;
}
if ($e) $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Script removed');
if ($e) $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Foreign meta element removed');
} else {
if ($e) $e->send(E_ERROR, 'Strategy_RemoveForeignElements: Foreign element removed');
}

View File

@@ -6,10 +6,6 @@ require_once 'HTMLPurifier/IDAccumulator.php';
require_once 'HTMLPurifier/AttrValidator.php';
HTMLPurifier_ConfigSchema::define(
'Attr', 'IDBlacklist', array(), 'list',
'Array of IDs not allowed in the document.');
/**
* Validate all attributes in the tokens.
*/
@@ -19,11 +15,6 @@ class HTMLPurifier_Strategy_ValidateAttributes extends HTMLPurifier_Strategy
function execute($tokens, $config, &$context) {
// setup id_accumulator context
$id_accumulator = new HTMLPurifier_IDAccumulator();
$id_accumulator->load($config->get('Attr', 'IDBlacklist'));
$context->register('IDAccumulator', $id_accumulator);
// setup validator
$validator = new HTMLPurifier_AttrValidator();
@@ -44,8 +35,7 @@ class HTMLPurifier_Strategy_ValidateAttributes extends HTMLPurifier_Strategy
$tokens[$key] = $token; // for PHP 4
}
$context->destroy('IDAccumulator');
$context->destroy('CurrentToken');
return $tokens;
}

View File

@@ -0,0 +1,182 @@
<?php
require_once 'HTMLPurifier/URIParser.php';
require_once 'HTMLPurifier/URIFilter.php';
/**
* HTML Purifier's internal representation of a URI.
* @note
* Internal data-structures are completely escaped. If the data needs
* to be used in a non-URI context (which is very unlikely), be sure
* to decode it first. The URI may not necessarily be well-formed until
* validate() is called.
*/
class HTMLPurifier_URI
{
var $scheme, $userinfo, $host, $port, $path, $query, $fragment;
/**
* @note Automatically normalizes scheme and port
*/
function HTMLPurifier_URI($scheme, $userinfo, $host, $port, $path, $query, $fragment) {
$this->scheme = is_null($scheme) || ctype_lower($scheme) ? $scheme : strtolower($scheme);
$this->userinfo = $userinfo;
$this->host = $host;
$this->port = is_null($port) ? $port : (int) $port;
$this->path = $path;
$this->query = $query;
$this->fragment = $fragment;
}
/**
* Retrieves a scheme object corresponding to the URI's scheme/default
* @param $config Instance of HTMLPurifier_Config
* @param $context Instance of HTMLPurifier_Context
* @return Scheme object appropriate for validating this URI
*/
function getSchemeObj($config, &$context) {
$registry =& HTMLPurifier_URISchemeRegistry::instance();
if ($this->scheme !== null) {
$scheme_obj = $registry->getScheme($this->scheme, $config, $context);
if (!$scheme_obj) return false; // invalid scheme, clean it out
} else {
// no scheme: retrieve the default one
$def = $config->getDefinition('URI');
$scheme_obj = $registry->getScheme($def->defaultScheme, $config, $context);
if (!$scheme_obj) {
// something funky happened to the default scheme object
trigger_error(
'Default scheme object "' . $def->defaultScheme . '" was not readable',
E_USER_WARNING
);
return false;
}
}
return $scheme_obj;
}
/**
* Generic validation method applicable for all schemes. May modify
* this URI in order to get it into a compliant form.
* @param $config Instance of HTMLPurifier_Config
* @param $context Instance of HTMLPurifier_Context
* @return True if validation/filtering succeeds, false if failure
*/
function validate($config, &$context) {
// ABNF definitions from RFC 3986
$chars_sub_delims = '!$&\'()*+,;=';
$chars_gen_delims = ':/?#[]@';
$chars_pchar = $chars_sub_delims . ':@';
// validate scheme (MUST BE FIRST!)
if (!is_null($this->scheme) && is_null($this->host)) {
$def = $config->getDefinition('URI');
if ($def->defaultScheme === $this->scheme) {
$this->scheme = null;
}
}
// validate host
if (!is_null($this->host)) {
$host_def = new HTMLPurifier_AttrDef_URI_Host();
$this->host = $host_def->validate($this->host, $config, $context);
if ($this->host === false) $this->host = null;
}
// validate username
if (!is_null($this->userinfo)) {
$encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . ':');
$this->userinfo = $encoder->encode($this->userinfo);
}
// validate port
if (!is_null($this->port)) {
if ($this->port < 1 || $this->port > 65535) $this->port = null;
}
// validate path
$path_parts = array();
$segments_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/');
if (!is_null($this->host)) {
// path-abempty (hier and relative)
$this->path = $segments_encoder->encode($this->path);
} elseif ($this->path !== '' && $this->path[0] === '/') {
// path-absolute (hier and relative)
if (strlen($this->path) >= 2 && $this->path[1] === '/') {
// This shouldn't ever happen!
$this->path = '';
} else {
$this->path = $segments_encoder->encode($this->path);
}
} elseif (!is_null($this->scheme) && $this->path !== '') {
// path-rootless (hier)
// Short circuit evaluation means we don't need to check nz
$this->path = $segments_encoder->encode($this->path);
} elseif (is_null($this->scheme) && $this->path !== '') {
// path-noscheme (relative)
// (once again, not checking nz)
$segment_nc_encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . '@');
$c = strpos($this->path, '/');
if ($c !== false) {
$this->path =
$segment_nc_encoder->encode(substr($this->path, 0, $c)) .
$segments_encoder->encode(substr($this->path, $c));
} else {
$this->path = $segment_nc_encoder->encode($this->path);
}
} else {
// path-empty (hier and relative)
$this->path = ''; // just to be safe
}
// qf = query and fragment
$qf_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/?');
if (!is_null($this->query)) {
$this->query = $qf_encoder->encode($this->query);
}
if (!is_null($this->fragment)) {
$this->fragment = $qf_encoder->encode($this->fragment);
}
return true;
}
/**
* Convert URI back to string
* @return String URI appropriate for output
*/
function toString() {
// reconstruct authority
$authority = null;
if (!is_null($this->host)) {
$authority = '';
if(!is_null($this->userinfo)) $authority .= $this->userinfo . '@';
$authority .= $this->host;
if(!is_null($this->port)) $authority .= ':' . $this->port;
}
// reconstruct the result
$result = '';
if (!is_null($this->scheme)) $result .= $this->scheme . ':';
if (!is_null($authority)) $result .= '//' . $authority;
$result .= $this->path;
if (!is_null($this->query)) $result .= '?' . $this->query;
if (!is_null($this->fragment)) $result .= '#' . $this->fragment;
return $result;
}
/**
* Returns a copy of the URI object
*/
function copy() {
return unserialize(serialize($this));
}
}

View File

@@ -0,0 +1,145 @@
<?php
require_once 'HTMLPurifier/Definition.php';
require_once 'HTMLPurifier/URIFilter.php';
require_once 'HTMLPurifier/URIParser.php';
require_once 'HTMLPurifier/URIFilter/DisableExternal.php';
require_once 'HTMLPurifier/URIFilter/DisableExternalResources.php';
require_once 'HTMLPurifier/URIFilter/HostBlacklist.php';
require_once 'HTMLPurifier/URIFilter/MakeAbsolute.php';
HTMLPurifier_ConfigSchema::define(
'URI', 'DefinitionID', null, 'string/null', '
<p>
Unique identifier for a custom-built URI definition. If you want
to add custom URIFilters, you must specify this value.
This directive has been available since 2.1.0.
</p>
');
HTMLPurifier_ConfigSchema::define(
'URI', 'DefinitionRev', 1, 'int', '
<p>
Revision identifier for your custom definition. See
%HTML.DefinitionRev for details. This directive has been available
since 2.1.0.
</p>
');
// informative URI directives
HTMLPurifier_ConfigSchema::define(
'URI', 'DefaultScheme', 'http', 'string', '
<p>
Defines through what scheme the output will be served, in order to
select the proper object validator when no scheme information is present.
</p>
');
HTMLPurifier_ConfigSchema::define(
'URI', 'Host', null, 'string/null', '
<p>
Defines the domain name of the server, so we can determine whether or
an absolute URI is from your website or not. Not strictly necessary,
as users should be using relative URIs to reference resources on your
website. It will, however, let you use absolute URIs to link to
subdomains of the domain you post here: i.e. example.com will allow
sub.example.com. However, higher up domains will still be excluded:
if you set %URI.Host to sub.example.com, example.com will be blocked.
<strong>Note:</strong> This directive overrides %URI.Base because
a given page may be on a sub-domain, but you wish HTML Purifier to be
more relaxed and allow some of the parent domains too.
This directive has been available since 1.2.0.
</p>
');
HTMLPurifier_ConfigSchema::define(
'URI', 'Base', null, 'string/null', '
<p>
The base URI is the URI of the document this purified HTML will be
inserted into. This information is important if HTML Purifier needs
to calculate absolute URIs from relative URIs, such as when %URI.MakeAbsolute
is on. You may use a non-absolute URI for this value, but behavior
may vary (%URI.MakeAbsolute deals nicely with both absolute and
relative paths, but forwards-compatibility is not guaranteed).
<strong>Warning:</strong> If set, the scheme on this URI
overrides the one specified by %URI.DefaultScheme. This directive has
been available since 2.1.0.
</p>
');
class HTMLPurifier_URIDefinition extends HTMLPurifier_Definition
{
var $type = 'URI';
var $filters = array();
var $registeredFilters = array();
/**
* HTMLPurifier_URI object of the base specified at %URI.Base
*/
var $base;
/**
* String host to consider "home" base
*/
var $host;
/**
* Name of default scheme based on %URI.DefaultScheme and %URI.Base
*/
var $defaultScheme;
function HTMLPurifier_URIDefinition() {
$this->registerFilter(new HTMLPurifier_URIFilter_DisableExternal());
$this->registerFilter(new HTMLPurifier_URIFilter_DisableExternalResources());
$this->registerFilter(new HTMLPurifier_URIFilter_HostBlacklist());
$this->registerFilter(new HTMLPurifier_URIFilter_MakeAbsolute());
}
function registerFilter($filter) {
$this->registeredFilters[$filter->name] = $filter;
}
function addFilter($filter, $config) {
$filter->prepare($config);
$this->filters[$filter->name] = $filter;
}
function doSetup($config) {
$this->setupMemberVariables($config);
$this->setupFilters($config);
}
function setupFilters($config) {
foreach ($this->registeredFilters as $name => $filter) {
$conf = $config->get('URI', $name);
if ($conf !== false && $conf !== null) {
$this->addFilter($filter, $config);
}
}
unset($this->registeredFilters);
}
function setupMemberVariables($config) {
$this->host = $config->get('URI', 'Host');
$base_uri = $config->get('URI', 'Base');
if (!is_null($base_uri)) {
$parser = new HTMLPurifier_URIParser();
$this->base = $parser->parse($base_uri);
$this->defaultScheme = $this->base->scheme;
if (is_null($this->host)) $this->host = $this->base->host;
}
if (is_null($this->defaultScheme)) $this->defaultScheme = $config->get('URI', 'DefaultScheme');
}
function filter(&$uri, $config, &$context) {
foreach ($this->filters as $name => $x) {
$result = $this->filters[$name]->filter($uri, $config, $context);
if (!$result) return false;
}
return true;
}
}

View File

@@ -0,0 +1,40 @@
<?php
/**
* Chainable filters for custom URI processing.
*
* These filters can perform custom actions on a URI filter object,
* including transformation or blacklisting.
*
* @warning This filter is called before scheme object validation occurs.
* Make sure, if you require a specific scheme object, you
* you check that it exists. This allows filters to convert
* proprietary URI schemes into regular ones.
*/
class HTMLPurifier_URIFilter
{
/**
* Unique identifier of filter
*/
var $name;
/**
* Performs initialization for the filter
*/
function prepare($config) {}
/**
* Filter a URI object
* @param &$uri Reference to URI object
* @param $config Instance of HTMLPurifier_Config
* @param &$context Instance of HTMLPurifier_Context
* @return bool Whether or not to continue processing: false indicates
* URL is no good, true indicates continue processing. Note that
* all changes are committed directly on the URI object
*/
function filter(&$uri, $config, &$context) {
trigger_error('Cannot call abstract function', E_USER_ERROR);
}
}

View File

@@ -0,0 +1,34 @@
<?php
require_once 'HTMLPurifier/URIFilter.php';
HTMLPurifier_ConfigSchema::define(
'URI', 'DisableExternal', false, 'bool',
'Disables links to external websites. This is a highly effective '.
'anti-spam and anti-pagerank-leech measure, but comes at a hefty price: no'.
'links or images outside of your domain will be allowed. Non-linkified '.
'URIs will still be preserved. If you want to be able to link to '.
'subdomains or use absolute URIs, specify %URI.Host for your website. '.
'This directive has been available since 1.2.0.'
);
class HTMLPurifier_URIFilter_DisableExternal extends HTMLPurifier_URIFilter
{
var $name = 'DisableExternal';
var $ourHostParts = false;
function prepare($config) {
$our_host = $config->get('URI', 'Host');
if ($our_host !== null) $this->ourHostParts = array_reverse(explode('.', $our_host));
}
function filter(&$uri, $config, &$context) {
if (is_null($uri->host)) return true;
if ($this->ourHostParts === false) return false;
$host_parts = array_reverse(explode('.', $uri->host));
foreach ($this->ourHostParts as $i => $x) {
if (!isset($host_parts[$i])) return false;
if ($host_parts[$i] != $this->ourHostParts[$i]) return false;
}
return true;
}
}

View File

@@ -0,0 +1,26 @@
<?php
require_once 'HTMLPurifier/URIFilter/DisableExternal.php';
HTMLPurifier_ConfigSchema::define(
'URI', 'DisableExternalResources', false, 'bool',
'Disables the embedding of external resources, preventing users from '.
'embedding things like images from other hosts. This prevents '.
'access tracking (good for email viewers), bandwidth leeching, '.
'cross-site request forging, goatse.cx posting, and '.
'other nasties, but also results in '.
'a loss of end-user functionality (they can\'t directly post a pic '.
'they posted from Flickr anymore). Use it if you don\'t have a '.
'robust user-content moderation team. This directive has been '.
'available since 1.3.0.'
);
class HTMLPurifier_URIFilter_DisableExternalResources extends HTMLPurifier_URIFilter_DisableExternal
{
var $name = 'DisableExternalResources';
function filter(&$uri, $config, &$context) {
if (!$context->get('EmbeddedURI', true)) return true;
return parent::filter($uri, $config, $context);
}
}

Some files were not shown because too many files have changed in this diff Show More