1
0
mirror of https://github.com/ezyang/htmlpurifier.git synced 2025-08-02 20:27:40 +02:00

Compare commits

...

304 Commits

Author SHA1 Message Date
Edward Z. Yang
f4e4c1556d Release 1.6.1.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1025 48356398-32a2-884e-a903-53898d9a118a
2007-05-05 20:26:42 +00:00
Edward Z. Yang
c5e33416d3 [1.6.1] Unit tests now use exclusively assertIdentical
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1024 48356398-32a2-884e-a903-53898d9a118a
2007-05-05 20:17:04 +00:00
Edward Z. Yang
6c08ca4c16 [1.6.1] Fix bug (== v. ===) that caused merged in attribute definitions to be messed up
- Make our modified class_exists() check to work in both PHP 4 and 5
(todo: we need some unit tests for ElementDef)

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1023 48356398-32a2-884e-a903-53898d9a118a
2007-05-05 20:04:34 +00:00
Edward Z. Yang
b1822bb04f [1.6.1] Implement AttrTransform for type in ul, ol and li
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1022 48356398-32a2-884e-a903-53898d9a118a
2007-05-05 19:13:52 +00:00
Edward Z. Yang
893e962890 [1.6.1] Update unit tests for font transformation
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1021 48356398-32a2-884e-a903-53898d9a118a
2007-05-05 18:59:24 +00:00
Edward Z. Yang
bd6071cb3b [1.6.1] Transformation of font's size attribute now handles super-large numbers
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1020 48356398-32a2-884e-a903-53898d9a118a
2007-05-05 18:56:45 +00:00
Edward Z. Yang
92ea74cba2 [1.6.1] Add attribute transformation smoketests
- Repair broken noshade implementation
- Add lots of advisory comments to TransformToStrict.php

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1019 48356398-32a2-884e-a903-53898d9a118a
2007-05-05 18:41:53 +00:00
Edward Z. Yang
a01459c87a [1.6.1] Implement clear in br and align in caption, table, img and hr
- Refactored ValidateAttributesTest.php

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1018 48356398-32a2-884e-a903-53898d9a118a
2007-05-05 16:18:04 +00:00
Edward Z. Yang
fd35c43643 [1.6.1] Implement generic EnumToCSS attribute transformation, migrate text alignment to it
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1017 48356398-32a2-884e-a903-53898d9a118a
2007-05-05 15:48:41 +00:00
Edward Z. Yang
0426985c81 [1.6.1] Refactor AttrTransform to reduce duplication.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1016 48356398-32a2-884e-a903-53898d9a118a
2007-05-05 02:25:55 +00:00
Edward Z. Yang
bbea02f55c Rewrite docs on align attribute, complete with smoketest-case and licensing info.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1014 48356398-32a2-884e-a903-53898d9a118a
2007-05-04 01:29:06 +00:00
Edward Z. Yang
4e77a1adbd [1.6.1] Fix fatal error with XHTML 1.1 validation.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1013 48356398-32a2-884e-a903-53898d9a118a
2007-05-04 01:17:00 +00:00
Edward Z. Yang
bd58a7ba77 [1.6.1] Implement BoolToCSS attribute transformations for td,th.nowrap and hr.noshade
- Implement CSS property white-space:nowrap;
- Update TODO with more ambitious goal: all transforms by 1.6.1

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1012 48356398-32a2-884e-a903-53898d9a118a
2007-05-03 04:07:47 +00:00
Edward Z. Yang
a3ed9196b9 Downgrade code-quality back to a txt scratchpad, add more items for AttrDef
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1010 48356398-32a2-884e-a903-53898d9a118a
2007-05-03 03:15:29 +00:00
Edward Z. Yang
2646f5ea57 Add experimental and dangerous Scripting module. This is NOT mentioned in the NEWS items, and will be officially released with 1.7.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1008 48356398-32a2-884e-a903-53898d9a118a
2007-05-01 21:43:24 +00:00
Edward Z. Yang
424c7ad2e3 Update target milestones, add Windows live mail specimen.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1007 48356398-32a2-884e-a903-53898d9a118a
2007-05-01 21:37:35 +00:00
Edward Z. Yang
234b3085d7 [1.6.1] Activate transform for hr.size
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1006 48356398-32a2-884e-a903-53898d9a118a
2007-05-01 21:36:19 +00:00
Edward Z. Yang
3d978c961d [1.6.1] Implement target module/attribute.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1002 48356398-32a2-884e-a903-53898d9a118a
2007-04-30 21:19:15 +00:00
Edward Z. Yang
72254cd77a [1.6.1] Implement vspace and hspace transformations in img.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1001 48356398-32a2-884e-a903-53898d9a118a
2007-04-30 19:39:42 +00:00
Edward Z. Yang
d8a6361244 [1.6.1] Empty strings get converted to empty arrays instead of arrays with an empty string in them.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1000 48356398-32a2-884e-a903-53898d9a118a
2007-04-30 01:14:21 +00:00
Edward Z. Yang
968dfa2feb [1.6.1] Fix broken configuration directive %Core.RemoveInvalidImg, also make basic demo operational out-of-the-box
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@999 48356398-32a2-884e-a903-53898d9a118a
2007-04-30 00:53:13 +00:00
Edward Z. Yang
114d6841ab Update TODO: rename release and add HTML configuration interface
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@998 48356398-32a2-884e-a903-53898d9a118a
2007-04-30 00:48:22 +00:00
Edward Z. Yang
1c68d769b5 Fix bug in packager: force all files to be "php"
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@996 48356398-32a2-884e-a903-53898d9a118a
2007-04-29 04:06:40 +00:00
Edward Z. Yang
ac0ca3f15c Miscellaneous URL updates.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@993 48356398-32a2-884e-a903-53898d9a118a
2007-04-22 22:26:20 +00:00
Edward Z. Yang
2d5498b8aa Update URLs from hp.jpsband.org to htmlpurifier.org
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@992 48356398-32a2-884e-a903-53898d9a118a
2007-04-22 22:22:48 +00:00
Edward Z. Yang
71ccae1a3a [1.6.0] Add news item on how demo script was removed
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@991 48356398-32a2-884e-a903-53898d9a118a
2007-04-22 22:11:35 +00:00
Edward Z. Yang
cb186dddc4 Compactify HTML Purifier library inclusion
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@987 48356398-32a2-884e-a903-53898d9a118a
2007-04-22 21:01:48 +00:00
Edward Z. Yang
2ceccc0969 Moved remotely to website.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@984 48356398-32a2-884e-a903-53898d9a118a
2007-04-22 20:55:52 +00:00
Edward Z. Yang
93aa98ad01 Update package.php with new URLs from migration.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@979 48356398-32a2-884e-a903-53898d9a118a
2007-04-22 02:56:05 +00:00
Edward Z. Yang
c0b38bab85 [1.6.1] Invert HTMLModuleManager->addModule() processing order to check prefixes first and then the literal module
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@971 48356398-32a2-884e-a903-53898d9a118a
2007-04-21 02:31:38 +00:00
Edward Z. Yang
d6c4473a12 [1.6.1] Possibly fatal bug with __autoload() fixed in module manager
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@969 48356398-32a2-884e-a903-53898d9a118a
2007-04-21 02:19:18 +00:00
Edward Z. Yang
fc06f221d5 Remove redundant $info member variable.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@960 48356398-32a2-884e-a903-53898d9a118a
2007-04-11 21:44:26 +00:00
Edward Z. Yang
ac3ab2a556 [1.6.1] DirectLex now preserves text in which a < bracket is followed by a non-alphanumeric character. This means that certain emoticons are now preserved.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@939 48356398-32a2-884e-a903-53898d9a118a
2007-04-04 02:22:27 +00:00
Edward Z. Yang
2c330cac73 Add 1.6.1 TODO stuff.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@937 48356398-32a2-884e-a903-53898d9a118a
2007-04-02 13:28:49 +00:00
Edward Z. Yang
a0d6543b84 Some packaging fixes:
- Add VERSION file, which contains just the version number of the release
- Add WHATSNEW, which is a short summary of the new release
- Add release.php which bumps all the necessary version numbers in files
- Update package.php so that the version numbers aren't hardcoded
- Add news entry for 1.7.0

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@935 48356398-32a2-884e-a903-53898d9a118a
2007-04-02 03:58:59 +00:00
Edward Z. Yang
e223490a78 Release 1.6.0.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@930 48356398-32a2-884e-a903-53898d9a118a
2007-04-01 22:31:16 +00:00
Edward Z. Yang
2666f067cc Add partial French install file.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@929 48356398-32a2-884e-a903-53898d9a118a
2007-04-01 21:38:10 +00:00
Edward Z. Yang
826a57a04a Update Advanced API with various edits and Customization section.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@928 48356398-32a2-884e-a903-53898d9a118a
2007-04-01 18:21:43 +00:00
Edward Z. Yang
e08b5aaa70 [1.6.0] Add error messages for when user attempts to "allow" elements or attributes HTML Purifier does not support.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@927 48356398-32a2-884e-a903-53898d9a118a
2007-03-31 03:41:22 +00:00
Edward Z. Yang
b15e8c344e [1.6.0] Implement ID regexp matching.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@926 48356398-32a2-884e-a903-53898d9a118a
2007-03-31 03:25:10 +00:00
Edward Z. Yang
2c9e041b4c Update TODO and progress document.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@925 48356398-32a2-884e-a903-53898d9a118a
2007-03-31 03:09:46 +00:00
Edward Z. Yang
e2c3394d70 [1.6.0] Add support for LinkTypes, used for rel and rev attributes.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@924 48356398-32a2-884e-a903-53898d9a118a
2007-03-31 02:58:16 +00:00
Edward Z. Yang
1532fe703a Update docs:
- Progress hr.size was changed from width to height
- UTF-8 rules of thumb were clarified to make clear this is only necessary for UTF-8 text.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@923 48356398-32a2-884e-a903-53898d9a118a
2007-03-30 00:01:35 +00:00
Edward Z. Yang
058f1eba7d [1.6.0] Implement width/height attribute transforms with Length.php
- Also, enabled 'height' CSS attribute

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@922 48356398-32a2-884e-a903-53898d9a118a
2007-03-29 23:48:54 +00:00
Edward Z. Yang
1102dc6e27 [1.6.0] Add support for name transformation to id
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@921 48356398-32a2-884e-a903-53898d9a118a
2007-03-29 23:19:53 +00:00
Edward Z. Yang
85374d330f [1.6.0] Add support for border attribute transform
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@920 48356398-32a2-884e-a903-53898d9a118a
2007-03-29 21:41:17 +00:00
Edward Z. Yang
a16d6c4342 [1.6.0] Add support for bgcolor attribute transform.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@919 48356398-32a2-884e-a903-53898d9a118a
2007-03-29 21:20:44 +00:00
Edward Z. Yang
9b5e2978ad Add ID regexps to the TODO list.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@913 48356398-32a2-884e-a903-53898d9a118a
2007-03-29 00:13:12 +00:00
Edward Z. Yang
06468a4157 [1.5.1] Add segfault fix to news log.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@912 48356398-32a2-884e-a903-53898d9a118a
2007-03-27 23:29:10 +00:00
Edward Z. Yang
0167f8aa84 [1.5.1] Try separating out declarations, might stop segfaulting.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@911 48356398-32a2-884e-a903-53898d9a118a
2007-03-27 23:15:01 +00:00
Edward Z. Yang
f1a90e684b [1.5.1] Separate out trouble area that's having segfaults. (note: this commit actually inadvertently let us discover a fix for the segfault, applied in the next revision).
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@910 48356398-32a2-884e-a903-53898d9a118a
2007-03-27 23:07:21 +00:00
Edward Z. Yang
14d98413fd Update advanced API with more details on selection interface.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@908 48356398-32a2-884e-a903-53898d9a118a
2007-03-27 01:26:26 +00:00
Edward Z. Yang
97a4ec7598 Add in terracc's suggestions to TODO file.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@886 48356398-32a2-884e-a903-53898d9a118a
2007-03-25 00:40:13 +00:00
Edward Z. Yang
71ed725c5c Complete PEAR packager that actually works!
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@885 48356398-32a2-884e-a903-53898d9a118a
2007-03-25 00:23:35 +00:00
Edward Z. Yang
d4bf41288a Add package2.xml
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@884 48356398-32a2-884e-a903-53898d9a118a
2007-03-24 20:43:16 +00:00
Edward Z. Yang
365bd78c20 Commit PEAR package stuffs.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@883 48356398-32a2-884e-a903-53898d9a118a
2007-03-24 20:39:00 +00:00
Edward Z. Yang
52fa958fb2 Release 1.5.0 (bumped HTMLPurifier.php version number).
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@874 48356398-32a2-884e-a903-53898d9a118a
2007-03-24 02:10:33 +00:00
Edward Z. Yang
17d32bac7f Almost release 1.5.0. Merged in a few strict changes.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@870 48356398-32a2-884e-a903-53898d9a118a
2007-03-24 01:24:38 +00:00
Edward Z. Yang
e2babe5308 Almost release 1.5.0.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@867 48356398-32a2-884e-a903-53898d9a118a
2007-03-24 00:35:53 +00:00
Edward Z. Yang
5f1a6b883f Update NEWS with a few old items I missed. We may yet have a 1.4.2 interim release.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@811 48356398-32a2-884e-a903-53898d9a118a
2007-03-14 21:34:37 +00:00
Edward Z. Yang
c5e3796202 Update advanced API docs, link to it from index.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@808 48356398-32a2-884e-a903-53898d9a118a
2007-03-14 04:56:44 +00:00
Edward Z. Yang
72f1984229 Add notes on "mode" to advanced API.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@801 48356398-32a2-884e-a903-53898d9a118a
2007-03-12 03:53:09 +00:00
Edward Z. Yang
918081b372 [1.4.x?] Make regex multiline.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@794 48356398-32a2-884e-a903-53898d9a118a
2007-03-04 02:55:44 +00:00
Edward Z. Yang
6c56dd070f Updated Advanced API docs.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@769 48356398-32a2-884e-a903-53898d9a118a
2007-03-01 03:56:08 +00:00
Edward Z. Yang
299f93f8f0 Add initial version of advanced API specification, also add <q> tag fix.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@768 48356398-32a2-884e-a903-53898d9a118a
2007-02-28 04:42:08 +00:00
Edward Z. Yang
4169846c57 Modules are not passed by reference, so in PHP 4 we cannot guarantee same module that went in will be used.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@767 48356398-32a2-884e-a903-53898d9a118a
2007-02-27 23:57:54 +00:00
Edward Z. Yang
aff4957531 [1.4.x?] Alright, have both PHP5 and DOMDocument requirements for DOMLex checked.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@766 48356398-32a2-884e-a903-53898d9a118a
2007-02-27 23:54:29 +00:00
Edward Z. Yang
e4bdf472a6 Fix typo.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@764 48356398-32a2-884e-a903-53898d9a118a
2007-02-20 03:05:03 +00:00
Edward Z. Yang
9a99750474 - Setup doctypes, auto properties, and work on making the interface more user-friendly
- Yet even more unit test for HTMLModuleManager
- Sample code in printDefinition for defining a new element
- Downgraded importances of HTMLModule->elements

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@762 48356398-32a2-884e-a903-53898d9a118a
2007-02-18 05:29:19 +00:00
Edward Z. Yang
7eb751b5f5 More refactoring: for interest of unit testing, default doctypes were moved to an initialize() method which could optionally be omitted. Disable collection aliases in favor of doctype aliases.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@761 48356398-32a2-884e-a903-53898d9a118a
2007-02-17 22:17:14 +00:00
Edward Z. Yang
0d0173eb6e Implement unit tests for very public interfaces of HTMLModuleManager, also added lots of error checking. tally_errors now requires unit test to be passed in as parameter.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@760 48356398-32a2-884e-a903-53898d9a118a
2007-02-17 19:37:48 +00:00
Edward Z. Yang
556ed4ea90 - Shuffle around includes to the right places
- Fix error in unit test

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@759 48356398-32a2-884e-a903-53898d9a118a
2007-02-17 17:43:44 +00:00
Edward Z. Yang
cf445a6107 - Revamp ordering scheme: onus in on collections, conflict resolution based on module load order.
- Miscellaneous refactoring and documentation

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@758 48356398-32a2-884e-a903-53898d9a118a
2007-02-17 17:10:28 +00:00
Edward Z. Yang
243ad45e59 Add some clarifying comments on what belongs in activeModules and validModules.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@757 48356398-32a2-884e-a903-53898d9a118a
2007-02-16 03:48:25 +00:00
Edward Z. Yang
31d0c621f5 Create two more module sets: activeModules and validModules to supplant the getModules() method.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@756 48356398-32a2-884e-a903-53898d9a118a
2007-02-16 03:33:29 +00:00
Edward Z. Yang
0870974a25 Have processCollections() perform name to instance indexing at the get-go.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@755 48356398-32a2-884e-a903-53898d9a118a
2007-02-16 03:16:17 +00:00
Edward Z. Yang
5c4a0a6785 Migrate default attribute collections to their own module, do late-loading of the attribute collection.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@754 48356398-32a2-884e-a903-53898d9a118a
2007-02-16 03:07:47 +00:00
Edward Z. Yang
e55babdc53 Move order to module itself, as member variable type.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@753 48356398-32a2-884e-a903-53898d9a118a
2007-02-16 03:01:23 +00:00
Edward Z. Yang
6e1b540d99 Remove missing include.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@752 48356398-32a2-884e-a903-53898d9a118a
2007-02-15 14:02:01 +00:00
Edward Z. Yang
edf20018f0 Add an HTMLModuleManager.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@751 48356398-32a2-884e-a903-53898d9a118a
2007-02-15 14:00:18 +00:00
Edward Z. Yang
c09432e171 Add command line support for loading a single test file.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@750 48356398-32a2-884e-a903-53898d9a118a
2007-02-15 00:17:23 +00:00
Edward Z. Yang
9c031b5c1e Add name class member variable to modules.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@749 48356398-32a2-884e-a903-53898d9a118a
2007-02-14 22:30:17 +00:00
Edward Z. Yang
a827cbc3ba Slight formatting change.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@748 48356398-32a2-884e-a903-53898d9a118a
2007-02-14 22:21:07 +00:00
Edward Z. Yang
c05eebee15 [1.5.0] AttrDef partitioned into HTML, CSS and URI segments. Also, some minor bugs with MultiLength fixed.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@747 48356398-32a2-884e-a903-53898d9a118a
2007-02-14 20:38:51 +00:00
Edward Z. Yang
93a69d020a Fix typo.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@746 48356398-32a2-884e-a903-53898d9a118a
2007-02-14 16:22:28 +00:00
Edward Z. Yang
f3fa9c01ba Add IDREF support to TODO list.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@744 48356398-32a2-884e-a903-53898d9a118a
2007-02-14 03:59:25 +00:00
Edward Z. Yang
bae5b0c022 Move out SetParent and TweakSubtractiveWhitelist. Move out some other configurations, disable ID references.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@743 48356398-32a2-884e-a903-53898d9a118a
2007-02-14 02:54:41 +00:00
Edward Z. Yang
67befbc8a8 [1.5.0] Rename %Attr.DisableURI to %URI.Disable and move it over to the AttrDef.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@742 48356398-32a2-884e-a903-53898d9a118a
2007-02-14 01:57:06 +00:00
Edward Z. Yang
cac22f01cf [1.5.0]
- More framework work (modules now are treated first class)
- Config will regenerate definitions when appropriate entries are set
- Add HTMLModule->setup for pre-processing stuff
- Constructor receives $definition not $config
- Config rolled inside $definition->config until end of setup()

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@741 48356398-32a2-884e-a903-53898d9a118a
2007-02-14 01:44:06 +00:00
Edward Z. Yang
94d2dbaa74 Fix broken benchmark code.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@739 48356398-32a2-884e-a903-53898d9a118a
2007-02-13 20:51:47 +00:00
Edward Z. Yang
6add828bc8 Update UTF-8 title.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@735 48356398-32a2-884e-a903-53898d9a118a
2007-02-13 03:09:34 +00:00
Edward Z. Yang
800b67ed65 Add preProcess and postProcess infrastructure to HTMLModule and HTMLDefinition so that almost all functionality that does not involve merging the modules together can be factored into modules.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@733 48356398-32a2-884e-a903-53898d9a118a
2007-02-12 03:02:26 +00:00
Edward Z. Yang
71e4ddd222 [1.5.0] Implement Legacy module.
- Yet another test EnableAttrID
- ElementDef now is mindful of attr inclusion merges

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@732 48356398-32a2-884e-a903-53898d9a118a
2007-02-11 01:52:56 +00:00
Edward Z. Yang
54a68a1713 [1.5.0] Implement TransformToStrict proprietary module
- Factored out strictblockquote from the common definition
- Text module now follows "strict" rules by default
- attr_transform_* now are indexed with string keys, to allow overloading
- Implement ElementDef mergin, and add standalone class variable to ElementDef to prevent half-baked element definitions from masquerading as full ones
- Implement merging global attributes from modules, namely info_attr_transform_post, info_attr_transform_pre and info_tag_transform
- Rename setupInfo() to processModules()
- Fix typo in HTMLModule/Bdo.php

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@731 48356398-32a2-884e-a903-53898d9a118a
2007-02-10 23:35:21 +00:00
Edward Z. Yang
bd544ad038 Formatting and documentation fixes.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@730 48356398-32a2-884e-a903-53898d9a118a
2007-02-09 03:19:43 +00:00
Edward Z. Yang
d5491da77f [1.5.0] Rewrite XHTML 1.1 document to describe HTMLDefinition's modularization
- Use ElementDef->child to define a literal ChildDef object, rather than ElementDef->content_model.
- Add notes on transforms, HTMLModule will be able to write those too
- Fix some misc typos.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@729 48356398-32a2-884e-a903-53898d9a118a
2007-02-08 23:10:49 +00:00
Edward Z. Yang
591fc0ae28 Divvy up TagTransform library files into their own separate files. Similar action needs to be taken for the tests.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@728 48356398-32a2-884e-a903-53898d9a118a
2007-02-06 01:33:28 +00:00
Edward Z. Yang
dac7ac1eae Add documentation.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@727 48356398-32a2-884e-a903-53898d9a118a
2007-02-05 05:23:20 +00:00
Edward Z. Yang
64ee756b7a Rename ConfigEntity to ConfigDef and factor into its own classes.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@726 48356398-32a2-884e-a903-53898d9a118a
2007-02-05 03:22:32 +00:00
Edward Z. Yang
e2103ce0f2 Factor out content set and childdef functionality to ContentSets. Remove redundant info suffix from attr_collections.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@725 48356398-32a2-884e-a903-53898d9a118a
2007-02-05 03:05:46 +00:00
Edward Z. Yang
219902ebff Revert back to pre XHTMLDefinition testing state.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@724 48356398-32a2-884e-a903-53898d9a118a
2007-02-04 23:18:53 +00:00
Edward Z. Yang
21116373a7 [1.5.0] Implemented new HTMLDefinition based on XHTML 1.1 Modularization
- Well, not really, but it's now official. Some gunky prototype code left, but it's pretty much all done.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@723 48356398-32a2-884e-a903-53898d9a118a
2007-02-04 23:17:13 +00:00
Edward Z. Yang
5ed88809f3 Add a bunch of compatibility gunk to XHTMLDefinition for modules we've not implemented yet and replace HTMLDefinition with it.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@722 48356398-32a2-884e-a903-53898d9a118a
2007-02-04 23:10:10 +00:00
Edward Z. Yang
bb8b38b1e0 Rename attr_collection to attr_collections, which is more accurate. HTMLModule now has attr_collections_info rather than attr_collections which implied an object. Further clarified naming conventions.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@721 48356398-32a2-884e-a903-53898d9a118a
2007-02-04 22:26:56 +00:00
Edward Z. Yang
236159242f Enforce info_ prefix convention for data that is accessed by HTML Purifier internals.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@720 48356398-32a2-884e-a903-53898d9a118a
2007-02-04 22:08:51 +00:00
Edward Z. Yang
9d8f839bf2 Add empty template HTMLModule for legacy-related processing.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@719 48356398-32a2-884e-a903-53898d9a118a
2007-02-04 21:58:38 +00:00
Edward Z. Yang
882148f9ad Add nested test for del/ins inline support.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@718 48356398-32a2-884e-a903-53898d9a118a
2007-02-04 21:02:35 +00:00
Edward Z. Yang
a863f62489 Add full documentation. Implement deferred ChildDef to HTMLModule. Add missing attributes for table, switched some to Number. Add necessary includes to module files. Add pre exclusions. Printer now ksorts arrays before output. Exclude ins/del from descendants_are_inline flagging.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@717 48356398-32a2-884e-a903-53898d9a118a
2007-02-04 20:09:35 +00:00
Edward Z. Yang
6478c7c2df Implement Style Attribute Module, cleanup some attribute collections and add some documentation.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@716 48356398-32a2-884e-a903-53898d9a118a
2007-02-04 18:27:59 +00:00
Edward Z. Yang
129a4ea506 Implement Image Module.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@715 48356398-32a2-884e-a903-53898d9a118a
2007-02-04 16:35:40 +00:00
Edward Z. Yang
a122243a89 Implement Tables Module.
- Fix HTMLDefinition rendering of table children

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@714 48356398-32a2-884e-a903-53898d9a118a
2007-02-04 16:23:26 +00:00
Edward Z. Yang
315c55eeb1 Implement Bdo module. Also added some documentation and missing values, as well as support for attr_collection additions.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@713 48356398-32a2-884e-a903-53898d9a118a
2007-02-04 15:28:47 +00:00
Edward Z. Yang
cfe50ff8ae Implement Edit module.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@712 48356398-32a2-884e-a903-53898d9a118a
2007-02-04 14:56:55 +00:00
Edward Z. Yang
d0018a2696 Implement Presentation module.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@711 48356398-32a2-884e-a903-53898d9a118a
2007-02-04 04:41:34 +00:00
Edward Z. Yang
77d9e05a07 [1.5.0] Massive refactoring for Blockquote and Chameleon to be more extensible and accommodating of XHTMLDefinition.
- Fixed buggy chameleon-support for ins and del
. Removed context variable ParentType, replaced with IsInline, which
  is false when you're not inline and an integer of the parent that
  caused you to become inline when you are (so possibly zero)
. Removed ElementDef->type in favor of ElementDef->descendants_are_inline
  and HTMLDefinition->content_sets
. StrictBlockquote now reports what elements its supposed to allow,
  rather than what it does allow
. Removed HTMLDefinition->info_flow_elements in favor of
  HTMLDefinition->content_sets['Flow']
. Removed redundant "exclusionary" definitions from DTD roster
. StrictBlockquote now requires a construction parameter as if it
  were an Required ChildDef, this is the "real" set of allowed elements

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@710 48356398-32a2-884e-a903-53898d9a118a
2007-02-04 03:53:57 +00:00
Edward Z. Yang
80243f377c Implement List module.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@709 48356398-32a2-884e-a903-53898d9a118a
2007-02-04 01:52:13 +00:00
Edward Z. Yang
43b157cf4d Add Hypertext module.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@708 48356398-32a2-884e-a903-53898d9a118a
2007-02-04 01:01:27 +00:00
Edward Z. Yang
f6b50d4bfd Initial implementation of XHTMLDefinition, you can see it in action at the smoketest printDefinition.php?x (add the x at the end).
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@707 48356398-32a2-884e-a903-53898d9a118a
2007-02-04 00:07:52 +00:00
Edward Z. Yang
806901cfd2 [1.5.0] Rename Class to Nmtokens (more accurate)
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@706 48356398-32a2-884e-a903-53898d9a118a
2007-02-03 20:15:33 +00:00
Edward Z. Yang
f90eef7f1f Update docs. Delineate XHTML 1.1 revamping of HTMLDefinition.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@705 48356398-32a2-884e-a903-53898d9a118a
2007-02-03 17:03:04 +00:00
Edward Z. Yang
06867e14b6 Increase child definition sets to all elements to facilitate later expansion. Currently has no perceptible effect.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@704 48356398-32a2-884e-a903-53898d9a118a
2007-02-03 03:45:13 +00:00
Edward Z. Yang
bda2615b30 [1.5.0] Add support for IDREF
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@703 48356398-32a2-884e-a903-53898d9a118a
2007-02-02 22:03:09 +00:00
Edward Z. Yang
e1a5d10e75 Fix typo in comment.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@702 48356398-32a2-884e-a903-53898d9a118a
2007-01-30 00:34:23 +00:00
Edward Z. Yang
98fd6b7d82 [1.5.0] Add rudimentary I18N and L10N support based off MediaWiki
- Also: allow 'x' subtag in language codes

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@701 48356398-32a2-884e-a903-53898d9a118a
2007-01-29 20:11:00 +00:00
Edward Z. Yang
be264a4b20 Update docs.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@700 48356398-32a2-884e-a903-53898d9a118a
2007-01-29 17:53:54 +00:00
Edward Z. Yang
01c85b71d2 Fix minor typo.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@699 48356398-32a2-884e-a903-53898d9a118a
2007-01-28 22:19:05 +00:00
Edward Z. Yang
2d22c0aa55 [1.4.x?] Completed enduser-utf8.html
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@697 48356398-32a2-884e-a903-53898d9a118a
2007-01-24 23:48:35 +00:00
Edward Z. Yang
6e061f5184 I18N -> International/internationalization
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@696 48356398-32a2-884e-a903-53898d9a118a
2007-01-24 21:24:54 +00:00
Edward Z. Yang
44b988f1f6 Fix some editing mistakes.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@695 48356398-32a2-884e-a903-53898d9a118a
2007-01-24 03:00:48 +00:00
Edward Z. Yang
0ead9558b4 Finish up to BOM.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@694 48356398-32a2-884e-a903-53898d9a118a
2007-01-24 01:29:25 +00:00
Edward Z. Yang
159a1cced1 Complete HTML Purifier segment.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@693 48356398-32a2-884e-a903-53898d9a118a
2007-01-23 03:27:10 +00:00
Edward Z. Yang
6871a54d64 Release 1.4.1.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@687 48356398-32a2-884e-a903-53898d9a118a
2007-01-21 21:47:18 +00:00
Edward Z. Yang
96ac7e8797 [1.4.1] docs/enduser-youtube.html updated according to new functionality and YouTube IDs can have underscores and dashes
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@686 48356398-32a2-884e-a903-53898d9a118a
2007-01-21 21:45:14 +00:00
Edward Z. Yang
2d49299621 Release 1.4.0.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@680 48356398-32a2-884e-a903-53898d9a118a
2007-01-21 17:24:18 +00:00
Edward Z. Yang
ab5c782c77 Actually fix it.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@679 48356398-32a2-884e-a903-53898d9a118a
2007-01-21 17:12:45 +00:00
Edward Z. Yang
8893b87e04 Fix call-time pass-by-reference.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@678 48356398-32a2-884e-a903-53898d9a118a
2007-01-21 17:10:24 +00:00
Edward Z. Yang
aeef746060 Add 1000 passes image.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@677 48356398-32a2-884e-a903-53898d9a118a
2007-01-21 16:45:01 +00:00
Edward Z. Yang
da13c6ac87 Further update TODO.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@676 48356398-32a2-884e-a903-53898d9a118a
2007-01-21 16:17:34 +00:00
Edward Z. Yang
ccae73c25a Update TODO, remove caching: it won't help our memory usage or speed.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@674 48356398-32a2-884e-a903-53898d9a118a
2007-01-21 15:23:42 +00:00
Edward Z. Yang
8d6bfa4037 [1.4.0] YouTube preservation code added to the core by adding HTMLPurifier_Filter hierarchy.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@673 48356398-32a2-884e-a903-53898d9a118a
2007-01-21 15:09:07 +00:00
Edward Z. Yang
712d81ebea [1.4.0] Config object can now be instantiated from ini files. Also updated TODO.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@672 48356398-32a2-884e-a903-53898d9a118a
2007-01-21 14:29:46 +00:00
Edward Z. Yang
f7f6fed86a [1.4.0] Revamp ConfigTest.php. Factor out tallyErrors() to its own function.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@671 48356398-32a2-884e-a903-53898d9a118a
2007-01-21 04:37:02 +00:00
Edward Z. Yang
2293c67eec [1.4.0] Revamp ConfigSchema tests, add more error checking.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@670 48356398-32a2-884e-a903-53898d9a118a
2007-01-20 22:59:20 +00:00
Edward Z. Yang
108df87824 Migrate from assertError to expectError, removed all assertNoErrors()
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@669 48356398-32a2-884e-a903-53898d9a118a
2007-01-20 19:22:55 +00:00
Edward Z. Yang
5e366b25f8 [1.4.0] Support for configuration directive aliases added.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@668 48356398-32a2-884e-a903-53898d9a118a
2007-01-20 18:43:58 +00:00
Edward Z. Yang
2e16c4a968 Replaced version check with functionality check for DOM
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@667 48356398-32a2-884e-a903-53898d9a118a
2007-01-20 15:07:48 +00:00
Edward Z. Yang
a8db22dfff Update docs, esp in context of soon to be added tag transforms.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@666 48356398-32a2-884e-a903-53898d9a118a
2007-01-20 03:59:07 +00:00
Edward Z. Yang
fbe2c25f8a Update progress doc with more info. Fix some comments.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@665 48356398-32a2-884e-a903-53898d9a118a
2007-01-20 03:48:39 +00:00
Edward Z. Yang
158be61def Update TODO.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@664 48356398-32a2-884e-a903-53898d9a118a
2007-01-20 02:28:51 +00:00
Edward Z. Yang
d693c4ea09 [1.4.0] Implement 'background' shorthand CSS property.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@663 48356398-32a2-884e-a903-53898d9a118a
2007-01-20 02:21:43 +00:00
Edward Z. Yang
c24916e1d6 Update progress doc.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@662 48356398-32a2-884e-a903-53898d9a118a
2007-01-20 01:46:37 +00:00
Edward Z. Yang
a68b6afda1 [1.4.0] CSS property background-position implemented. Also:
- Fixed some misinformation in Percentage
- Add support for lowercase CSS length units

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@661 48356398-32a2-884e-a903-53898d9a118a
2007-01-20 01:40:56 +00:00
Edward Z. Yang
78cf7db82e Refactor index.php test runner, it was getting too big.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@660 48356398-32a2-884e-a903-53898d9a118a
2007-01-19 23:26:15 +00:00
Edward Z. Yang
9b375fdfb8 [1.4.0] Added convenient single test selector form on test runner
- Also fixed includes in test files
- Updated TODO with CSS plans

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@659 48356398-32a2-884e-a903-53898d9a118a
2007-01-19 23:02:28 +00:00
Edward Z. Yang
0dd866cc15 [1.4.0]
- Added %Core.EscapeNonASCIICharacters to workaround %Core.Encoding misbehavior
- Add "All Tests" to test runner title and reorder subfile names
- Specific file is now called with ?f=
- Link to UTF-8 docs, even though they're not done
- 1000th unit test passed! W00t! (that's a third as many as SimpleTest has for itself.)

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@658 48356398-32a2-884e-a903-53898d9a118a
2007-01-19 03:54:55 +00:00
Edward Z. Yang
ad1169c711 [1.4.0] Make all functions in Encoder static. Affects branches/strict
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@656 48356398-32a2-884e-a903-53898d9a118a
2007-01-18 22:55:44 +00:00
Edward Z. Yang
2816ae535f Link docs back to HTML Purifier home page.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@655 48356398-32a2-884e-a903-53898d9a118a
2007-01-18 22:38:40 +00:00
Edward Z. Yang
462d3ab72f [1.4.0] Add some nicer styling to the configuration documentation. Also update NEWS.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@654 48356398-32a2-884e-a903-53898d9a118a
2007-01-18 02:52:20 +00:00
Edward Z. Yang
cf1d868782 [1.4.0] Add some docs to printDefinition smoketest.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@653 48356398-32a2-884e-a903-53898d9a118a
2007-01-18 02:05:39 +00:00
Edward Z. Yang
c705e17a58 + Well Supported
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@652 48356398-32a2-884e-a903-53898d9a118a
2007-01-17 03:07:51 +00:00
Edward Z. Yang
1cce367950 [1.4.0] Add all smoketest file using iframes.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@651 48356398-32a2-884e-a903-53898d9a118a
2007-01-16 22:29:11 +00:00
Edward Z. Yang
61f852d429 Merge in PHP5 strict changes that are applicable to PHP4.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@650 48356398-32a2-884e-a903-53898d9a118a
2007-01-16 22:22:08 +00:00
Edward Z. Yang
3a73c2cf04 Fix some XHTML 1.0 conformance issues.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@646 48356398-32a2-884e-a903-53898d9a118a
2007-01-15 20:06:35 +00:00
Edward Z. Yang
e75b676656 Done up to Forms.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@645 48356398-32a2-884e-a903-53898d9a118a
2007-01-15 19:18:17 +00:00
Edward Z. Yang
b53370efbf Update progress doc.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@644 48356398-32a2-884e-a903-53898d9a118a
2007-01-15 01:16:25 +00:00
Edward Z. Yang
d60f345cab [1.4.0] Implemented background-image, background-repeat and background-attachment CSS properties. background shorthand property HAS NOT been extended to allow these, and background-position IS NOT implemented yet.
- Also: fixed up some flaky behavior in list-style shorthand, introduced uri_or_none

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@643 48356398-32a2-884e-a903-53898d9a118a
2007-01-15 01:14:24 +00:00
Edward Z. Yang
aefda60696 [1.4.0] Refactored ListStyle, since list-style-image was sort of tacked on and didn't really conform with the standard. Implementation is still a little flaky but conforms with W3C's validation service.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@642 48356398-32a2-884e-a903-53898d9a118a
2007-01-15 00:48:54 +00:00
Edward Z. Yang
2ffa5d3135 Update progress doc.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@641 48356398-32a2-884e-a903-53898d9a118a
2007-01-14 16:26:47 +00:00
Edward Z. Yang
23d3490d49 [1.4.0] Implemented list-style-image, URIs now allowed in list-style
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@640 48356398-32a2-884e-a903-53898d9a118a
2007-01-14 16:24:02 +00:00
Edward Z. Yang
582ffc4143 [1.4.0] Implemented AttrDef_CSSURI for url(http://google.com) style declarations
- 1.3.3 release downgraded to "not likely"

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@639 48356398-32a2-884e-a903-53898d9a118a
2007-01-14 15:54:05 +00:00
Edward Z. Yang
d52189a19d Complete info on fixing embedded encodings. Will discuss UTF-8 next.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@638 48356398-32a2-884e-a903-53898d9a118a
2007-01-14 02:31:54 +00:00
Edward Z. Yang
02006d6e64 Commit initial draft of UTF-8 document. Incomplete.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@637 48356398-32a2-884e-a903-53898d9a118a
2007-01-13 03:58:02 +00:00
Edward Z. Yang
dcaa374dae [1.3.3] Random miscellaneous housekeeping
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@636 48356398-32a2-884e-a903-53898d9a118a
2007-01-11 22:37:54 +00:00
Edward Z. Yang
e2cc37724b [1.3.3]
- Move SLOW to docs/enduser-slow.html and add code examples
- Update README and WYSIWYG
- Add warning to HTMLPurifier.func.php about naming similarities

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@635 48356398-32a2-884e-a903-53898d9a118a
2007-01-11 22:28:44 +00:00
Edward Z. Yang
3ad6239dc3 Remove useless $init assignment.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@634 48356398-32a2-884e-a903-53898d9a118a
2007-01-07 23:55:55 +00:00
Edward Z. Yang
663fb4e1b2 Add TODO item to INSTALL for new array syntax for configuration.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@627 48356398-32a2-884e-a903-53898d9a118a
2006-12-26 17:40:05 +00:00
Edward Z. Yang
a909632d2d Release 1.3.2.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@621 48356398-32a2-884e-a903-53898d9a118a
2006-12-26 04:31:48 +00:00
Edward Z. Yang
18f7c85ebe Update XHTML 1.1 reference document with a few more notes on modularization.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@620 48356398-32a2-884e-a903-53898d9a118a
2006-12-26 04:11:51 +00:00
Edward Z. Yang
4c54283642 Fix a few things in the YouTube documentation as mentioned by Everah and kuza55 (sorry kuza55, still haven't acted completely on your requests).
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@619 48356398-32a2-884e-a903-53898d9a118a
2006-12-26 04:09:23 +00:00
Edward Z. Yang
688b1833f5 Fix typos in AttrDef/Lang.php involving lowercasing uppercased language strings.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@618 48356398-32a2-884e-a903-53898d9a118a
2006-12-26 03:56:53 +00:00
Edward Z. Yang
ceb1b9ccdb Update TODO.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@617 48356398-32a2-884e-a903-53898d9a118a
2006-12-21 21:42:21 +00:00
Edward Z. Yang
52fb35b0bb TODO + hooks
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@616 48356398-32a2-884e-a903-53898d9a118a
2006-12-20 23:52:28 +00:00
Edward Z. Yang
b6e222cbc2 [1.3.2] Added purifyArray(), which takes a list of HTML and purifies it all
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@615 48356398-32a2-884e-a903-53898d9a118a
2006-12-20 23:51:09 +00:00
Edward Z. Yang
dcfd8f5641 Update index with YouTube link.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@613 48356398-32a2-884e-a903-53898d9a118a
2006-12-20 03:03:03 +00:00
Edward Z. Yang
48da08ab78 [1.3.2] Added enduser-youtube.html, explains how to embed YouTube videos. See also corresponding smoketest preserveYouTube.php.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@612 48356398-32a2-884e-a903-53898d9a118a
2006-12-20 02:59:19 +00:00
Edward Z. Yang
360f984f63 [1.3.2]
! HTMLPurifier object now accepts configuration arrays, no need to manually instantiate a configuration object
! Context object now accessible to outside
. HTMLPurifier_Config::create() added, takes mixed variable and converts into a HTMLPurifier_Config object.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@611 48356398-32a2-884e-a903-53898d9a118a
2006-12-15 02:12:03 +00:00
Edward Z. Yang
41a25cb6b8 [1.3.2] printDefinition.php: added labels, added better clarification
- Updated TODO

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@609 48356398-32a2-884e-a903-53898d9a118a
2006-12-13 04:14:30 +00:00
Edward Z. Yang
a0fd6a9f5c Release 1.3.1.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@603 48356398-32a2-884e-a903-53898d9a118a
2006-12-06 22:52:22 +00:00
Edward Z. Yang
66e1d2732a [1.3.1] Add credit to bug report.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@602 48356398-32a2-884e-a903-53898d9a118a
2006-12-06 22:41:40 +00:00
Edward Z. Yang
b73b5100fd [1.3.1] Add defense in depth measure: reject entire node if there is no child definition for the element.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@601 48356398-32a2-884e-a903-53898d9a118a
2006-12-06 22:38:25 +00:00
Edward Z. Yang
d886ed59fd [1.3.1] Standardized all attribute handling variables to attr, made it plural
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@600 48356398-32a2-884e-a903-53898d9a118a
2006-12-06 22:29:08 +00:00
Edward Z. Yang
cbb492c52c [1.3.1] Fixed bug in RemoveInvalidImg code that caused all images to be dropped
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@599 48356398-32a2-884e-a903-53898d9a118a
2006-12-06 22:12:44 +00:00
Edward Z. Yang
4f8f022eac [1.3.1] Added HTMLPurifier.func.php stub for a convenient function to call the library
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@598 48356398-32a2-884e-a903-53898d9a118a
2006-12-06 22:04:16 +00:00
Edward Z. Yang
301b2585ae Add TODO: allow array input.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@596 48356398-32a2-884e-a903-53898d9a118a
2006-12-02 02:18:46 +00:00
Edward Z. Yang
8e733a52fb Update change-log with new version numbers.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@595 48356398-32a2-884e-a903-53898d9a118a
2006-11-27 00:15:43 +00:00
Edward Z. Yang
2a01cf786e Release 1.3.0 (bumped TODO items)
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@588 48356398-32a2-884e-a903-53898d9a118a
2006-11-26 23:21:19 +00:00
Edward Z. Yang
825b0671b5 [1.3.0] Bump version numbers.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@587 48356398-32a2-884e-a903-53898d9a118a
2006-11-26 23:18:32 +00:00
Edward Z. Yang
4bdc0446de [1.3.0] New directive %URI.HostBlacklist for blocking links to bad hosts. xssAttacks.php smoketest updated accordingly.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@586 48356398-32a2-884e-a903-53898d9a118a
2006-11-26 23:14:12 +00:00
Edward Z. Yang
45a70e8ae4 [1.3.0] Update xssAttacks.xml.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@585 48356398-32a2-884e-a903-53898d9a118a
2006-11-26 00:46:57 +00:00
Edward Z. Yang
1fe60c9b9d [1.3.0] Clarify docs on what printDefinition is for
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@584 48356398-32a2-884e-a903-53898d9a118a
2006-11-26 00:14:03 +00:00
Edward Z. Yang
dc0e2c6b3e Revise character estimate upwards.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@582 48356398-32a2-884e-a903-53898d9a118a
2006-11-25 21:18:20 +00:00
Edward Z. Yang
9bbbb87ffa [1.3.0] Add Printer_CSSDefinition.
- Added @public identifiers to properties that the Printers are using.
- Augmented Printer::getClass() to include meta-info about the object (contained inside parentheses). Currently supports: enum, composite and multiple.
- Remove all linebreaks from Printer output
- Document Printer_HTMLDefinition's methods.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@581 48356398-32a2-884e-a903-53898d9a118a
2006-11-25 05:05:32 +00:00
Edward Z. Yang
b63b0be21f [1.3.0] Some housekeeping after the last commit
- Add a few missing unit tests
- Allow for spaces between comma separated strings to be transformed into arrays
- smoketests/printDefinition.php now has documentation, links to more documentation and a friendly user-interface

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@579 48356398-32a2-884e-a903-53898d9a118a
2006-11-24 07:12:16 +00:00
Edward Z. Yang
73a1e31fad [1.3.0] Added spiffy new smoketest printDefinition.php, which lets you twiddle with the configuration settings and see how the internal rules are affected. (currently only complete for HTMLDefinition).
- HTMLPurifier -> HTML Purifier
. HTMLPurifier_Config->getBatch($namespace) added
. More lenient casting to bool from string in HTMLPurifier_ConfigSchema
. <?xml ... tags added to all smoketests

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@578 48356398-32a2-884e-a903-53898d9a118a
2006-11-24 06:26:02 +00:00
Edward Z. Yang
775763c583 [1.3.0] New directive %URI.Munge, munges URI so you can use some sort of redirector service to avoid PageRank leaks or warn users that they are exiting your site.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@576 48356398-32a2-884e-a903-53898d9a118a
2006-11-24 00:29:16 +00:00
Edward Z. Yang
49cb2a4a7c [1.3.0] More control of URIs granted
# Invalid images are now removed, rather than replaced with a dud <img src="" alt="Invalid image" />. Previous behavior can be restored with new directive %Core.RemoveInvalidImg set to false.
! New directives %URI.DisableExternalResources and %URI.DisableResources
! New directive %Attr.DisableURI, which eliminates all hyperlinking
- Missing "Available since" documentation added

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@575 48356398-32a2-884e-a903-53898d9a118a
2006-11-23 23:59:20 +00:00
Edward Z. Yang
61b6ee7183 Update filter levels document in light of fact that user can now specify tags. We may want to upgrade this to HTML so users can be helped out in choosing things to allow.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@574 48356398-32a2-884e-a903-53898d9a118a
2006-11-23 22:40:59 +00:00
Edward Z. Yang
d7ce6b4587 Add code quality advisory about demo.php.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@573 48356398-32a2-884e-a903-53898d9a118a
2006-11-23 22:34:41 +00:00
Edward Z. Yang
f67ee19f31 [1.3.0] Add some forward thinking documents.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@572 48356398-32a2-884e-a903-53898d9a118a
2006-11-23 22:33:07 +00:00
Edward Z. Yang
92b3f0e817 [1.3.0] <li value="4"> and <ul start="2"> now allowed in loose mode
- Updated progress with some more impl-no decisions
 - Loose vs. Strict now has better tallying on current behavior
 - Document what we're not allowing in loose
 - Strict boolean indicator added to HTMLDefinition
 - Added XHTML 1.1 to TODO.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@571 48356398-32a2-884e-a903-53898d9a118a
2006-11-23 22:15:35 +00:00
Edward Z. Yang
3c4da9666f - Update TODO: Caching and Configuration profiles
- Added another code-quality issue

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@570 48356398-32a2-884e-a903-53898d9a118a
2006-11-23 21:36:17 +00:00
Edward Z. Yang
925a07b828 [1.3.0] New directives %HTML.AllowedElements and %HTML.AllowedAttributes to let users narrow the set of allowed tags
. Added HTMLPurifier->info_parent_def, parent child processing made special

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@565 48356398-32a2-884e-a903-53898d9a118a
2006-11-23 13:51:19 +00:00
Edward Z. Yang
94db380271 [1.3.0] Remove Tidy option from demo if there is not Tidy available
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@563 48356398-32a2-884e-a903-53898d9a118a
2006-11-23 03:49:19 +00:00
Edward Z. Yang
b9e7ba6a2f [1.3.0] Move valid XHTML 1.0 button link to better spot.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@562 48356398-32a2-884e-a903-53898d9a118a
2006-11-23 03:39:55 +00:00
Edward Z. Yang
b1b3377b9c [1.3.0] Huge upgrade, (X)HTML Strict now supported
+ Transparently handles inline elements in block context (blockquote)
! Added GET method to demo for easier validation, added 50kb max input size
! New directive %HTML.BlockWrapper, for block-ifying inline elements
! New directive %HTML.Parent, allows you to only allow inline content
- Added missing type to ChildDef_Chameleon
. ChildDef_Required guards against empty tags
. Lookup table HTMLDefinition->info_flow_elements added
. Added peace-of-mind variable initialization to Strategy_FixNesting

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@560 48356398-32a2-884e-a903-53898d9a118a
2006-11-23 03:23:35 +00:00
Edward Z. Yang
d8673539ab - Add more documentation about proprietary tags
- Link to all text memos

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@559 48356398-32a2-884e-a903-53898d9a118a
2006-11-23 00:45:43 +00:00
Edward Z. Yang
3b26e5dc5b [1.3.0] Refactored ChildDef classes into their own files
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@558 48356398-32a2-884e-a903-53898d9a118a
2006-11-22 18:55:15 +00:00
Edward Z. Yang
c5ea987069 Fix parse error.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@557 48356398-32a2-884e-a903-53898d9a118a
2006-11-22 18:19:44 +00:00
Edward Z. Yang
b152448608 [1.3.0] Implement user-unfriendly implementation of Strict doctype. We will try not to ship this one.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@556 48356398-32a2-884e-a903-53898d9a118a
2006-11-22 18:17:39 +00:00
Edward Z. Yang
b0575cb888 Add more TODO items:
- Formatter caveat to strict XHTML
- YouTube video embedding

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@555 48356398-32a2-884e-a903-53898d9a118a
2006-11-22 17:46:38 +00:00
Edward Z. Yang
224ef774f7 Commit two new docs: loose-vs-strict and proprietary-tags, both research/reference.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@554 48356398-32a2-884e-a903-53898d9a118a
2006-11-22 04:49:26 +00:00
Edward Z. Yang
18a83acc5d Re-prioritize (X)HTML strict output TODO.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@553 48356398-32a2-884e-a903-53898d9a118a
2006-11-22 03:00:12 +00:00
Edward Z. Yang
f9090e45c0 [1.3.0] Add items for projected 1.3.0 and 1.2.1 releases.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@552 48356398-32a2-884e-a903-53898d9a118a
2006-11-20 03:58:56 +00:00
Edward Z. Yang
450523a9ca [1.2.0] [merged] Bump TODO items.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@547 48356398-32a2-884e-a903-53898d9a118a
2006-11-20 03:21:52 +00:00
Edward Z. Yang
1955527a11 Release 1.2.0.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@543 48356398-32a2-884e-a903-53898d9a118a
2006-11-20 03:16:32 +00:00
Edward Z. Yang
a5751c7f20 [1.2.0] Update new directives file.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@542 48356398-32a2-884e-a903-53898d9a118a
2006-11-20 03:07:46 +00:00
Edward Z. Yang
0960cf6ace [1.2.0] Converted enduser-id.txt to HTML. Fixed summary in index. Added extra style .subsubtitle
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@539 48356398-32a2-884e-a903-53898d9a118a
2006-11-20 02:47:00 +00:00
Edward Z. Yang
83ed9e0fe1 [1.2.0]
- Converted dev-naming and dev-optimization to HTML
- Fixed up failed validation in a few of the other HTML files

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@538 48356398-32a2-884e-a903-53898d9a118a
2006-11-19 04:56:50 +00:00
Edward Z. Yang
fe9238af3a [1.2.0] Nuke 1.1.3 release.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@537 48356398-32a2-884e-a903-53898d9a118a
2006-11-19 04:42:42 +00:00
Edward Z. Yang
f0fe829af4 [1.2.0] Update documentation paths.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@536 48356398-32a2-884e-a903-53898d9a118a
2006-11-19 04:37:26 +00:00
Edward Z. Yang
a3968a1ec7 [1.2.0] Update documentation infrastructure.
- Add filings and link to index
- Update descriptions
- Add an index

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@535 48356398-32a2-884e-a903-53898d9a118a
2006-11-19 04:31:48 +00:00
Edward Z. Yang
a8298172e1 [1.2.0] Rename so that docs have specific categories.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@534 48356398-32a2-884e-a903-53898d9a118a
2006-11-19 03:35:57 +00:00
Edward Z. Yang
90dd7f13ae [1.2.0] HTML-ization for code-quality and colors. Also added in missing $Id$ to progress, and allowed for subtitling in the style.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@533 48356398-32a2-884e-a903-53898d9a118a
2006-11-19 03:10:14 +00:00
Edward Z. Yang
780c7fd309 [1.2.0] Revamp docs
- Style existing HTML files (taken from AuthTools)
- Add svn:eol-style=native and svn:keywords=Id to all file
- Add metadata to HTML files
- Trim DevNetwork by using <base>

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@532 48356398-32a2-884e-a903-53898d9a118a
2006-11-19 02:36:47 +00:00
Edward Z. Yang
dec6c52695 [1.2.0] Add a i18n documentation text.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@531 48356398-32a2-884e-a903-53898d9a118a
2006-11-18 23:58:41 +00:00
Edward Z. Yang
1ea3c1e968 Ignore incubator/ directory.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@530 48356398-32a2-884e-a903-53898d9a118a
2006-11-18 03:40:39 +00:00
Edward Z. Yang
bdab77b59e [1.2.0] Update Devnetwork topic document.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@529 48356398-32a2-884e-a903-53898d9a118a
2006-11-18 03:33:30 +00:00
Edward Z. Yang
82afd890c4 [1.2.0] Non-accessible resources (ex. mailto) blocked from embedded URIs (img src)
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@528 48356398-32a2-884e-a903-53898d9a118a
2006-11-17 23:09:10 +00:00
Edward Z. Yang
b0df2f292f [1.2.0] Migrate feature requests in the code quality document to TODO.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@527 48356398-32a2-884e-a903-53898d9a118a
2006-11-17 22:13:16 +00:00
Edward Z. Yang
7a4c7b3777 [1.2.0] [BC] ID attributes now disabled by default. New directives:
+ %HTML.EnableAttrID - restores old behavior by allowing IDs
  + %Attr.IDPrefix - %Attr.IDBlacklist alternative that munges all user IDs so that they don't collide with your IDs
  + %Attr.IDPrefixLocal - Same as above, but for when there are multiple instances of user content on the page
  + Profuse documentation on how to use these available in id.txt

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@526 48356398-32a2-884e-a903-53898d9a118a
2006-11-17 01:05:41 +00:00
Edward Z. Yang
2dc8e9c3d5 [1.2.0] Unit test housekeeping:
- HTMLPurifier_Context doesn't throw a variable reference error if you attempt to retrieve a non-existent variable
. Cleaned up test-cases to remove unnecessary swallowErrors()

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@525 48356398-32a2-884e-a903-53898d9a118a
2006-11-16 23:58:33 +00:00
Edward Z. Yang
d48f9b6b21 [1.2.0]
- Update TODO
  . Add another possible plaintext formatter
  . Reference config-ideas.txt for URI options
- Update code-quality.txt, removing issues that have been addressed and updating time for post-beta
- Update config-ideas.txt
  . Added more possible URI directives
  . Removed silly language control directive
- Improved documentation on Class, CSS and Host

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@524 48356398-32a2-884e-a903-53898d9a118a
2006-11-12 19:26:49 +00:00
Edward Z. Yang
2df5896324 [1.2.0] Add more projected URI control values.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@523 48356398-32a2-884e-a903-53898d9a118a
2006-11-12 04:02:27 +00:00
Edward Z. Yang
f38fe431ed [1.2.0]
- Added %URI.DisableExternal, which prevents links to external websites. You can also use %URI.Host to permit absolute linking to subdomains
- Fixed a few bugs involving null configuration values

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@522 48356398-32a2-884e-a903-53898d9a118a
2006-11-12 03:35:41 +00:00
Edward Z. Yang
926b94bdd3 [1.2.0] Allow configuration directives to permit null values. ConfigDoc updated accordingly.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@521 48356398-32a2-884e-a903-53898d9a118a
2006-11-12 02:59:36 +00:00
Edward Z. Yang
ad934540da [1.2.0] Merge two comment strings.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@520 48356398-32a2-884e-a903-53898d9a118a
2006-11-12 02:01:39 +00:00
Edward Z. Yang
afee1ea9bf [1.2.0]
- Updated ConfigDoc TODO
- configdoc.xml now has xml:space attached to default value nodes

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@519 48356398-32a2-884e-a903-53898d9a118a
2006-11-12 00:05:27 +00:00
Edward Z. Yang
a6bbe60e7c [1.2.0] Configuration documentation now has table of contents
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@518 48356398-32a2-884e-a903-53898d9a118a
2006-11-08 14:21:06 +00:00
Edward Z. Yang
d2fd193bc4 [1.2.0] Implement primitive email regexp to be used for mailto. There are many spotty implementation issues, so this code is not actually called anywhere else currently.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@517 48356398-32a2-884e-a903-53898d9a118a
2006-11-08 03:10:43 +00:00
Edward Z. Yang
e1b29d7c25 [1.2.0] XSS attacks smoketest given facelift.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@516 48356398-32a2-884e-a903-53898d9a118a
2006-11-08 01:31:38 +00:00
Edward Z. Yang
9668ac1e38 [1.2.0] Add protection against stdclasses into HTMLDefinition.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@514 48356398-32a2-884e-a903-53898d9a118a
2006-11-08 00:11:10 +00:00
Edward Z. Yang
eb6950d7d0 [1.2.0] Fix improper instantiation of stdclasses for '' and '#PCDATA'
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@513 48356398-32a2-884e-a903-53898d9a118a
2006-11-08 00:07:42 +00:00
Edward Z. Yang
4a724d0230 [1.2.0] Add documentation to PercentEncoder.php
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@510 48356398-32a2-884e-a903-53898d9a118a
2006-11-07 17:42:41 +00:00
Edward Z. Yang
504203c0f3 [1.2.0] Added percent encoding normalization
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@509 48356398-32a2-884e-a903-53898d9a118a
2006-11-07 17:15:28 +00:00
Edward Z. Yang
e998b034d1 [1.2.0] Update TODO, reorganized and added an item
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@504 48356398-32a2-884e-a903-53898d9a118a
2006-11-04 05:05:19 +00:00
Edward Z. Yang
84e3a28001 [1.2.0] Type variable in HTMLDefinition was not being set properly, fixed. Minor bug because no other code actually uses the feature (todo: add unit test).
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@503 48356398-32a2-884e-a903-53898d9a118a
2006-11-04 05:03:53 +00:00
Edward Z. Yang
4ee1bf94e3 [1.2.0] Assorted tinyfixes
- Add TODO request about Phalanger, something to do if I'm really bored
- Update XSS attacks
- Minor formatting/grammar fixes in documentation

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@502 48356398-32a2-884e-a903-53898d9a118a
2006-11-03 02:40:37 +00:00
Edward Z. Yang
24f2771304 Add TODO items:
- RTL/LTR override UTF-8 character treatment
- Content compression by removing whitespace

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@501 48356398-32a2-884e-a903-53898d9a118a
2006-10-31 02:17:52 +00:00
Edward Z. Yang
74ba9b8629 [1.2.0] Add context parameter to URIScheme and URISchemeRegistry classes.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@500 48356398-32a2-884e-a903-53898d9a118a
2006-10-27 01:20:10 +00:00
Edward Z. Yang
b9caa35bf4 [1.2.0]
- Add missing reference operator to AttrTransform.php
- Add note on error collection for EntityParser.php
- Add note that IDAccumulator won't collect errors either.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@498 48356398-32a2-884e-a903-53898d9a118a
2006-10-22 16:09:36 +00:00
Edward Z. Yang
6ff78d2f79 Add $config and $context to TagTransform transform() calls.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@497 48356398-32a2-884e-a903-53898d9a118a
2006-10-22 15:56:38 +00:00
Edward Z. Yang
8256ca4376 [1.2.0] Migrate AttrTransform tests to use the Harness supertype.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@496 48356398-32a2-884e-a903-53898d9a118a
2006-10-22 03:38:32 +00:00
Edward Z. Yang
7d2fe4c5d7 [1.2.0]
- Factor out Config and Context object population through arrays
- Bring dependent assertions together in IDTest.php
- AttrDefHarness.php now resets context and configuration between tests
- Add missing reference operator in AttrDef/ID.php

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@494 48356398-32a2-884e-a903-53898d9a118a
2006-10-21 18:18:36 +00:00
Edward Z. Yang
f3646a3a06 [1.2.0]
- Add context parameter to AttrTransform objects.
- Update documentation on attribute transformations in ValidateAttributes.php


git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@493 48356398-32a2-884e-a903-53898d9a118a
2006-10-21 17:27:51 +00:00
Edward Z. Yang
29716bf8f4 Add version number to HTMLPurifier.php. It needs to be bumped on new releases.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@492 48356398-32a2-884e-a903-53898d9a118a
2006-10-21 17:18:40 +00:00
Edward Z. Yang
fb38b02135 [1.2.0] Documentation updated
- Moved docs from EntityParser to Encoder
- Removed/updated docs in Generator

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@490 48356398-32a2-884e-a903-53898d9a118a
2006-10-09 16:07:35 +00:00
Edward Z. Yang
13790c6db2 Added MODx plugin.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@486 48356398-32a2-884e-a903-53898d9a118a
2006-10-02 16:56:47 +00:00
Edward Z. Yang
2d6bf12fe0 [1.2.0]
- All important classes that use Context were migrated. Todo: Classes that currently use $config but not $context are AttrTransform (done in r493) and URIScheme+Registry (done in r500). There may be more classes, incl TagTransform (done in r497) that should have both $config and $context added.
- Strategy unit tests now migrated to use HTMLPurifier_Harness

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@485 48356398-32a2-884e-a903-53898d9a118a
2006-10-01 21:55:13 +00:00
Edward Z. Yang
8f515b9cda [1.2.0]
- Partially finished migrating to new Context object (done in r485).
- Created HTMLPurifier_Harness to assist with testing, ChildDefTest migrated to that framework.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@484 48356398-32a2-884e-a903-53898d9a118a
2006-10-01 20:47:07 +00:00
Edward Z. Yang
58be73fcf7 [1.2.0] Added exists() method to HTMLPurifier_Context.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@483 48356398-32a2-884e-a903-53898d9a118a
2006-10-01 18:39:48 +00:00
Edward Z. Yang
f432a40f50 [1.2.0] Commit initial implementation of Context object, we will be migrating all systems over to it next commit.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@482 48356398-32a2-884e-a903-53898d9a118a
2006-10-01 18:14:08 +00:00
Edward Z. Yang
d660b9018b [1.2.0]
- Add 1.1.3 section in NEWS
- Replace tabs with four spaces in INSTALL
- Renamed data.txt to entities.ser

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@481 48356398-32a2-884e-a903-53898d9a118a
2006-09-30 20:18:08 +00:00
Edward Z. Yang
4d96433c23 [1.1.2] Fix typo in NEWS file.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@479 48356398-32a2-884e-a903-53898d9a118a
2006-09-30 19:34:59 +00:00
Edward Z. Yang
a78f0f5f80 [1.1.2] Bump version number in Doxyfile
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@474 48356398-32a2-884e-a903-53898d9a118a
2006-09-30 19:03:51 +00:00
Edward Z. Yang
d941d30cfa Released 1.1.2.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@473 48356398-32a2-884e-a903-53898d9a118a
2006-09-30 19:02:32 +00:00
Edward Z. Yang
9af9c505e1 [1.1.2]
- Added notes on HTML versus XML attribute whitespace handling
- Noted that HTMLPurifier_ChildDef_Custom isn't being used
- Noted that config object's definitions are cached versions
- Hooked up HTMLPurifier_ChildDef_Custom's unit tests (they weren't being run)
- Tester named "HTML Purifier" not "HTMLPurifier"

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@472 48356398-32a2-884e-a903-53898d9a118a
2006-09-30 18:55:17 +00:00
Edward Z. Yang
7e6a3fc990 [1.1.2] ftp:// URIs now have their typecodes checked
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@471 48356398-32a2-884e-a903-53898d9a118a
2006-09-30 17:24:12 +00:00
Edward Z. Yang
c7e798080c [1.1.2]
- (meta) Updated NEWS document
- Rearranged NEWS into different segments: features/bugfixes/internals

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@470 48356398-32a2-884e-a903-53898d9a118a
2006-09-28 01:28:18 +00:00
Edward Z. Yang
32c5b5080b [1.1.2]
- Add HTMLPurifier.auto.php stub class that automatically configures include path
- Rewrite INSTALL document
- Add semi-lossy dumb character entity conversion to TODO list

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@469 48356398-32a2-884e-a903-53898d9a118a
2006-09-28 00:31:12 +00:00
Edward Z. Yang
cbdd48811d [1.1.2]
- (meta) Add internals note to NEWS document.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@467 48356398-32a2-884e-a903-53898d9a118a
2006-09-27 02:14:53 +00:00
Edward Z. Yang
37def0104b [1.1.2]
- Documentation updated
- API docs now exclude more files that are not classes
- Fixed lack of attribute parsing in HTMLPurifier_Lexer_PEARSax3
- (internal) Refactored parseData() to general Lexer class

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@466 48356398-32a2-884e-a903-53898d9a118a
2006-09-27 02:09:54 +00:00
Edward Z. Yang
d9bb97cc26 [1.1.2] Update NEWS for r464.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@465 48356398-32a2-884e-a903-53898d9a118a
2006-09-25 00:23:33 +00:00
Edward Z. Yang
8bff97ec08 [1.1.2] Mass svn:eol-style=native. data.txt had line ending info taken away, since it is unbiased
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@464 48356398-32a2-884e-a903-53898d9a118a
2006-09-25 00:05:33 +00:00
Edward Z. Yang
fab2b363d0 Released 1.1.1.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@463 48356398-32a2-884e-a903-53898d9a118a
2006-09-24 23:42:14 +00:00
Edward Z. Yang
8e1cfb362d [1.1.1] Update INSTALL docs.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@457 48356398-32a2-884e-a903-53898d9a118a
2006-09-24 22:03:48 +00:00
Edward Z. Yang
1fa5101511 [1.1.1]
- Clarify usage of %Core.TidyFormat
- Add test-settings.sample.php, to facilitate benchmark and unit test running

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@456 48356398-32a2-884e-a903-53898d9a118a
2006-09-24 21:58:14 +00:00
Edward Z. Yang
24663d65ed [1.1.1] To make up for DOMLex's tendency to drop tags, we've added a configuration option to let Tidy cleanup the HTML afterwards. Good for hand-editors. Also, Tidy is a smart solution for pretty-printed HTML, so we're marking the related TODO wontfix.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@454 48356398-32a2-884e-a903-53898d9a118a
2006-09-24 21:23:54 +00:00
Edward Z. Yang
6adbaf0e5c [1.1.1] Removed double-semicolon.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@453 48356398-32a2-884e-a903-53898d9a118a
2006-09-24 19:58:59 +00:00
Edward Z. Yang
81cd9b1ee8 [1.1.1] Grey outputs that are negative.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@452 48356398-32a2-884e-a903-53898d9a118a
2006-09-24 19:48:29 +00:00
Edward Z. Yang
f5ff8acbb0 [1.1.1]
- Gracefully handle error if test-settings.php is not present
- Let test-settings define number of runs.

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@451 48356398-32a2-884e-a903-53898d9a118a
2006-09-24 19:40:28 +00:00
Edward Z. Yang
ad8310c1f5 [1.1.1] Format millisecond timing, run the parser 3 times for a more fair comparison.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@450 48356398-32a2-884e-a903-53898d9a118a
2006-09-24 18:45:24 +00:00
Edward Z. Yang
4b5198c5bc [1.1.1] Augment with time difference.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@449 48356398-32a2-884e-a903-53898d9a118a
2006-09-24 18:36:32 +00:00
Edward Z. Yang
a251ec590f [1.1.1] Error out if PEAR is not enabled on the system, include the test-settings.php file.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@448 48356398-32a2-884e-a903-53898d9a118a
2006-09-24 18:32:41 +00:00
Edward Z. Yang
2bfdfaa02c [1.1.1] Fix bad include paths from ConfigDef to ConfigSchema changes.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@447 48356398-32a2-884e-a903-53898d9a118a
2006-09-24 18:17:05 +00:00
Edward Z. Yang
4abf83af62 [1.1.1] Update TODO.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@446 48356398-32a2-884e-a903-53898d9a118a
2006-09-24 02:08:55 +00:00
Edward Z. Yang
1ad55e0ed5 [1.1.1] As far as possible, preserve whitespace is table internals.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@445 48356398-32a2-884e-a903-53898d9a118a
2006-09-24 02:08:18 +00:00
Edward Z. Yang
6c04bbdac1 [1.1.1]
- Update documentation
- Fix parse error in configuration documentation

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@444 48356398-32a2-884e-a903-53898d9a118a
2006-09-24 02:06:12 +00:00
Edward Z. Yang
c046da638a [1.1.1] Update milestones in progress.html
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@443 48356398-32a2-884e-a903-53898d9a118a
2006-09-23 18:39:27 +00:00
Edward Z. Yang
801dbcafb7 - Update filter-levels document to cover CSS and attributes
- Add colors proposal, for constraining allowed colors in  document
- Add strictness proposal, for attributes that are permitted by Transitional but not by HTML Purifier

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@442 48356398-32a2-884e-a903-53898d9a118a
2006-09-23 18:37:30 +00:00
Edward Z. Yang
4f8d83506d [1.1.1]
- Shuffle around TODO items, we're going to handle the URI deficiencies first
- Fix bugs in documentation :-P

git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@441 48356398-32a2-884e-a903-53898d9a118a
2006-09-23 00:43:21 +00:00
Edward Z. Yang
00fce29467 Add more documentation to HTMLDefinition in anticipation for refactoring.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@440 48356398-32a2-884e-a903-53898d9a118a
2006-09-22 02:47:41 +00:00
Edward Z. Yang
686824262e [1.1.1] Add to list of wanted formatters Linkify URLs.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@439 48356398-32a2-884e-a903-53898d9a118a
2006-09-18 01:13:49 +00:00
Edward Z. Yang
b93892a3b6 [1.1.1] Update documentation and TODO.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@436 48356398-32a2-884e-a903-53898d9a118a
2006-09-17 21:59:40 +00:00
Edward Z. Yang
7a6de55f76 [1.1.1] Update documentation.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@435 48356398-32a2-884e-a903-53898d9a118a
2006-09-17 21:53:12 +00:00
290 changed files with 15081 additions and 4173 deletions

View File

@@ -2,6 +2,6 @@
CREDITS
Almost everything written by Edward Z. Yang (Ambush Commander). Lots of thanks
to the DevNetwork Community for their help (see docs/devnetwork.html for more
details), Feyd especially (namely IPv6 and optimization). Thanks to RSnake for
letting me package his fantastic XSS cheatsheet for a smoketest.
to the DevNetwork Community for their help (see docs/ref-devnetwork.html for
more details), Feyd especially (namely IPv6 and optimization). Thanks to RSnake
for letting me package his fantastic XSS cheatsheet for a smoketest.

View File

@@ -4,7 +4,7 @@
# Project related configuration options
#---------------------------------------------------------------------------
PROJECT_NAME = HTML Purifier
PROJECT_NUMBER = 1.0.0
PROJECT_NUMBER = 1.6.1
OUTPUT_DIRECTORY = "C:/Documents and Settings/Edward/My Documents/My Webs/htmlpurifier/docs/doxygen"
CREATE_SUBDIRS = NO
OUTPUT_LANGUAGE = English
@@ -89,9 +89,12 @@ EXCLUDE =
EXCLUDE_SYMLINKS = NO
EXCLUDE_PATTERNS = */tests/* \
*/benchmarks/* \
*/docs/phpdoc/* \
*/docs/doxygen/* \
*/test-settings.php
*/docs/* \
*/test-settings.php \
*/configdoc/* \
*/test-settings.php \
*/maintenance/* \
*/smoketests/*
EXAMPLE_PATH =
EXAMPLE_PATTERNS = *
EXAMPLE_RECURSIVE = NO

193
INSTALL
View File

@@ -2,141 +2,186 @@
Install
How to install HTML Purifier
Being a library, there's no fancy GUI that will take you step-by-step through
configuring database credentials and other mumbo-jumbo. HTML Purifier is
designed to run "out of the box." Regardless, there are still a couple of
things you should be mindful of.
HTML Purifier is designed to run out of the box, so actually using the library
is extremely easy. (Although, if you were looking for a step-by-step
installation GUI, you've come to the wrong place!) The impatient can scroll
down to the bottom of this INSTALL document to see the code, but you really
should make sure a few things are properly done.
Todo: Convert to using the array syntax for configuration.
1. Compatibility
HTML Purifier works in both PHP 4 and PHP 5, from PHP 4.3.9 and up. It has no
core dependencies with other libraries. (Whoopee!)
Optional extensions are iconv (usually installed) and tidy (also common).
If you use UTF-8 and don't plan on pretty-printing HTML, you can get away with
not having either of these extensions.
0. Compatibility
2. Including the library
HTML Purifier works in both PHP 4 and PHP 5. I have run the test suite on
these versions:
Simply use:
- 4.3.9, 4.3.11
- 4.4.0, 4.4.4
- 5.0.0, 5.0.4
- 5.1.0, 5.1.6
require_once '/path/to/library/HTMLPurifier.auto.php';
And can confidently say that HTML Purifier should work in all versions
between and afterwards. HTML Purifier definitely does not support PHP 4.2,
and PHP 4.3 branch support may go further back than that, but I haven't tested
any earlier versions.
...and you're good to go. Since HTML Purifier's codebase is fairly
large, I recommend only including HTML Purifier when you need it.
I have been unable to get PHP 5.0.5 working on my computer, so if someone
wants to test that, be my guest. All tests were done on Windows XP Home,
but operating system should not be a major factor in the library.
1. Including the proper files
The library/ directory must be added to your path: HTML Purifier will not be
able to find the necessary includes otherwise. This is as simple as:
set_include_path('/path/to/htmlpurifier/library' . PATH_SEPARATOR .
get_include_path() );
...replacing /path/to/htmlpurifier with the actual location of the folder. Don't
worry, HTML Purifier is namespaced so unless you have another file named
HTMLPurifier.php, the files won't collide with any of your includes.
Then, it's a simple matter of including the base file:
If you don't like your include_path to be fiddled around with, simply set
HTML Purifier's library/ directory to the include path yourself and then:
require_once 'HTMLPurifier.php';
...and you're good to go.
Only the contents in the library/ folder are necessary, so you can remove
everything else when using HTML Purifier in a production environment.
2. Preparing the proper environment
3. Preparing the proper output environment
While no configuration is necessary, you first should take precautions regarding
the other output HTML that the filtered content will be going along with. Here
is a (short) checklist:
HTML Purifier is all about web-standards, so accordingly your webpages should
be standards compliant. HTML Purifier can deal with these doctypes:
* Have I specified XHTML 1.0 Transitional as the doctype?
* Have I specified UTF-8 as the character encoding?
* XHTML 1.0 Transitional (default)
* XHTML 1.0 Strict
* HTML 4.01 Transitional
* HTML 4.01 Strict
...and these character encodings:
* UTF-8 (default)
* Any encoding iconv supports (support is crippled for i18n though)
The defaults are there for a reason: they are best-practice choices that
should not be changed lightly. For those of you in the dark, you can determine
the doctype from this code in your HTML documents:
To find out what these are, browse to your website and view its source code.
You can figure out the doctype from the a declaration that looks like
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
or no doctype. You can figure out the character encoding by looking for
...and the character encoding from this code:
<meta http-equiv="Content-type" content="text/html;charset=ENCODING">
I cannot stress the importance of these two bullets enough. Omitting either
of them could have dire consequences not only for security but for plain
old usability. You can find a more in-depth discussion of why this is needed
in docs/security.txt, in the meantime, try to change your output so this is
the case. If you can't, well, we might be able to accomodate you (read
section 3).
For legacy codebases these declarations may be missing. If that is the case,
STOP, and read up on character encodings and doctypes (in that order). Here
are some links:
* http://www.joelonsoftware.com/articles/Unicode.html
* http://alistapart.com/stories/doctype/
You may currently be vulnerable to XSS and other security threats, and HTML
Purifier won't be able to fix that.
3. Configuring HTML Purifier
4. Configuration
HTML Purifier is designed to run out-of-the-box, but occasionally HTML
Purifier needs to be told what to do.
Purifier needs to be told what to do. If you answered no to any of these
questions, read on, otherwise, you can skip to the next section (or, if you're
into configuring things just for the heck of it, skip to 4.3).
If, for some reason, you are unable to switch to UTF-8 immediately, you can
switch HTML Purifier's encoding. Note that the availability of encodings is
dependent on iconv, and you'll be missing characters if the charset you
choose doesn't have them.
* Am I using UTF-8?
* Am I using XHTML 1.0 Transitional?
If you answered no to any of these questions, instantiate a configuration
object and read on:
$config = HTMLPurifier_Config::createDefault();
4.1. Setting a different character encoding
You really shouldn't use any other encoding except UTF-8, especially if you
plan to support multilingual websites (read section three for more details).
However, switching to UTF-8 is not always immediately feasible, so we can
adapt.
HTML Purifier uses iconv to support other character encodings, as such,
any encoding that iconv supports <http://www.gnu.org/software/libiconv/>
HTML Purifier supports with this code:
$config->set('Core', 'Encoding', /* put your encoding here */);
An example usage for Latin-1 websites:
An example usage for Latin-1 websites (the most common encoding for English
websites):
$config->set('Core', 'Encoding', 'ISO-8859-1');
Note that HTML Purifier's support for non-Unicode encodings is crippled by the
fact that any character not supported by that encoding will be silently
dropped, EVEN if it is ampersand escaped. This is a current limitation of
HTML Purifier that we are NOT actively working to fix. Patches are welcome,
but there are so many other gotchas and problems in I18N for non-Unicode
encodings that this functionality is low priority. See
<http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html> for a more
detailed lowdown on the topic.
4.2. Setting a different doctype
For those of you stuck using HTML 4.01 Transitional, you can disable
XHTML output like this:
$config->set('Core', 'XHTML', false);
However, I strongly recommend that you use XHTML. Currently, we can only
guarantee transitional-complaint output, future versions will also allow strict
output.
I recommend that you use XHTML, although not as much as I recommend UTF-8. If
your HTML 4.01 page validates, good for you!
Currently, we can only guarantee transitional-complaint output, future
versions will also allow strict-compliant output.
3. Using the code
4.3. Other settings
There are more configuration directives which can be read about
here: <http://htmlpurifier.org/live/configdoc/plain.html> They're a bit boring,
but they can help out for those of you who like to exert maximum control over
your code.
5. Using the code
The interface is mind-numbingly simple:
$purifier = new HTMLPurifier();
$clean_html = $purifier->purify($dirty_html);
$clean_html = $purifier->purify( $dirty_html );
Or, if you're using the configuration object:
...or, if you're using the configuration object:
$purifier = new HTMLPurifier($config);
$clean_html = $purifier->purify($dirty_html);
$clean_html = $purifier->purify( $dirty_html );
That's it. For more examples, check out docs/examples/. Also, SLOW gives
advice on what to do if HTML Purifier is slowing down your application.
That's it! For more examples, check out docs/examples/ (they aren't very
different though). Also, SLOW gives advice on what to do if HTML Purifier
is slowing down your application.
4. Quick install
6. Quick install
If your website is in UTF-8 and XHTML Transitional, use this code:
<?php
set_include_path('/path/to/htmlpurifier/library'
. PATH_SEPARATOR . get_include_path() );
require_once 'HTMLPurifier.php';
$purifier = new HTMLPurifier();
require_once '/path/to/htmlpurifier/library/HTMLPurifier.auto.php';
$purifier = new HTMLPurifier();
$clean_html = $purifier->purify($dirty_html);
?>
If your website is in a different encoding or doctype, use this code:
<?php
set_include_path('/path/to/htmlpurifier/library'
. PATH_SEPARATOR . get_include_path() );
require_once 'HTMLPurifier.php';
require_once '/path/to/htmlpurifier/library/HTMLPurifier.auto.php';
$config = HTMLPurifier_Config::createDefault();
$config->set('Core', 'Encoding', 'ISO-8859-1'); //replace with your encoding

71
INSTALL.fr.utf8 Normal file
View File

@@ -0,0 +1,71 @@

Installation
Comment installer HTML Purifier
Attention: Ce document a encode en UTF-8. Si les lettres avec les accents
est essoreuse, prenez un mieux editeur de texte.
À L'Aide: Je ne suis pas un diseur natif de français. Si vous trouvez une
erreur dans ce document, racontez-moi! Merci.
L'installation de HTML Purifier est trés simple, parce qu'il ne doit pas
la configuration. Dans le pied de de document, les utilisateurs
impatient peuvent trouver le code, mais je recommande que vous lisez
ce document pour quelques choses.
1. Compatibilité
HTML Purifier fonctionne dans PHP 4 et PHP 5. PHP 4.3.9 est le dernier
version que je le testais. Il ne dépend de les autre librairies.
Les extensions optionnel est iconv (en général déjà installer) et
tidy (répandu aussi). Si vous utilisez UTF-8 et ne voulez pas
l'indentation, vous pouvez utiliser HTML Purifier sans ces extensions.
2. Inclure la librarie
Utilisez:
require_once '/path/to/library/HTMLPurifier.auto.php';
...quand vous devez utiliser HTML Purifier (ne inclure pas quand vous
ne devez pas, parce que HTML Purifier est trés grand.)
Si vous n'aime pas que HTML Purifier change vos include_path, on peut
change vos include_path, et:
require_once 'HTMLPurifier.php';
Seuleument les contents dans library/ est essentiel; vous peut enlever
les autre fichiers quand vous est dans une atmosphère professionnel.
[En cours de construction]
6. Installation vite
Si votre site web est en UTF-8 et XHTML Transitional, utilisez:
<?php
require_once '/path/to/htmlpurifier/library/HTMLPurifier.auto.php';
$purificateur = new HTMLPurifier();
$html_propre = $purificateur->purify($html_salle);
?>
Sinon, utilisez:
<?php
require_once '/path/to/htmlpurifier/library/HTMLPurifier.auto.php';
$config = HTMLPurifier_Config::createDefault();
$config->set('Core', 'Encoding', 'ISO-8859-1'); //remplacez avec votre encoding
$config->set('Core', 'XHTML', true); //remplacez avec false si HTML 4.01
$purificateur = new HTMLPurifier($config);
$html_propre = $purificateur->purify($html_salle);
?>

257
NEWS
View File

@@ -1,20 +1,243 @@
NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
1.2.0, unknown projected release date
(major feature release)
= KEY ====================
# Breaks back-compat
! Feature
- Bugfix
+ Sub-comment
. Internal change
==========================
1.1.1, unknown projected release date
(bugfix release)
1.7.0, unknown release date
1.6.1, released 2007-05-05
! Support for more deprecated attributes via transformations:
+ hspace and vspace in img
+ size and noshade in hr
+ nowrap in td
+ clear in br
+ align in caption, table, img and hr
+ type in ul, ol and li
! DirectLex now preserves text in which a < bracket is followed by
a non-alphanumeric character. This means that certain emoticons
are now preserved.
! %Core.RemoveInvalidImg is now operational, when set to false invalid
images will hang around with an empty src
! target attribute in a tag supported, use %Attr.AllowedFrameTargets
to enable
! CSS property white-space now allows nowrap (supported in all modern
browsers) but not others (which have spotty browser implementations)
! XHTML 1.1 mode now sort-of works without any fatal errors, and
lang is now moved over to xml:lang.
! Attribute transformation smoketest available at smoketests/attrTransform.php
! Transformation of font's size attribute now handles super-large numbers
- Possibly fatal bug with __autoload() fixed in module manager
- Invert HTMLModuleManager->addModule() processing order to check
prefixes first and then the literal module
- Empty strings get converted to empty arrays instead of arrays with
an empty string in them.
- Merging in attribute lists now works.
. Demo script removed: it has been added to the website's repository
. Basic.php script modified to work out of the box
. Refactor AttrTransform classes to reduce duplication
. AttrTransform_TextAlign axed in favor of a more general
AttrTransform_EnumToCSS, refer to HTMLModule/TransformToStrict.php to
see how the new equivalent is implemented
. Unit tests now use exclusively assertIdentical
1.6.0, released 2007-04-01
! Support for most common deprecated attributes via transformations:
+ bgcolor in td, th, tr and table
+ border in img
+ name in a and img
+ width in td, th and hr
+ height in td, th
! Support for CSS attribute 'height' added
! Support for rel and rev attributes in a tags added, use %Attr.AllowedRel
and %Attr.AllowedRev to activate
- You can define ID blacklists using regular expressions via
%Attr.IDBlacklistRegexp
- Error messages are emitted when you attempt to "allow" elements or
attributes that HTML Purifier does not support
1.5.1, unknown release date
- Fix segfault in unit test. The problem is not very reproduceable and
I don't know what causes it, but a six line patch fixed it.
1.5.0, released 2007-03-23
! Added a rudimentary I18N and L10N system modeled off MediaWiki. It
doesn't actually do anything yet, but keep your eyes peeled.
! docs/enduser-utf8.html explains how to use UTF-8 and HTML Purifier
! Newly structured HTMLDefinition modeled off of XHTML 1.1 modules.
I am loathe to release beta quality APIs, but this is exactly that;
don't use the internal interfaces if you're not willing to do migration
later on.
- Allow 'x' subtag in language codes
- Fixed buggy chameleon-support for ins and del
. Added support for IDREF attributes (i.e. for)
. Renamed HTMLPurifier_AttrDef_Class to HTMLPurifier_AttrDef_Nmtokens
. Removed context variable ParentType, replaced with IsInline, which
is false when you're not inline and an integer of the parent that
caused you to become inline when you are (so possibly zero)
. Removed ElementDef->type in favor of ElementDef->descendants_are_inline
and HTMLDefinition->content_sets
. StrictBlockquote now reports what elements its supposed to allow,
rather than what it does allow
. Removed HTMLDefinition->info_flow_elements in favor of
HTMLDefinition->content_sets['Flow']
. Removed redundant "exclusionary" definitions from DTD roster
. StrictBlockquote now requires a construction parameter as if it
were an Required ChildDef, this is the "real" set of allowed elements
. AttrDef partitioned into HTML, CSS and URI segments
. Modify Youtube filter regexp to be multiline
. Require both PHP5 and DOM extension in order to use DOMLex, fixes
some edge cases where a DOMDocument class exists in a PHP4 environment
due to DOM XML extension.
1.4.1, released 2007-01-21
! docs/enduser-youtube.html updated according to new functionality
- YouTube IDs can have underscores and dashes
1.4.0, released 2007-01-21
! Implemented list-style-image, URIs now allowed in list-style
! Implemented background-image, background-repeat, background-attachment
and background-position CSS properties. Shorthand property background
supports all of these properties.
! Configuration documentation looks nicer
! Added %Core.EscapeNonASCIICharacters to workaround loss of Unicode
characters while %Core.Encoding is set to a non-UTF-8 encoding.
! Support for configuration directive aliases added
! Config object can now be instantiated from ini files
! YouTube preservation code added to the core, with two lines of code
you can add it as a filter to your code. See smoketests/preserveYouTube.php
for sample code.
! Moved SLOW to docs/enduser-slow.html and added code examples
- Replaced version check with functionality check for DOM (thanks Stephen
Khoo)
. Added smoketest 'all.php', which loads all other smoketests via frames
. Implemented AttrDef_CSSURI for url(http://google.com) style declarations
. Added convenient single test selector form on test runner
1.3.2, released 2006-12-25
! HTMLPurifier object now accepts configuration arrays, no need to manually
instantiate a configuration object
! Context object now accessible to outside
! Added enduser-youtube.html, explains how to embed YouTube videos. See
also corresponding smoketest preserveYouTube.php.
! Added purifyArray(), which takes a list of HTML and purifies it all
! Added static member variable $version to HTML Purifier with PHP-compatible
version number string.
- Fixed fatal error thrown by upper-cased language attributes
- printDefinition.php: added labels, added better clarification
. HTMLPurifier_Config::create() added, takes mixed variable and converts into
a HTMLPurifier_Config object.
1.3.1, released 2006-12-06
! Added HTMLPurifier.func.php stub for a convenient function to call the library
- Fixed bug in RemoveInvalidImg code that caused all images to be dropped
(thanks to .mario for reporting this)
. Standardized all attribute handling variables to attr, made it plural
1.3.0, released 2006-11-26
# Invalid images are now removed, rather than replaced with a dud
<img src="" alt="Invalid image" />. Previous behavior can be restored
with new directive %Core.RemoveInvalidImg set to false.
! (X)HTML Strict now supported
+ Transparently handles inline elements in block context (blockquote)
! Added GET method to demo for easier validation, added 50kb max input size
! New directive %HTML.BlockWrapper, for block-ifying inline elements
! New directive %HTML.Parent, allows you to only allow inline content
! New directives %HTML.AllowedElements and %HTML.AllowedAttributes to let
users narrow the set of allowed tags
! <li value="4"> and <ul start="2"> now allowed in loose mode
! New directives %URI.DisableExternalResources and %URI.DisableResources
! New directive %Attr.DisableURI, which eliminates all hyperlinking
! New directive %URI.Munge, munges URI so you can use some sort of redirector
service to avoid PageRank leaks or warn users that they are exiting your site.
! Added spiffy new smoketest printDefinition.php, which lets you twiddle with
the configuration settings and see how the internal rules are affected.
! New directive %URI.HostBlacklist for blocking links to bad hosts.
xssAttacks.php smoketest updated accordingly.
- Added missing type to ChildDef_Chameleon
- Remove Tidy option from demo if there is not Tidy available
. ChildDef_Required guards against empty tags
. Lookup table HTMLDefinition->info_flow_elements added
. Added peace-of-mind variable initialization to Strategy_FixNesting
. Added HTMLPurifier->info_parent_def, parent child processing made special
. Added internal documents briefly summarizing future progression of HTML
. HTMLPurifier_Config->getBatch($namespace) added
. More lenient casting to bool from string in HTMLPurifier_ConfigSchema
. Refactored ChildDef classes into their own files
1.2.0, released 2006-11-19
# ID attributes now disabled by default. New directives:
+ %HTML.EnableAttrID - restores old behavior by allowing IDs
+ %Attr.IDPrefix - %Attr.IDBlacklist alternative that munges all user IDs
so that they don't collide with your IDs
+ %Attr.IDPrefixLocal - Same as above, but for when there are multiple
instances of user content on the page
+ Profuse documentation on how to use these available in docs/enduser-id.txt
! Added MODx plugin <http://modxcms.com/forums/index.php/topic,6604.0.html>
! Added percent encoding normalization
! XSS attacks smoketest given facelift
! Configuration documentation now has table of contents
! Added %URI.DisableExternal, which prevents links to external websites. You
can also use %URI.Host to permit absolute linking to subdomains
! Non-accessible resources (ex. mailto) blocked from embedded URIs (img src)
- Type variable in HTMLDefinition was not being set properly, fixed
- Documentation updated
+ TODO added request Phalanger
+ TODO added request Native compression
+ TODO added request Remove redundant tags
+ TODO added possible plaintext formatter for HTML Purifier documentation
+ Updated ConfigDoc TODO
+ Improved inline comments in AttrDef/Class.php, AttrDef/CSS.php
and AttrDef/Host.php
+ Revamped documentation into HTML, along with misc updates
- HTMLPurifier_Context doesn't throw a variable reference error if you attempt
to retrieve a non-existent variable
. Switched to purify()-wide Context object registry
. Refactored unit tests to minimize duplication
. XSS attack sheet updated
. configdoc.xml now has xml:space attached to default value nodes
. Allow configuration directives to permit null values
. Cleaned up test-cases to remove unnecessary swallowErrors()
1.1.2, released 2006-09-30
! Add HTMLPurifier.auto.php stub file that configures include_path
- Documentation updated
+ INSTALL document rewritten
+ TODO added semi-lossy conversion
+ API Doxygen docs' file exclusions updated
+ Added notes on HTML versus XML attribute whitespace handling
+ Noted that HTMLPurifier_ChildDef_Custom isn't being used
+ Noted that config object's definitions are cached versions
- Fixed lack of attribute parsing in HTMLPurifier_Lexer_PEARSax3
- ftp:// URIs now have their typecodes checked
- Hooked up HTMLPurifier_ChildDef_Custom's unit tests (they weren't being run)
. Line endings standardized throughout project (svn:eol-style standardized)
. Refactored parseData() to general Lexer class
. Tester named "HTML Purifier" not "HTMLPurifier"
1.1.1, released 2006-09-24
! Configuration option to optionally Tidy up output for indentation to make up
for dropped whitespace by DOMLex (pretty-printing for the entire application
should be done by a page-wide Tidy)
- Various documentation updates
- Fixed parse error in configuration documentation script
- Fixed fatal error in benchmark scripts, slightly augmented
- As far as possible, whitespace is preserved in-between table children
- Sample test-settings.php file included
1.1.0, released 2006-09-16
! Directive documentation generation using XSLT
! XHTML can now be turned off, output becomes <br>
- Made URI validator more forgiving: will ignore leading and trailing
quotes, apostrophes and less than or greater than signs.
- Enforce alphanumeric namespace and directive names for configuration.
- Directive documentation generation using XSLT
- Table child definition made more flexible, will fix up poorly ordered elements
- XHTML generation can now be turned off, allowing things like <br>
- Renamed ConfigDef to ConfigSchema
. Renamed ConfigDef to ConfigSchema
1.0.1, released 2006-09-04
- Fixed slight bug in DOMLex attribute parsing
@@ -24,17 +247,17 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
space in them. This manifested in TinyMCE.
1.0.0, released 2006-09-01
! Shorthand CSS properties implemented: font, border, background, list-style
! Basic color keywords translated into hexadecimal values
! Table CSS properties implemented
! Support for charsets other than UTF-8 (defined by iconv)
! Malformed UTF-8 and non-SGML character detection and cleaning implemented
- Fixed broken numeric entity conversion
- Malformed UTF-8 and non-SGML character detection and cleaning implemented
- API documentation completed
- Shorthand CSS properties implemented: font, border, background, list-style
- Basic color keywords translated into hexadecimal values
- Table CSS properties implemented
- (HTML|CSS)Definition de-singleton-ized
- Support for charsets other than UTF-8 (defined by iconv)
. (HTML|CSS)Definition de-singleton-ized
1.0.0beta, released 2006-08-16
- First public release, most functionality implemented. Notable omissions are:
. Shorthand CSS properties
. Table CSS properties
. Deprecated attribute transformations
! First public release, most functionality implemented. Notable omissions are:
+ Shorthand CSS properties
+ Table CSS properties
+ Deprecated attribute transformations

33
README
View File

@@ -1,13 +1,22 @@
README
All about HTMLPurifier
HTMLPurifier is an HTML filtering solution. It uses a unique combination of
robust whitelists and agressive parsing to ensure that not only are XSS
attacks thwarted, but the resulting HTML is standards compliant.
See INSTALL on how to use the library. See docs/ for more developer-oriented
documentation as well as some code examples. Users of TinyMCE or FCKeditor
may be especially interested in WYSIWYG.
HTMLPurifier can be found on the web at: http://hp.jpsband.org/
README
All about HTML Purifier
HTML Purifier is an HTML filtering solution that uses a unique combination
of robust whitelists and agressive parsing to ensure that not only are
XSS attacks thwarted, but the resulting HTML is standards compliant.
HTML Purifier is oriented towards richly formatted documents from
untrusted sources that require CSS and a full tag-set. This library can
be configured to accept a more restrictive set of tags, but it won't be
as efficient as more bare-bones parsers. It will, however, do the job
right, which may be more important.
Places to go:
* See INSTALL for a quick installation guide
* See docs/ for developer-oriented documentation, code examples and
an in-depth installation guide.
* See WYSIWYG for information on editors like TinyMCE and FCKeditor
HTML Purifier can be found on the web at: http://htmlpurifier.org/

34
SLOW
View File

@@ -1,34 +0,0 @@
SLOW
also known as the HELP ME LIBRARY IS TOO SLOW MY PAGE TAKE TOO LONG LOAD page
HTMLPurifier is a very powerful library. But with power comes great
responsibility, or, at least, longer execution times. Remember, this
library isn't lightly grazing over submitted HTML: it's deconstructing
the whole thing, rigorously checking the parts, and then putting it
back together.
So, if it so turns out that HTMLPurifier is kinda too slow for outbound
filtering, you've got a few options:
1. Inbound filtering - perform filtering of HTML when it's submitted by the
user. Since the user is already submitting something, an extra half a
second tacked on to the load time probably isn't going to be that huge of
a problem. Then, displaying the content is a simple a manner of outputting
it directly from your database/filesystem. The trouble with this method is
that your user loses the original text, and when doing edits, will be
handling the filtered text. Of course, maybe that's a good thing. If you
don't mind a little extra complexity, you can try...
2. Caching the filtered output - accept the submitted text and put it
unaltered into the database, but then also generate a filtered version and
stash that in the database. Serve the filtered version to readers, and the
unaltered version to editors. If need be, you can invalidate the cache and
have the cached filtered version be regenerated on the first page view. Pros?
Full data retention. Cons? It's more complicated.
In short, inbound filtering is almost as simple as outbound filtering, but
it has some drawbacks which cannot be fixed unless you save both the original
and the filtered versions.
There is a third option: profile and optimize HTMLPurifier yourself. ;-)

117
TODO
View File

@@ -1,46 +1,95 @@
TODO List
Ongoing
- Lots of profiling, make it faster!
- Plugins for major CMSes (very tricky issue)
= KEY ====================
# Flagship
- Regular
? Maybe I'll Do It
==========================
1.2 release
- Additional support for poorly written HTML
- Implement all non-essential attribute transforms
- Microsoft Word HTML cleaning (i.e. MsoNormal)
1.7 release [Advanced API]
# Complete advanced API, and fully document it
# Implement all edge-case attribute transforms
# Implement all deprecated tags and attributes
- Parse TinyMCE-style whitelist into our %HTML.Allow* whitelists (possibly
do this earlier)
? HTML interface for tweaking configuration to see changes
1.3 release
- Formatters for plaintext
1.8 release [Refactor, refactor!]
# URI validation routines tighter (see docs/dev-code-quality.html) (COMPLEX)
# Advanced URI filtering schemes (see docs/proposal-new-directives.txt)
- Configuration profiles: predefined directives set with one func call
- Implement IDREF support (harder than it seems, since you cannot have
IDREFs to non-existent IDs)
- Allow non-ASCII characters in font names
1.9 release [Error'ed]
# Error logging for filtering/cleanup procedures
- Requires I18N facilities to be created first (COMPLEX)
- XSS-attempt detection
- More fine-grained control over escaping behavior
- Silently drop content inbetween SCRIPT tags (can be generalized to allow
specification of elements that, when detected as foreign, trigger removal
of children, although unbalanced tags could wreck havoc (or at least
delete the rest of the document)).
1.10 release [Do What I Mean, Not What I Say]
# Additional support for poorly written HTML
- Microsoft Word HTML cleaning (i.e. MsoNormal, but research essential!)
- Friendly strict handling of <address> (block -> <br>)
- Remove redundant tags, ex. <u><u>Underlined</u></u>. Implementation notes:
1. Analyzing which tags to remove duplicants
2. Ensure attributes are merged into the parent tag
3. Extend the tag exclusion system to specify whether or not the
contents should be dropped or not (currently, there's code that could do
something like this if it didn't drop the inner text too.)
- Remove <span> tags that don't do anything (no attributes)
- Remove empty inline tags<i></i>
- Append something to duplicate IDs so they're still usable (impl. note: the
dupe detector would also need to detect the suffix as well)
2.0 release [Beyond HTML]
# Legit token based CSS parsing (will require revamping almost every
AttrDef class)
# Formatters for plaintext (COMPLEX)
- Auto-paragraphing (be sure to leverage fact that we know when things
shouldn't be paragraphed, such as lists and tables).
- Make URI validation routines tighter (especially mailto)
- More extensive URI filtering schemes
- Allow for background-image and list-style-image (see above)
- Distinguish between different types of URIs, for instance, a mailto URI
in IMG SRC is nonsensical
2.0 release
- Add various "levels" of cleaning
- Related: Allow strict (X)HTML
3.0 release
- Extended HTML capabilities based on namespacing and tag transforms
- Hooks for adding custom processors to custom namespaced tags and
attributes, offer default implementation
- Lots of documentation and samples
Unknown release (on a scratch-an-itch basis)
- Silently drop content inbetween SCRIPT tags (can be generalized to allow
specification of elements that, when detected as foreign, trigger removal
of children, although unbalanced tags could wreck havoc (or at least delete
the rest of the document)).
- Linkify URLs
- Smileys
- Linkification for HTML Purifier docs: notably configuration and classes
- Allow tags to be "armored", an internal flag that protects them
from validation and passes them out unharmed
- Fixes for Firefox's inability to handle COL alignment props (Bug 915)
- Automatically add non-breaking spaces to empty table cells when
empty-cells:show is applied to have compatibility with Internet Explorer
- Pretty-printing HTML (adds dependency of Generator to HTMLDefinition)
- Non-lossy dumb alternate character encoding transformations, achieved by
numerically encoding all non-ASCII characters
- Convert RTL/LTR override characters to <bdo> tags, or vice versa on demand.
Also, enable disabling of directionality
3.0 release [To XML and Beyond]
- Extended HTML capabilities based on namespacing and tag transforms (COMPLEX)
- Hooks for adding custom processors to custom namespaced tags and
attributes, offer default implementation
- Lots of documentation and samples
- XHTML 1.1 support
Ongoing
- Lots of profiling, make it faster!
- Plugins for major CMSes (COMPLEX)
- WordPress (mostly written, needs beta-testing)
- eFiction
- more! (look for ones that use WYSIWYGs)
Unknown release (on a scratch-an-itch basis)
? Semi-lossy dumb alternate character encoding transfor
? Have 'lang' attribute be checked against official lists, achieved by
encoding all characters that have string entity equivalents
- Explain how to use HTML Purifier in non-PHP languages
Requested
? Native content compression, whitespace stripping (don't rely on Tidy, make
sure we don't remove from <pre> or related tags)
Wontfix
- Non-lossy smart alternate character encoding transformations
- Non-lossy smart alternate character encoding transformations (unless
patch provided)
- Pretty-printing HTML, users can use Tidy on the output on entire page

1
VERSION Normal file
View File

@@ -0,0 +1 @@
1.6.1

7
WHATSNEW Normal file
View File

@@ -0,0 +1,7 @@
The 1.6.1 release, code-named 'Ach! We missed something! Run!', completes
HTML Purifier's roster of attribute transformations. It also implements
a number of minor features (such as better font transformations, smarter
HTML parsing, the CSS property 'white-space' and XHTML 1.1), a few bug
fixes (most notably fixed __autoload compatibility issues) and a ton
of refactoring. 1.6 was for things that absolutely could not wait: this
release, developed in a more leisurely pace, fills in the gaps.

View File

@@ -18,4 +18,5 @@ HTML Purifier is perfect for filtering pure-HTML input from WYSIWYG editors.
Enough said.
There is a proof-of-concept integration of HTML Purifier with the Mantis
bugtracker at http://hp.jpsband.org/mantis/
bugtracker at http://hp.jpsband.org/mantis/ You can see notes on how
this integration was acheived at http://hp.jpsband.org/mantis_notes.txt

BIN
art/1000passes.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.4 KiB

View File

@@ -3,15 +3,25 @@
// emulates inserting a dir called HTMLPurifier into your class dir
set_include_path(get_include_path() . PATH_SEPARATOR . '../library/');
require_once 'HTMLPurifier/ConfigDef.php';
require_once 'HTMLPurifier/Config.php';
require_once 'HTMLPurifier/Lexer/DirectLex.php';
require_once 'HTMLPurifier/Lexer/PEARSax3.php';
@include_once '../test-settings.php';
$LEXERS = array(
'DirectLex' => new HTMLPurifier_Lexer_DirectLex(),
'PEARSax3' => new HTMLPurifier_Lexer_PEARSax3()
);
require_once 'HTMLPurifier/ConfigSchema.php';
require_once 'HTMLPurifier/Config.php';
require_once 'HTMLPurifier/Context.php';
$LEXERS = array();
$RUNS = isset($GLOBALS['HTMLPurifierTest']['Runs'])
? $GLOBALS['HTMLPurifierTest']['Runs'] : 2;
require_once 'HTMLPurifier/Lexer/DirectLex.php';
$LEXERS['DirectLex'] = new HTMLPurifier_Lexer_DirectLex();
if (!empty($GLOBALS['HTMLPurifierTest']['PEAR'])) {
require_once 'HTMLPurifier/Lexer/PEARSax3.php';
$LEXERS['PEARSax3'] = new HTMLPurifier_Lexer_PEARSax3();
} else {
exit('PEAR required to perform benchmark.');
}
if (version_compare(PHP_VERSION, '5', '>=')) {
require_once 'HTMLPurifier/Lexer/DOMLex.php';
@@ -56,9 +66,12 @@ class RowTimer extends Benchmark_Timer
if ($standard == false) $standard = $v['diff'];
$perc = $v['diff'] * 100 / $standard;
$bad_run = ($v['diff'] < 0);
$out .= '<td align="right">' . number_format($perc, 2, '.', '') .
'%</td>';
$out .= '<td align="right"'.
($bad_run ? ' style="color:#AAA;"' : '').
'>' . number_format($perc, 2, '.', '') .
'%</td><td>'.number_format($v['diff'],4,'.','').'</td>';
}
@@ -79,13 +92,16 @@ function print_lexers() {
}
function do_benchmark($name, $document) {
global $LEXERS;
global $LEXERS, $RUNS;
$config = HTMLPurifier_Config::createDefault();
$context = new HTMLPurifier_Context();
$timer = new RowTimer($name);
$timer->start();
foreach($LEXERS as $key => $lexer) {
$tokens = $lexer->tokenizeHTML($document);
for ($i=0; $i<$RUNS; $i++) $tokens = $lexer->tokenizeHTML($document, $config, $context);
$timer->setMarker($key);
}
@@ -103,7 +119,7 @@ function do_benchmark($name, $document) {
<table border="1">
<tr><th>Case</th><?php
foreach ($LEXERS as $key => $value) {
echo '<th>' . htmlspecialchars($key) . '</th>';
echo '<th colspan="2">' . htmlspecialchars($key) . '</th>';
}
?></tr>
<?php

View File

@@ -2,15 +2,18 @@
set_include_path(get_include_path() . PATH_SEPARATOR . '../library/');
require_once 'HTMLPurifier/ConfigDef.php';
require_once 'HTMLPurifier/ConfigSchema.php';
require_once 'HTMLPurifier/Config.php';
require_once 'HTMLPurifier/Lexer/DirectLex.php';
require_once 'HTMLPurifier/Context.php';
$input = file_get_contents('samples/Lexer/4.html');
$lexer = new HTMLPurifier_Lexer_DirectLex();
$config = HTMLPurifier_Config::createDefault();
$context = new HTMLPurifier_Context();
for ($i = 0; $i < 10; $i++) {
$tokens = $lexer->tokenizeHTML($input);
$tokens = $lexer->tokenizeHTML($input, $config, $context);
}
?>

View File

@@ -12,10 +12,8 @@ TODO:
- multipage documentation
- determine how to multilingualize
- factor out code into classes
- generate a table of contents
*/
// ---------------------------------------------------------------------------
// Check and configure environment
@@ -26,8 +24,7 @@ error_reporting(E_ALL);
// ---------------------------------------------------------------------------
// Include HTML Purifier library
set_include_path('../library' . PATH_SEPARATOR . get_include_path());
require_once 'HTMLPurifier.php';
require_once '../library/HTMLPurifier.auto.php';
// ---------------------------------------------------------------------------
@@ -50,7 +47,7 @@ function appendHTMLDiv($document, $node, $html) {
// ---------------------------------------------------------------------------
// Load copies of HTMLPurifier_ConfigDef and HTMLPurifier
$definition = HTMLPurifier_ConfigDef::instance();
$schema = HTMLPurifier_ConfigSchema::instance();
$purifier = new HTMLPurifier();
@@ -61,7 +58,7 @@ $types_document = new DOMDocument('1.0', 'UTF-8');
$types_root = $types_document->createElement('types');
$types_document->appendChild($types_root);
$types_document->formatOutput = true;
foreach ($definition->types as $name => $expanded_name) {
foreach ($schema->types as $name => $expanded_name) {
$types_type = $types_document->createElement('type', $expanded_name);
$types_type->setAttribute('id', $name);
$types_root->appendChild($types_type);
@@ -82,13 +79,10 @@ $dom_root->appendChild($dom_document->createElement('title', 'HTML Purifier'));
/*
TODO for XML format:
- namespace descriptions
- enumerated values
- default values
- create a definition (DTD or other) once interface stabilizes
*/
foreach($definition->info as $namespace_name => $namespace_info) {
foreach($schema->info as $namespace_name => $namespace_info) {
$dom_namespace = $dom_document->createElement('namespace');
$dom_root->appendChild($dom_namespace);
@@ -100,10 +94,12 @@ foreach($definition->info as $namespace_name => $namespace_info) {
$dom_namespace_description = $dom_document->createElement('description');
$dom_namespace->appendChild($dom_namespace_description);
appendHTMLDiv($dom_document, $dom_namespace_description,
$definition->info_namespace[$namespace_name]->description);
$schema->info_namespace[$namespace_name]->description);
foreach ($namespace_info as $name => $info) {
if ($info->class == 'alias') continue;
$dom_directive = $dom_document->createElement('directive');
$dom_namespace->appendChild($dom_directive);
@@ -115,9 +111,12 @@ foreach($definition->info as $namespace_name => $namespace_info) {
$dom_constraints = $dom_document->createElement('constraints');
$dom_directive->appendChild($dom_constraints);
$dom_constraints->appendChild(
$dom_document->createElement('type', $info->type)
);
$dom_type = $dom_document->createElement('type', $info->type);
if ($info->allow_null) {
$dom_type->setAttribute('allow-null', 'yes');
}
$dom_constraints->appendChild($dom_type);
if ($info->allowed !== true) {
$dom_allowed = $dom_document->createElement('allowed');
$dom_constraints->appendChild($dom_allowed);
@@ -128,19 +127,25 @@ foreach($definition->info as $namespace_name => $namespace_info) {
}
}
$raw_default = $definition->defaults[$namespace_name][$name];
$raw_default = $schema->defaults[$namespace_name][$name];
if (is_bool($raw_default)) {
$default = $raw_default ? 'true' : 'false';
} elseif (is_string($raw_default)) {
$default = "\"$raw_default\"";
} elseif (is_null($raw_default)) {
$default = 'null';
} else {
$default = print_r(
$definition->defaults[$namespace_name][$name], true
$schema->defaults[$namespace_name][$name], true
);
}
$dom_constraints->appendChild(
$dom_document->createElement('default', $default)
);
$dom_default = $dom_document->createElement('default', $default);
// remove this once we get a DTD
$dom_default->setAttribute('xml:space', 'preserve');
$dom_constraints->appendChild($dom_default);
$dom_descriptions = $dom_document->createElement('descriptions');
$dom_directive->appendChild($dom_descriptions);
@@ -182,7 +187,7 @@ $xsl_processor->importStylesheet($xsl_dom_stylesheet);
$html_output = $xsl_processor->transformToXML($dom_document);
// some slight fudges to preserve backwards compatibility
$html_output = str_replace('/>', ' />', $html_output); // <br /> not <br>
$html_output = str_replace('/>', ' />', $html_output); // <br /> not <br/>
$html_output = str_replace(' xmlns=""', '', $html_output); // rm unnecessary xmlns
if (class_exists('Tidy')) {

View File

@@ -1,7 +1,24 @@
table {border-collapse:collapse;}
table td, table th {padding:0.2em;}
table.constraints {margin:0 0 1em;}
table.constraints th {text-align:left;padding-left:0.4em;}
table.constraints td {padding-right:0.4em;}
table.constraints td pre {margin:0;}
body {margin:1em 4em;}
table {border-collapse:collapse;}
table td, table th {padding:0.2em;}
table.constraints {margin:0 0 1em;}
table.constraints th {text-align:left;padding-left:0.4em;}
table.constraints td {padding-right:0.4em;}
table.constraints td pre {margin:0;}
#toc {list-style-type:none; font-weight:bold;}
#toc ul {list-style-type:disc; font-weight:normal;}
.description p {margin-top:0;margin-bottom:1em;}
#library, h1 {text-align:center; font-family:Garamond, serif;
font-variant:small-caps;}
#library {font-size:1em;}
h1 {margin-top:0;}
h2 {border-bottom:1px solid #CCC; font-family:sans-serif; font-weight:normal;
font-size:1.3em;}
h3 {font-family:sans-serif; font-size:1.1em; font-weight:bold; }
h4 {font-family:sans-serif; font-size:0.9em; font-weight:bold; }

View File

@@ -1,105 +1,127 @@
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet
version = "1.0"
xmlns = "http://www.w3.org/1999/xhtml"
xmlns:xsl = "http://www.w3.org/1999/XSL/Transform"
>
<xsl:output
method = "xml"
encoding = "UTF-8"
doctype-public = "-//W3C//DTD XHTML 1.0 Transitional//EN"
doctype-system = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"
indent = "no"
media-type = "text/html"
/>
<xsl:variable name="typeLookup" select="document('../types.xml')" />
<xsl:template match="/">
<html lang="en" xml:lang="en">
<head>
<title><xsl:value-of select="/configdoc/title" /> Configuration Documentation</title>
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8" />
<link rel="stylesheet" type="text/css" href="styles/plain.css" />
</head>
<body>
<xsl:apply-templates />
</body>
</html>
</xsl:template>
<xsl:template match="title">
<h1><xsl:value-of select="/configdoc/title" /> Configuration Documentation</h1>
</xsl:template>
<xsl:template match="namespace">
<xsl:apply-templates />
<xsl:if test="count(child::directive)=0">
<p>No configuration directives defined for this namespace.</p>
</xsl:if>
</xsl:template>
<xsl:template match="namespace/name">
<h2 id="{../@id}"><xsl:value-of select="text()" /></h2>
</xsl:template>
<xsl:template match="namespace/description">
<div class="description">
<xsl:copy-of select="div/node()" />
</div>
</xsl:template>
<xsl:template match="directive">
<xsl:apply-templates />
</xsl:template>
<xsl:template match="directive/name">
<h3 id="{../@id}"><xsl:value-of select="text()" /></h3>
</xsl:template>
<xsl:template match="directive/constraints">
<table class="constraints">
<xsl:apply-templates />
<!-- Calculated other values -->
<tr>
<th>Used by:</th>
<td>
<xsl:for-each select="../descriptions/description">
<xsl:if test="position()&gt;1">, </xsl:if>
<xsl:value-of select="@file" />
</xsl:for-each>
</td>
</tr>
</table>
</xsl:template>
<xsl:template match="directive//description">
<div class="description">
<xsl:copy-of select="div/node()" />
</div>
</xsl:template>
<xsl:template match="constraints/type">
<tr>
<th>Type:</th>
<td>
<xsl:variable name="type" select="text()" />
<xsl:attribute name="class">type type-<xsl:value-of select="$type" /></xsl:attribute>
<xsl:value-of select="$typeLookup/types/type[@id=$type]/text()" />
</td>
</tr>
</xsl:template>
<xsl:template match="constraints/allowed">
<tr>
<th>Allowed values:</th>
<td>
<xsl:for-each select="value"><!--
--><xsl:if test="position()&gt;1">, </xsl:if>
&quot;<xsl:value-of select="." />&quot;<!--
--></xsl:for-each>
</td>
</tr>
</xsl:template>
<xsl:template match="constraints/default">
<tr>
<th>Default:</th>
<td><pre><xsl:value-of select="." xml:space="preserve" /></pre></td>
</tr>
</xsl:template>
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet
version = "1.0"
xmlns = "http://www.w3.org/1999/xhtml"
xmlns:xsl = "http://www.w3.org/1999/XSL/Transform"
>
<xsl:output
method = "xml"
encoding = "UTF-8"
doctype-public = "-//W3C//DTD XHTML 1.0 Transitional//EN"
doctype-system = "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"
indent = "no"
media-type = "text/html"
/>
<xsl:variable name="typeLookup" select="document('../types.xml')" />
<xsl:template match="/">
<html lang="en" xml:lang="en">
<head>
<title>Configuration Documentation - <xsl:value-of select="/configdoc/title" /></title>
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8" />
<link rel="stylesheet" type="text/css" href="styles/plain.css" />
</head>
<body>
<div id="library"><xsl:value-of select="/configdoc/title" /></div>
<h1>Configuration Documentation</h1>
<h2>Table of Contents</h2>
<ul id="toc">
<xsl:apply-templates mode="toc" />
</ul>
<xsl:apply-templates />
</body>
</html>
</xsl:template>
<xsl:template match="title" mode="toc" />
<xsl:template match="namespace" mode="toc">
<xsl:if test="count(directive)&gt;0">
<li>
<a href="#{@id}"><xsl:value-of select="name" /></a>
<ul>
<xsl:apply-templates select="directive" mode="toc" />
</ul>
</li>
</xsl:if>
</xsl:template>
<xsl:template match="directive" mode="toc">
<li><a href="#{@id}"><xsl:value-of select="name" /></a></li>
</xsl:template>
<xsl:template match="title" />
<xsl:template match="namespace">
<xsl:apply-templates />
<xsl:if test="count(directive)=0">
<p>No configuration directives defined for this namespace.</p>
</xsl:if>
</xsl:template>
<xsl:template match="namespace/name">
<h2 id="{../@id}"><xsl:value-of select="." /></h2>
</xsl:template>
<xsl:template match="namespace/description">
<div class="description">
<xsl:copy-of select="div/node()" />
</div>
</xsl:template>
<xsl:template match="directive">
<xsl:apply-templates />
</xsl:template>
<xsl:template match="directive/name">
<h3 id="{../@id}"><xsl:value-of select="../@id" /></h3>
</xsl:template>
<xsl:template match="directive/constraints">
<table class="constraints">
<xsl:apply-templates />
<!-- Calculated other values -->
<tr>
<th>Used by:</th>
<td>
<xsl:for-each select="../descriptions/description">
<xsl:if test="position()&gt;1">, </xsl:if>
<xsl:value-of select="@file" />
</xsl:for-each>
</td>
</tr>
</table>
</xsl:template>
<xsl:template match="directive//description">
<div class="description">
<xsl:copy-of select="div/node()" />
</div>
</xsl:template>
<xsl:template match="constraints/type">
<tr>
<th>Type:</th>
<td>
<xsl:variable name="type" select="text()" />
<xsl:attribute name="class">type type-<xsl:value-of select="$type" /></xsl:attribute>
<xsl:value-of select="$typeLookup/types/type[@id=$type]/text()" />
<xsl:if test="@allow-null='yes'">
(or null)
</xsl:if>
</td>
</tr>
</xsl:template>
<xsl:template match="constraints/allowed">
<tr>
<th>Allowed values:</th>
<td>
<xsl:for-each select="value"><!--
--><xsl:if test="position()&gt;1">, </xsl:if>
&quot;<xsl:value-of select="." />&quot;<!--
--></xsl:for-each>
</td>
</tr>
</xsl:template>
<xsl:template match="constraints/default">
<tr>
<th>Default:</th>
<td><pre><xsl:value-of select="." xml:space="preserve" /></pre></td>
</tr>
</xsl:template>
</xsl:stylesheet>

View File

@@ -1,39 +0,0 @@
Code Quality Issues
Okay, face it. Programmers can get lazy, cut corners, or make mistakes. They
also can do quick prototypes, and then forget to rewrite them later. Well,
while I can't list mistakes in here, I can list prototype-like segments
of code that should be aggressively refactored after the beta is released.
This does not list optimization issues, that needs to be done after intense
profiling.
Here we go:
AttrDef
Class - doesn't support Unicode characters, uses regular expressions
Lang - code duplication, premature optimization, doesn't consult official
lists
Pixels/Length/MultiLength - implemented according to HTML spec (excludes
code reuse in CSS)
URI - multiple regular expressions, needs host validation routines factored
out for mailto scheme, IPv6 validation is broken (fringe), unintuitive
variable overwriting, missing validation for query, fragment and path,
no percent-encode fixing
CSS - parser doesn't accept advanced CSS (fringe)
Number - constructor interface is inconsistent with Integer
AttrTransform - doesn't accept AttrContext, non-validating
ChildDef - not-allowed nodes translated to text, likely invalid handling
Config - "load configuration" hooks missing, rich set* accessors missing
Strategy
FixNesting - cannot bubble nodes out of structures
MakeWellFormed - insufficient automatic closing definitions (check HTML
spec for optional end tags).
RemoveForeignElements - should be run in parallel with MakeWellFormed
URIScheme - needs to have callable generic checks
ftp - missing typecode check
mailto - doesn't validate emails
news - doesn't validate opaque path
nntp - doesn't constrain path
EOL

View File

@@ -1,46 +0,0 @@
Configuration Ideas
Here are some theoretical configuration ideas that we could implement some
time. Note the naming convention: %Namespace.Directive
%Attr.IDPrefix - prefix all ids with this
%Attr.RewriteFragments - if there's %Attr.IDPrefix we may want to transparently
rewrite the URLs we parse too. However, we can only do it when it's a pure
anchor link, so it's not foolproof
%Attr.ClassBlacklist,
%Attr.ClassWhitelist,
%Attr.ClassListMode - determines what classes are allowed. When
%Attr.ClassListMode is set to Blacklist, only allow those not in
%Attr.ClassBlacklist. When it's Whitelist, only allow those in
%Attr.ClassWhitelist.
%Attr.LangAlphaOnly - designate whether or not to allow numerals in language
code subtags
* RFC 1766, the current standard referenced by XML, does not permit
numbers, but,
* RFC 3066, the superseding best practice standard since January 2001,
permits them.
We allow numbers by default, but you generally never see them
at all, which makes this a little more sane.
%Attr.MaxWidth,
%Attr.MaxHeight - caps for width and height related checks.
(a hack in Pixels for an image crashing attack could be replaced by this)
%URI.Munge - will munge all URIs to a different URI, which should redirect
the user to the applicable page. A urlencoded version of the URI
will replace any instances of %s in the string. One possible
string is 'http://www.google.com/url?q=%s'. Useful for preventing
pagerank from being sent to other sites
%URI.AddRelNofollow - will add rel="nofollow" to all links, preventing the
spread of ill-gotten pagerank
%URI.Host - host of website, for external link checks
%URI.RelativeToAbsolute - transforms all relative URIs to absolute form
%URI.DisableExternal - disable external links

287
docs/dev-advanced-api.html Normal file
View File

@@ -0,0 +1,287 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta name="description" content="Functional specification for HTML Purifier's advanced API for defining custom filtering behavior." />
<link rel="stylesheet" type="text/css" href="style.css" />
<title>Advanced API - HTML Purifier</title>
</head><body>
<h1>Advanced API</h1>
<div id="filing">Filed under Development</div>
<div id="index">Return to the <a href="index.html">index</a>.</div>
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
<p>HTML Purifier currently natively supports only a subset of HTML's
allowed elements, attributes, and behavior. This is by design,
but as the user is always right, they'll need some method to overload
these behaviors.</p>
<p>Our goals are to let the user:</p>
<dl>
<dt>Select</dt>
<dd><ul>
<li>Doctype</li>
<li>Mode: Lenient / Correctional</li>
<li>Elements / Attributes / Modules</li>
<li>Filterset</li>
</ul></dd>
<dt>Customize</dt>
<dd><ul>
<li>Attributes</li>
<li>Elements</li>
</ul></dd>
<dt>Internals</dt>
<dd><ul>
<li>Modules / Elements / Attributes / Attribute Types</li>
<li>Filtersets</li>
<li>Doctype</li>
</ul></dd>
</dl>
<h2>Select</h2>
<p>For basic use, the user will have to specify some basic parameters. This
is not strictly necessary, as HTML Purifier's default setting will always
output safe code, but is required for standards-compliant output.</p>
<h3>Selecting a Doctype</h3>
<p>The first thing to select is the <strong>doctype</strong>. This
is essential for standards-compliant output.</p>
<p class="technical">This identifier is based
on the name the W3C has given to the document type and <em>not</em>
the DTD identifier.</p>
<p>This parameter is set via the configuration object:</p>
<pre>$config->set('HTML', 'Doctype', 'XHTML 1.0 Transitional');</pre>
<p>Due to historical reasons, the default doctype is XHTML 1.0
Transitional, however, we really shouldn't be guessing what the user's
doctype is. Fortunantely, people who can't be bothered to set this won't
be bothered when their pages stop validating.</p>
<h3>Selecting Mode</h3>
<p>Within doctypes, there are various <strong>modes</strong> of operation.
These indicate variant behaviors that, while not strictly changing the
allowed set of elements and attributes, definitely affect the output.
Currently, we have two modes, which may be used together:</p>
<dl>
<dt>Lenient</dt>
<dd>
<p>Deprecated elements and attributes will be transformed into
standards-compliant alternatives when explicitly disallowed.</p>
<p>For example, in the XHTML 1.0 Strict doctype, a <code>center</code>
element would be turned into a <code>div</code> with the CSS property
<code>text-align:center;</code>, but in XHTML 1.0 Transitional
the element would be preserved.</p>
<p>This mode is on by default.</p>
</dd>
<dt>Correctional[items to correct]</dt>
<dd>
<p>Deprecated elements and attributes will be transformed into
standards-compliant alternatives whenever possible.
It may have various levels of operation.</p>
<p>Referring back to the previous example, the <code>center</code> element would
be transformed in both cases. However, elements without a
reasonable standards-compliant alternative will be preserved
in their form.</p>
<p>A user may want to correct certain deprecated attributes, but
not others. For example, the <code>bgcolor</code> attribute may be
acceptable, but the <code>center</code> element not; also, possibly,
an HTML Purifier transformation may be buggy, so the user wants
to forgo it. Thus, correctional accepts an array defining which
elements and attributes to cleanup, or no parameter at all, which
means everything gets corrected. This also means that each
correction needs to be given a unique ID that can be referenced
in this manner. (We may also allow globbing, like *.name or a.*
for mass-enabling correction, and subtractive mode, where things
specified stop correction.) This array gets passed into the
constructor of the mode's module.</p>
<p>This mode is on by default.</p>
</dd>
</dl>
<p>A possible call to select modes would be:</p>
<pre>$config->set('HTML', 'Mode', array('correctional', 'lenient'));</pre>
<p>If modes have extra parameters, a hash is necessary:</p>
<pre>$config->set('HTML', 'Mode', array(
'correctional' => 'center,a.name',
'lenient' => true // this one's just boolean
));</pre>
<p>Modes may be specified along with the doctype declaration (we may want
to get a better set of separator characters):</p>
<pre>$config->setDoctype('XHTML Transitional 1.0', '+correctional[center,a.name] -lenient');</pre>
<p>
With regards to the various levels of operation conjectured in the
Correctional mode, this is prompted by the fact that a user may want to
correct certain problems but not others, for example, fix the <code>center</code>
element but not the <code>u</code> element, both of which are deprecated.
Having an integer <q>level</q> will not work very well for such fine
grained tweaking, but an array of specific settings might.</p>
<h3>Selecting Elements / Attributes / Modules</h3>
<p></p>
<p>If this cookie cutter approach doesn't appeal to a user, they may
decide to roll their own filterset by selecting modules, elements and
attributes to allow.</p>
<p class="technical">This would make use of the same facilities
as a filterset author would use, except that it would go under an
<q>anonymous</q> filterset that would be auto-selected if any of the
relevant module/elements/attribute selection configuration directives were
non-null.</p>
<p>In practice, this is the most commonly demanded feature. Most users are
perfectly happy defining a filterset that looks like:</p>
<pre>$config->setAllowedHTML('a[href,title];em;p;blockquote');</pre>
<p class="technical">The directive %HTML.Allowed is a convenience function
that may be fully expressed with the legacy interface, and thus is
given its own setter.</p>
<p>We currently support a separated interface, which also must be preserved:</p>
<pre>$config->set('HTML', 'AllowedElements', 'a,em,p,blockquote');
$config->set('HTML', 'AllowedAttributes', 'a.href,a.title');</pre>
<p>A user may also choose to allow modules:</p>
<pre>$config->set('HTML', 'AllowedModules', 'Hypertext,Text,Lists'); // or
$config->setAllowedHTML('Hypertext,Text,Lists');</pre>
<p>But it is not expected that this feature will be widely used.</p>
<p class="fixme">The granularity of these modules is too coarse for
the average user (for example, the core module loads everything from
the essential <code>p</code> element to the not-so-safe <code>h1</code>
element). How do we make this still a viable solution? Possible answers
may be sub-modules or module parameters. This may not even be a problem,
considering that most people won't be selecting modules.</p>
<p class="technical">Modules are distinguished from regular elements by the
case of their first letter. While XML distinguishes between and allows
lower and uppercase letters in element names, most well-known XML
languages use only lower-case
element names for sake of consistency.</p>
<p class="technical">Considering that, internally speaking, as mandated by
the XHTML 1.1 Modularization specification, we have organized our
elements around modules, considerable gymnastics will be needed to
get this sort of functionality working.</p>
<h3>Unified selector</h3>
<p>Because selecting each and every one of these configuration options
is a chore, we may wish to offer a specialized configuration method
for selecting a filterset. Possibility:</p>
<pre>function selectFilter($doctype, $filterset, $mode)</pre>
<p>...which is simply a light wrapper over the individual configuration
calls. A custom config file format or text format could also be adopted.</p>
<h2>Customize</h2>
<p>By reviewing topic posts in the support forum, we determined that
there were two primarily demanded customization features people wanted:
to add an attribute to an existing element, and to add an element.
Thus, we'll want to create convenience functions for these common
use-cases.</p>
<p>Note that the functions described here are only available if
a raw copy of <code>HTMLPurifier_HTMLDefinition</code> was retrieved.
<code>addAttribute</code> may work on a processed copy, but for
consistency's sake we will mandate this for everything.</p>
<h3>Attributes</h3>
<p>An attribute is bound to an element by a name and has a specific
<code>AttrDef</code> that validates it. Thus, the interface should
be:</p>
<pre>function addAttribute($element, $attribute, $attribute_def);</pre>
<p>With a use-case that looks like:</p>
<pre>$def->addAttribute('a', 'rel', new HTMLPurifier_AttrDef_Enum(array('nofollow')));</pre>
<p>The <code>$attribute_def</code> value can be a little flexible,
to make things simpler. We'll let it also be:</p>
<ul>
<li>Class name: We'll instantiate it for you</li>
<li>Function name: We'll create an <code>HTMLPurifier_AttrDef_Anonymous</code>
class with that function registered as a callback.</li>
<li>String attribute type: We'll use <code>HTMLPurifier_AttrTypes</code>
</li>
<li>String starting with <code>enum(</code>: We'll explode it and stuff it in an
<code>HTMLPurifier_AttrDef_Enum</code> for you.</li>
</ul>
<p>Making the previous example written as:</p>
<pre>$def->addAttribute('a', 'rel', 'enum(nofollow)');</pre>
<h3>Elements</h3>
<p>An element requires certain information as specified by
<code>HTMLPurifier_ElementDef</code>. However, not all of it is necessary,
the usual things required are:</p>
<ul>
<li>Attributes</li>
<li>Content model/type</li>
<li>Registration in a content set</li>
</ul>
<p>This suggests an API like this:</p>
<pre>function addElement($element, $type, $content_model, $attributes = array());</pre>
<p>Each parameter explained in depth:</p>
<dl>
<dt><code>$element</code></dt>
<dd>Element name, ex. 'label'</dd>
<dt><code>$type</code></dt>
<dd>Content set to register in, ex. 'Inline' or 'Flow'</dd>
<dt><code>$content_model</code></dt>
<dd>Description of allowed children. This is a merged form of
<code>HTMLPurifier_ElementDef</code>'s member variables
<code>$content_model</code> and <code>$content_model_type</code>,
where the form is <q>Type: Model</q>, ex. 'Optional: Inline'.</dd>
<dt><code>$attributes</code></dt>
<dd>Array of attribute names to attribute definitions, much like
the above-described attribute customization.</dd>
</dl>
<p>A possible usage:</p>
<pre>$def->addElement('font', 'Inline', 'Optional: Inline',
array(0 => array('Common'), 'color' => 'Color'));</pre>
<p>We may want to Common attribute collection inclusion to be added
by default.</p>
<div id="version">$Id$</div>
</body></html>

32
docs/dev-code-quality.txt Normal file
View File

@@ -0,0 +1,32 @@
Code Quality Issues
Okay, face it. Programmers can get lazy, cut corners, or make mistakes. They
also can do quick prototypes, and then forget to rewrite them later. Well,
while I can't list mistakes in here, I can list prototype-like segments
of code that should be aggressively refactored. This does not list
optimization issues, that needs to be done after intense profiling.
docs/examples/demo.php - ad hoc HTML/PHP soup to the extreme
AttrDef - a lot of duplication, more generic classes need to be created;
a lot of strtolower() calls, no legit casing
Class - doesn't support Unicode characters (fringe); uses regular
expressions
Lang - code duplication; premature optimization
Length - easily mistaken for CSSLength
URI - multiple regular expressions; missing validation for parts (?)
CSS - parser doesn't accept advanced CSS (fringe)
Number - constructor interface inconsistent with Integer
ConfigSchema - redefinition is a mess
Strategy
FixNesting - cannot bubble nodes out of structures, duplicated checks
for special-case parent node
MakeWellFormed - insufficient automatic closing definitions (check HTML
spec for optional end tags, also, closing based on type (block/inline)
might be efficient).
RemoveForeignElements - should be run in parallel with MakeWellFormed
URIScheme - needs to have callable generic checks
mailto - doesn't validate emails, doesn't validate querystring
news - doesn't validate opaque path
nntp - doesn't constrain path

82
docs/dev-naming.html Normal file
View File

@@ -0,0 +1,82 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta name="description" content="Defines class naming conventions in HTML Purifier." />
<link rel="stylesheet" type="text/css" href="./style.css" />
<title>Naming Conventions - HTML Purifier</title>
</head><body>
<h1>Naming Conventions</h1>
<div id="filing">Filed under Development</div>
<div id="index">Return to the <a href="index.html">index</a>.</div>
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
<p>The classes in this library follow a few naming conventions, which may
help you find the correct functionality more quickly. Here they are:</p>
<dl>
<dt>All classes occupy the HTMLPurifier pseudo-namespace.</dt>
<dd>This means that all classes are prefixed with HTMLPurifier_. As such, all
names under HTMLPurifier_ are reserved. I recommend that you use the name
HTMLPurifierX_YourName_ClassName, especially if you want to take advantage
of HTMLPurifier_ConfigDef.</dd>
<dt>All classes correspond to their path if library/ was in the include path</dt>
<dd>HTMLPurifier_AttrDef is located at HTMLPurifier/AttrDef.php; replace
underscores with slashes and append .php and you'll have the location of
the class.</dd>
<dt>Harness and Test are reserved class names for unit tests</dt>
<dd>The suffix <code>Test</code> indicates that the class is a subclass of UnitTestCase
(of the Simpletest library) and is testable. "Harness" indicates a subclass
of UnitTestCase that is not meant to be run but to be extended into
concrete test cases and contains custom test methods (i.e. assert*())</dd>
<dt>Class names do not necessarily represent inheritance hierarchies</dt>
<dd>While we try to reflect inheritance in naming to some extent, it is not
guaranteed (for instance, none of the classes inherit from HTMLPurifier,
the base class). However, all class files have the require_once
declarations to whichever classes they are tightly coupled to.</dd>
<dt>Strategy has a meaning different from the Gang of Four pattern</dt>
<dd>In Design Patterns, the Gang of Four describes a Strategy object as
encapsulating an algorithm so that they can be switched at run-time. While
our strategies are indeed algorithms, they are not meant to be substituted:
all must be present in order for proper functioning.</dd>
<dt>Abbreviations are avoided</dt>
<dd>We try to avoid abbreviations as much as possible, but in some cases,
abbreviated version is more readable than the full version. Here, we
list common abbreviations:
<ul>
<li>Attr to Attributes (note that it is plural, i.e. <code>$attr = array()</code>)</li>
<li>Def to Definition</li>
<li><code>$ret</code> is the value to be returned in a function</li>
</ul>
</dd>
<dt>Ambiguity concerning the definition of Def/Definition</dt>
<dd>While a definition normally defines the structure/acceptable values of
an entity, most of the definitions in this application also attempt
to validate and fix the value. I am unsure of a better name, as
"Validator" would exclude fixing the value, "Fixer" doesn't invoke
the proper image of "fixing" something, and "ValidatorFixer" is too long!
Some other suggestions were "Handler", "Reference", "Check", "Fix",
"Repair" and "Heal".</dd>
<dt>Transform not Transformer</dt>
<dd>Transform is both a noun and a verb, and thus we define a "Transform" as
something that "transforms," leaving "Transformer" (which sounds like an
electrical device/robot toy).</dd>
</dl>
<div id="version">$Id$</div>
</body></html>

View File

@@ -0,0 +1,33 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta name="description" content="Discusses possible methods of optimizing HTML Purifier." />
<link rel="stylesheet" type="text/css" href="./style.css" />
<title>Optimization - HTML Purifier</title>
</head><body>
<h1>Optimization</h1>
<div id="filing">Filed under Development</div>
<div id="index">Return to the <a href="index.html">index</a>.</div>
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
<p>Here are some possible optimization techniques we can apply to code sections if
they turn out to be slow. Be sure not to prematurely optimize: if you get
that itch, put it here!</p>
<ul>
<li>Make Tokens Flyweights (may prove problematic, probably not worth it)</li>
<li>Rewrite regexps into PHP code</li>
<li>Serialize the Definition object</li>
<li>Batch regexp validation (do as many per function call as possible)</li>
<li>Parallelize strategies</li>
</ul>
<div id="version">$Id$</div>
</body></html>

View File

@@ -1,292 +1,302 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head>
<title>HTMLPurifier Progress</title>
<style type="text/css">
td {padding-right:1em;border-bottom:1px solid #000;padding-left:0.5em;}
th {text-align:left;padding-top:1.4em;font-size:13pt;
border-bottom:2px solid #000;background:#FFF;}
thead th {text-align:left;padding:0.1em;background-color:#EEE;}
.impl-yes {background:#9D9;}
.impl-partial {background:#FFA;}
.impl-no {background:#CCC;}
.danger {color:#600;}
.css1 {color:#060;}
.required {font-weight:bold;}
.feature {color:#999;}
</style>
</head><body>
<h1>HTMLPurifier Progress</h1>
<h2>Key</h2>
<table cellspacing="0"><tbody>
<tr><td class="impl-yes">Implemented</td></tr>
<tr><td class="impl-partial">Partially implemented</td></tr>
<tr><td class="impl-no">Will not implement</td></tr>
<tr><td class="danger">Dangerous attribute/property</td></tr>
<tr><td class="css1">Present in CSS1</td></tr>
<tr><td class="feature">Feature, requires extra work</td></tr>
</tbody></table>
<h3>CSS</h3>
<table cellspacing="0">
<thead>
<tr><th>Name</th><th>Notes</th></tr>
</thead>
<!--
<tr><td>-</td><td>-</td></tr>
-->
<tbody>
<tr><th colspan="2">Standard</th></tr>
<tr class="css1 impl-yes"><td>background-color</td><td>COMPOSITE(&lt;color&gt;, transparent)</td></tr>
<tr class="css1 impl-yes"><td>background</td><td>SHORTHAND, only for color, see below for info on background-image and friends</td></tr>
<tr class="css1 impl-yes"><td>border</td><td>SHORTHAND, MULTIPLE</td></tr>
<tr class="css1 impl-yes"><td>border-color</td><td>MULTIPLE</td></tr>
<tr class="css1 impl-yes"><td>border-style</td><td>MULTIPLE</td></tr>
<tr class="css1 impl-yes"><td>border-width</td><td>MULTIPLE</td></tr>
<tr class="css1 impl-yes"><td>border-*</td><td>SHORTHAND</td></tr>
<tr class="impl-yes"><td>border-*-color</td><td>COMPOSITE(&lt;color&gt;, transparent)</td></tr>
<tr class="impl-yes"><td>border-*-style</td><td>ENUM(none, hidden, dotted, dashed,
solid, double, groove, ridge, inset, outset)</td></tr>
<tr class="css1 impl-yes"><td>border-*-width</td><td>COMPOSITE(&lt;length&gt;, thin, medium, thick)</td></tr>
<tr class="css1 impl-yes"><td>clear</td><td>ENUM(none, left, right, both)</td></tr>
<tr class="css1 impl-yes"><td>color</td><td>&lt;color&gt;</td></tr>
<tr class="css1 impl-yes"><td>float</td><td>ENUM(left, right, none), May require layout
precautions with clear</td></tr>
<tr class="css1 impl-yes"><td>font</td><td>SHORTHAND</td></tr>
<tr class="css1 impl-yes"><td>font-family</td><td>CSS validator may complain if fallback font
family not specified</td></tr>
<tr class="css1 impl-yes"><td>font-size</td><td>COMPOSITE(&lt;absolute-size&gt;,
&lt;relative-size&gt;, &lt;length&gt;, &lt;percentage&gt;)</td></tr>
<tr class="css1 impl-yes"><td>font-style</td><td>ENUM(normal, italic, oblique)</td></tr>
<tr class="css1 impl-yes"><td>font-variant</td><td>ENUM(normal, small-caps)</td></tr>
<tr class="css1 impl-yes"><td>font-weight</td><td>ENUM(normal, bold, bolder, lighter,
100, 200, 300, 400, 500, 600, 700, 800, 900), maybe special code for
in-between integers</td></tr>
<tr class="css1 impl-yes"><td>letter-spacing</td><td>COMPOSITE(&lt;length&gt;, normal)</td></tr>
<tr class="css1 impl-yes"><td>line-height</td><td>COMPOSITE(&lt;number&gt;,
&lt;length&gt;, &lt;percentage&gt;, normal)</td></tr>
<tr class="css1 impl-yes"><td>list-style-position</td><td>ENUM(inside, outside),
Strange behavior in browsers</td></tr>
<tr class="css1 impl-yes"><td>list-style-type</td><td>ENUM(...),
Well-supported values are: disc, circle, square,
decimal, lower-roman, upper-roman, lower-alpha and upper-alpha. See also
CSS 3. Mostly IE lack of support.</td></tr>
<tr class="css1 impl-yes"><td>list-style</td><td>SHORTHAND, target milestone 1.0</td></tr>
<tr class="css1 impl-yes"><td>margin</td><td>MULTIPLE</td></tr>
<tr class="css1 impl-yes"><td>margin-*</td><td>COMPOSITE(&lt;length&gt;,
&lt;percentage&gt;, auto)</td></tr>
<tr class="css1 impl-yes"><td>padding</td><td>MULTIPLE</td></tr>
<tr class="css1 impl-yes"><td>padding-*</td><td>COMPOSITE(&lt;length&gt;(positive),
&lt;percentage&gt;(positive))</td></tr>
<tr class="css1 impl-yes"><td>text-align</td><td>ENUM(left, right,
center, justify)</td></tr>
<tr class="css1 impl-yes"><td>text-decoration</td><td>No blink (argh my eyes), not
enum, can be combined (composite sorta): underline, overline,
line-through</td></tr>
<tr class="css1 impl-yes"><td>text-indent</td><td>COMPOSITE(&lt;length&gt;,
&lt;percentage&gt;)</td></tr>
<tr class="css1 impl-yes"><td>text-transform</td><td>ENUM(capitalize, uppercase,
lowercase, none)</td></tr>
<tr class="css1 impl-yes"><td>width</td><td>COMPOSITE(&lt;length&gt;,
&lt;percentage&gt;, auto), Interesting</td></tr>
<tr class="css1 impl-yes"><td>word-spacing</td><td>COMPOSITE(&lt;length&gt;, auto),
IE 5 no support</td></tr>
</tbody>
<tbody>
<tr><th colspan="2">Table</th></tr>
<tr class="impl-yes"><td>border-collapse</td><td>ENUM(collapse, seperate)</td></tr>
<tr class="impl-yes"><td>caption-side</td><td>ENUM(top, bottom)</td></tr>
<tr class="feature"><td>empty-cells</td><td>ENUM(show, hide), No IE support makes this useless,
possible fix with &amp;nbsp;? Unknown release milestone.</td></tr>
<tr class="impl-yes"><td>table-layout</td><td>ENUM(auto, fixed)</td></tr>
<tr class="impl-yes css1"><td>vertical-align</td><td>COMPOSITE(ENUM(baseline, sub,
super, top, text-top, middle, bottom, text-bottom), &lt;percentage&gt;,
&lt;length&gt;) Also applies to others with explicit height</td></tr>
</tbody>
<tbody>
<tr><th colspan="2">Absolute positioning, unknown release milestone</th></tr>
<tr class="danger"><td>bottom</td><td rowspan="4">Dangerous, must be non-negative</td></tr>
<tr class="danger"><td>left</td></tr>
<tr class="danger"><td>right</td></tr>
<tr class="danger"><td>top</td></tr>
<tr><td>clip</td><td>-</td></tr>
<tr class="danger"><td>position</td><td>ENUM(static, relative, absolute, fixed), permit
relative not absolute?</td></tr>
<tr class="danger"><td>z-index</td><td>Dangerous</td></tr>
</tbody>
<tbody>
<tr><th colspan="2">Unknown</th></tr>
<tr class="danger css1"><td>background-image</td><td>Dangerous, target milestone 1.3</td></tr>
<tr class="css1"><td>background-attachment</td><td>ENUM(scroll, fixed),
Depends on background-image</td></tr>
<tr class="css1"><td>background-position</td><td>Depends on background-image</td></tr>
<tr class="danger impl-no"><td>cursor</td><td>Dangerous but fluffy</td></tr>
<tr class="danger css1"><td>display</td><td>ENUM(...), Dangerous but interesting;
will not implement list-item, run-in (Opera only) or table (no IE);
inline-block has incomplete IE6 support and requires -moz-inline-box
for Mozilla. Unknown target milestone.</td></tr>
<tr><td class="css1">height</td><td>Interesting, why use it? Unknown target milestone.</td></tr>
<tr class="danger css1"><td>list-style-image</td><td>Dangerous? Target milestone 1.3</td></tr>
<tr class="impl-no"><td>max-height</td><td rowspan="4">No IE 5/6</td></tr>
<tr class="impl-no"><td>min-height</td></tr>
<tr class="impl-no"><td>max-width</td></tr>
<tr class="impl-no"><td>min-width</td></tr>
<tr class="impl-no"><td>orphans</td><td>No IE support</td></tr>
<tr class="impl-no"><td>widows</td><td>No IE support</td></tr>
<tr><td>overflow</td><td>ENUM, IE 5/6 almost (remove visible if set). Unknown target milestone.</td></tr>
<tr><td>page-break-after</td><td>ENUM(auto, always, avoid, left, right),
IE 5.5/6 and Opera. Unknown target milestone.</td></tr>
<tr><td>page-break-before</td><td>ENUM(auto, always, avoid, left, right),
Mostly supported. Unknown target milestone.</td></tr>
<tr><td>page-break-inside</td><td>ENUM(avoid, auto), Opera only. Unknown target milestone.</td></tr>
<tr class="impl-no"><td>quotes</td><td>May be dropped from CSS2, fairly useless for inline context</td></tr>
<tr class="impl-no"><td>visibility</td><td>ENUM(visible, hidden, collapse),
Dangerous</td></tr>
<tr class="css1 feature"><td>white-space</td><td>ENUM(normal, pre, nowrap, pre-wrap,
pre-line), Spotty implementation:
pre (no IE 5/6), nowrap (no IE 5),
pre-wrap (only Opera), pre-line (no support). Fixable? Unknown target milestone.</td></tr>
</tbody>
<tbody class="impl-no">
<tr><th colspan="2">Aural</th></tr>
<tr><td>azimuth</td><td>-</td></tr>
<tr><td>cue</td><td>-</td></tr>
<tr><td>cue-after</td><td>-</td></tr>
<tr><td>cue-before</td><td>-</td></tr>
<tr><td>elevation</td><td>-</td></tr>
<tr><td>pause-after</td><td>-</td></tr>
<tr><td>pause-before</td><td>-</td></tr>
<tr><td>pause</td><td>-</td></tr>
<tr><td>pitch-range</td><td>-</td></tr>
<tr><td>pitch</td><td>-</td></tr>
<tr><td>play-during</td><td>-</td></tr>
<tr><td>richness</td><td>-</td></tr>
<tr><td>speak-header</td><td>Table related</td></tr>
<tr><td>speak-numeral</td><td>-</td></tr>
<tr><td>speak-punctuation</td><td>-</td></tr>
<tr><td>speak</td><td>-</td></tr>
<tr><td>speech-rate</td><td>-</td></tr>
<tr><td>stress</td><td>-</td></tr>
<tr><td>voice-family</td><td>-</td></tr>
<tr><td>volume</td><td>-</td></tr>
</tbody>
<tbody class="impl-no">
<tr><th colspan="2">Will not implement</th></tr>
<tr><td>content</td><td>Not applicable for inline styles</td></tr>
<tr><td>counter-increment</td><td>Needs content, Opera only</td></tr>
<tr><td>counter-reset</td><td>Needs content, Opera only</td></tr>
<tr><td>direction</td><td>No support</td></tr>
<tr><td>outline-color</td><td rowspan="4">IE Mac and Opera on outside,
Mozilla on inside and needs -moz-outline, no IE support.</td></tr>
<tr><td>outline-style</td></tr>
<tr><td>outline-width</td></tr>
<tr><td>outline</td></tr>
<tr><td>unicode-bidi</td><td>No support</td></tr>
</tbody>
</table>
<h2>Interesting Attributes</h2>
<table cellspacing="0">
<thead>
<tr><th>Attribute</th><th>Tags</th><th>Notes</th></tr>
</thead>
<!--
<tr><th></th></tr>
<tbody>
<tr><td>-</td><td>-</td><td>-</td></tr>
</tbody>
-->
<tbody>
<tr><th colspan="3">CSS</th></tr>
<tr class="impl-yes"><td>style</td><td>All</td><td>Not all properties may be implemented, parser is good though.</td></tr>
</tbody>
<tbody>
<tr><th colspan="3">Questionable</th></tr>
<tr class="impl-no"><td>accesskey</td><td>A</td><td>May interfere with main interface</td></tr>
<tr class="impl-no"><td>tabindex</td><td>A</td><td>May interfere with main interface</td></tr>
<tr><td>target</td><td>A</td><td>Config enabled, only useful for frame layouts</td></tr>
</tbody>
<tbody>
<tr><th colspan="3">Miscellaneous</th></tr>
<tr><td>datetime</td><td>DEL, INS</td><td>No visible effect, ISO format</td></tr>
<tr><td>rel</td><td>A</td><td>Largely user-defined: nofollow, tag (see microformats)</td></tr>
<tr><td>rev</td><td>A</td><td>Largely user-defined: vote-*</td></tr>
<tr class="feature"><td>axis</td><td>TD, TH</td><td>W3C only: No browser implementation</td></tr>
<tr class="feature"><td>char</td><td>COL, COLGROUP, TBODY, TD, TFOOT, TH, THEAD, TR</td><td>W3C only: No browser implementation</td></tr>
<tr class="feature"><td>headers</td><td>TD, TH</td><td>W3C only: No browser implementation</td></tr>
<tr class="feature"><td>scope</td><td>TD, TH</td><td>W3C only: No browser implementation</td></tr>
</tbody>
<tbody class="impl-yes">
<tr><th colspan="3">URI</th></tr>
<tr><td rowspan="2">cite</td><td>BLOCKQUOTE, Q</td><td>For attribution</td></tr>
<tr><td>DEL, INS</td><td>Link to explanation why it changed</td></tr>
<tr><td>href</td><td>A</td><td>-</td></tr>
<tr><td>longdesc</td><td>IMG</td><td>-</td></tr>
<tr class="required"><td>src</td><td>IMG</td><td>Required</td></tr>
</tbody>
<tbody>
<tr><th colspan="3">Transform, target milestone 1.2</th></tr>
<tr><td rowspan="5">align</td><td>CAPTION</td><td>Near-equiv style 'caption-side', drop left and right</td></tr>
<tr><td>IMG</td><td rowspan="2">Margin-left and margin-right = auto or parent div</td></tr>
<tr><td>TABLE</td></tr>
<tr><td>HR</td><td>Equivalent style 'text-align' (IE tested)</td></tr>
<tr class="impl-yes"><td>H1, H2, H3, H4, H5, H6, P</td><td>Equivalent style 'text-align'</td></tr>
<tr class="required impl-yes"><td>alt</td><td>IMG</td><td>Required, insert image filename if src is present or default invalid image text</td></tr>
<tr><td rowspan="3">bgcolor</td><td>TABLE</td><td>Equivalent style 'background-color' (IE tested)</td></tr>
<tr><td>TR</td><td>Equivalent style 'background-color' (IE tested)</td></tr>
<tr><td>TD, TH</td><td>Equivalent style 'background-color'</td></tr>
<tr><td>border</td><td>IMG</td><td>Equivalent style 'border-width', only applies when link present</td></tr>
<tr><td>clear</td><td>BR</td><td>Near-equiv style 'clear', transform 'all' into 'both'</td></tr>
<tr class="impl-no"><td>compact</td><td>DL, OL, UL</td><td>Boolean, needs custom CSS class; rarely used anyway</td></tr>
<tr class="required impl-yes"><td>dir</td><td>BDO</td><td>Required, insert ltr (or configuration value) if none</td></tr>
<tr><td>height</td><td>TD, TH</td><td>Near-equiv style 'height', needs px suffix if original was in pixels</td></tr>
<tr><td>hspace</td><td>IMG</td><td>Near-equiv styles 'margin-top' and 'margin-bottom', needs px suffix</td></tr>
<tr class="impl-yes"><td>lang</td><td>*</td><td>Copy value to xml:lang</td></tr>
<tr><td rowspan="2">name</td><td>IMG</td><td>Turn into ID</td></tr>
<tr><td>A</td><td>Turn into ID? (not deprecated, though in which specs?)</td></tr>
<tr><td>noshade</td><td>HR</td><td>Boolean, style 'border-style:solid;'</td></tr>
<tr><td>nowrap</td><td>TD, TH</td><td>Boolean, style 'white-space:nowrap;' (not compat with IE5)</td></tr>
<tr><td>size</td><td>HR</td><td>Near-equiv 'width', needs px suffix if original was pixels</td></tr>
<tr class="required impl-yes"><td>src</td><td>IMG</td><td>Required, insert blank or default img if not set</td></tr>
<tr><td>start</td><td>OL</td><td>Poorly supported 'counter-reset', transform may not be desirable</td></tr>
<tr><td rowspan="3">type</td><td>LI</td><td rowspan="3">Equivalent style 'list-style-type', different allowed values though. (needs testing)</td></tr>
<tr><td>OL</td></tr>
<tr><td>UL</td></tr>
<tr><td>value</td><td>LI</td><td>Poorly supported 'counter-reset', transform may not be desirable, see ol.start. Configurable.</td></tr>
<tr><td>vspace</td><td>IMG</td><td>Near-equiv styles 'margin-left' and 'margin-right', needs px suffix, see hspace</td></tr>
<tr><td rowspan="2">width</td><td>HR</td><td rowspan="2">Near-equiv style 'width', needs px suffix if original was pixels</td></tr>
<tr><td>TD, TH</td></tr>
</tbody>
</table>
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta name="description" content="Tables detailing HTML element and CSS property implementation coverage in HTML Purifier." />
<link rel="stylesheet" type="text/css" href="./style.css" />
<title>Implementation Progress - HTML Purifier</title>
<style type="text/css">
td {padding-right:1em;border-bottom:1px solid #000;padding-left:0.5em;}
th {text-align:left;padding-top:1.4em;font-size:13pt;
border-bottom:2px solid #000;background:#FFF;}
thead th {text-align:left;padding:0.1em;background-color:#EEE;}
.impl-yes {background:#9D9;}
.impl-partial {background:#FFA;}
.impl-no {background:#CCC;}
.danger {color:#600;}
.css1 {color:#060;}
.required {font-weight:bold;}
.feature {color:#999;}
</style>
</head><body>
<h1>Implementation Progress</h1>
<div id="filing">Filed under Development</div>
<div id="index">Return to the <a href="index.html">index</a>.</div>
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
<h2>Key</h2>
<table cellspacing="0"><tbody>
<tr><td class="impl-yes">Implemented</td></tr>
<tr><td class="impl-partial">Partially implemented</td></tr>
<tr><td class="impl-no">Will not implement</td></tr>
<tr><td class="danger">Dangerous attribute/property</td></tr>
<tr><td class="css1">Present in CSS1</td></tr>
<tr><td class="feature">Feature, requires extra work</td></tr>
</tbody></table>
<h2>CSS</h2>
<table cellspacing="0">
<thead>
<tr><th>Name</th><th>Notes</th></tr>
</thead>
<!--
<tr><td>-</td><td>-</td></tr>
-->
<tbody>
<tr><th colspan="2">Standard</th></tr>
<tr class="css1 impl-yes"><td>background-color</td><td>COMPOSITE(&lt;color&gt;, transparent)</td></tr>
<tr class="css1 impl-yes"><td>background</td><td>SHORTHAND, currently alias for background-color</td></tr>
<tr class="css1 impl-yes"><td>border</td><td>SHORTHAND, MULTIPLE</td></tr>
<tr class="css1 impl-yes"><td>border-color</td><td>MULTIPLE</td></tr>
<tr class="css1 impl-yes"><td>border-style</td><td>MULTIPLE</td></tr>
<tr class="css1 impl-yes"><td>border-width</td><td>MULTIPLE</td></tr>
<tr class="css1 impl-yes"><td>border-*</td><td>SHORTHAND</td></tr>
<tr class="impl-yes"><td>border-*-color</td><td>COMPOSITE(&lt;color&gt;, transparent)</td></tr>
<tr class="impl-yes"><td>border-*-style</td><td>ENUM(none, hidden, dotted, dashed,
solid, double, groove, ridge, inset, outset)</td></tr>
<tr class="css1 impl-yes"><td>border-*-width</td><td>COMPOSITE(&lt;length&gt;, thin, medium, thick)</td></tr>
<tr class="css1 impl-yes"><td>clear</td><td>ENUM(none, left, right, both)</td></tr>
<tr class="css1 impl-yes"><td>color</td><td>&lt;color&gt;</td></tr>
<tr class="css1 impl-yes"><td>float</td><td>ENUM(left, right, none), May require layout
precautions with clear</td></tr>
<tr class="css1 impl-yes"><td>font</td><td>SHORTHAND</td></tr>
<tr class="css1 impl-yes"><td>font-family</td><td>CSS validator may complain if fallback font
family not specified</td></tr>
<tr class="css1 impl-yes"><td>font-size</td><td>COMPOSITE(&lt;absolute-size&gt;,
&lt;relative-size&gt;, &lt;length&gt;, &lt;percentage&gt;)</td></tr>
<tr class="css1 impl-yes"><td>font-style</td><td>ENUM(normal, italic, oblique)</td></tr>
<tr class="css1 impl-yes"><td>font-variant</td><td>ENUM(normal, small-caps)</td></tr>
<tr class="css1 impl-yes"><td>font-weight</td><td>ENUM(normal, bold, bolder, lighter,
100, 200, 300, 400, 500, 600, 700, 800, 900), maybe special code for
in-between integers</td></tr>
<tr class="css1 impl-yes"><td>letter-spacing</td><td>COMPOSITE(&lt;length&gt;, normal)</td></tr>
<tr class="css1 impl-yes"><td>line-height</td><td>COMPOSITE(&lt;number&gt;,
&lt;length&gt;, &lt;percentage&gt;, normal)</td></tr>
<tr class="css1 impl-yes"><td>list-style-position</td><td>ENUM(inside, outside),
Strange behavior in browsers</td></tr>
<tr class="css1 impl-yes"><td>list-style-type</td><td>ENUM(...),
Well-supported values are: disc, circle, square,
decimal, lower-roman, upper-roman, lower-alpha and upper-alpha. See also
CSS 3. Mostly IE lack of support.</td></tr>
<tr class="css1 impl-yes"><td>list-style</td><td>SHORTHAND</td></tr>
<tr class="css1 impl-yes"><td>margin</td><td>MULTIPLE</td></tr>
<tr class="css1 impl-yes"><td>margin-*</td><td>COMPOSITE(&lt;length&gt;,
&lt;percentage&gt;, auto)</td></tr>
<tr class="css1 impl-yes"><td>padding</td><td>MULTIPLE</td></tr>
<tr class="css1 impl-yes"><td>padding-*</td><td>COMPOSITE(&lt;length&gt;(positive),
&lt;percentage&gt;(positive))</td></tr>
<tr class="css1 impl-yes"><td>text-align</td><td>ENUM(left, right,
center, justify)</td></tr>
<tr class="css1 impl-yes"><td>text-decoration</td><td>No blink (argh my eyes), not
enum, can be combined (composite sorta): underline, overline,
line-through</td></tr>
<tr class="css1 impl-yes"><td>text-indent</td><td>COMPOSITE(&lt;length&gt;,
&lt;percentage&gt;)</td></tr>
<tr class="css1 impl-yes"><td>text-transform</td><td>ENUM(capitalize, uppercase,
lowercase, none)</td></tr>
<tr class="css1 impl-yes"><td>width</td><td>COMPOSITE(&lt;length&gt;,
&lt;percentage&gt;, auto), Interesting</td></tr>
<tr class="css1 impl-yes"><td>word-spacing</td><td>COMPOSITE(&lt;length&gt;, auto),
IE 5 no support</td></tr>
</tbody>
<tbody>
<tr><th colspan="2">Table</th></tr>
<tr class="impl-yes"><td>border-collapse</td><td>ENUM(collapse, seperate)</td></tr>
<tr class="impl-yes"><td>caption-side</td><td>ENUM(top, bottom)</td></tr>
<tr class="feature"><td>empty-cells</td><td>ENUM(show, hide), No IE support makes this useless,
possible fix with &amp;nbsp;? Unknown release milestone.</td></tr>
<tr class="impl-yes"><td>table-layout</td><td>ENUM(auto, fixed)</td></tr>
<tr class="impl-yes css1"><td>vertical-align</td><td>COMPOSITE(ENUM(baseline, sub,
super, top, text-top, middle, bottom, text-bottom), &lt;percentage&gt;,
&lt;length&gt;) Also applies to others with explicit height</td></tr>
</tbody>
<tbody>
<tr><th colspan="2">Absolute positioning, unknown release milestone</th></tr>
<tr class="danger impl-no"><td>bottom</td><td rowspan="4">Dangerous, must be non-negative to even be considered,
but it's still possible to arbitrarily position by running over.</td></tr>
<tr class="danger impl-no"><td>left</td></tr>
<tr class="danger impl-no"><td>right</td></tr>
<tr class="danger impl-no"><td>top</td></tr>
<tr class="impl-no"><td>clip</td><td>-</td></tr>
<tr class="danger impl-no"><td>position</td><td>ENUM(static, relative, absolute, fixed)
relative not absolute?</td></tr>
<tr class="danger impl-no"><td>z-index</td><td>Dangerous</td></tr>
</tbody>
<tbody>
<tr><th colspan="2">Unknown</th></tr>
<tr class="danger css1 impl-yes"><td>background-image</td><td>Dangerous</td></tr>
<tr class="css1 impl-yes"><td>background-attachment</td><td>ENUM(scroll, fixed),
Depends on background-image</td></tr>
<tr class="css1 impl-yes"><td>background-position</td><td>Depends on background-image</td></tr>
<tr class="danger impl-no"><td>cursor</td><td>Dangerous but fluffy</td></tr>
<tr class="danger css1"><td>display</td><td>ENUM(...), Dangerous but interesting;
will not implement list-item, run-in (Opera only) or table (no IE);
inline-block has incomplete IE6 support and requires -moz-inline-box
for Mozilla. Unknown target milestone.</td></tr>
<tr class="css1 impl-yes"><td>height</td><td>Interesting, why use it? Unknown target milestone.</td></tr>
<tr class="danger css1 impl-yes"><td>list-style-image</td><td>Dangerous?</td></tr>
<tr class="impl-no"><td>max-height</td><td rowspan="4">No IE 5/6</td></tr>
<tr class="impl-no"><td>min-height</td></tr>
<tr class="impl-no"><td>max-width</td></tr>
<tr class="impl-no"><td>min-width</td></tr>
<tr class="impl-no"><td>orphans</td><td>No IE support</td></tr>
<tr class="impl-no"><td>widows</td><td>No IE support</td></tr>
<tr><td>overflow</td><td>ENUM, IE 5/6 almost (remove visible if set). Unknown target milestone.</td></tr>
<tr><td>page-break-after</td><td>ENUM(auto, always, avoid, left, right),
IE 5.5/6 and Opera. Unknown target milestone.</td></tr>
<tr><td>page-break-before</td><td>ENUM(auto, always, avoid, left, right),
Mostly supported. Unknown target milestone.</td></tr>
<tr><td>page-break-inside</td><td>ENUM(avoid, auto), Opera only. Unknown target milestone.</td></tr>
<tr class="impl-no"><td>quotes</td><td>May be dropped from CSS2, fairly useless for inline context</td></tr>
<tr class="impl-no"><td>visibility</td><td>ENUM(visible, hidden, collapse),
Dangerous</td></tr>
<tr class="css1 feature impl-partial"><td>white-space</td><td>ENUM(normal, pre, nowrap, pre-wrap,
pre-line), Spotty implementation:
pre (no IE 5/6), <em>nowrap</em> (no IE 5, supported),
pre-wrap (only Opera), pre-line (no support). Fixable? Unknown target milestone.</td></tr>
</tbody>
<tbody class="impl-no">
<tr><th colspan="2">Aural</th></tr>
<tr><td>azimuth</td><td>-</td></tr>
<tr><td>cue</td><td>-</td></tr>
<tr><td>cue-after</td><td>-</td></tr>
<tr><td>cue-before</td><td>-</td></tr>
<tr><td>elevation</td><td>-</td></tr>
<tr><td>pause-after</td><td>-</td></tr>
<tr><td>pause-before</td><td>-</td></tr>
<tr><td>pause</td><td>-</td></tr>
<tr><td>pitch-range</td><td>-</td></tr>
<tr><td>pitch</td><td>-</td></tr>
<tr><td>play-during</td><td>-</td></tr>
<tr><td>richness</td><td>-</td></tr>
<tr><td>speak-header</td><td>Table related</td></tr>
<tr><td>speak-numeral</td><td>-</td></tr>
<tr><td>speak-punctuation</td><td>-</td></tr>
<tr><td>speak</td><td>-</td></tr>
<tr><td>speech-rate</td><td>-</td></tr>
<tr><td>stress</td><td>-</td></tr>
<tr><td>voice-family</td><td>-</td></tr>
<tr><td>volume</td><td>-</td></tr>
</tbody>
<tbody class="impl-no">
<tr><th colspan="2">Will not implement</th></tr>
<tr><td>content</td><td>Not applicable for inline styles</td></tr>
<tr><td>counter-increment</td><td>Needs content, Opera only</td></tr>
<tr><td>counter-reset</td><td>Needs content, Opera only</td></tr>
<tr><td>direction</td><td>No support</td></tr>
<tr><td>outline-color</td><td rowspan="4">IE Mac and Opera on outside,
Mozilla on inside and needs -moz-outline, no IE support.</td></tr>
<tr><td>outline-style</td></tr>
<tr><td>outline-width</td></tr>
<tr><td>outline</td></tr>
<tr><td>unicode-bidi</td><td>No support</td></tr>
</tbody>
</table>
<h2>Interesting Attributes</h2>
<table cellspacing="0">
<thead>
<tr><th>Attribute</th><th>Tags</th><th>Notes</th></tr>
</thead>
<!--
<tr><th></th></tr>
<tbody>
<tr><td>-</td><td>-</td><td>-</td></tr>
</tbody>
-->
<tbody>
<tr><th colspan="3">CSS</th></tr>
<tr class="impl-yes"><td>style</td><td>All</td><td>Parser is reasonably functional. Status here doesn't count individual properties.</td></tr>
</tbody>
<tbody>
<tr><th colspan="3">Questionable</th></tr>
<tr class="impl-no"><td>accesskey</td><td>A</td><td>May interfere with main interface</td></tr>
<tr class="impl-no"><td>tabindex</td><td>A</td><td>May interfere with main interface</td></tr>
<tr class="impl-yes"><td>target</td><td>A</td><td>Config enabled, only useful for frame layouts, disallowed in strict</td></tr>
</tbody>
<tbody>
<tr><th colspan="3">Miscellaneous</th></tr>
<tr><td>datetime</td><td>DEL, INS</td><td>No visible effect, ISO format</td></tr>
<tr class="impl-yes"><td>rel</td><td>A</td><td>Largely user-defined: nofollow, tag (see microformats)</td></tr>
<tr class="impl-yes"><td>rev</td><td>A</td><td>Largely user-defined: vote-*</td></tr>
<tr class="feature"><td>axis</td><td>TD, TH</td><td>W3C only: No browser implementation</td></tr>
<tr class="feature"><td>char</td><td>COL, COLGROUP, TBODY, TD, TFOOT, TH, THEAD, TR</td><td>W3C only: No browser implementation</td></tr>
<tr class="feature"><td>headers</td><td>TD, TH</td><td>W3C only: No browser implementation</td></tr>
<tr class="feature"><td>scope</td><td>TD, TH</td><td>W3C only: No browser implementation</td></tr>
</tbody>
<tbody class="impl-yes">
<tr><th colspan="3">URI</th></tr>
<tr><td rowspan="2">cite</td><td>BLOCKQUOTE, Q</td><td>For attribution</td></tr>
<tr><td>DEL, INS</td><td>Link to explanation why it changed</td></tr>
<tr><td>href</td><td>A</td><td>-</td></tr>
<tr><td>longdesc</td><td>IMG</td><td>-</td></tr>
<tr class="required"><td>src</td><td>IMG</td><td>Required</td></tr>
</tbody>
<tbody>
<tr><th colspan="3">Transform</th></tr>
<tr class="impl-yes"><td rowspan="5">align</td><td>CAPTION</td><td>'caption-side' for top/bottom, 'text-align' for left/right</td></tr>
<tr class="impl-yes"><td>IMG</td><td rowspan="3">See specimens/html-align-to-css.html</td></tr>
<tr class="impl-yes"><td>TABLE</td></tr>
<tr class="impl-yes"><td>HR</td></tr>
<tr class="impl-yes"><td>H1, H2, H3, H4, H5, H6, P</td><td>Equivalent style 'text-align'</td></tr>
<tr class="required impl-yes"><td>alt</td><td>IMG</td><td>Required, insert image filename if src is present or default invalid image text</td></tr>
<tr class="impl-yes"><td rowspan="3">bgcolor</td><td>TABLE</td><td>Superset style 'background-color'</td></tr>
<tr class="impl-yes"><td>TR</td><td>Superset style 'background-color'</td></tr>
<tr class="impl-yes"><td>TD, TH</td><td>Superset style 'background-color'</td></tr>
<tr class="impl-yes"><td>border</td><td>IMG</td><td>Equivalent style <code>border:[number]px solid</code></td></tr>
<tr class="impl-yes"><td>clear</td><td>BR</td><td>Near-equiv style 'clear', transform 'all' into 'both'</td></tr>
<tr class="impl-no"><td>compact</td><td>DL, OL, UL</td><td>Boolean, needs custom CSS class; rarely used anyway</td></tr>
<tr class="required impl-yes"><td>dir</td><td>BDO</td><td>Required, insert ltr (or configuration value) if none</td></tr>
<tr class="impl-yes"><td>height</td><td>TD, TH</td><td>Near-equiv style 'height', needs px suffix if original was in pixels</td></tr>
<tr class="impl-yes"><td>hspace</td><td>IMG</td><td>Near-equiv styles 'margin-top' and 'margin-bottom', needs px suffix</td></tr>
<tr class="impl-yes"><td>lang</td><td>*</td><td>Copy value to xml:lang</td></tr>
<tr class="impl-yes"><td rowspan="2">name</td><td>IMG</td><td>Turn into ID</td></tr>
<tr class="impl-yes"><td>A</td><td>Turn into ID</td></tr>
<tr class="impl-yes"><td>noshade</td><td>HR</td><td>Boolean, style 'border-style:solid;'</td></tr>
<tr class="impl-yes"><td>nowrap</td><td>TD, TH</td><td>Boolean, style 'white-space:nowrap;' (not compat with IE5)</td></tr>
<tr class="impl-yes"><td>size</td><td>HR</td><td>Near-equiv 'height', needs px suffix if original was pixels</td></tr>
<tr class="required impl-yes"><td>src</td><td>IMG</td><td>Required, insert blank or default img if not set</td></tr>
<tr class="impl-yes"><td>start</td><td>OL</td><td>Poorly supported 'counter-reset', allowed in loose, dropped in strict</td></tr>
<tr class="impl-yes"><td rowspan="3">type</td><td>LI</td><td rowspan="3">Equivalent style 'list-style-type', different allowed values though. (needs testing)</td></tr>
<tr class="impl-yes"><td>OL</td></tr>
<tr class="impl-yes"><td>UL</td></tr>
<tr class="impl-yes"><td>value</td><td>LI</td><td>Poorly supported 'counter-reset', allowed in loose, dropped in strict</td></tr>
<tr class="impl-yes"><td>vspace</td><td>IMG</td><td>Near-equiv styles 'margin-left' and 'margin-right', needs px suffix, see hspace</td></tr>
<tr class="impl-yes"><td rowspan="2">width</td><td>HR</td><td rowspan="2">Near-equiv style 'width', needs px suffix if original was pixels</td></tr>
<tr class="impl-yes"><td>TD, TH</td></tr>
</tbody>
</table>
<div id="version">$Id$</div>
</body></html>

147
docs/enduser-id.html Normal file
View File

@@ -0,0 +1,147 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta name="description" content="Explains various methods for allowing IDs in documents safely in HTML Purifier." />
<link rel="stylesheet" type="text/css" href="./style.css" />
<title>IDs - HTML Purifier</title>
</head><body>
<h1 class="subtitled">IDs</h1>
<div class="subtitle">What they are, why you should(n't) wear them, and how to deal with it</div>
<div id="filing">Filed under End-User</div>
<div id="index">Return to the <a href="index.html">index</a>.</div>
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
<p>Prior to HTML Purifier 1.2.0, this library blithely accepted user input that
looked like this:</p>
<pre>&lt;a id=&quot;fragment&quot;&gt;Anchor&lt;/a&gt;</pre>
<p>...presenting an attractive vector for those that would destroy standards
compliance: simply set the ID to one that is already used elsewhere in the
document and voila: validation breaks. There was a half-hearted attempt to
prevent this by allowing users to blacklist IDs, but I suspect that no one
really bothered, and thus, with the release of 1.2.0, IDs are now <em>removed</em>
by default.</p>
<p>IDs, however, are quite useful functionality to have, so if users start
complaining about broken anchors you'll probably want to turn them back on
with %HTML.EnableAttrID. But before you go mucking around with the config
object, it's probably worth to take some precautions to keep your page
validating. Why?</p>
<ol>
<li>Standards-compliant pages are good</li>
<li>Duplicated IDs interfere with anchors. If there are two id="foobar"s in a
document, which spot does a browser presented with the fragment #foobar go
to? Most browsers opt for the first appearing ID, making it impossible
to references the second section. Similarly, duplicated IDs can hijack
client-side scripting that relies on the IDs of elements.</li>
</ol>
<p>You have (currently) four ways of dealing with the problem.</p>
<h2 class="subtitled">Blacklisting IDs</h2>
<div class="subsubtitle">Good for pages with single content source and stable templates</div>
<p>Keeping in terms with the
<acronym title="Keep It Simple, Stupid">KISS</acronym> principle, let us
deal with the most obvious solution: preventing users from using any IDs that
appear elsewhere on the document. The method is simple:</p>
<pre>$config->set('HTML', 'EnableAttrID', true);
$config->set('Attr', 'IDBlacklist' array(
'list', 'of', 'attributes', 'that', 'are', 'forbidden'
));</pre>
<p>That being said, there are some notable drawbacks. First of all, you have to
know precisely which IDs are being used by the HTML surrounding the user code.
This is easier said than done: quite often the page designer and the system
coder work separately, so the designer has to constantly be talking with the
coder whenever he decides to add a new anchor. Miss one and you open yourself
to possible standards-compliance issues.</p>
<p>Furthermore, this position becomes untenable when a single web page must hold
multiple portions of user-submitted content. Since there's obviously no way
to find out before-hand what IDs users will use, the blacklist is helpless.
And even since HTML Purifier validates each segment seperately, perhaps doing
so at different times, it would be extremely difficult to dynamically update
the blacklist inbetween runs.</p>
<p>Finally, simply destroying the ID is extremely un-userfriendly behavior: after
all, they might have simply specified a duplicate ID by accident.</p>
<p>Thus, we get to our second method.</p>
<h2 class="subtitled">Namespacing IDs</h2>
<div class="subsubtitle">Lazy developer's way, but needs user education</div>
<p>This method, too, is quite simple: add a prefix to all user IDs. With this
code:</p>
<pre>$config->set('HTML', 'EnableAttrID', true);
$config->set('Attr', 'IDPrefix', 'user_');</pre>
<p>...this:</p>
<pre>&lt;a id=&quot;foobar&quot;&gt;Anchor!&lt;/a&gt;</pre>
<p>...turns into:</p>
<pre>&lt;a id=&quot;user_foobar&quot;&gt;Anchor!&lt;/a&gt;</pre>
<p>As long as you don't have any IDs that start with user_, collisions are
guaranteed not to happen. The drawback is obvious: if a user submits
id=&quot;foobar&quot;, they probably expect to be able to reference their page with
#foobar. You'll have to tell them, &quot;No, that doesn't work, you have to add
user_ to the beginning.&quot;</p>
<p>And yes, things get hairier. Even with a nice prefix, we still have done
nothing about multiple HTML Purifier outputs on one page. Thus, we have
a second configuration value to piggy-back off of: %Attr.IDPrefixLocal:</p>
<pre>$config->set('Attr', 'IDPrefixLocal', 'comment' . $id . '_');</pre>
<p>This new attributes does nothing but append on to regular IDPrefix, but is
special in that it is volatile: it's value is determined at run-time and
cannot possibly be cordoned into, say, a .ini config file. As for what to
put into the directive, is up to you, but I would recommend the ID number
the text has been assigned in the database. Whatever you pick, however, it
has to be unique and stable for the text you are validating. Note, however,
that we require that %Attr.IDPrefix be set before you use this directive.</p>
<p>And also remember: the user has to know what this prefix is too!</p>
<h2>Abstinence</h2>
<p>You may not want to bother. That's okay too, just don't enable IDs.</p>
<p>Personally, I would take this road whenever user-submitted content would be
possibly be shown together on one page. Why a blog comment would need to use
anchors is beyond me.</p>
<h2>Denial</h2>
<p>To revert back to pre-1.2.0 behavior, simply:</p>
<pre>$config->set('HTML', 'EnableAttrID', true);</pre>
<p>Don't come crying to me when your page mysteriously stops validating, though.</p>
<div id="version">$Id$</div>
</body>
</html>

View File

@@ -36,7 +36,7 @@ forgiving lexer. You may also be interested in the unit tests located in the
tests/ folder, which provide a living document on how exactly the filter deals
with malformed input.
In summary:
In summary (see corresponding classes for more details):
1. Parse document into an array of tag and text tokens (Lexer)
2. Remove all elements not on whitelist and transform certain other elements

22
docs/enduser-security.txt Normal file
View File

@@ -0,0 +1,22 @@
Security
Like anything that claims to afford security, HTML_Purifier can be circumvented
through negligence of people. This class will do its job: no more, no less,
and it's up to you to provide it the proper information and proper context
to be effective. Things to remember:
1. Character Encoding: see enduser-utf8.html for more info.
2. Doctype: document pending feature completion
Not strictly necessary, actually. More in-depth discussion once we figure
out how to get strict loose mode working.
3. IDs: see enduser-id.html for more info
4. Links: document pending feature completion
Rudimentary blacklisting, we should also allow only relative URIs. We
need a doc to explain the stuff.
5. CSS: document pending
Explain which CSS styles we blocked and why.

117
docs/enduser-slow.html Normal file
View File

@@ -0,0 +1,117 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta name="description" content="Explains how to speed up HTML Purifier through caching or inbound filtering." />
<link rel="stylesheet" type="text/css" href="./style.css" />
<title>Speeding up HTML Purifier - HTML Purifier</title>
</head><body>
<h1 class="subtitled">Speeding up HTML Purifier</h1>
<div class="subtitle">...also known as the HELP ME LIBRARY IS TOO SLOW MY PAGE TAKE TOO LONG page</div>
<div id="filing">Filed under End-User</div>
<div id="index">Return to the <a href="index.html">index</a>.</div>
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
<p>HTML Purifier is a very powerful library. But with power comes great
responsibility, in the form of longer execution times. Remember, this
library isn't lightly grazing over submitted HTML: it's deconstructing
the whole thing, rigorously checking the parts, and then putting it back
together. </p>
<p>So, if it so turns out that HTML Purifier is kinda too slow for outbound
filtering, you've got a few options: </p>
<h2>Inbound filtering</h2>
<p>Perform filtering of HTML when it's submitted by the user. Since the
user is already submitting something, an extra half a second tacked on
to the load time probably isn't going to be that huge of a problem.
Then, displaying the content is a simple a manner of outputting it
directly from your database/filesystem. The trouble with this method is
that your user loses the original text, and when doing edits, will be
handling the filtered text. While this may be a good thing, especially
if you're using a WYSIWYG editor, it can also result in data-loss if a
user makes a typo. </p>
<p>Example (non-functional):</p>
<pre>&lt;?php
/**
* FORM SUBMISSION PAGE
* display_error($message) : displays nice error page with message
* display_success() : displays a nice success page
* display_form() : displays the HTML submission form
* database_insert($html) : inserts data into database as new row
*/
if (!empty($_POST)) {
require_once '/path/to/library/HTMLPurifier.auto.php';
require_once 'HTMLPurifier.func.php';
$dirty_html = isset($_POST['html']) ? $_POST['html'] : false;
if (!$dirty_html) {
display_error('You must write some HTML!');
}
$html = HTMLPurifier($dirty_html);
database_insert($html);
display_success();
// notice that $dirty_html is *not* saved
} else {
display_form();
}
?&gt;</pre>
<h2>Caching the filtered output</h2>
<p>Accept the submitted text and put it unaltered into the database, but
then also generate a filtered version and stash that in the database.
Serve the filtered version to readers, and the unaltered version to
editors. If need be, you can invalidate the cache and have the cached
filtered version be regenerated on the first page view. Pros? Full data
retention. Cons? It's more complicated, and opens other editors up to
XSS if they are using a WYSIWYG editor (to fix that, they'd have to be
able to get their hands on the *really* original text served in
plaintext mode). </p>
<p>Example (non-functional):</p>
<pre>&lt;?php
/**
* VIEW PAGE
* display_error($message) : displays nice error page with message
* cache_get($id) : retrieves HTML from fast cache (db or file)
* cache_insert($id, $html) : inserts good HTML into cache system
* database_get($id) : retrieves raw HTML from database
*/
$id = isset($_GET['id']) ? (int) $_GET['id'] : false;
if (!$id) {
display_error('Must specify ID.');
exit;
}
$html = cache_get($id); // filesystem or database
if ($html === false) {
// cache didn't have the HTML, generate it
$raw_html = database_get($id);
require_once '/path/to/library/HTMLPurifier.auto.php';
require_once 'HTMLPurifier.func.php';
$html = HTMLPurifier($raw_html);
cache_insert($id, $html);
}
echo $html;
?&gt;</pre>
<h2>Summary</h2>
<p>In short, inbound filtering is the simple option and caching is the
robust option (albeit with bigger storage requirements). </p>
<p>There is a third option, independent of the two we've discussed: profile
and optimize HTMLPurifier yourself. Be sure to report back your results
if you decide to do that! Especially if you port HTML Purifier to C++.
<tt>;-)</tt></p>
</body>
</html>

1046
docs/enduser-utf8.html Normal file

File diff suppressed because it is too large Load Diff

152
docs/enduser-youtube.html Normal file
View File

@@ -0,0 +1,152 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta name="description" content="Explains how to safely allow the embedding of flash from trusted sites in HTML Purifier." />
<link rel="stylesheet" type="text/css" href="./style.css" />
<title>Embedding YouTube Videos - HTML Purifier</title>
</head><body>
<h1 class="subtitled">Embedding YouTube Videos</h1>
<div class="subtitle">...as well as other dangerous active content</div>
<div id="filing">Filed under End-User</div>
<div id="index">Return to the <a href="index.html">index</a>.</div>
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
<p>Clients like their YouTube videos. It gives them a warm fuzzy feeling when
they see a neat little embedded video player on their websites that can play
the latest clips from their documentary &quot;Fido and the Bones of Spring&quot;.
All joking aside, the ability to embed YouTube videos or other active
content in their pages is something that a lot of people like.</p>
<p>This is a <em>bad</em> idea. The moment you embed anything untrusted,
you will definitely be slammed by a manner of nasties that can be
embedded in things from your run of the mill Flash movie to
<a href="http://blog.spywareguide.com/2006/12/myspace_phish_attack_leads_use.html">Quicktime movies</a>.
Even <code>img</code> tags, which HTML Purifier allows by default, can be
dangerous. Be distrustful of anything that tells a browser to load content
from another website automatically.</p>
<p>Luckily for us, however, whitelisting saves the day. Sure, letting users
include any old random flash file could be dangerous, but if it's
from a specific website, it probably is okay. If no amount of pleading will
convince the people upstairs that they should just settle with just linking
to their movies, you may find this technique very useful.</p>
<h2>Looking in</h2>
<p>Below is custom code that allows users to embed
YouTube videos. This is not favoritism: this trick can easily be adapted for
other forms of embeddable content.</p>
<p>Usually, websites like YouTube give us boilerplate code that you can insert
into your documents. YouTube's code goes like this:</p>
<pre>
&lt;object width=&quot;425&quot; height=&quot;350&quot;&gt;
&lt;param name=&quot;movie&quot; value=&quot;http://www.youtube.com/v/AyPzM5WK8ys&quot; /&gt;
&lt;param name=&quot;wmode&quot; value=&quot;transparent&quot; /&gt;
&lt;embed src=&quot;http://www.youtube.com/v/AyPzM5WK8ys&quot;
type=&quot;application/x-shockwave-flash&quot;
wmode=&quot;transparent&quot; width=&quot;425&quot; height=&quot;350&quot; /&gt;
&lt;/object&gt;
</pre>
<p>There are two things to note about this code:</p>
<ol>
<li><code>&lt;embed&gt;</code> is not recognized by W3C, so if you want
standards-compliant code, you'll have to get rid of it.</li>
<li>The code is exactly the same for all instances, except for the
identifier <tt>AyPzM5WK8ys</tt> which tells us which movie file
to retrieve.</li>
</ol>
<p>What point 2 means is that if we have code like <code>&lt;span
class=&quot;embed-youtube&quot;&gt;AyPzM5WK8ys&lt;/span&gt;</code> your
application can reconstruct the full object from this small snippet that
passes through HTML Purifier <em>unharmed</em>.
<a href="http://htmlpurifier.org/svnroot/htmlpurifier/trunk/library/HTMLPurifier/Filter/YouTube.php">Show me the code!</a></p>
<p>And the corresponding usage:</p>
<pre>&lt;?php
// assuming $purifier is an instance of HTMLPurifier
require_once 'HTMLPurifier/Filter/YouTube.php';
$purifier-&gt;addFilter(new HTMLPurifier_Filter_YouTube());
?&gt;</pre>
<p>There is a bit going in the two code snippets, so let's explain.</p>
<ol>
<li>This is a Filter object, which intercepts the HTML that is
coming into and out of the purifier. You can add as many
filter objects as you like. <code>preFilter()</code>
processes the code before it gets purified, and <code>postFilter()</code>
processes the code afterwards. So, we'll use <code>preFilter()</code> to
replace the object tag with a <code>span</code>, and <code>postFilter()</code>
to restore it.</li>
<li>The first preg_replace call replaces any YouTube code users may have
embedded into the benign span tag. Span is used because it is inline,
and objects are inline too. We are very careful to be extremely
restrictive on what goes inside the span tag, as if an errant code
gets in there it could get messy.</li>
<li>The HTML is then purified as usual.</li>
<li>Then, another preg_replace replaces the span tag with a fully fledged
object. Note that the embed is removed, and, in its place, a data
attribute was added to the object. This makes the tag standards
compliant! It also breaks Internet Explorer, so we add in a bit of
conditional comments with the old embed code to make it work again.
It's all quite convoluted but works.</li>
</ol>
<h2>Warning</h2>
<p>There are a number of possible problems with the code above, depending
on how you look at it.</p>
<h3>Cannot change width and height</h3>
<p>The width and height of the final YouTube movie cannot be adjusted. This
is because I am lazy. If you really insist on letting users change the size
of the movie, what you need to do is package up the attributes inside the
span tag (along with the movie ID). It gets complicated though: a malicious
user can specify an outrageously large height and width and attempt to crash
the user's operating system/browser. You need to either cap it by limiting
the amount of digits allowed in the regex or using a callback to check the
number.</p>
<h3>Trusts media's host's security</h3>
<p>By allowing this code onto our website, we are trusting that YouTube has
tech-savvy enough people not to allow their users to inject malicious
code into the Flash files. An exploit on YouTube means an exploit on your
site. Even though YouTube is run by the reputable Google, it
<a href="http://ha.ckers.org/blog/20061213/google-xss-vuln/">doesn't</a>
mean they are
<a href="http://ha.ckers.org/blog/20061208/xss-in-googles-orkut/">invulnerable.</a>
You're putting a certain measure of the job on an external provider (just as
you have by entrusting your user input to HTML Purifier), and
it is important that you are cognizant of the risk.</p>
<h3>Poorly written adaptations compromise security</h3>
<p>This should go without saying, but if you're going to adapt this code
for Google Video or the like, make sure you do it <em>right</em>. It's
extremely easy to allow a character too many in <code>postFilter()</code> and
suddenly you're introducing XSS into HTML Purifier's XSS free output. HTML
Purifier may be well written, but it cannot guard against vulnerabilities
introduced after it has finished.</p>
<h2>Help out!</h2>
<p>If you write a filter for your favorite video destination (or anything
like that, for that matter), send it over and it might get included
with the core!</p>
</body>
</html>

View File

@@ -2,14 +2,22 @@
// This file demonstrates basic usage of HTMLPurifier.
exit; // not to be called directly, it will fail fantastically!
// replace this with the path to the HTML Purifier library
require_once '../../library/HTMLPurifier.auto.php';
set_include_path('/path/to/htmlpurifier/library' . PATH_SEPARATOR . get_include_path());
require_once 'HTMLPurifier.php';
$config = HTMLPurifier_Config::createDefault();
$purifier = new HTMLPurifier();
// configuration goes here:
$config->set('Core', 'Encoding', 'ISO-8859-1'); //replace with your encoding
$config->set('Core', 'XHTML', true); // set to false if HTML 4.01
$purifier = new HTMLPurifier($config);
// untrusted input HTML
$html = '<b>Simple and short';
$pure_html = $purifier->purify($html);
echo '<pre>' . htmlspecialchars($pure_html) . '</pre>';
?>

View File

@@ -1,75 +0,0 @@
<?php
header('Content-type:text/html;charset=UTF-8');
?><!DOCTYPE html
PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html>
<head>
<title>HTMLPurifier Live Demo</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
</head>
<body>
<h1>HTMLPurifier Live Demo</h1>
<?php
set_include_path('../../library' . PATH_SEPARATOR . get_include_path());
require_once 'HTMLPurifier.php';
if (!empty($_POST['html'])) {
$html = get_magic_quotes_gpc() ? stripslashes($_POST['html']) : $_POST['html'];
$purifier = new HTMLPurifier();
$pure_html = $purifier->purify($html);
?>
<p>Here is your purified HTML:</p>
<div style="border:5px solid #CCC;margin:0 10%;padding:1em;">
<?php
echo $pure_html;
?>
<div style="clear:both;"></div>
</div>
<p>Here is the source code of the purified HTML:</p>
<pre><?php
echo htmlspecialchars($pure_html, ENT_COMPAT, 'UTF-8');
?></pre>
<?php
} else {
?>
<p>Welcome to the live demo. Enter some HTML and see how HTMLPurifier
will filter it.</p>
<?php
}
?>
<form name="filter" action="demo.php<?php
if (isset($_GET['profile']) || isset($_GET['XDEBUG_PROFILE'])) {
echo '?XDEBUG_PROFILE=1';
} ?>" method="post">
<fieldset>
<legend>HTML</legend>
<textarea name="html" cols="60" rows="15"><?php
if (isset($html)) {
echo htmlspecialchars(
HTMLPurifier_Encoder::cleanUTF8($html), ENT_COMPAT, 'UTF-8');
}
?></textarea>
<div>
<input type="submit" value="Submit" name="submit" class="button" />
</div>
</fieldset>
</form>
<p>Return to <a href="http://hp.jpsband.org/">HTMLPurifier's home page</a>.</p>
</body>
</html>

View File

@@ -1,67 +0,0 @@
Filter Levels
When one size *does not* fit all
The more I think about it, the less sense it makes for maintaining one huge
monolithic HTMLDefinition class. There's simply so much variation that
could go into this definition: the set of HTML good for blog entries is
definitely too large for HTML that would be allowed in blog comments. Going
from Transitional to Strict requires changes to the definition.
However, allowing users to specify their own whitelists was an idea I
rejected from the start. Simply put, the typical programmer is too lazy
to actually go through the trouble of investigating which tags, attributes
and properties to allow. HTMLDefinition makes a big part of what HTMLPurifier
is.
The idea, then, is to setup fundamentally different set of definitions, which
can further be customized using simpler configuration options.
Here are some fuzzy levels you could set:
1. Comments - Wordpress recommends a, abbr, acronym, b, blockquote, cite,
code, em, i, strike, strong; however, you could get away with only a, b and
i; also having p and pre tags would be helpful.
2. Pages - As permissive as possible without allowing XSS. No protection
against bad design sense, unfortunantely. Suitable for wiki and page
environments.
3. Lint - Accept everything in the spec, a Tidy wannabe.
I've also decomposed tags into risk levels. An asterisk indicates that no one
really uses that tag, tilde indicates it's deprecated.
1 - blockquote, code, em, i, p, tt / strong, sub, sup
1* - abbr, acronym, bdo, cite, dfn, kbd, q, samp
2 - b, br, del, div, pre, span / ins, s, strike ~ u
3 - h2, h3, h4, h5, h6 ~ center
4 - h1, big ~ font
5 - a
7 - area, map
Lists - dd, dl, dt, li, ol, ul ~ menu, dir
Tables - caption, table, td, th, tr / col, colgroup, tbody, tfoot, thead
Forms - fieldset, form, input, lable, legend, optgroup, option, select, textarea
XSS - noscript, object, script ~ applet
Meta - base, basefont, body, head, html, link, meta, style, title
Frames - frame, frameset, iframe
And tag specific notes:
a - general problems involving linkspam
b - too much bold is bad, typographically speaking bold is discouraged
br - often misused
center - CSS, usually no legit use
del - only useful in editing context
div - little meaning in certain contexts i.e. blog comment
h1 - usually no legit use, as header is already set by application
h* - not needed in blog comments
hr - usually not necessary in blog comments
img - could be extremely undesirable if linking to external pics
pre - could use formatting, only useful in code contexts
q - very little support
s - transform into span with styling or del?
small - technically presentational
span - depends on attribute allowances
sub, sup - specialized
u - little legit use, prefer class with text-decoration

6
docs/fixquotes.htc Normal file
View File

@@ -0,0 +1,6 @@
<public:attach event="oncontentready" onevent="init();" />
<script>
function init() {
element.innerHTML = '&#8220;'+element.innerHTML+'&#8221;';
}
</script>

165
docs/index.html Normal file
View File

@@ -0,0 +1,165 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta name="description" content="Index to all HTML Purifier documentation." />
<link rel="stylesheet" type="text/css" href="./style.css" />
<title>Documentation - HTML Purifier</title>
</head>
<body>
<h1>Documentation</h1>
<p><strong><a href="http://htmlpurifier.org/">HTML Purifier</a></strong> has documentation for all types of people.
Here is an index of all of them.</p>
<h2>End-user</h2>
<p>End-user documentation that contains articles, tutorials and useful
information for casual developers using HTML Purifier.</p>
<dl>
<dt><a href="enduser-id.html">IDs</a></dt>
<dd>Explains various methods for allowing IDs in documents safely.</dd>
<dt><a href="enduser-youtube.html">Embedding YouTube videos</a></dt>
<dd>Explains how to safely allow the embedding of flash from trusted sites.</dd>
<dt><a href="enduser-slow.html">Speeding up HTML Purifier</a></dt>
<dd>Explains how to speed up HTML Purifier through caching or inbound filtering.</dd>
<dt><a href="enduser-utf8.html">UTF-8: The Secret of Character Encoding</a></dt>
<dd>Describes the rationale for using UTF-8, the ramifications otherwise, and how to make the switch.</dd>
</dl>
<h2>Development</h2>
<p>Developer documentation detailing code issues, roadmaps and project
conventions.</p>
<dl>
<dt><a href="dev-progress.html">Implementation Progress</a></dt>
<dd>Tables detailing HTML element and CSS property implementation coverage.</dd>
<dt><a href="dev-naming.html">Naming Conventions</a></dt>
<dd>Defines class naming conventions.</dd>
<dt><a href="dev-optimization.html">Optimization</a></dt>
<dd>Discusses possible methods of optimizing HTML Purifier.</dd>
<dt><a href="dev-advanced-api.html">Advanced API</a></dt>
<dd>Functional specification for HTML Purifier's advanced API for defining
custom filtering behavior.</dd>
</dl>
<h2>Proposals</h2>
<p>Proposed features, as well as the associated rambling to get a clear
objective in place before attempted implementation.</p>
<dl>
<dt><a href="proposal-colors.html">Colors</a></dt>
<dd>Proposal to allow for color constraints.</dd>
</dl>
<h2>Reference</h2>
<p>Miscellaneous essays, research pieces and other reference type material
that may not directly discuss HTML Purifier.</p>
<dl>
<dt><a href="ref-devnetwork.html">DevNetwork Credits</a></dt>
<dd>Credits and links to DevNetwork forum topics.</dd>
</dl>
<h2>Internal memos</h2>
<p>Plaintext documents that are more for use by active developers of
the code. They may be upgraded to HTML files or stay as TXT scratchpads.</p>
<table class="table">
<thead><tr>
<th width="10%">Type</th>
<th width="20%">Name</th>
<th>Description</th>
</tr></thead>
<tbody>
<tr>
<td>End-user</td>
<td><a href="enduser-overview.txt">Overview</a></td>
<td>High level overview of the general control flow (mostly obsolete).</td>
</tr>
<tr>
<td>End-user</td>
<td><a href="enduser-security.txt">Security</a></td>
<td>Common security issues that may still arise (half-baked).</td>
</tr>
<tr>
<td>Development</td>
<td><a href="enduser-code-quality.txt">Code Quality Issues</a></td>
<td>Enumerates code quality issues and places that need to be refactored.</td>
</tr>
<tr>
<td>Proposal</td>
<td><a href="proposal-filter-levels.txt">Filter levels</a></td>
<td>Outlines details of projected configurable level of filtering.</td>
</tr>
<tr>
<td>Proposal</td>
<td><a href="proposal-language.txt">Language</a></td>
<td>Specification of I18N for error messages derived from MediaWiki (half-baked).</td>
</tr>
<tr>
<td>Proposal</td>
<td><a href="proposal-new-directives.txt">New directives</a></td>
<td>Assorted configuration options that could be implemented.</td>
</tr>
<tr>
<td>Reference</td>
<td><a href="ref-loose-vs-strict.txt">Loose vs.Strict</a></td>
<td>Differences between HTML Strict and Transitional versions.</td>
</tr>
<tr>
<td>Reference</td>
<td><a href="ref-proprietary-tags.txt">Proprietary tags</a></td>
<td>List of vendor-specific tags we may want to transform to W3C compliant markup.</td>
</tr>
<tr>
<td>Reference</td>
<td><a href="ref-strictness.txt">Strictness</a></td>
<td>Short essay on how loose definition isn't really loose.</td>
</tr>
<tr>
<td>Reference</td>
<td><a href="ref-xhtml-1.1.txt">XHTML 1.1</a></td>
<td>What we'd have to do to support XHTML 1.1.</td>
</tr>
<tr>
<td>Reference</td>
<td><a href="ref-whatwg.txt">WHATWG</a></td>
<td>How WHATWG plays into what we need to do.</td>
</tr>
</tbody>
</table>
<div id="version">$Id$</div>
</body>
</html>

View File

@@ -1,56 +0,0 @@
Naming
The classes in this library follow a few naming conventions, which may
help you find the correct functionality more quickly. Here they are:
All classes occupy the HTMLPurifier pseudo-namespace.
This means that all classes are prefixed with HTMLPurifier_. As such, all
names under HTMLPurifier_ are reserved. I recommend that you use the name
HTMLPurifierX_YourName_ClassName, especially if you want to take advantage
of HTMLPurifier_ConfigDef.
All classes correspond to their path if library/ was in the include path
HTMLPurifier_AttrDef is located at HTMLPurifier/AttrDef.php; replace
underscores with slashes and append .php and you'll have the location of
the class.
Harness and Test are reserved class names for unit tests
The suffix "Test" indicates that the class is a subclass of UnitTestCase
(of the Simpletest library) and is testable. "Harness" indicates a subclass
of UnitTestCase that is not meant to be run but to be extended into
concrete test cases and contains custom test methods (i.e. assert*())
Class names do not necessarily represent inheritance hierarchies
While we try to reflect inheritance in naming to some extent, it is not
guaranteed (for instance, none of the classes inherit from HTMLPurifier,
the base class). However, all class files have the require_once
declarations to whichever classes they are tightly coupled to.
Strategy has a meaning different from the Gang of Four pattern
In Design Patterns, the Gang of Four describes a Strategy object as
encapsulating an algorithm so that they can be switched at run-time. While
our strategies are indeed algorithms, they are not meant to be substituted:
all must be present in order for proper functioning.
Abbreviations are avoided
We try to avoid abbreviations as much as possible, but in some cases,
abbreviated version is more readable than the full version. Here, we
list common abbreviations:
Attr(s) -> Attribute(s)
Def -> Definition
Ambiguity concerning the definition of Def/Definition
While a definition normally defines the structure/acceptable values of
an entity, most of the definitions in this application also attempt
to validate and fix the value. I am unsure of a better name, as
"Validator" would exclude fixing the value, "Fixer" doesn't invoke
the proper image of "fixing" something, and "ValidatorFixer" is too long!
Some other suggestions were "Handler", "Reference", "Check", "Fix",
"Repair" and "Heal".
Transform not Transformer
Transform is both a noun and a verb, and thus we define a "Transform" as
something that "transforms," leaving "Transformer" (which sounds like an
electrical device/robot toy).

View File

@@ -1,11 +0,0 @@
Optimization
Here are some possible optimization techniques we can apply to code sections if
they turn out to be slow. Be sure not to prematurely optimize though!
- Make Tokens Flyweights (may prove problematic, probably not worth it)
- Rewrite regexps into PHP code
- Serialize the Definition object
- Batch regexp validation (do as many per function call as possible)
- Parallelize strategies

48
docs/proposal-colors.html Normal file
View File

@@ -0,0 +1,48 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta name="description" content="Proposal to allow for color constraints in HTML Purifier." />
<link rel="stylesheet" type="text/css" href="./style.css" />
<title>Proposal: Colors - HTML Purifier</title>
</head><body>
<h1 class="subtitled">Colors</h1>
<div class="subtitle">Hammering some sense into those color-blind newbies</div>
<div id="filing">Filed under Proposals</div>
<div id="index">Return to the <a href="index.html">index</a>.</div>
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
<p>Your website probably has a color-scheme.
<span style="color:#090; background:#FFF;">Green on white</span>,
<span style="color:#A0F; background:#FF0;">purple on yellow</span>,
whatever. When you give users the ability to style their content, you may
want them to keep in line with your styling. If you're website is all
about light colors, you don't want a user to come in and vandalize your
page with a deep maroon.</p>
<p>This is an extremely silly feature proposal, but I'm writing it down anyway.</p>
<p>What if the user could constrain the colors specified in inline styles? You
are only allowed to use these shades of dark green for text and these shades
of light yellow for the background. At the very least, you could ensure
that we did not have pale yellow on white text.</p>
<h2>Implementation issues</h2>
<ol>
<li>Requires the color attribute definition to know, currently, what the text
and background colors are. This becomes difficult when classes are thrown
into the mix.</li>
<li>The user still has to define the permissible colors, how does one do
something like that?</li>
</ol>
<div id="version">$Id$</div>
</body>
</html>

View File

@@ -7,25 +7,22 @@ value is used for. This means decentralized configuration declarations that
are nevertheless error checking and a centralized configuration object.
Directives are divided into namespaces, indicating the major portion of
functionality they cover (although there may be overlaps. Please consult
functionality they cover (although there may be overlaps). Please consult
the documentation in ConfigDef for more information on these namespaces.
Since configuration is dependent on context, most of the internal classes
require a configuration object to be passed as a parameter. However, a few
make this optional: they will supply a default configuration object if none
are passed. These classes are: HTMLPurifier::*, Generator::generateFromTokens
and Lexer::tokenizeHTML. However, whenever a valid configuration object
is defined, that object should be used.
Since configuration is dependant on context, internal classes require a
configuration object to be passed as a parameter. (They also require a
Context object).
In relation to HTMLDefinition and CSSDefinition, there is a special class
In relation to HTMLDefinition and CSSDefinition, there could be a special class
of directives that influence the *construction* of the Definition object.
A standard call pattern would look like:
A theoretical call pattern would look like:
1. Client calls Config->getHTMLDefinition()
2. Config calls HTMLDefinition->createNew(this)
3. HTMLDefinition constructs itself with base configuration
4. HTMLDefinition calls Config->get('HTMLDefinition')
5. Config returns array of directives that later construction
4. HTMLDefinition calls Config->get('HTML')
5. Config returns array of directives
6. HTMLDefinition performs operations and changes specified by directives
7. HTMLPurifier returns constructed definition
8. Config caches definition so it doesn't have to be generated again
@@ -36,3 +33,8 @@ custom copy, which OVERRIDES all directives. Only the base, vanilla copy
is the Singleton, the object actually interfaced with is a operated-upon
clone of that object. Also, if an update to the directives would update
the definition, you'd have to force reconstruction.
In practice, the pulling directives from the config object are
solely need-based, and the flex points are littered throughout the
setup() function. Some sort of refactoring is likely in order. See
ref-xhtml-1.1.txt for more info.

View File

@@ -0,0 +1,133 @@
Filter Levels
When one size *does not* fit all
The more I think about it, the less sense it makes for maintaining one huge
monolithic HTMLDefinition class. There's simply so much variation that
could go into this definition: the set of HTML good for blog entries is
definitely too large for HTML that would be allowed in blog comments. Going
from Transitional to Strict requires changes to the definition.
Allowing users to specify their own whitelists is one step (implemented, btw),
but I have doubts on only doing this. Simply put, the typical programmer is too
lazy to actually go through the trouble of investigating which tags, attributes
and properties to allow. HTMLDefinition makes a big part of what HTMLPurifier
is.
The idea, then, is to setup fundamentally different set of definitions, which
can further be customized using simpler configuration options. Alternatively,
they could be implemented as configuration profiles, which simply load
a set of recommended directives to acheive a desired affect (no simpler
config options though).
Here are some fuzzy levels you could set:
1. Comments - Wordpress recommends a, abbr, acronym, b, blockquote, cite,
code, em, i, strike, strong; however, you could get away with only a, em and
p; also having blockquote and pre tags would be helpful.
2. BBCode - Emulate the usual tagset for forums: b, i, img, a, blockquote,
pre, div, span and h[2-6] (the last three are for specially formatted
posts, div and span require associated classes or inline styling enabled
to be useful)
3. Pages - As permissive as possible without allowing XSS. No protection
against bad design sense, unfortunantely. Suitable for wiki and page
environments. (probably what we have now)
4. Lint - Accept everything in the spec, a Tidy wannabe. (This probably won't
get implemented as it would require routines for things like <object>
and friends to be implemented, which is a lot of work for not a lot of
benefit)
One final note: when you start axing tags that are more commonly used, you
run the risk of accidentally destroying user data, especially if the data
is incoming from a WYSIWYG eidtor that hasn't been synced accordingly. This may
make forbidden element to text transformations desirable (for example, images).
== Element Risk Analysis ==
Legend:
[danger level] - regular tags / uncommon tags ~ deprecated tags
[danger level]* - rare tags
1 - blockquote, code, em, i, p, tt / strong, sub, sup
1* - abbr, acronym, bdo, cite, dfn, kbd, q, samp
2 - b, br, del, div, pre, span / ins, s, strike ~ u
3 - h2, h3, h4, h5, h6 ~ center
4 - h1, big ~ font
5 - a
7 - area, map
These are special use tags, they should be enabled on a blanket basis.
Lists - dd, dl, dt, li, ol, ul ~ menu, dir
Tables - caption, table, td, th, tr / col, colgroup, tbody, tfoot, thead
Forms - fieldset, form, input, lable, legend, optgroup, option, select, textarea
XSS - noscript, object, script ~ applet
Meta - base, basefont, body, head, html, link, meta, style, title
Frames - frame, frameset, iframe
And tag specific notes:
a - general problems involving linkspam
b - too much bold is bad, typographically speaking bold is discouraged
br - often misused
center - CSS, usually no legit use
del - only useful in editing context
div - little meaning in certain contexts i.e. blog comment
h1 - usually no legit use, as header is already set by application
h* - not needed in blog comments
hr - usually not necessary in blog comments
img - could be extremely undesirable if linking to external pics (CSRF, goatse)
pre - could use formatting, only useful in code contexts
q - very little support
s - transform into span with styling or del?
small - technically presentational
span - depends on attribute allowances
sub, sup - specialized
u - little legit use, prefer class with text-decoration
Based on the riskiness of the items, we may want to offer %HTML.DisableImages
attribute and put URI filtering higher up on the priority list.
== Attribute Risk Analysis ==
We actually have a suprisingly small assortment of allowed attributes (the
rest are deprecated in strict, and thus we opted not to allow them, even
though our output is XHTML Transitional by default.)
Required URI - img.alt, img.src, a.href
Medium risk - *.class, *.dir
High risk - img.height, img.width, *.id, *.style
Table - colgroup/col.span, td/th.rowspan, td/th.colspan
Uncommon - *.title, *.lang, *.xml:lang
Rare - td/th.abbr, table.summary, {table}.charoff
Rare URI - del.cite, ins.cite, blockquote.cite, q.cite, img.longdesc
Presentational - {table}.align, {table}.valign, table.frame, table.rules,
table.border
Partially presentational - table.cellpadding, table.cellspacing,
table.width, col.width, colgroup.width
== CSS Risk Analysis ==
There are certain CSS elements that are extremely useful inline, but then
as you get to more presentation oriented styling it may not always be
appropriate to inline them.
Useful - clear, float, border-collapse, caption-side
These CSS properties can break layouts if used improperly. We have excluded
any CSS properties that are not currently implemented (such as position).
Dangerous, can go outside container - float
Easy to abuse - font-size, font-family (font), width
Colored - background-color (background), border-color (border), color
Dramatic - border, list-style-position (list-style), margin, padding,
text-align, text-indent, text-transform, vertical-align, line-height
Dramatic elements substantially change the look of text in ways that should
probably have been reserved to other areas.

View File

@@ -0,0 +1,62 @@
We are going to model our I18N/L10N off of MediaWiki's system. Their's is
obviously quite complicated, so we're going to simplify it a bit for our needs.
== Caching ==
MediaWiki has lots of caching mechanisms built in, which make the code somewhat
more difficult to understand. Before doing any loading, MediaWiki will check
the following places to see if we can be lazy:
1. $mLocalisationCache[$code] - just a variable where it may have been stashed
2. serialized/$code.ser - compiled serialized language file
3. Memcached version of file (with expiration checking)
Expiration checking consists of by ensuring all dependencies have filemtime
that match the ones bundled with the cached copy. Similar checking could be
implemented for serialized versions, as it seems that they are not updated
until manually recompiled.
== Behavior ==
Things that are localizable:
- Weekdays (and abbrev)
- Months (and abbrev)
- Bookstores
- Skin names
- Date preferences / Custom date format
- Default date format
- Default user option overrides
-+ Language names
- Timezones
-+ Character encoding conversion via iconv
- UpperLowerCase first (needs casemaps for some)
- UpperLowerCase
- Uppercase words
- Uppercase word breaks
- Case folding
- Strip punctuation for MySQL search
- Get first character
-+ Alternate encoding
-+ Recoding for edit (and then recode input)
-+ RTL
-+ Direction mark character depending on RTL
-? Arrow depending on RTL
- Languages where italics cannot be used
-+ Number formatting (commafy, transform digits, transform separators)
- Truncate (multibyte)
- Grammar conversions for inflected languages
- Plural transformations
- Formatting expiry times
- Segmenting for diffs (Chinese)
- Convert to variants of language
- Language specific user preference options
- Link trails [[foo]]bar
-+ Language code (RFC 3066)
Neat functionality:
- I18N sprintfDate
- Roman numeral formatting
Items marked with a + likely need to be addressed by HTML Purifier

View File

@@ -0,0 +1,44 @@
Configuration Ideas
Here are some theoretical configuration ideas that we could implement some
time. Note the naming convention: %Namespace.Directive
%Attr.RewriteFragments - if there's %Attr.IDPrefix we may want to transparently
rewrite the URLs we parse too. However, we can only do it when it's a pure
anchor link, so it's not foolproof
%Attr.ClassBlacklist,
%Attr.ClassWhitelist,
%Attr.ClassPolicy - determines what classes are allowed. When
%Attr.ClassPolicy is set to Blacklist, only allow those not in
%Attr.ClassBlacklist. When it's Whitelist, only allow those in
%Attr.ClassWhitelist.
%Attr.MaxWidth,
%Attr.MaxHeight - caps for width and height related checks.
(the hack in Pixels for an image crashing attack could be replaced by this)
%URI.AddRelNofollow - will add rel="nofollow" to all links, preventing the
spread of ill-gotten pagerank
%URI.RelativeToAbsolute - transforms all relative URIs to absolute form
%URI.HostBlacklistRegex - regexes that if matching the host are disallowed
%URI.HostWhitelist - domain names that are excluded from the host blacklist
%URI.HostPolicy - determines whether or not its reject all and then whitelist
or allow all in then do specific blacklists with whitelist intervening.
'DenyAll' or 'AllowAll' (default)
%URI.DisableIPHosts - URIs that have IP addresses for hosts are disallowed.
Be sure to also grab unusual encodings (dword, hex and octal), which may
be currently be caught by regular DNS
%URI.DisableIDN - Disallow raw internationalized domain names. Punycode
will still be permitted.
%URI.ConvertUnusualIPHosts - transform dword/hex/octal IP addresses to the
regular form
%URI.ConvertAbsoluteDNS - Remove extra dots after host names that trigger
absolute DNS. While this is actually the preferred method according to
the RFC, most people opt to use a relative domain name relative to . (root).

View File

@@ -1,31 +1,45 @@
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head>
<title>DevNetwork Forums</title>
</head>
<body>
<p>Many thanks to the DevNetwork community for answering questions,
theorizing about design, and offering encouragement during
the development of this library in these forum threads:</p>
<ul>
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=52905">HTMLPurifier PHP Library hompeage</a></li>
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=53056">How much of CSS to implement?</a></li>
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=53083">Parsing URL only according to URI : Security Risk?</a></li>
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=53096">Gimme a name : URI and friends</a></li>
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=53415">How to document configuration directives</a></li>
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=53479">IPv6</a></li>
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=53539">http and ftp versus news and mailto</a></li>
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=53579">HTMLPurifier - Take your best shot</a></li>
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=53664">Need help optimizing a block of code</a>
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=53861">Non-SGML characters</a>
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=54283">Wordpress makes me cry</a>
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=54478">Parameter Object vs. Parameter Array vs. Parameter Functions</a>
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=54521">Convert encoding where output cannot represent characters</a>
</ul>
</body>
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta name="description" content="Credits and links to DevNetwork forum topics on HTML Purifier." />
<link rel="stylesheet" type="text/css" href="./style.css" />
<title>DevNetwork Credits - HTML Purifier</title>
</head>
<body>
<h1>DevNetwork Credits</h1>
<div id="filing">Filed under Reference</div>
<div id="index">Return to the <a href="index.html">index</a>.</div>
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
<p>Many thanks to the DevNetwork community for answering questions,
theorizing about design, and offering encouragement during
the development of this library in these forum threads:</p>
<ul>
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=52905">HTMLPurifier PHP Library hompeage</a></li>
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=53056">How much of CSS to implement?</a></li>
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=53083">Parsing URL only according to URI : Security Risk?</a></li>
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=53096">Gimme a name : URI and friends</a></li>
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=53415">How to document configuration directives</a></li>
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=53479">IPv6</a></li>
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=53539">http and ftp versus news and mailto</a></li>
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=53579">HTMLPurifier - Take your best shot</a></li>
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=53664">Need help optimizing a block of code</a></li>
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=53861">Non-SGML characters</a></li>
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=54283">Wordpress makes me cry</a></li>
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=54478">Parameter Object vs. Parameter Array vs. Parameter Functions</a></li>
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=54521">Convert encoding where output cannot represent characters</a></li>
<li><a href="http://forums.devnetwork.net/viewtopic.php?t=56411">Reporting errors in a document without line numbers</a></li>
</ul>
<p>...as well as any I may have forgotten.</p>
<div id="version">$Id$</div>
</body>
</html>

View File

@@ -0,0 +1,37 @@
Loose versus Strict
Changes from one doctype to another
There are changes. Wow, how insightful. Not everything changed is relevant
to HTML Purifier, though, so let's take a look:
== Major incompatibilities ==
[done] BLOCKQUOTE changes from 'flow' to 'block'
current behavior: inline inner contents should not be nuked, block-ify as necessary
[partially-done] U, S, STRIKE cut
current behavior: removed completely
projected behavior: replace with appropriate inline span + CSS
[done] ADDRESS from potpourri to Inline (removes p tags)
current behavior: block tags silently dropped
ideal behavior: replace tags with something like <br>. (not high priority)
== Things we can loosen up ==
Tags DIR, MENU, CENTER, ISINDEX, FONT, BASEFONT? allowed in loose
current behavior: transform to strict-valid forms
Attributes allowed in loose (see attribute transforms in 'dev-progress.html')
current behavior: projected to transform into strict-valid forms
== Periphery issues ==
A tag's attribute 'target' (for selecting frames) cut
current behavior: not allowed at all
projected behavior: use loose doctype if needed, needs valid values
[done] OL/LI tag's attribute 'start'/'value' (for renumbering lists) cut
current behavior: no substitute, just delete when in strict, allow in loose
Attribute 'name' deprecated in favor of 'id'
current behavior: dropped silently
projected behavior: create proper AttrTransform
[done] PRE tag allows SUB/SUP? (strict dtd comment vs syntax, loose disallows)
current behavior: disallow as usual

View File

@@ -0,0 +1,22 @@
Proprietary Tags
<nobr> and friends
Here are some proprietary tags that W3C does not define but occasionally show
up in the wild. We have only included tags that would make sense in an
HTML Purifier context.
<align>, block element that aligns (extremely rare)
<blackface>, inline that double-bolds text (extremely rare)
<comment>, hidden comment for IE and WebTV
<multicol cols=number gutter=pixels width=pixels>, multiple columns
<nobr>, no linebreaks
<spacer align=* type="vertical|horizontal|block">, whitespace in doc,
use width/height for block and size for vertical/horizontal (attributes)
(extremely rare)
<wbr>, potential word break point: allows linebreaks. Only works in <nobr>
<listing>, monospace pre-variant (extremely rare)
<plaintext>, escapes all tags to the end of document
<ruby> and friends, (more research needed, appears to be XHTML 1.1 markup)
<xmp>, monospace, replace with pre

37
docs/ref-strictness.txt Normal file
View File

@@ -0,0 +1,37 @@
Is HTML Purifier Strict or Transitional?
A little bit of helpful guidance
Despite the fact that HTML Purifier professes to support both transitional and
strict HTML, it rejects a lot of attributes and elements that are actually, indeed,
valid. You can investigate progress.html to find out precisely what we
are doing to these *deprecated* attributes.
However, users have found that Strict HTML imposes some quite unreasonable
restrictions on certain things. The start and value attributes in ol and
li (respectively) perhaps are the most contested. There's is currently no
widely supported browser method short of JavaScript that can replace these
two deprecated elements. It behooves us to allow these deprecated
attributes when the output is transitional.
Fortunantely, that's the only real bugger case. The others have near-perfect
CSS equivalents, and were presentational anyway. However, the other question
pops up: should we always convert these to the CSS forms when 1. the spec
allows them anyway and 2. older browsers support them better? After all, the
whole point about CSS is to seperate styling from content, so inline styling
doesn't solve that problem.
It's an icky question, and we'll have to deal with it as more and more
transforms get implemented. As of right now, however, we currently support
these loose-only constructs in loose mode:
- <ul start="1">, <li value="1"> attributes
- <u>, <strike>, <s> tags
- flow children in <blockquote>
- mixed children in <address>
The changed child definitions as well as the ul.start li.value are the most
compelling reasons why loose should be used. We may want offer disabling <u>,
<strike> and <s> by themselves. We may also want to offer no pre-emptive
deprecated conversions. This all must be unified.

9
docs/ref-whatwg.txt Normal file
View File

@@ -0,0 +1,9 @@
Web Hypertext Application Technology Working Group
WHATWG
I don't think we need to worry about them. Untrusted users shouldn't be
submitting applications, eh? But if some interesting attribute pops up in
their spec, and might be worth supporting, stick it here.
(none so far, as you can see)

187
docs/ref-xhtml-1.1.txt Normal file
View File

@@ -0,0 +1,187 @@
XHTML 1.1 and HTML Purifier
Todo for XHTML 1.1 support <http://www.w3.org/TR/xhtml11/changes.html>
1. Scratch lang entirely in favor of xml:lang
2. Scratch name entirely in favor of id (partially-done)
3. Support Ruby <http://www.w3.org/TR/2001/REC-ruby-20010531/>
HTML Purifier uses the modularization of XHTML
<http://www.w3.org/TR/xhtml-modularization/> to organize the internals
of HTMLDefinition into a more manageable and extensible fashion. Rather
than have one super-object, HTMLDefinition is split into HTMLModules,
each of which are responsible for defining elements, their attributes,
and other properties (for a more indepth coverage, see
/library/HTMLPurifier/HTMLModule.php's docblock comments).
The modules that W3C defines and we support are:
* 5.1. Attribute Collections (technically not a module
* 5.2. Core Modules
o 5.2.2. Text Module
o 5.2.3. Hypertext Module
o 5.2.4. List Module
* 5.4. Text Extension Modules
o 5.4.1. Presentation Module
o 5.4.2. Edit Module
o 5.4.3. Bi-directional Text Module
* 5.6. Table Modules
o 5.6.2. Tables Module
* 5.7. Image Module
* 5.18. Style Attribute Module
Modules that we don't support but coul support are:
* 5.6. Table Modules
o 5.6.1. Basic Tables Module [?]
* 5.8. Client-side Image Map Module [?]
* 5.9. Server-side Image Map Module [?]
* 5.12. Target Module [?]
* 5.21. Name Identification Module [deprecated]
* 5.22. Legacy Module [deprecated]
These modules will not be implemented due to their dangerousness or
inapplicability as an XHTML fragment:
* 5.2. Core Modules
o 5.2.1. Structure Module
* 5.3. Applet Module
* 5.5. Forms Modules
o 5.5.1. Basic Forms Module
o 5.5.2. Forms Module
* 5.10. Object Module
* 5.11. Frames Module
* 5.13. Iframe Module
* 5.14. Intrinsic Events Module
* 5.15. Metainformation Module
* 5.16. Scripting Module
* 5.17. Style Sheet Module
* 5.19. Link Module
* 5.20. Base Module
We will not be using W3C's XML Schemas or DTDs directly due to the lack
of robust tools for handling them (the main problem is that all the
current parsers are usually PHP 5 only and solely-validating, not
correcting).
The abstraction of the HTMLDefinition creation process will also
contribute to a need for a caching system. Cache invalidation would be
difficult, but could be done by comparing the HTML and Attr config
namespaces with a copy that was packaged along with the serialized
HTMLDefinition object.
== General Use-Case ==
The outwards API of HTMLDefinition has been largely preserved, not
only for backwards-compatibility but also by design. Instead,
HTMLDefinition can be retrieved "raw", in which it loads a structure
that closely resembles the modules of XHTML 1.1. This structure is very
dynamic, making it easy to make cascading changes to global content
sets or remove elements in bulk.
However, once HTML Purifier needs the actual definition, it retrieves
a finalized version of HTMLDefinition. The finalized definition involves
processing the modules into a form that it is optimized for multiple
calls. This final version is immutable and, even if editable, would
be extremely hard to change.
So, some code taking advantage of the XHTML modularization may look
like this:
<?php
$config = HTMLPurifier_Config::createDefault();
$def =& $config->getHTMLDefinition(true); // reference to raw
unset($def->modules['Hypertext']); // rm ''a'' link
$purifier = new HTMLPurifier($config);
$purifier->purify($html); // now the definition is finalized
?>
== Inclusions ==
One of the nice features of HTMLDefinition is that piggy-backing off
of global attribute and content sets is extremely easy to do.
=== Attributes ===
HTMLModule->elements[$element]->attr stores attribute information for the
specific attributes of $element. This is quite close to the final
API that HTML Purifier interfaces with, but there's an important
extra feature: attr may also contain a array with a member index zero.
<?php
HTMLModule->elements[$element]->attr[0] = array('AttrSet');
?>
Rather than map the attribute key 0 to an array (which should be
an AttrDef), it defines a number of attribute collections that should
be merged into this elements attribute array.
Furthermore, the value of an attribute key, attribute value pair need
not be a fully fledged AttrDef object. They can also be a string, which
signifies a AttrDef that is looked up from a centralized registry
AttrTypes. This allows more concise attribute definitions that look
more like W3C's declarations, as well as offering a centralized point
for modifying the behavior of one attribute type. And, of course, the
old method of manually instantiating an AttrDef still works.
=== Attribute Collections ===
Attribute collections are stored and processed in the AttrCollections
object, which is responsible for performing the inclusions signified
by the 0 index. These attribute collections, too, are mutable, by
using HTMLModule->attr_collections. You may add new attributes
to a collection or define an entirely new collection for your module's
use. Inclusions can also be cumulative.
Attribute collections allow us to get rid of so called "global attributes"
(which actually aren't so global).
=== Content Models and ChildDef ===
An implementation of the above-mentioned attributes and attribute
collections was applied to the ChildDef system. HTML Purifier uses
a proprietary system called ChildDef for performance and flexibility
reasons, but this does not line up very well with W3C's notion of
regexps for defining the allowed children of an element.
HTMLPurifier->elements[$element]->content_model and
HTMLPurifier->elements[$element]->content_model_type store information
about the final ChildDef that will be stored in
HTMLPurifier->elements[$element]->child (we use a different variable
because the two forms are sufficiently different).
$content_model is an abstract, string representation of the internal
state of ChildDef, while $content_model_type is a string identifier
of which ChildDef subclass to instantiate. $content_model is processed
by substituting all content set identifiers (capitalized element names)
with their contents. It is then parsed and passed into the appropriate
ChildDef class, as defined by the ContentSets->getChildDef() or the
custom fallback HTMLModule->getChildDef() for custom child definitions
not in the core.
You'll need to use these facilities if you plan on referencing a content
set like "Inline" or "Block", and using them is recommended even if you're
not due to their conciseness.
A few notes on $content_model: it's structure can be as complicated
as you want, but the pipe symbol (|) is reserved for defining possible
choices, due to the content sets implementation. For example, a content
model that looks like:
"Inline -> Block -> a"
...when the Inline content set is defined as "span | b" and the Block
content set is defined as "div | blockquote", will expand into:
"span | b -> div | blockquote -> a"
The custom HTMLModule->getChildDef() function will need to be able to
then feed this information to ChildDef in a usable manner.
=== Content Sets ===
Content sets can be altered using HTMLModule->content_sets, an associative
array of content set names to content set contents. If the content set
already exists, your values are appended on to it (great for, say,
registering the font tag as an inline element), otherwise it is
created. They are substituted into content_model.

View File

@@ -1,39 +0,0 @@
Security
Like anything that claims to afford security, HTML_Purifier can be circumvented
through negligence of people. This class will do its job: no more, no less,
and it's up to you to provide it the proper information and proper context
to be effective. Things to remember:
1. UTF-8. Currently, the parser runs under the assumption that it is dealing
with UTF-8. Not ISO-8859-1 or Windows-1252, UTF-8. And definitely not "no
character encoding explicitly stated" or UTF-7. If you're not using UTF-8 as
your character encoding, you should switch. Now. Make sure any input is
properly converted to UTF-8, or the parser will mangle it badly
(though it won't be a security risk if you're outputting it as UTF-8 though).
2. XHTML 1.0 Transitional. This is what the parser is outputting. For the most
part, it's compatible with HTML 4.01, but XHTML enforces some very nice things
that all web developers should use. Regardless, NO DOCTYPE is a NO. Quirks mode
has waaaay too many quirks for a little parser to handle. We did not select
strict in order to prevent ourselves from being too draconic on users, but
this may be configurable in the future.
3. IDs. They need to be unique, but without some knowledge of the
rest of the document, it's difficult to know what's unique. %Attr.IDBlacklist
needs to be set: we may want to consider disallowing IDs by default to
save lazy programmers.
4. [PROJECTED] Links. We're not going to try for spam protection (although
some hooks for such a module might be nice) but we may offer the ability to
only accept relative URLs. Pick the one that's right for you.
5. CSS. While we can prevent the most flagrant cases from affecting your
layout (such as absolutely positioned elements), no amount of code is going
to protect your pages from being attacked by garish colors and plain old
bad taste. A neat feature would be the ability to define acceptable colors
in a document, but that's not likely to be implemented for a while. In the
meantime, be sure to make sure that floated elements (permitted, since they
can be quite useful) can't mess up your layout. Once again, we may want to
disable this by default to protect lazy developers.

8
docs/specimens/LICENSE Normal file
View File

@@ -0,0 +1,8 @@
Licensing of Specimens
Some files in this directory have different licenses:
windows-live-mail-desktop-beta.html - donated by laacz, public domain
img.png - LGPL, from <http://commons.wikimedia.org/wiki/Image:Pastille_chrome.png>
All other files are by me, and are licensed under LGPL.

View File

@@ -0,0 +1,165 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<title>HTML align attribute to CSS - HTML Purifier Specimen</title>
<style type="text/css">
div.container {position:relative;height:110px;}
div.container.legend .test {text-align:center;line-height:100px;}
div.test {width:100px;height:100px;border:1px solid black;
position:absolute;top:10px;}
div.test.html {left:10px;}
div.test.css {left:140px;}
table {background:#F00;}
img {border:1px solid #000;}
hr {width:50px;}
div.segment {width:250px; float:left; margin-top:1em;}
</style>
</head>
<body>
<h1>HTML align attribute to CSS</h1>
<p>Inspect source for methodology.</p>
<div class="container legend">
<div class="test html">
HTML
</div>
<div class="test css">
CSS
</div>
</div>
<div class="segment">
<h2>table.align</h2>
<h3>left</h3>
<div class="container">
<div class="test html">
a<table align="left"><tr><td>O</td></tr></table>a
</div>
<div class="test css">
a<table style="float:left;"><tr><td>O</td></tr></table>a
</div>
</div>
<h3>center</h3>
<div class="container">
<div class="test html">
a<table align="center"><tr><td>O</td></tr></table>a
</div>
<div class="test css">
a<table style="margin-left:auto; margin-right:auto;"><tr><td>O</td></tr></table>a
</div>
</div>
<h3>right</h3>
<div class="container">
<div class="test html">
a<table align="right"><tr><td>O</td></tr></table>a
</div>
<div class="test css">
a<table style="float:right;"><tr><td>O</td></tr></table>a
</div>
</div>
</div>
<!-- ################################################################## -->
<div class="segment">
<h2>img.align</h2>
<h3>left</h3>
<div class="container">
<div class="test html">
a<img src="img.png" align="left">a
</div>
<div class="test css">
a<img src="img.png" style="float:left;">a
</div>
</div>
<h3>right</h3>
<div class="container">
<div class="test html">
a<img src="img.png" align="right">a
</div>
<div class="test css">
a<img src="img.png" style="float:right;">a
</div>
</div>
<h3>bottom</h3>
<div class="container">
<div class="test html">
a<img src="img.png" align="bottom">a
</div>
<div class="test css">
a<img src="img.png" style="vertical-align:baseline;">a
</div>
</div>
<h3>middle</h3>
<div class="container">
<div class="test html">
a<img src="img.png" align="middle">a
</div>
<div class="test css">
a<img src="img.png" style="vertical-align:middle;">a
</div>
</div>
<h3>top</h3>
<div class="container">
<div class="test html">
a<img src="img.png" align="top">a
</div>
<div class="test css">
a<img src="img.png" style="vertical-align:top;">a
</div>
</div>
</div>
<!-- ################################################################## -->
<div class="segment">
<h2>hr.align</h2>
<h3>left</h3>
<div class="container">
<div class="test html">
<hr align="left" />
</div>
<div class="test css">
<hr style="margin-right:auto; margin-left:0; text-align:left;" />
</div>
</div>
<h3>center</h3>
<div class="container">
<div class="test html">
<hr align="center" />
</div>
<div class="test css">
<hr style="margin-right:auto; margin-left:auto; text-align:center;" />
</div>
</div>
<h3>right</h3>
<div class="container">
<div class="test html">
<hr align="right" />
</div>
<div class="test css">
<hr style="margin-right:0; margin-left:auto; text-align:right;" />
</div>
</div>
</div>
</body>
</html>

BIN
docs/specimens/img.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.1 KiB

View File

@@ -0,0 +1,74 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<HTML ChildAreas="4" xmlns:canvas><HEAD>
<META http-equiv=Content-Type content=text/html;charset=windows-1257>
<STYLE></STYLE>
<META content="MSHTML 6.00.6000.16414" name=GENERATOR></HEAD>
<BODY id=MailContainerBody
style="PADDING-RIGHT: 10px; PADDING-LEFT: 10px; FONT-SIZE: 10pt; COLOR: #000000; PADDING-TOP: 15px; FONT-FAMILY: Arial"
bgColor=#ff6600 leftMargin=0 background="" topMargin=0
name="Compose message area" acc_role="text" CanvasTabStop="false">
<DIV
style="BORDER-TOP: #dddddd 1px solid; FONT-SIZE: 10pt; WIDTH: 100%; MARGIN-RIGHT: 10px; PADDING-TOP: 5px; BORDER-BOTTOM: #dddddd 1px solid; FONT-FAMILY: Verdana; HEIGHT: 25px; BACKGROUND-COLOR: #ffffff"><NOBR><SPAN
title="View a slideshow of the pictures in this e-mail message."
style="PADDING-RIGHT: 20px"><A style="COLOR: #0088e4"
href="http://g.msn.com/5meen_us/171?path=/photomail/{6fc0065f-ffdd-4ca6-9a4c-cc5a93dc122f}&amp;image=47D7B182CFEFB10!127&amp;imagehi=47D7B182CFEFB10!125&amp;CID=323550092004883216">Play
slideshow </A></SPAN><SPAN style="COLOR: #909090"><SPAN>|</SPAN><SPAN
style="PADDING-LEFT: 20px"> Download the highest quality version of a picture by
clicking the + above it </SPAN></SPAN></NOBR></DIV>
<DIV
style="PADDING-RIGHT: 5px; PADDING-LEFT: 7px; PADDING-BOTTOM: 2px; WIDTH: 100%; PADDING-TOP: 2px">
<OL>
<LI><IMG title="Angry smile emoticon"
style="FLOAT: none; MARGIN: 0px; POSITION: static" tabIndex=-1
alt="Angry smile emoticon" src="cid:49F0C856199E4D688D2D740680733D74@wc"
MSNNonUserImageOrEmoticon="true">Un ka <FONT style="BACKGROUND-COLOR: #800000"
color=#cc99ff><STRONG>Tev</STRONG></FONT> iet, un ko tu dari?
<LI>Aha!</LI></OL>
<UL>
<LI>Buletets
<LI>
<DIV align=justify><A title=http://laacz.lv/blog/
href="http://laacz.lv/blog/">http://laacz.lv/blog/</A> un <A
title=http://google.com/ href="http://google.com/">gugle</A></DIV>
<LI>Sarakstucitis</LI></UL></DIV><SPAN><SPAN xmlns:canvas="canvas-namespace-id"
layoutEmptyTextWellFont="Tahoma"><SPAN
style="MARGIN-BOTTOM: 15px; OVERFLOW: visible; HEIGHT: 16px"></SPAN><SPAN
style="MARGIN-BOTTOM: 25px; VERTICAL-ALIGN: top; OVERFLOW: visible; MARGIN-RIGHT: 25px; HEIGHT: 234px">
<TABLE style="DISPLAY: inline">
<TBODY>
<TR>
<TD>
<DIV
style="FONT-WEIGHT: bold; FONT-SIZE: 12pt; FONT-FAMILY: arial; TEXT-ALIGN: center"><A
id=HiresARef
title="Click here to view or download a high resolution version of this picture"
style="COLOR: #0088e4; TEXT-DECORATION: none"
href="http://byfiles.storage.msn.com/x1pMvt0I80jTgT6DuaCpEMbprX3nk3jNv_vjigxV_EYVSMyM_PKgEvDEUtuNhQC-F-23mTTcKyqx6eGaeK2e_wMJ0ikwpDdFntk4SY7pfJUv2g2Ck6R2S2vAA?download">+</A></DIV>
<DIV
title="Click here to view the full image using the online photo viewer."
style="DISPLAY: inline; OVERFLOW: hidden; WIDTH: 140px; HEIGHT: 140px"><A
href="http://g.msn.com/5meen_us/171?path=/photomail/{6fc0065f-ffdd-4ca6-9a4c-cc5a93dc122f}&amp;image=47D7B182CFEFB10!127&amp;imagehi=47D7B182CFEFB10!125&amp;CID=323550092004883216"
border="0"><IMG
style="MARGIN-TOP: 15px; DISPLAY: inline-block; MARGIN-LEFT: 0px"
height=109 src="cid:006A71303B80404E9FB6184E55D6A446@wc" width=140
border=0></A></DIV></TD></TR>
<TR>
<TD>
<DIV
style="FONT-SIZE: 10pt; WIDTH: 140px; FONT-FAMILY: verdana; TEXT-ALIGN: center"><EM><STRONG>This
<U>is </U></STRONG><U>tit</U>le</EM> fo<STRONG>r <FONT
face="Arial Black">t<FONT color=#800000 size=7>h<U>i</U></FONT>s
</FONT>picture</STRONG></DIV></TD></TR></TBODY></TABLE></SPAN></SPAN></SPAN>
<DIV
style="PADDING-RIGHT: 5px; PADDING-LEFT: 7px; PADDING-BOTTOM: 2px; WIDTH: 100%; PADDING-TOP: 2px; HEIGHT: 50px">
<DIV>&nbsp;</DIV></DIV>
<DIV
style="BORDER-TOP: #dddddd 1px solid; FONT-SIZE: 10pt; MARGIN-BOTTOM: 10px; WIDTH: 100%; COLOR: #909090; MARGIN-RIGHT: 10px; PADDING-TOP: 9px; FONT-FAMILY: Verdana; HEIGHT: 42px; BACKGROUND-COLOR: #ffffff"><NOBR><SPAN
title="Join Windows Live to share photos using Windows Live Photo E-mail.">Online
pictures are available for 30 days. <A style="COLOR: #0088e4"
href="http://g.msn.com/5meen_us/175">Get Windows Live Mail desktop to create
your own photo e-mails. </A></SPAN></NOBR></DIV></BODY></HTML>

68
docs/style.css Normal file
View File

@@ -0,0 +1,68 @@
html {font-size:1em; font-family:serif; }
body {margin-left:4em; margin-right:4em; }
dt {font-weight:bold; }
pre {margin-left:2em; }
pre, code, tt {font-family:monospace; font-size:1em; }
h1 {text-align:center; font-family:Garamond, serif;
font-variant:small-caps;}
h2 {border-bottom:1px solid #CCC; font-family:sans-serif; font-weight:normal;
font-size:1.3em;}
h3 {font-family:sans-serif; font-size:1.1em; font-weight:bold; }
h4 {font-family:sans-serif; font-size:0.9em; font-weight:bold; }
/* For witty quips */
.subtitled {margin-bottom:0em;}
.subtitle , .subsubtitle {font-size:.8em; margin-bottom:1em;
font-style:italic; margin-top:-.2em;text-align:center;}
.subsubtitle {text-align:left;margin-left:2em;}
/* Used for special "See also" links. */
.reference {font-style:italic;margin-left:2em;}
/* Marks off asides, discussions on why something is the way it is */
.aside {margin-left:2em; font-family:sans-serif; font-size:0.9em; }
blockquote .label {font-weight:bold; font-size:1em; margin:0 0 .1em;
border-bottom:1px solid #CCC;}
/* A regular table */
.table {border-collapse:collapse; border-bottom:2px solid #888; margin-left:2em; }
.table thead th {margin:0; background:#888; color:#FFF; }
.table thead th:first-child {-moz-border-radius-topleft:1em;}
.table tbody td {border-bottom:1px solid #CCC; padding-right:0.6em;padding-left:0.6em;}
/* Category of the file */
#filing {font-weight:bold; font-size:smaller; }
/* Contains, without exception, Return to index. */
#index {font-size:smaller; }
#home {font-size:smaller;}
/* Contains, without exception, $Id$, for SVN version info. */
#version {text-align:right; font-style:italic; margin:2em 0;}
#toc ol ol {list-style-type:lower-roman;}
#toc ol {list-style-type:decimal;}
#toc {list-style-type:upper-alpha;}
q {
behavior: url(fixquotes.htc); /* IE fix */
quotes: '\201C' '\201D' '\2018' '\2019';
}
q:before {
content: open-quote;
}
q:after {
content: close-quote;
}
/* Marks off implementation details interesting only to the person writing
the class described in the spec. */
.technical {margin-left:2em; }
.technical:before {content:"Technical note: "; font-weight:bold; color:#061; }
/* Marks off sections that are lacking. */
.fixme {margin-left:2em; }
.fixme:before {content:"Fix me: "; font-weight:bold; color:#C00; }

View File

@@ -0,0 +1,10 @@
<?php
/**
* This is a stub include that automatically configures the include path.
*/
set_include_path(dirname(__FILE__) . PATH_SEPARATOR . get_include_path() );
require_once 'HTMLPurifier.php';
?>

View File

@@ -0,0 +1,21 @@
<?php
/**
* Function wrapper for HTML Purifier for quick use.
* @note This function only includes the library when it is called. While
* this is efficient for instances when you only use HTML Purifier
* on a few of your pages, it murders bytecode caching. You still
* need to add HTML Purifier to your path.
* @note ''HTMLPurifier()'' is NOT the same as ''new HTMLPurifier()''
*/
function HTMLPurifier($html, $config = null) {
static $purifier = false;
if (!$purifier) {
require_once 'HTMLPurifier.php';
$purifier = new HTMLPurifier();
}
return $purifier->purify($html, $config);
}
?>

View File

@@ -3,7 +3,7 @@
/*!
* @mainpage
*
* HTMLPurifier is an HTML filter that will take an arbitrary snippet of
* HTML Purifier is an HTML filter that will take an arbitrary snippet of
* HTML and rigorously test, validate and filter it into a version that
* is safe for output onto webpages. It achieves this by:
*
@@ -22,7 +22,7 @@
*/
/*
HTMLPurifier - Standards Compliant HTML Filtering
HTML Purifier 1.6.1 - Standards Compliant HTML Filtering
Copyright (C) 2006 Edward Z. Yang
This library is free software; you can redistribute it and/or
@@ -44,6 +44,7 @@
// they get included
require_once 'HTMLPurifier/ConfigSchema.php';
require_once 'HTMLPurifier/Config.php';
require_once 'HTMLPurifier/Context.php';
require_once 'HTMLPurifier/Lexer.php';
require_once 'HTMLPurifier/Generator.php';
@@ -63,51 +64,107 @@ require_once 'HTMLPurifier/Encoder.php';
class HTMLPurifier
{
var $version = '1.6.1';
var $config;
var $filters;
var $lexer, $strategy, $generator;
/**
* Final HTMLPurifier_Context of last run purification. Might be an array.
* @public
*/
var $context;
/**
* Initializes the purifier.
* @param $config Optional HTMLPurifier_Config object for all instances of
* the purifier, if omitted, a default configuration is
* supplied (which can be overridden on a per-use basis).
* The parameter can also be any type that
* HTMLPurifier_Config::create() supports.
*/
function HTMLPurifier($config = null) {
$this->config = $config ? $config : HTMLPurifier_Config::createDefault();
$this->config = HTMLPurifier_Config::create($config);
$this->lexer = HTMLPurifier_Lexer::create();
$this->strategy = new HTMLPurifier_Strategy_Core();
$this->generator = new HTMLPurifier_Generator();
$this->encoder = new HTMLPurifier_Encoder();
}
/**
* Adds a filter to process the output. First come first serve
* @param $filter HTMLPurifier_Filter object
*/
function addFilter($filter) {
$this->filters[] = $filter;
}
/**
* Filters an HTML snippet/document to be XSS-free and standards-compliant.
*
* @param $html String of HTML to purify
* @param $config HTMLPurifier_Config object for this operation, if omitted,
* defaults to the config object specified during this
* object's construction.
* object's construction. The parameter can also be any type
* that HTMLPurifier_Config::create() supports.
* @return Purified HTML
*/
function purify($html, $config = null) {
$config = $config ? $config : $this->config;
$html = $this->encoder->convertToUTF8($html, $config);
$config = $config ? HTMLPurifier_Config::create($config) : $this->config;
$context = new HTMLPurifier_Context();
$html = HTMLPurifier_Encoder::convertToUTF8($html, $config, $context);
for ($i = 0, $size = count($this->filters); $i < $size; $i++) {
$html = $this->filters[$i]->preFilter($html, $config, $context);
}
// purified HTML
$html =
$this->generator->generateFromTokens(
// list of tokens
$this->strategy->execute(
$this->lexer->tokenizeHTML($html, $config),
$config
// list of un-purified tokens
$this->lexer->tokenizeHTML(
// un-purified HTML
$html, $config, $context
),
$config, $context
),
$config
$config, $context
);
$html = $this->encoder->convertFromUTF8($html, $config);
for ($i = $size - 1; $i >= 0; $i--) {
$html = $this->filters[$i]->postFilter($html, $config, $context);
}
$html = HTMLPurifier_Encoder::convertFromUTF8($html, $config, $context);
$this->context =& $context;
return $html;
}
/**
* Filters an array of HTML snippets
* @param $config Optional HTMLPurifier_Config object for this operation.
* See HTMLPurifier::purify() for more details.
* @return Array of purified HTML
*/
function purifyArray($array_of_html, $config = null) {
$context_array = array();
foreach ($array_of_html as $key => $html) {
$array_of_html[$key] = $this->purify($html, $config);
$context_array[$key] = $this->context;
}
$this->context = $context_array;
return $array_of_html;
}
}
?>

View File

@@ -0,0 +1,100 @@
<?php
require_once 'HTMLPurifier/AttrTypes.php';
require_once 'HTMLPurifier/AttrDef/Lang.php';
/**
* Defines common attribute collections that modules reference
*/
class HTMLPurifier_AttrCollections
{
/**
* Associative array of attribute collections, indexed by name
* @note Technically, the composition of these is more complicated,
* but we bypass it using our own excludes property
*/
var $info = array();
/**
* Performs all expansions on internal data for use by other inclusions
* It also collects all attribute collection extensions from
* modules
* @param $attr_types HTMLPurifier_AttrTypes instance
* @param $modules Hash array of HTMLPurifier_HTMLModule members
*/
function HTMLPurifier_AttrCollections($attr_types, $modules) {
$info =& $this->info;
// load extensions from the modules
foreach ($modules as $module) {
foreach ($module->attr_collections as $coll_i => $coll) {
foreach ($coll as $attr_i => $attr) {
if ($attr_i === 0 && isset($info[$coll_i][$attr_i])) {
// merge in includes
$info[$coll_i][$attr_i] = array_merge(
$info[$coll_i][$attr_i], $attr);
continue;
}
$info[$coll_i][$attr_i] = $attr;
}
}
}
// perform internal expansions and inclusions
foreach ($info as $name => $attr) {
// merge attribute collections that include others
$this->performInclusions($info[$name]);
// replace string identifiers with actual attribute objects
$this->expandIdentifiers($info[$name], $attr_types);
}
}
/**
* Takes a reference to an attribute associative array and performs
* all inclusions specified by the zero index.
* @param &$attr Reference to attribute array
*/
function performInclusions(&$attr) {
if (!isset($attr[0])) return;
$merge = $attr[0];
// loop through all the inclusions
for ($i = 0; isset($merge[$i]); $i++) {
// foreach attribute of the inclusion, copy it over
foreach ($this->info[$merge[$i]] as $key => $value) {
if (isset($attr[$key])) continue; // also catches more inclusions
$attr[$key] = $value;
}
if (isset($info[$merge[$i]][0])) {
// recursion
$merge = array_merge($merge, isset($info[$merge[$i]][0]));
}
}
unset($attr[0]);
}
/**
* Expands all string identifiers in an attribute array by replacing
* them with the appropriate values inside HTMLPurifier_AttrTypes
* @param &$attr Reference to attribute array
* @param $attr_types HTMLPurifier_AttrTypes instance
*/
function expandIdentifiers(&$attr, $attr_types) {
foreach ($attr as $def_i => $def) {
if ($def_i === 0) continue;
if (!is_string($def)) continue;
if ($def === false) {
unset($attr[$def_i]);
continue;
}
if (isset($attr_types->info[$def])) {
$attr[$def_i] = $attr_types->info[$def];
} else {
trigger_error('Attempted to reference undefined attribute type', E_USER_ERROR);
unset($attr[$def_i]);
}
}
}
}
?>

View File

@@ -1,26 +0,0 @@
<?php
/**
* Internal data-structure used in attribute validation to accumulate state.
*
* All it is is a data-structure that holds objects that accumulate state, like
* HTMLPurifier_IDAccumulator.
*
* @param Many functions that accept this object have it as a mandatory
* parameter, even when there is no use for it. Though this is
* for the same reasons as why HTMLPurifier_Config is a mandatory
* parameter, it is also because you cannot assign a default value
* to a parameter passed by reference (passing by reference is essential
* for context to work in PHP 4).
*/
class HTMLPurifier_AttrContext
{
/**
* Contains an HTMLPurifier_IDAccumulator, which keeps track of used IDs.
* @public
*/
var $id_accumulator;
}
?>

View File

@@ -1,7 +1,5 @@
<?php
require_once 'HTMLPurifier/AttrContext.php';
/**
* Base class for all validating attribute definitions.
*
@@ -22,10 +20,7 @@ class HTMLPurifier_AttrDef
var $minimized = false;
/**
* Abstract function defined for functions that validate and clean strings.
*
* This function forms the basis for all the subclasses: they must
* define this method.
* Validates and cleans passed string according to a definition.
*
* @public
* @param $string String to be validated and cleaned.
@@ -48,7 +43,16 @@ class HTMLPurifier_AttrDef
*
* @note This method is not entirely standards compliant, as trim() removes
* more types of whitespace than specified in the spec. In practice,
* this is rarely a problem.
* this is rarely a problem, as those extra characters usually have
* already been removed by HTMLPurifier_Encoder.
*
* @warning This processing is inconsistent with XML's whitespace handling
* as specified by section 3.3.3 and referenced XHTML 1.0 section
* 4.7. Compliant processing requires all line breaks normalized
* to "\n", so the fix is not as simple as fixing it in this
* function. Trim and whitespace collapsing are supposed to only
* occur in NMTOKENs. However, note that we are NOT necessarily
* parsing XML, thus, this behavior may still be correct.
*
* @public
*/

View File

@@ -8,6 +8,11 @@ require_once 'HTMLPurifier/CSSDefinition.php';
* @note We don't implement the whole CSS specification, so it might be
* difficult to reuse this component in the context of validating
* actual stylesheet declarations.
* @note If we were really serious about validating the CSS, we would
* tokenize the styles and then parse the tokens. Obviously, we
* are not doing that. Doing that could seriously harm performance,
* but would make these components a lot more viable for a CSS
* filtering solution.
*/
class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef
{
@@ -20,6 +25,9 @@ class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef
// we're going to break the spec and explode by semicolons.
// This is because semicolon rarely appears in escaped form
// Doing this is generally flaky but fast
// IT MIGHT APPEAR IN URIs, see HTMLPurifier_AttrDef_CSSURI
// for details
$declarations = explode(';', $css);
$propvalues = array();
@@ -43,6 +51,7 @@ class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef
$propvalues[$property] = $result;
}
// procedure does not write the new CSS simultaneously, so it's
// slightly inefficient, but it's the only way of getting rid of
// duplicates. Perhaps config to optimize it, but not now.

View File

@@ -0,0 +1,87 @@
<?php
require_once 'HTMLPurifier/AttrDef.php';
require_once 'HTMLPurifier/CSSDefinition.php';
/**
* Validates shorthand CSS property background.
* @warning Does not support url tokens that have internal spaces.
*/
class HTMLPurifier_AttrDef_CSS_Background extends HTMLPurifier_AttrDef
{
/**
* Local copy of component validators.
* @note See HTMLPurifier_AttrDef_Font::$info for a similar impl.
*/
var $info;
function HTMLPurifier_AttrDef_CSS_Background($config) {
$def = $config->getCSSDefinition();
$this->info['background-color'] = $def->info['background-color'];
$this->info['background-image'] = $def->info['background-image'];
$this->info['background-repeat'] = $def->info['background-repeat'];
$this->info['background-attachment'] = $def->info['background-attachment'];
$this->info['background-position'] = $def->info['background-position'];
}
function validate($string, $config, &$context) {
// regular pre-processing
$string = $this->parseCDATA($string);
if ($string === '') return false;
// assumes URI doesn't have spaces in it
$bits = explode(' ', strtolower($string)); // bits to process
$caught = array();
$caught['color'] = false;
$caught['image'] = false;
$caught['repeat'] = false;
$caught['attachment'] = false;
$caught['position'] = false;
$i = 0; // number of catches
$none = false;
foreach ($bits as $bit) {
if ($bit === '') continue;
foreach ($caught as $key => $status) {
if ($key != 'position') {
if ($status !== false) continue;
$r = $this->info['background-' . $key]->validate($bit, $config, $context);
} else {
$r = $bit;
}
if ($r === false) continue;
if ($key == 'position') {
if ($caught[$key] === false) $caught[$key] = '';
$caught[$key] .= $r . ' ';
} else {
$caught[$key] = $r;
}
$i++;
break;
}
}
if (!$i) return false;
if ($caught['position'] !== false) {
$caught['position'] = $this->info['background-position']->
validate($caught['position'], $config, $context);
}
$ret = array();
foreach ($caught as $value) {
if ($value === false) continue;
$ret[] = $value;
}
if (empty($ret)) return false;
return implode(' ', $ret);
}
}
?>

View File

@@ -0,0 +1,130 @@
<?php
require_once 'HTMLPurifier/AttrDef.php';
require_once 'HTMLPurifier/AttrDef/CSS/Length.php';
require_once 'HTMLPurifier/AttrDef/CSS/Percentage.php';
/* W3C says:
[ // adjective and number must be in correct order, even if
// you could switch them without introducing ambiguity.
// some browsers support that syntax
[
<percentage> | <length> | left | center | right
]
[
<percentage> | <length> | top | center | bottom
]?
] |
[ // this signifies that the vertical and horizontal adjectives
// can be arbitrarily ordered, however, there can only be two,
// one of each, or none at all
[
left | center | right
] ||
[
top | center | bottom
]
]
top, left = 0%
center, (none) = 50%
bottom, right = 100%
*/
/* QuirksMode says:
keyword + length/percentage must be ordered correctly, as per W3C
Internet Explorer and Opera, however, support arbitrary ordering. We
should fix it up.
Minor issue though, not strictly necessary.
*/
// control freaks may appreciate the ability to convert these to
// percentages or something, but it's not necessary
/**
* Validates the value of background-position.
*/
class HTMLPurifier_AttrDef_CSS_BackgroundPosition extends HTMLPurifier_AttrDef
{
var $length;
var $percentage;
function HTMLPurifier_AttrDef_CSS_BackgroundPosition() {
$this->length = new HTMLPurifier_AttrDef_CSS_Length();
$this->percentage = new HTMLPurifier_AttrDef_CSS_Percentage();
}
function validate($string, $config, &$context) {
$string = $this->parseCDATA($string);
$bits = explode(' ', $string);
$keywords = array();
$keywords['h'] = false; // left, right
$keywords['v'] = false; // top, bottom
$keywords['c'] = false; // center
$measures = array();
$i = 0;
$lookup = array(
'top' => 'v',
'bottom' => 'v',
'left' => 'h',
'right' => 'h',
'center' => 'c'
);
foreach ($bits as $bit) {
if ($bit === '') continue;
// test for keyword
$lbit = ctype_lower($bit) ? $bit : strtolower($bit);
if (isset($lookup[$lbit])) {
$status = $lookup[$lbit];
$keywords[$status] = $lbit;
$i++;
}
// test for length
$r = $this->length->validate($bit, $config, $context);
if ($r !== false) {
$measures[] = $r;
$i++;
}
// test for percentage
$r = $this->percentage->validate($bit, $config, $context);
if ($r !== false) {
$measures[] = $r;
$i++;
}
}
if (!$i) return false; // no valid values were caught
$ret = array();
// first keyword
if ($keywords['h']) $ret[] = $keywords['h'];
elseif (count($measures)) $ret[] = array_shift($measures);
elseif ($keywords['c']) {
$ret[] = $keywords['c'];
$keywords['c'] = false; // prevent re-use: center = center center
}
if ($keywords['v']) $ret[] = $keywords['v'];
elseif (count($measures)) $ret[] = array_shift($measures);
elseif ($keywords['c']) $ret[] = $keywords['c'];
if (empty($ret)) return false;
return implode(' ', $ret);
}
}
?>

View File

@@ -5,7 +5,7 @@ require_once 'HTMLPurifier/AttrDef.php';
/**
* Validates the border property as defined by CSS.
*/
class HTMLPurifier_AttrDef_Border extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_CSS_Border extends HTMLPurifier_AttrDef
{
/**
@@ -13,7 +13,7 @@ class HTMLPurifier_AttrDef_Border extends HTMLPurifier_AttrDef
*/
var $info = array();
function HTMLPurifier_AttrDef_Border($config) {
function HTMLPurifier_AttrDef_CSS_Border($config) {
$def = $config->getCSSDefinition();
$this->info['border-width'] = $def->info['border-width'];
$this->info['border-style'] = $def->info['border-style'];

View File

@@ -5,7 +5,7 @@ require_once 'HTMLPurifier/AttrDef.php';
/**
* Validates Color as defined by CSS.
*/
class HTMLPurifier_AttrDef_Color extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_CSS_Color extends HTMLPurifier_AttrDef
{
/**

View File

@@ -9,7 +9,7 @@
* especially useful for CSS values, which often are a choice between
* an enumerated set of predefined values or a flexible data type.
*/
class HTMLPurifier_AttrDef_Composite extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_CSS_Composite extends HTMLPurifier_AttrDef
{
/**
@@ -21,7 +21,7 @@ class HTMLPurifier_AttrDef_Composite extends HTMLPurifier_AttrDef
/**
* @param $defs List of HTMLPurifier_AttrDef objects
*/
function HTMLPurifier_AttrDef_Composite($defs) {
function HTMLPurifier_AttrDef_CSS_Composite($defs) {
$this->defs = $defs;
}

View File

@@ -5,7 +5,7 @@ require_once 'HTMLPurifier/AttrDef.php';
/**
* Validates shorthand CSS property font.
*/
class HTMLPurifier_AttrDef_Font extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_CSS_Font extends HTMLPurifier_AttrDef
{
/**
@@ -30,7 +30,7 @@ class HTMLPurifier_AttrDef_Font extends HTMLPurifier_AttrDef
'status-bar' => true
);
function HTMLPurifier_AttrDef_Font($config) {
function HTMLPurifier_AttrDef_CSS_Font($config) {
$def = $config->getCSSDefinition();
$this->info['font-style'] = $def->info['font-style'];
$this->info['font-variant'] = $def->info['font-variant'];

View File

@@ -7,7 +7,7 @@ require_once 'HTMLPurifier/AttrDef.php';
/**
* Validates a font family list according to CSS spec
*/
class HTMLPurifier_AttrDef_FontFamily extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
{
/**

View File

@@ -1,13 +1,12 @@
<?php
require_once 'HTMLPurifier/AttrDef.php';
require_once 'HTMLPurifier/AttrDef/Number.php';
require_once 'HTMLPurifier/AttrDef/CSS/Number.php';
/**
* Represents a Length as defined by CSS.
* @warning Be sure not to confuse this with HTMLPurifier_AttrDef_Length!
*/
class HTMLPurifier_AttrDef_CSSLength extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_CSS_Length extends HTMLPurifier_AttrDef
{
/**
@@ -26,8 +25,8 @@ class HTMLPurifier_AttrDef_CSSLength extends HTMLPurifier_AttrDef
* @param $non_negative Bool indication whether or not negative values are
* allowed.
*/
function HTMLPurifier_AttrDef_CSSLength($non_negative = false) {
$this->number_def = new HTMLPurifier_AttrDef_Number($non_negative);
function HTMLPurifier_AttrDef_CSS_Length($non_negative = false) {
$this->number_def = new HTMLPurifier_AttrDef_CSS_Number($non_negative);
}
function validate($length, $config, &$context) {
@@ -40,6 +39,7 @@ class HTMLPurifier_AttrDef_CSSLength extends HTMLPurifier_AttrDef
// we assume all units are two characters
$unit = substr($length, $strlen - 2);
if (!ctype_lower($unit)) $unit = strtolower($unit);
$number = substr($length, 0, $strlen - 2);
if (!isset($this->units[$unit])) return false;

View File

@@ -0,0 +1,80 @@
<?php
require_once 'HTMLPurifier/AttrDef.php';
/**
* Validates shorthand CSS property list-style.
* @warning Does not support url tokens that have internal spaces.
*/
class HTMLPurifier_AttrDef_CSS_ListStyle extends HTMLPurifier_AttrDef
{
/**
* Local copy of component validators.
* @note See HTMLPurifier_AttrDef_CSS_Font::$info for a similar impl.
*/
var $info;
function HTMLPurifier_AttrDef_CSS_ListStyle($config) {
$def = $config->getCSSDefinition();
$this->info['list-style-type'] = $def->info['list-style-type'];
$this->info['list-style-position'] = $def->info['list-style-position'];
$this->info['list-style-image'] = $def->info['list-style-image'];
}
function validate($string, $config, &$context) {
// regular pre-processing
$string = $this->parseCDATA($string);
if ($string === '') return false;
// assumes URI doesn't have spaces in it
$bits = explode(' ', strtolower($string)); // bits to process
$caught = array();
$caught['type'] = false;
$caught['position'] = false;
$caught['image'] = false;
$i = 0; // number of catches
$none = false;
foreach ($bits as $bit) {
if ($i >= 3) return; // optimization bit
if ($bit === '') continue;
foreach ($caught as $key => $status) {
if ($status !== false) continue;
$r = $this->info['list-style-' . $key]->validate($bit, $config, $context);
if ($r === false) continue;
if ($r === 'none') {
if ($none) continue;
else $none = true;
if ($key == 'image') continue;
}
$caught[$key] = $r;
$i++;
break;
}
}
if (!$i) return false;
$ret = array();
// construct type
if ($caught['type']) $ret[] = $caught['type'];
// construct image
if ($caught['image']) $ret[] = $caught['image'];
// construct position
if ($caught['position']) $ret[] = $caught['position'];
if (empty($ret)) return false;
return implode(' ', $ret);
}
}
?>

View File

@@ -13,7 +13,7 @@ require_once 'HTMLPurifier/AttrDef.php';
* can only be used alone: it will never manifest as part of a multi
* shorthand declaration. Thus, this class does not allow inherit.
*/
class HTMLPurifier_AttrDef_Multiple extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_CSS_Multiple extends HTMLPurifier_AttrDef
{
/**
@@ -30,7 +30,7 @@ class HTMLPurifier_AttrDef_Multiple extends HTMLPurifier_AttrDef
* @param $single HTMLPurifier_AttrDef to multiply
* @param $max Max number of values allowed (usually four)
*/
function HTMLPurifier_AttrDef_Multiple($single, $max = 4) {
function HTMLPurifier_AttrDef_CSS_Multiple($single, $max = 4) {
$this->single = $single;
$this->max = $max;
}

View File

@@ -3,7 +3,7 @@
/**
* Validates a number as defined by the CSS spec.
*/
class HTMLPurifier_AttrDef_Number extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_CSS_Number extends HTMLPurifier_AttrDef
{
/**
@@ -14,7 +14,7 @@ class HTMLPurifier_AttrDef_Number extends HTMLPurifier_AttrDef
/**
* @param $non_negative Bool indicating whether negatives are forbidden
*/
function HTMLPurifier_AttrDef_Number($non_negative = false) {
function HTMLPurifier_AttrDef_CSS_Number($non_negative = false) {
$this->non_negative = $non_negative;
}

View File

@@ -1,25 +1,24 @@
<?php
require_once 'HTMLPurifier/AttrDef.php';
require_once 'HTMLPurifier/AttrDef/Number.php';
require_once 'HTMLPurifier/AttrDef/CSS/Number.php';
/**
* Validates a Percentage as defined by the HTML spec.
* @note This also allows integer pixel values.
* Validates a Percentage as defined by the CSS spec.
*/
class HTMLPurifier_AttrDef_Percentage extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_CSS_Percentage extends HTMLPurifier_AttrDef
{
/**
* Instance of HTMLPurifier_AttrDef_Number to defer pixel validation
* Instance of HTMLPurifier_AttrDef_CSS_Number to defer number validation
*/
var $number_def;
/**
* @param Bool indicating whether to forbid negative values
*/
function HTMLPurifier_AttrDef_Percentage($non_negative = false) {
$this->number_def = new HTMLPurifier_AttrDef_Number($non_negative);
function HTMLPurifier_AttrDef_CSS_Percentage($non_negative = false) {
$this->number_def = new HTMLPurifier_AttrDef_CSS_Number($non_negative);
}
function validate($string, $config, &$context) {

View File

@@ -7,7 +7,7 @@ require_once 'HTMLPurifier/AttrDef.php';
* @note This class could be generalized into a version that acts sort of
* like Enum except you can compound the allowed values.
*/
class HTMLPurifier_AttrDef_TextDecoration extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_CSS_TextDecoration extends HTMLPurifier_AttrDef
{
/**

View File

@@ -0,0 +1,58 @@
<?php
require_once 'HTMLPurifier/AttrDef/URI.php';
/**
* Validates a URI in CSS syntax, which uses url('http://example.com')
* @note While theoretically speaking a URI in a CSS document could
* be non-embedded, as of CSS2 there is no such usage so we're
* generalizing it. This may need to be changed in the future.
* @warning Since HTMLPurifier_AttrDef_CSS blindly uses semicolons as
* the separator, you cannot put a literal semicolon in
* in the URI. Try percent encoding it, in that case.
*/
class HTMLPurifier_AttrDef_CSS_URI extends HTMLPurifier_AttrDef_URI
{
function HTMLPurifier_AttrDef_CSS_URI() {
$this->HTMLPurifier_AttrDef_URI(true); // always embedded
}
function validate($uri_string, $config, &$context) {
// parse the URI out of the string and then pass it onto
// the parent object
$uri_string = $this->parseCDATA($uri_string);
if (strpos($uri_string, 'url(') !== 0) return false;
$uri_string = substr($uri_string, 4);
$new_length = strlen($uri_string) - 1;
if ($uri_string[$new_length] != ')') return false;
$uri = trim(substr($uri_string, 0, $new_length));
if (isset($uri[0]) && ($uri[0] == "'" || $uri[0] == '"')) {
$quote = $uri[0];
$new_length = strlen($uri) - 1;
if ($uri[$new_length] !== $quote) return false;
$uri = substr($uri, 1, $new_length - 1);
}
$keys = array( '(', ')', ',', ' ', '"', "'");
$values = array('\\(', '\\)', '\\,', '\\ ', '\\"', "\\'");
$uri = str_replace($values, $keys, $uri);
$result = parent::validate($uri, $config, $context);
if ($result === false) return false;
// escape necessary characters according to CSS spec
// except for the comma, none of these should appear in the
// URI at all
$result = str_replace($keys, $values, $result);
return "url($result)";
}
}
?>

View File

@@ -5,6 +5,9 @@ require_once 'HTMLPurifier/AttrDef.php';
// Enum = Enumerated
/**
* Validates a keyword against a list of valid values.
* @warning The case-insensitive compare of this function uses PHP's
* built-in strtolower and ctype_lower functions, which may
* cause problems with international comparisons
*/
class HTMLPurifier_AttrDef_Enum extends HTMLPurifier_AttrDef
{
@@ -25,8 +28,8 @@ class HTMLPurifier_AttrDef_Enum extends HTMLPurifier_AttrDef
* @param $case_sensitive Bool indicating whether or not case sensitive
*/
function HTMLPurifier_AttrDef_Enum(
$valid_values = array(), $case_sensitive = false) {
$valid_values = array(), $case_sensitive = false
) {
$this->valid_values = array_flip($valid_values);
$this->case_sensitive = $case_sensitive;
}
@@ -34,6 +37,7 @@ class HTMLPurifier_AttrDef_Enum extends HTMLPurifier_AttrDef
function validate($string, $config, &$context) {
$string = trim($string);
if (!$this->case_sensitive) {
// we may want to do full case-insensitive libraries
$string = ctype_lower($string) ? $string : strtolower($string);
}
$result = isset($this->valid_values[$string]);

View File

@@ -0,0 +1,34 @@
<?php
HTMLPurifier_ConfigSchema::define(
'Attr', 'AllowedFrameTargets', array(), 'lookup',
'Lookup table of all allowed link frame targets. Some commonly used '.
'link targets include _blank, _self, _parent and _top. Values should '.
'be lowercase, as validation will be done in a case-sensitive manner '.
'despite W3C\'s recommendation. XHTML 1.0 Strict does not permit '.
'the target attribute so this directive will have no effect in that '.
'doctype. XHTML 1.1 does not enable the Target module by default, you '.
'will have to manually enable it (see the module documentation for more details.)'
);
require_once 'HTMLPurifier/AttrDef/Enum.php';
/**
* Special-case enum attribute definition that lazy loads allowed frame targets
*/
class HTMLPurifier_AttrDef_HTML_FrameTarget extends HTMLPurifier_AttrDef_Enum
{
var $valid_values = false; // uninitialized value
var $case_sensitive = false;
function HTMLPurifier_AttrDef_HTML_FrameTarget() {}
function validate($string, $config, &$context) {
if ($this->valid_values === false) $this->valid_values = $config->get('Attr', 'AllowedFrameTargets');
return parent::validate($string, $config, $context);
}
}
?>

View File

@@ -0,0 +1,121 @@
<?php
require_once 'HTMLPurifier/AttrDef.php';
require_once 'HTMLPurifier/IDAccumulator.php';
HTMLPurifier_ConfigSchema::define(
'Attr', 'EnableID', false, 'bool',
'Allows the ID attribute in HTML. This is disabled by default '.
'due to the fact that without proper configuration user input can '.
'easily break the validation of a webpage by specifying an ID that is '.
'already on the surrounding HTML. If you don\'t mind throwing caution to '.
'the wind, enable this directive, but I strongly recommend you also '.
'consider blacklisting IDs you use (%Attr.IDBlacklist) or prefixing all '.
'user supplied IDs (%Attr.IDPrefix). This directive has been available '.
'since 1.2.0, and when set to true reverts to the behavior of pre-1.2.0 '.
'versions.'
);
HTMLPurifier_ConfigSchema::defineAlias(
'HTML', 'EnableAttrID', 'Attr', 'EnableID'
);
HTMLPurifier_ConfigSchema::define(
'Attr', 'IDPrefix', '', 'string',
'String to prefix to IDs. If you have no idea what IDs your pages '.
'may use, you may opt to simply add a prefix to all user-submitted ID '.
'attributes so that they are still usable, but will not conflict with '.
'core page IDs. Example: setting the directive to \'user_\' will result in '.
'a user submitted \'foo\' to become \'user_foo\' Be sure to set '.
'%HTML.EnableAttrID to true before using '.
'this. This directive was available since 1.2.0.'
);
HTMLPurifier_ConfigSchema::define(
'Attr', 'IDPrefixLocal', '', 'string',
'Temporary prefix for IDs used in conjunction with %Attr.IDPrefix. If '.
'you need to allow multiple sets of '.
'user content on web page, you may need to have a seperate prefix that '.
'changes with each iteration. This way, seperately submitted user content '.
'displayed on the same page doesn\'t clobber each other. Ideal values '.
'are unique identifiers for the content it represents (i.e. the id of '.
'the row in the database). Be sure to add a seperator (like an underscore) '.
'at the end. Warning: this directive will not work unless %Attr.IDPrefix '.
'is set to a non-empty value! This directive was available since 1.2.0.'
);
HTMLPurifier_ConfigSchema::define(
'Attr', 'IDBlacklistRegexp', null, 'string/null',
'PCRE regular expression to be matched against all IDs. If the expression '.
'is matches, the ID is rejected. Use this with care: may cause '.
'significant degradation. ID matching is done after all other '.
'validation. This directive was available since 1.6.0.'
);
/**
* Validates the HTML attribute ID.
* @warning Even though this is the id processor, it
* will ignore the directive Attr:IDBlacklist, since it will only
* go according to the ID accumulator. Since the accumulator is
* automatically generated, it will have already absorbed the
* blacklist. If you're hacking around, make sure you use load()!
*/
class HTMLPurifier_AttrDef_HTML_ID extends HTMLPurifier_AttrDef
{
// ref functionality disabled, since we also have to verify
// whether or not the ID it refers to exists
function validate($id, $config, &$context) {
if (!$config->get('Attr', 'EnableID')) return false;
$id = trim($id); // trim it first
if ($id === '') return false;
$prefix = $config->get('Attr', 'IDPrefix');
if ($prefix !== '') {
$prefix .= $config->get('Attr', 'IDPrefixLocal');
// prevent re-appending the prefix
if (strpos($id, $prefix) !== 0) $id = $prefix . $id;
} elseif ($config->get('Attr', 'IDPrefixLocal') !== '') {
trigger_error('%Attr.IDPrefixLocal cannot be used unless '.
'%Attr.IDPrefix is set', E_USER_WARNING);
}
//if (!$this->ref) {
$id_accumulator =& $context->get('IDAccumulator');
if (isset($id_accumulator->ids[$id])) return false;
//}
// we purposely avoid using regex, hopefully this is faster
if (ctype_alpha($id)) {
$result = true;
} else {
if (!ctype_alpha(@$id[0])) return false;
$trim = trim( // primitive style of regexps, I suppose
$id,
'A..Za..z0..9:-._'
);
$result = ($trim === '');
}
$regexp = $config->get('Attr', 'IDBlacklistRegexp');
if ($regexp && preg_match($regexp, $id)) {
return false;
}
if (/*!$this->ref && */$result) $id_accumulator->add($id);
// if no change was made to the ID, return the result
// else, return the new id if stripping whitespace made it
// valid, or return false.
return $result ? $id : false;
}
}
?>

View File

@@ -1,18 +1,16 @@
<?php
require_once 'HTMLPurifier/AttrDef.php';
require_once 'HTMLPurifier/AttrDef/Pixels.php';
require_once 'HTMLPurifier/AttrDef/HTML/Pixels.php';
/**
* Validates the HTML type length (not to be confused with CSS's length).
*
* This accepts integer pixels or percentages as lengths for certain
* HTML attributes. Don't use this for CSS: that's
* HTMLPurifier_AttrDef_CSSLength which requires prefixes and allows a lot
* more different types.
* HTML attributes.
*/
class HTMLPurifier_AttrDef_Length extends HTMLPurifier_AttrDef_Pixels
class HTMLPurifier_AttrDef_HTML_Length extends HTMLPurifier_AttrDef_HTML_Pixels
{
function validate($string, $config, &$context) {

View File

@@ -0,0 +1,75 @@
<?php
require_once 'HTMLPurifier/AttrDef.php';
HTMLPurifier_ConfigSchema::define(
'Attr', 'AllowedRel', array(), 'lookup',
'List of allowed forward document relationships in the rel attribute. '.
'Common values may be nofollow or print. By default, this is empty, '.
'meaning that no document relationships are allowed. This directive '.
'was available since 1.6.0.'
);
HTMLPurifier_ConfigSchema::define(
'Attr', 'AllowedRev', array(), 'lookup',
'List of allowed reverse document relationships in the rev attribute. '.
'This attribute is a bit of an edge-case; if you don\'t know what it '.
'is for, stay away. This directive was available since 1.6.0.'
);
/**
* Validates a rel/rev link attribute against a directive of allowed values
* @note We cannot use Enum because link types allow multiple
* values.
* @note Assumes link types are ASCII text
*/
class HTMLPurifier_AttrDef_HTML_LinkTypes extends HTMLPurifier_AttrDef
{
/** Lookup array of attribute names to configuration name */
var $configLookup = array(
'rel' => 'AllowedRel',
'rev' => 'AllowedRev'
);
/** Name config attribute to pull. */
var $name;
function HTMLPurifier_AttrDef_HTML_LinkTypes($name) {
if (!isset($this->configLookup[$name])) {
trigger_error('Unrecognized attribute name for link '.
'relationship.', E_USER_ERROR);
return;
}
$this->name = $this->configLookup[$name];
}
function validate($string, $config, &$context) {
$allowed = $config->get('Attr', $this->name);
if (empty($allowed)) return false;
$string = $this->parseCDATA($string);
$parts = explode(' ', $string);
// lookup to prevent duplicates
$ret_lookup = array();
foreach ($parts as $part) {
$part = strtolower(trim($part));
if (!isset($allowed[$part])) continue;
$ret_lookup[$part] = true;
}
if (empty($ret_lookup)) return false;
$ret_array = array();
foreach ($ret_lookup as $part => $bool) $ret_array[] = $part;
$string = implode(' ', $ret_array);
return $string;
}
}
?>

View File

@@ -1,7 +1,7 @@
<?php
require_once 'HTMLPurifier/AttrDef.php';
require_once 'HTMLPurifier/AttrDef/Length.php';
require_once 'HTMLPurifier/AttrDef/HTML/Length.php';
/**
* Validates a MultiLength as defined by the HTML spec.
@@ -9,7 +9,7 @@ require_once 'HTMLPurifier/AttrDef/Length.php';
* A multilength is either a integer (pixel count), a percentage, or
* a relative number.
*/
class HTMLPurifier_AttrDef_MultiLength extends HTMLPurifier_AttrDef_Length
class HTMLPurifier_AttrDef_HTML_MultiLength extends HTMLPurifier_AttrDef_HTML_Length
{
function validate($string, $config, &$context) {
@@ -27,12 +27,14 @@ class HTMLPurifier_AttrDef_MultiLength extends HTMLPurifier_AttrDef_Length
$int = substr($string, 0, $length - 1);
if ($int == '') return '*';
if (!is_numeric($int)) return false;
$int = (int) $int;
if ($int < 0) return '0*';
if ($int < 0) return false;
if ($int == 0) return '0';
if ($int == 1) return '*';
return ((string) $int) . '*';
}

View File

@@ -4,9 +4,13 @@ require_once 'HTMLPurifier/AttrDef.php';
require_once 'HTMLPurifier/Config.php';
/**
* Validates the contents of the global HTML attribute class.
* Validates contents based on NMTOKENS attribute type.
* @note The only current use for this is the class attribute in HTML
* @note Could have some functionality factored out into Nmtoken class
* @warning We cannot assume this class will be used only for 'class'
* attributes. Not sure how to hook in magic behavior, then.
*/
class HTMLPurifier_AttrDef_Class extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_HTML_Nmtokens extends HTMLPurifier_AttrDef
{
function validate($string, $config, &$context) {
@@ -24,16 +28,17 @@ class HTMLPurifier_AttrDef_Class extends HTMLPurifier_AttrDef
// and plus it would complicate optimization efforts (you never
// see that anyway).
$matches = array();
$pattern = '/(?:(?<=\s)|\A)'.
$pattern = '/(?:(?<=\s)|\A)'. // look behind for space or string start
'((?:--|-?[A-Za-z_])[A-Za-z_\-0-9]*)'.
'(?:(?=\s)|\z)/';
'(?:(?=\s)|\z)/'; // look ahead for space or string end
preg_match_all($pattern, $string, $matches);
if (empty($matches[1])) return false;
// reconstruct string
$new_string = '';
foreach ($matches[1] as $class_names) {
$new_string .= $class_names . ' ';
foreach ($matches[1] as $token) {
$new_string .= $token . ' ';
}
$new_string = rtrim($new_string);

View File

@@ -5,7 +5,7 @@ require_once 'HTMLPurifier/AttrDef.php';
/**
* Validates an integer representation of pixels according to the HTML spec.
*/
class HTMLPurifier_AttrDef_Pixels extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_HTML_Pixels extends HTMLPurifier_AttrDef
{
function validate($string, $config, &$context) {

View File

@@ -1,49 +0,0 @@
<?php
require_once 'HTMLPurifier/AttrDef.php';
require_once 'HTMLPurifier/IDAccumulator.php';
/**
* Validates the HTML attribute ID.
* @warning Even though this is the id processor, it
* will ignore the directive Attr:IDBlacklist, since it will only
* go according to the ID accumulator. Since the accumulator is
* automatically generated, it will have already absorbed the
* blacklist. If you're hacking around, make sure you use load()!
*/
class HTMLPurifier_AttrDef_ID extends HTMLPurifier_AttrDef
{
function validate($id, $config, &$context) {
$id = trim($id); // trim it first
if ($id === '') return false;
if (isset($context->id_accumulator->ids[$id])) return false;
// we purposely avoid using regex, hopefully this is faster
if (ctype_alpha($id)) {
$result = true;
} else {
if (!ctype_alpha(@$id[0])) return false;
$trim = trim( // primitive style of regexps, I suppose
$id,
'A..Za..z0..9:-._'
);
$result = ($trim === '');
}
if ($result) $context->id_accumulator->add($id);
// if no change was made to the ID, return the result
// else, return the new id if stripping whitespace made it
// valid, or return false.
return $result ? $id : false;
}
}
?>

View File

@@ -46,10 +46,10 @@ class HTMLPurifier_AttrDef_Lang extends HTMLPurifier_AttrDef
// process second subtag : $subtags[1]
$length = strlen($subtags[1]);
if ($length == 0 || $length == 1 || $length > 8 || !ctype_alnum($subtags[1])) {
if ($length == 0 || ($length == 1 && $subtags[1] != 'x') || $length > 8 || !ctype_alnum($subtags[1])) {
return $new_string;
}
if (!ctype_lower($subtags[1])) $subtags[1] = strotolower($subtags[1]);
if (!ctype_lower($subtags[1])) $subtags[1] = strtolower($subtags[1]);
$new_string .= '-' . $subtags[1];
if ($num_subtags == 2) return $new_string;
@@ -61,7 +61,7 @@ class HTMLPurifier_AttrDef_Lang extends HTMLPurifier_AttrDef
return $new_string;
}
if (!ctype_lower($subtags[$i])) {
$subtags[$i] = strotolower($subtags[$i]);
$subtags[$i] = strtolower($subtags[$i]);
}
$new_string .= '-' . $subtags[$i];
}

View File

@@ -1,78 +0,0 @@
<?php
require_once 'HTMLPurifier/AttrDef.php';
/**
* Validates shorthand CSS property list-style.
* @note This currently does not support list-style-image, as that functionality
* is not implemented yet elsewhere.
*/
class HTMLPurifier_AttrDef_ListStyle extends HTMLPurifier_AttrDef
{
/**
* Local copy of component validators.
* @note See HTMLPurifier_AttrDef_Font::$info for a similar impl.
*/
var $info;
function HTMLPurifier_AttrDef_ListStyle($config) {
$def = $config->getCSSDefinition();
$this->info['list-style-type'] = $def->info['list-style-type'];
$this->info['list-style-position'] = $def->info['list-style-position'];
}
function validate($string, $config, &$context) {
// regular pre-processing
$string = $this->parseCDATA($string);
if ($string === '') return false;
$bits = explode(' ', strtolower($string)); // bits to process
$caught_type = false;
$caught_position = false;
$caught_none = false; // as in keyword none, which is in all of them
$ret = '';
foreach ($bits as $bit) {
if ($caught_none && ($caught_type || $caught_position)) break;
if ($caught_type && $caught_position) break;
if ($bit === '') continue;
if ($bit === 'none') {
if ($caught_none) continue;
$caught_none = true;
$ret .= 'none ';
continue;
}
// if we add anymore, roll it into a loop
$r = $this->info['list-style-type']->validate($bit, $config, $context);
if ($r !== false) {
if ($caught_type) continue;
$caught_type = true;
$ret .= $r . ' ';
continue;
}
$r = $this->info['list-style-position']->validate($bit, $config, $context);
if ($r !== false) {
if ($caught_position) continue;
$caught_position = true;
$ret .= $r . ' ';
continue;
}
}
$ret = rtrim($ret);
return $ret ? $ret : false;
}
}
?>

View File

@@ -3,7 +3,8 @@
require_once 'HTMLPurifier/AttrDef.php';
require_once 'HTMLPurifier/URIScheme.php';
require_once 'HTMLPurifier/URISchemeRegistry.php';
require_once 'HTMLPurifier/AttrDef/Host.php';
require_once 'HTMLPurifier/AttrDef/URI/Host.php';
require_once 'HTMLPurifier/PercentEncoder.php';
HTMLPurifier_ConfigSchema::define(
'URI', 'DefaultScheme', 'http', 'string',
@@ -11,6 +12,79 @@ HTMLPurifier_ConfigSchema::define(
'select the proper object validator when no scheme information is present.'
);
HTMLPurifier_ConfigSchema::define(
'URI', 'Host', null, 'string/null',
'Defines the domain name of the server, so we can determine whether or '.
'an absolute URI is from your website or not. Not strictly necessary, '.
'as users should be using relative URIs to reference resources on your '.
'website. It will, however, let you use absolute URIs to link to '.
'subdomains of the domain you post here: i.e. example.com will allow '.
'sub.example.com. However, higher up domains will still be excluded: '.
'if you set %URI.Host to sub.example.com, example.com will be blocked. '.
'This directive has been available since 1.2.0.'
);
HTMLPurifier_ConfigSchema::define(
'URI', 'DisableExternal', false, 'bool',
'Disables links to external websites. This is a highly effective '.
'anti-spam and anti-pagerank-leech measure, but comes at a hefty price: no'.
'links or images outside of your domain will be allowed. Non-linkified '.
'URIs will still be preserved. If you want to be able to link to '.
'subdomains or use absolute URIs, specify %URI.Host for your website. '.
'This directive has been available since 1.2.0.'
);
HTMLPurifier_ConfigSchema::define(
'URI', 'DisableExternalResources', false, 'bool',
'Disables the embedding of external resources, preventing users from '.
'embedding things like images from other hosts. This prevents '.
'access tracking (good for email viewers), bandwidth leeching, '.
'cross-site request forging, goatse.cx posting, and '.
'other nasties, but also results in '.
'a loss of end-user functionality (they can\'t directly post a pic '.
'they posted from Flickr anymore). Use it if you don\'t have a '.
'robust user-content moderation team. This directive has been '.
'available since 1.3.0.'
);
HTMLPurifier_ConfigSchema::define(
'URI', 'DisableResources', false, 'bool',
'Disables embedding resources, essentially meaning no pictures. You can '.
'still link to them though. See %URI.DisableExternalResources for why '.
'this might be a good idea. This directive has been available since 1.3.0.'
);
HTMLPurifier_ConfigSchema::define(
'URI', 'Munge', null, 'string/null',
'Munges all browsable (usually http, https and ftp) URI\'s into some URL '.
'redirection service. Pass this directive a URI, with %s inserted where '.
'the url-encoded original URI should be inserted (sample: '.
'<code>http://www.google.com/url?q=%s</code>). '.
'This prevents PageRank leaks, while being as transparent as possible '.
'to users (you may also want to add some client side JavaScript to '.
'override the text in the statusbar). Warning: many security experts '.
'believe that this form of protection does not deter spam-bots. '.
'You can also use this directive to redirect users to a splash page '.
'telling them they are leaving your website. '.
'This directive has been available since 1.3.0.'
);
HTMLPurifier_ConfigSchema::define(
'URI', 'HostBlacklist', array(), 'list',
'List of strings that are forbidden in the host of any URI. Use it to '.
'kill domain names of spam, etc. Note that it will catch anything in '.
'the domain, so <tt>moo.com</tt> will catch <tt>moo.com.example.com</tt>. '.
'This directive has been available since 1.3.0.'
);
HTMLPurifier_ConfigSchema::define(
'URI', 'Disable', false, 'bool',
'Disables all URIs in all forms. Not sure why you\'d want to do that '.
'(after all, the Internet\'s founded on the notion of a hyperlink). '.
'This directive has been available since 1.3.0.'
);
HTMLPurifier_ConfigSchema::defineAlias('Attr', 'DisableURI', 'URI', 'Disable');
/**
* Validates a URI as defined by RFC 3986.
* @note Scheme-specific mechanics deferred to HTMLPurifier_URIScheme
@@ -19,9 +93,16 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
{
var $host;
var $PercentEncoder;
var $embeds_resource;
function HTMLPurifier_AttrDef_URI() {
$this->host = new HTMLPurifier_AttrDef_Host();
/**
* @param $embeds_resource_resource Does the URI here result in an extra HTTP request?
*/
function HTMLPurifier_AttrDef_URI($embeds_resource = false) {
$this->host = new HTMLPurifier_AttrDef_URI_Host();
$this->PercentEncoder = new HTMLPurifier_PercentEncoder();
$this->embeds_resource = (bool) $embeds_resource;
}
function validate($uri, $config, &$context) {
@@ -29,9 +110,14 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
// We'll write stack-based parsers later, for now, use regexps to
// get things working as fast as possible (irony)
if ($config->get('URI', 'Disable')) return false;
// parse as CDATA
$uri = $this->parseCDATA($uri);
// fix up percent-encoding
$uri = $this->PercentEncoder->normalize($uri);
// while it would be nice to use parse_url(), that's specifically
// for HTTP and thus won't work for our generic URI parsing
@@ -63,18 +149,38 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
// no need to validate the scheme's fmt since we do that when we
// retrieve the specific scheme object from the registry
$scheme = ctype_lower($scheme) ? $scheme : strtolower($scheme);
$scheme_obj =& $registry->getScheme($scheme, $config);
$scheme_obj = $registry->getScheme($scheme, $config, $context);
if (!$scheme_obj) return false; // invalid scheme, clean it out
} else {
$scheme_obj =& $registry->getScheme(
$config->get('URI', 'DefaultScheme'), $config
$scheme_obj = $registry->getScheme(
$config->get('URI', 'DefaultScheme'), $config, $context
);
}
// the URI we're processing embeds_resource a resource in the page, but the URI
// it references cannot be located
if ($this->embeds_resource && !$scheme_obj->browsable) {
return false;
}
if ($authority !== null) {
// remove URI if it's absolute and we disabled externals or
// if it's absolute and embedded and we disabled external resources
unset($our_host);
if (
$config->get('URI', 'DisableExternal') ||
(
$config->get('URI', 'DisableExternalResources') &&
$this->embeds_resource
)
) {
$our_host = $config->get('URI', 'Host');
if ($our_host === null) return false;
}
$HEXDIG = '[A-Fa-f0-9]';
$unreserved = 'A-Za-z0-9-._~'; // make sure you wrap with []
$sub_delims = '!$&\'()'; // needs []
@@ -97,6 +203,19 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
$host = $this->host->validate($host, $config, $context);
if ($host === false) $host = null;
if ($this->checkBlacklist($host, $config, $context)) return false;
// more lenient absolute checking
if (isset($our_host)) {
$host_parts = array_reverse(explode('.', $host));
// could be cached
$our_host_parts = array_reverse(explode('.', $our_host));
foreach ($our_host_parts as $i => $discard) {
if (!isset($host_parts[$i])) return false;
if ($host_parts[$i] != $our_host_parts[$i]) return false;
}
}
// userinfo and host are validated within the regexp
} else {
@@ -120,7 +239,7 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
// note that $fragment is omitted
list($userinfo, $host, $port, $path, $query) =
$scheme_obj->validateComponents(
$userinfo, $host, $port, $path, $query, $config
$userinfo, $host, $port, $path, $query, $config, $context
);
@@ -141,10 +260,37 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
if ($query !== null) $result .= "?$query";
if ($fragment !== null) $result .= "#$fragment";
// munge if necessary
$munge = $config->get('URI', 'Munge');
if (!empty($scheme_obj->browsable) && $munge !== null) {
if ($authority !== null) {
$result = str_replace('%s', rawurlencode($result), $munge);
}
}
return $result;
}
/**
* Checks a host against an array blacklist
* @param $host Host to check
* @param $config HTMLPurifier_Config instance
* @param $context HTMLPurifier_Context instance
* @return bool Is spam?
*/
function checkBlacklist($host, &$config, &$context) {
$blacklist = $config->get('URI', 'HostBlacklist');
if (!empty($blacklist)) {
foreach($blacklist as $blacklisted_host_fragment) {
if (strpos($host, $blacklisted_host_fragment) !== false) {
return true;
}
}
}
return false;
}
}
?>

View File

@@ -0,0 +1,17 @@
<?php
require_once 'HTMLPurifier/AttrDef.php';
class HTMLPurifier_AttrDef_URI_Email extends HTMLPurifier_AttrDef
{
/**
* Unpacks a mailbox into its display-name and address
*/
function unpack($string) {
// needs to be implemented
}
}
?>

View File

@@ -0,0 +1,23 @@
<?php
require_once 'HTMLPurifier/AttrDef/URI/Email.php';
/**
* Primitive email validation class based on the regexp found at
* http://www.regular-expressions.info/email.html
*/
class HTMLPurifier_AttrDef_URI_Email_SimpleCheck extends HTMLPurifier_AttrDef_URI_Email
{
function validate($string, $config, &$context) {
// no support for named mailboxes i.e. "Bob <bob@example.com>"
// that needs more percent encoding to be done
if ($string == '') return false;
$string = trim($string);
$result = preg_match('/^[A-Z0-9._%-]+@[A-Z0-9.-]+\.[A-Z]{2,4}$/i', $string);
return $result ? $string : false;
}
}
?>

View File

@@ -1,23 +1,28 @@
<?php
require_once 'HTMLPurifier/AttrDef.php';
require_once 'HTMLPurifier/AttrDef/IPv4.php';
require_once 'HTMLPurifier/AttrDef/IPv6.php';
require_once 'HTMLPurifier/AttrDef/URI/IPv4.php';
require_once 'HTMLPurifier/AttrDef/URI/IPv6.php';
/**
* Validates a host according to the IPv4, IPv6 and DNS specifications.
* Validates a host according to the IPv4, IPv6 and DNS (future) specifications.
*/
class HTMLPurifier_AttrDef_Host extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef
{
/**
* Instances of HTMLPurifier_AttrDef_IPv4 and HTMLPurifier_AttrDef_IPv6
* Instance of HTMLPurifier_AttrDef_URI_IPv4 sub-validator
*/
var $ipv4, $ipv6;
var $ipv4;
function HTMLPurifier_AttrDef_Host() {
$this->ipv4 = new HTMLPurifier_AttrDef_IPv4();
$this->ipv6 = new HTMLPurifier_AttrDef_IPv6();
/**
* Instance of HTMLPurifier_AttrDef_URI_IPv6 sub-validator
*/
var $ipv6;
function HTMLPurifier_AttrDef_URI_Host() {
$this->ipv4 = new HTMLPurifier_AttrDef_URI_IPv4();
$this->ipv6 = new HTMLPurifier_AttrDef_URI_IPv6();
}
function validate($string, $config, &$context) {
@@ -30,6 +35,8 @@ class HTMLPurifier_AttrDef_Host extends HTMLPurifier_AttrDef
if ($valid === false) return false;
return '['. $valid . ']';
}
// need to do checks on unusual encodings too
$ipv4 = $this->ipv4->validate($string, $config, $context);
if ($ipv4 !== false) return $ipv4;

View File

@@ -6,7 +6,7 @@ require_once 'HTMLPurifier/AttrDef.php';
* Validates an IPv4 address
* @author Feyd @ forums.devnetwork.net (public domain)
*/
class HTMLPurifier_AttrDef_IPv4 extends HTMLPurifier_AttrDef
class HTMLPurifier_AttrDef_URI_IPv4 extends HTMLPurifier_AttrDef
{
/**
@@ -15,7 +15,7 @@ class HTMLPurifier_AttrDef_IPv4 extends HTMLPurifier_AttrDef
*/
var $ip4;
function HTMLPurifier_AttrDef_IPv4() {
function HTMLPurifier_AttrDef_URI_IPv4() {
$oct = '(?:25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])'; // 0-255
$this->ip4 = "(?:{$oct}\\.{$oct}\\.{$oct}\\.{$oct})";
}

View File

@@ -1,6 +1,6 @@
<?php
require_once 'HTMLPurifier/AttrDef/IPv4.php';
require_once 'HTMLPurifier/AttrDef/URI/IPv4.php';
/**
* Validates an IPv6 address.
@@ -8,7 +8,7 @@ require_once 'HTMLPurifier/AttrDef/IPv4.php';
* @note This function requires brackets to have been removed from address
* in URI.
*/
class HTMLPurifier_AttrDef_IPv6 extends HTMLPurifier_AttrDef_IPv4
class HTMLPurifier_AttrDef_URI_IPv6 extends HTMLPurifier_AttrDef_URI_IPv4
{
function validate($aIP, $config, &$context) {

View File

@@ -21,13 +21,38 @@ class HTMLPurifier_AttrTransform
* Abstract: makes changes to the attributes dependent on multiple values.
*
* @param $attr Assoc array of attributes, usually from
* HTMLPurifier_Token_Tag::$attributes
* HTMLPurifier_Token_Tag::$attr
* @param $config Mandatory HTMLPurifier_Config object.
* @param $context Mandatory HTMLPurifier_Context object
* @returns Processed attribute array.
*/
function transform($attr, $config) {
function transform($attr, $config, &$context) {
trigger_error('Cannot call abstract function', E_USER_ERROR);
}
/**
* Prepends CSS properties to the style attribute, creating the
* attribute if it doesn't exist.
* @param $attr Attribute array to process (passed by reference)
* @param $css CSS to prepend
*/
function prependCSS(&$attr, $css) {
$attr['style'] = isset($attr['style']) ? $attr['style'] : '';
$attr['style'] = $css . $attr['style'];
}
/**
* Retrieves and removes an attribute
* @param $attr Attribute array to process (passed by reference)
* @param $key Key of attribute to confiscate
*/
function confiscateAttr(&$attr, $key) {
if (!isset($attr[$key])) return null;
$value = $attr[$key];
unset($attr[$key]);
return $value;
}
}
?>

View File

@@ -20,7 +20,7 @@ HTMLPurifier_ConfigSchema::defineAllowedValues(
class HTMLPurifier_AttrTransform_BdoDir extends HTMLPurifier_AttrTransform
{
function transform($attr, $config) {
function transform($attr, $config, &$context) {
if (isset($attr['dir'])) return $attr;
$attr['dir'] = $config->get('Attr', 'DefaultTextDir');
return $attr;

View File

@@ -0,0 +1,26 @@
<?php
require_once 'HTMLPurifier/AttrTransform.php';
/**
* Pre-transform that changes deprecated bgcolor attribute to CSS.
*/
class HTMLPurifier_AttrTransform_BgColor
extends HTMLPurifier_AttrTransform {
function transform($attr, $config, &$context) {
if (!isset($attr['bgcolor'])) return $attr;
$bgcolor = $this->confiscateAttr($attr, 'bgcolor');
// some validation should happen here
$this->prependCSS($attr, "background-color:$bgcolor;");
return $attr;
}
}
?>

View File

@@ -0,0 +1,39 @@
<?php
require_once 'HTMLPurifier/AttrTransform.php';
/**
* Pre-transform that changes converts a boolean attribute to fixed CSS
*/
class HTMLPurifier_AttrTransform_BoolToCSS
extends HTMLPurifier_AttrTransform {
/**
* Name of boolean attribute that is trigger
*/
var $attr;
/**
* CSS declarations to add to style, needs trailing semicolon
*/
var $css;
/**
* @param $attr string attribute name to convert from
* @param $css string CSS declarations to add to style (needs semicolon)
*/
function HTMLPurifier_AttrTransform_BoolToCSS($attr, $css) {
$this->attr = $attr;
$this->css = $css;
}
function transform($attr, $config, &$context) {
if (!isset($attr[$this->attr])) return $attr;
unset($attr[$this->attr]);
$this->prependCSS($attr, $this->css);
return $attr;
}
}
?>

View File

@@ -0,0 +1,20 @@
<?php
require_once 'HTMLPurifier/AttrTransform.php';
/**
* Pre-transform that changes deprecated border attribute to CSS.
*/
class HTMLPurifier_AttrTransform_Border extends HTMLPurifier_AttrTransform {
function transform($attr, $config, &$context) {
if (!isset($attr['border'])) return $attr;
$border_width = $this->confiscateAttr($attr, 'border');
// some validation should happen here
$this->prependCSS($attr, "border:{$border_width}px solid;");
return $attr;
}
}
?>

View File

@@ -0,0 +1,60 @@
<?php
require_once 'HTMLPurifier/AttrTransform.php';
/**
* Generic pre-transform that converts an attribute with a fixed number of
* values (enumerated) to CSS.
*/
class HTMLPurifier_AttrTransform_EnumToCSS extends HTMLPurifier_AttrTransform {
/**
* Name of attribute to transform from
*/
var $attr;
/**
* Lookup array of attribute values to CSS
*/
var $enumToCSS = array();
/**
* Case sensitivity of the matching
* @warning Currently can only be guaranteed to work with ASCII
* values.
*/
var $caseSensitive = false;
/**
* @param $attr String attribute name to transform from
* @param $enumToCSS Lookup array of attribute values to CSS
* @param $case_sensitive Boolean case sensitivity indicator, default false
*/
function HTMLPurifier_AttrTransform_EnumToCSS($attr, $enum_to_css, $case_sensitive = false) {
$this->attr = $attr;
$this->enumToCSS = $enum_to_css;
$this->caseSensitive = (bool) $case_sensitive;
}
function transform($attr, $config, &$context) {
if (!isset($attr[$this->attr])) return $attr;
$value = trim($attr[$this->attr]);
unset($attr[$this->attr]);
if (!$this->caseSensitive) $value = strtolower($value);
if (!isset($this->enumToCSS[$value])) {
return $attr;
}
$this->prependCSS($attr, $this->enumToCSS[$value]);
return $attr;
}
}
?>

View File

@@ -25,7 +25,7 @@ HTMLPurifier_ConfigSchema::define(
class HTMLPurifier_AttrTransform_ImgRequired extends HTMLPurifier_AttrTransform
{
function transform($attr, $config) {
function transform($attr, $config, &$context) {
$src = true;
if (!isset($attr['src'])) {

View File

@@ -0,0 +1,47 @@
<?php
require_once 'HTMLPurifier/AttrTransform.php';
/**
* Pre-transform that changes deprecated hspace and vspace attributes to CSS
*/
class HTMLPurifier_AttrTransform_ImgSpace
extends HTMLPurifier_AttrTransform {
var $attr;
var $css = array(
'hspace' => array('left', 'right'),
'vspace' => array('top', 'bottom')
);
function HTMLPurifier_AttrTransform_ImgSpace($attr) {
$this->attr = $attr;
if (!isset($this->css[$attr])) {
trigger_error(htmlspecialchars($attr) . ' is not valid space attribute');
}
}
function transform($attr, $config, &$context) {
if (!isset($attr[$this->attr])) return $attr;
$width = $this->confiscateAttr($attr, $this->attr);
// some validation could happen here
if (!isset($this->css[$this->attr])) return $attr;
$style = '';
foreach ($this->css[$this->attr] as $suffix) {
$property = "margin-$suffix";
$style .= "$property:{$width}px;";
}
$this->prependCSS($attr, $style);
return $attr;
}
}
?>

Some files were not shown because too many files have changed in this diff Show More