mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2025-08-02 20:27:40 +02:00
Compare commits
654 Commits
Author | SHA1 | Date | |
---|---|---|---|
|
7015aaff46 | ||
|
1009bd41a6 | ||
|
511dfe2d4a | ||
|
463aa3a0fa | ||
|
7189ec2790 | ||
|
e901d832ab | ||
|
643ed1bddc | ||
|
41830cd902 | ||
|
261aa1aeaa | ||
|
486b401cf7 | ||
|
f2794e59c5 | ||
|
d702077d2e | ||
|
36bd06d53e | ||
|
13eb016e06 | ||
|
32025a12e1 | ||
|
7dae94c44b | ||
|
54cc691ba7 | ||
|
3af2ff8f98 | ||
|
36fb284d2f | ||
|
8d1f1e8e73 | ||
|
322288e6c0 | ||
|
3c4346cb1e | ||
|
14d934c7ca | ||
|
bb16d8eae5 | ||
|
10530d7f81 | ||
|
c7e172f660 | ||
|
917d2ea5ef | ||
|
895141e0b5 | ||
|
8ab30e24b7 | ||
|
9db891c3aa | ||
|
eb9f9bc7f6 | ||
|
fcebb7731d | ||
|
8d0d0d1a03 | ||
|
80f59206d7 | ||
|
af3f5190dc | ||
|
5620241165 | ||
|
c06727190e | ||
|
1a95852007 | ||
|
c3fab7200e | ||
|
6d7a17e9b6 | ||
|
64b5581bf2 | ||
|
d8da5ff406 | ||
|
fda310f1e7 | ||
|
fc7dbdbd33 | ||
|
02ac821503 | ||
|
16fa73afa0 | ||
|
32a6afa27c | ||
|
587d642826 | ||
|
0bef016271 | ||
|
ef6a1c9274 | ||
|
86b1da9b6f | ||
|
00ea2062d4 | ||
|
cb5d5d0648 | ||
|
77ce3e8b4a | ||
|
e0c0d8eab6 | ||
|
ce46fb618c | ||
|
9f37764614 | ||
|
aaf6ba421c | ||
|
4b862f64e6 | ||
|
be2cfb7918 | ||
|
2f29c27a59 | ||
|
144bd6f07a | ||
|
a95f600e76 | ||
|
84aa2ca390 | ||
|
1f8619cda5 | ||
|
04b1ec33cb | ||
|
6d9643a92e | ||
|
438d973073 | ||
|
f295465ad4 | ||
|
eaabccdd9b | ||
|
893cdd0301 | ||
|
1ba77fedd4 | ||
|
fae720115a | ||
|
c0f2e69c9f | ||
|
ca6b20ff2b | ||
|
c4aa3ee40c | ||
|
e9c7873057 | ||
|
d45f42e6a8 | ||
|
f46aef698e | ||
|
8fdf8c2e44 | ||
|
4fe475c57f | ||
|
d3710518ce | ||
|
e1876c18ad | ||
|
e616f07739 | ||
|
8708c0617a | ||
|
39be09ee14 | ||
|
949f605857 | ||
|
50aa0ea714 | ||
|
59605d592b | ||
|
5dbd455afb | ||
|
64d2da72f8 | ||
|
5489537b4b | ||
|
a4181a80d7 | ||
|
a1ea149105 | ||
|
3b5effcb72 | ||
|
d1ace6af17 | ||
|
a391dfe1de | ||
|
119c70fc05 | ||
|
b997076dfa | ||
|
04a6b1d3f2 | ||
|
27ba8f2192 | ||
|
9f1e678b48 | ||
|
c216968087 | ||
|
d467af6c4b | ||
|
0ee090bc7b | ||
|
6b21a841c4 | ||
|
08bdeb2ac2 | ||
|
9676e8580e | ||
|
dac98cdb06 | ||
|
870a4029ec | ||
|
e78df4dc9f | ||
|
1d25be875d | ||
|
0e51e42197 | ||
|
51cbb72649 | ||
|
9f2f6c3166 | ||
|
48311b3b02 | ||
|
880a5b9dae | ||
|
043099549c | ||
|
0c051df108 | ||
|
7e59923029 | ||
|
6d517fab09 | ||
|
77302f845f | ||
|
82c9a737f4 | ||
|
aedfbd1e93 | ||
|
848795d4a0 | ||
|
b8f00ace1a | ||
|
34ba0e408f | ||
|
56cfcba5d1 | ||
|
ec59062a9d | ||
|
93babf0a88 | ||
|
42d2858c9d | ||
|
c0dd6944a3 | ||
|
e83573a3ad | ||
|
b65942a2c5 | ||
|
e4ab6d584e | ||
|
e21e9b23ad | ||
|
6cdcc8b8e1 | ||
|
ff60e09780 | ||
|
bd64a8346d | ||
|
7480e7b956 | ||
|
b9eb44bf03 | ||
|
c0b5bc3eea | ||
|
14437cbf47 | ||
|
4c798bd17e | ||
|
1b434f0ecc | ||
|
fab5a9b3e1 | ||
|
d8cb360f3b | ||
|
18320d59a4 | ||
|
0d9c05d13c | ||
|
d81bcbd208 | ||
|
fa6b6fe85f | ||
|
8bda0c4dfb | ||
|
d682a59a68 | ||
|
240b565513 | ||
|
c521e3a534 | ||
|
d765628d24 | ||
|
2cc535ad84 | ||
|
30eb982961 | ||
|
a2d044f58d | ||
|
002fe649f7 | ||
|
d3c04de9dc | ||
|
e4ce3362a5 | ||
|
cb793cd9b9 | ||
|
b5f1c76ee8 | ||
|
f2863557f5 | ||
|
6c9c8f2380 | ||
|
fbc595ebed | ||
|
6651941e3f | ||
|
f5a77c523b | ||
|
ed2aa44bd2 | ||
|
21cf2c94d4 | ||
|
a4abc45505 | ||
|
969a027a5b | ||
|
e3fdda1f3c | ||
|
40d3d5b961 | ||
|
a3b6a15595 | ||
|
4c24a51054 | ||
|
f4c4354ae4 | ||
|
51da183f80 | ||
|
212958a9c7 | ||
|
9f851b2139 | ||
|
8c439aa62c | ||
|
5c0a1d467a | ||
|
929d932234 | ||
|
3441421e8b | ||
|
e28d39e46b | ||
|
bf6de96bd0 | ||
|
65d0e1fdfe | ||
|
d228d66785 | ||
|
e9c22df148 | ||
|
de6e024464 | ||
|
66229121bf | ||
|
0eadf98ee2 | ||
|
6eb193a316 | ||
|
8165adb6c8 | ||
|
37b24b6732 | ||
|
35f8b3c801 | ||
|
c7e115c81c | ||
|
4c502b25f2 | ||
|
3d56c1253b | ||
|
d00cb1e64d | ||
|
b6c9dcefd7 | ||
|
14ef0b75e5 | ||
|
3ba42106ba | ||
|
dea117032f | ||
|
b8a46821f3 | ||
|
2135597553 | ||
|
d238b1a9ed | ||
|
c5e1e1711d | ||
|
f2e42d1d3e | ||
|
a74a590f1c | ||
|
3baf1774b2 | ||
|
41e3c091e1 | ||
|
5a6021599a | ||
|
40d7a296b5 | ||
|
fd81fbac82 | ||
|
2598b26778 | ||
|
4fe6661c64 | ||
|
522c8ed7c2 | ||
|
81a4cf6a14 | ||
|
25551c4b78 | ||
|
c9bf2e8489 | ||
|
07ca96a132 | ||
|
43a5ef3cc6 | ||
|
ff72b2d012 | ||
|
5eee08c548 | ||
|
dd8ef4d3f5 | ||
|
c43c0660f5 | ||
|
c85fd83d2b | ||
|
c23b9da2cd | ||
|
aca282104f | ||
|
a8f7cddd49 | ||
|
8a17b1fbc3 | ||
|
57f897661e | ||
|
4e32902c63 | ||
|
02658df8b2 | ||
|
be7c1e7a8f | ||
|
562f53b54c | ||
|
38a59ef5b8 | ||
|
8779b46fc4 | ||
|
a7fab00cdd | ||
|
beefb11879 | ||
|
ae1c8f47cc | ||
|
0f961c6af4 | ||
|
a840c24796 | ||
|
66c0407bef | ||
|
5b3431d889 | ||
|
831f552ec5 | ||
|
54b37674f1 | ||
|
62f3fd894d | ||
|
b5546ff6f0 | ||
|
7ddd9d0afe | ||
|
620ab75906 | ||
|
3ef9bdf8a2 | ||
|
43f01925cd | ||
|
85a23bacb6 | ||
|
4066416160 | ||
|
fad6aa45fa | ||
|
a7e6d85f6d | ||
|
c330860606 | ||
|
0ea53e5a3d | ||
|
68167176dc | ||
|
bb08f679f0 | ||
|
8cd1806ec8 | ||
|
1274cfed49 | ||
|
1ab47ba949 | ||
|
da95ee096a | ||
|
6d7250c309 | ||
|
df55df1083 | ||
|
1a8d864a42 | ||
|
552102f7f2 | ||
|
f5371bbad4 | ||
|
c8b020879d | ||
|
094b20f58f | ||
|
f2df669eec | ||
|
ca43df9fdd | ||
|
5f76796e14 | ||
|
1f9a6ba30e | ||
|
ccca8cc34f | ||
|
28c29656af | ||
|
88f4f57a47 | ||
|
43a98de909 | ||
|
b9d886d53b | ||
|
5b3c8c5534 | ||
|
dd40d41bc3 | ||
|
37a80f1295 | ||
|
fb367dc871 | ||
|
29c3c21b34 | ||
|
e45cc503a2 | ||
|
85cdea0120 | ||
|
c7676afb0d | ||
|
d75c695994 | ||
|
6f6fcbc354 | ||
|
c31d6ec80e | ||
|
cb92a57e4e | ||
|
423afedbf4 | ||
|
7827a95273 | ||
|
9881a34712 | ||
|
a19f30fdcf | ||
|
8f58c7f49e | ||
|
71301b36eb | ||
|
4f0d012dfa | ||
|
24a4dfdf83 | ||
|
f922285383 | ||
|
3af6457801 | ||
|
d51d3c127b | ||
|
4f92c0377f | ||
|
c3efafb07d | ||
|
79c18eb781 | ||
|
7b64bc37e2 | ||
|
b3aa5fa0dc | ||
|
350d8301dd | ||
|
a40e16dd2e | ||
|
ee388e86c0 | ||
|
79df79b2fd | ||
|
f5b72c623c | ||
|
7bccc24977 | ||
|
25fe416ab2 | ||
|
a9012f4387 | ||
|
82f8561123 | ||
|
0b743fb2db | ||
|
08e32597df | ||
|
2b82fbacad | ||
|
710820cbe9 | ||
|
22ef52a7f6 | ||
|
4919187fc6 | ||
|
797b899305 | ||
|
8c9dbe142d | ||
|
2a002857ce | ||
|
9d98b45dea | ||
|
b0f3116b9e | ||
|
b03a44abff | ||
|
cf257cabde | ||
|
ab950a1909 | ||
|
a12ea4bb3b | ||
|
f80de908bd | ||
|
349c4de75b | ||
|
89622c964e | ||
|
732fe5cad7 | ||
|
e7e81c0a5b | ||
|
626b2a13c8 | ||
|
35487c02ae | ||
|
4bc1761b12 | ||
|
63f5414f2e | ||
|
88d014706b | ||
|
f6de73d7e7 | ||
|
733868a76d | ||
|
fab6a212c8 | ||
|
ea1362ce5c | ||
|
cff498ef67 | ||
|
1765a7537a | ||
|
d7157d0ccd | ||
|
ed44b5c5ba | ||
|
5e5c0f3aa4 | ||
|
b2ed0aff01 | ||
|
148681d1b0 | ||
|
2e7e411491 | ||
|
02051e465c | ||
|
a96b5bf612 | ||
|
9dd7c8c7dd | ||
|
0c59db1da3 | ||
|
584a1abd15 | ||
|
a6ede3804e | ||
|
4476745003 | ||
|
45748500ec | ||
|
e99520ab96 | ||
|
1e2abb7f8f | ||
|
362c802191 | ||
|
3a1d505b3d | ||
|
a005da8a4c | ||
|
9a66394abb | ||
|
62c0575468 | ||
|
6a95d91a1a | ||
|
275932ec05 | ||
|
ae90bb919d | ||
|
3c734b4c72 | ||
|
3d02a2a7d4 | ||
|
0bfa42f9b7 | ||
|
7a8edc88f9 | ||
|
98b4e70a93 | ||
|
6f5592ae60 | ||
|
9f996b125a | ||
|
96b571d236 | ||
|
0e9904a9ba | ||
|
e66a98c396 | ||
|
728088f2ba | ||
|
8ae2604440 | ||
|
7b087c7bbe | ||
|
58064592ff | ||
|
b19fc32a5a | ||
|
b15cbbb42a | ||
|
5f0663cad7 | ||
|
75e52a12a6 | ||
|
269268b843 | ||
|
62c6d93b6d | ||
|
31704c92f6 | ||
|
291fa4cb29 | ||
|
389fcc9a5d | ||
|
e5191b3ada | ||
|
5d0a992579 | ||
|
ae83bebc98 | ||
|
9191877740 | ||
|
3066ca357a | ||
|
53fd096641 | ||
|
2166246b7e | ||
|
49bb6ec35d | ||
|
401612dc3a | ||
|
dc0fb7d2b4 | ||
|
eee45fed37 | ||
|
03657ad51a | ||
|
dda4038446 | ||
|
996ccdbdda | ||
|
008348db21 | ||
|
b10a380ff4 | ||
|
bf0d659c47 | ||
|
e55551ecdd | ||
|
e9f3fef47b | ||
|
840f9f7434 | ||
|
10c970760d | ||
|
69996acc9e | ||
|
8bbb73e47d | ||
|
cf7a50163c | ||
|
da2ea348fd | ||
|
ab3ebcba6d | ||
|
d399abba50 | ||
|
0b0a505c30 | ||
|
6aa3dfc116 | ||
|
c3094275ef | ||
|
220c150e0a | ||
|
32d30a9181 | ||
|
0e5491b20c | ||
|
7699efd593 | ||
|
4bf15de536 | ||
|
70bcccf54c | ||
|
bf6ce67fc1 | ||
|
bd44105ca9 | ||
|
d1f43636e5 | ||
|
9c7483166c | ||
|
e840564228 | ||
|
7d4b532d6b | ||
|
58f00105c8 | ||
|
8d15d1ce13 | ||
|
9c60eeed04 | ||
|
2e089477a5 | ||
|
b442d09ea6 | ||
|
12f73605a3 | ||
|
e2a951420f | ||
|
002395de09 | ||
|
d1187ed331 | ||
|
426fbd1f97 | ||
|
9c5f01a0cf | ||
|
f985d3cd96 | ||
|
0cb1d85822 | ||
|
073ddb0cb2 | ||
|
889ccb1a92 | ||
|
aec84dc3f6 | ||
|
dea62ffdab | ||
|
8913239b7f | ||
|
e06929c218 | ||
|
aaf4839c34 | ||
|
c113f43440 | ||
|
bd8ecdd268 | ||
|
ef51f8681a | ||
|
ee61ffc0d9 | ||
|
f758f7c534 | ||
|
95499e34da | ||
|
de23201cbb | ||
|
21ab12a6a8 | ||
|
69666e977f | ||
|
fa05319e30 | ||
|
ea46d79b0a | ||
|
a62f8971e4 | ||
|
7a3e06d4d0 | ||
|
e180b7689e | ||
|
7579932948 | ||
|
818d0d7a23 | ||
|
797d3e0393 | ||
|
ff7eec7424 | ||
|
0ea04db559 | ||
|
831db14c79 | ||
|
a470fc5621 | ||
|
2945f6a930 | ||
|
71326abec1 | ||
|
23ef535043 | ||
|
fda2043ace | ||
|
3f06d8316c | ||
|
e4b621eec2 | ||
|
9728be4a52 | ||
|
f1ec05afd0 | ||
|
7481d349d3 | ||
|
086dc9177b | ||
|
4d38c02932 | ||
|
83a50465dc | ||
|
dd62a303eb | ||
|
e4e981b6f1 | ||
|
a846f4e70b | ||
|
a5136b65e4 | ||
|
2d035483dd | ||
|
831a09d455 | ||
|
2cbb3be602 | ||
|
f7eccc0038 | ||
|
65252d6fbd | ||
|
6b9c5ec603 | ||
|
e7b15068c2 | ||
|
53c19552d2 | ||
|
048242004e | ||
|
05e1aca2fa | ||
|
23feb457f2 | ||
|
8f6380d63a | ||
|
3b1c40b2fc | ||
|
da92cb9ff4 | ||
|
bda9167423 | ||
|
cb9c96a2b0 | ||
|
e0cf214c44 | ||
|
ed73fdd5b8 | ||
|
eaea42f827 | ||
|
7f39e1e2c3 | ||
|
b81fb0af90 | ||
|
47fe34ad81 | ||
|
ac50d333a5 | ||
|
ce013e2962 | ||
|
67fab710bf | ||
|
b3a599e8c2 | ||
|
f4e4c1556d | ||
|
c5e33416d3 | ||
|
6c08ca4c16 | ||
|
b1822bb04f | ||
|
893e962890 | ||
|
bd6071cb3b | ||
|
92ea74cba2 | ||
|
a01459c87a | ||
|
fd35c43643 | ||
|
0426985c81 | ||
|
bbea02f55c | ||
|
4e77a1adbd | ||
|
bd58a7ba77 | ||
|
a3ed9196b9 | ||
|
2646f5ea57 | ||
|
424c7ad2e3 | ||
|
234b3085d7 | ||
|
3d978c961d | ||
|
72254cd77a | ||
|
d8a6361244 | ||
|
968dfa2feb | ||
|
114d6841ab | ||
|
1c68d769b5 | ||
|
ac0ca3f15c | ||
|
2d5498b8aa | ||
|
71ccae1a3a | ||
|
cb186dddc4 | ||
|
2ceccc0969 | ||
|
93aa98ad01 | ||
|
c0b38bab85 | ||
|
d6c4473a12 | ||
|
fc06f221d5 | ||
|
ac3ab2a556 | ||
|
2c330cac73 | ||
|
a0d6543b84 | ||
|
e223490a78 | ||
|
2666f067cc | ||
|
826a57a04a | ||
|
e08b5aaa70 | ||
|
b15e8c344e | ||
|
2c9e041b4c | ||
|
e2c3394d70 | ||
|
1532fe703a | ||
|
058f1eba7d | ||
|
1102dc6e27 | ||
|
85374d330f | ||
|
a16d6c4342 | ||
|
9b5e2978ad | ||
|
06468a4157 | ||
|
0167f8aa84 | ||
|
f1a90e684b | ||
|
14d98413fd | ||
|
97a4ec7598 | ||
|
71ed725c5c | ||
|
d4bf41288a | ||
|
365bd78c20 | ||
|
52fa958fb2 | ||
|
17d32bac7f | ||
|
e2babe5308 | ||
|
5f1a6b883f | ||
|
c5e3796202 | ||
|
72f1984229 | ||
|
918081b372 | ||
|
6c56dd070f | ||
|
299f93f8f0 | ||
|
4169846c57 | ||
|
aff4957531 | ||
|
e4bdf472a6 | ||
|
9a99750474 | ||
|
7eb751b5f5 | ||
|
0d0173eb6e | ||
|
556ed4ea90 | ||
|
cf445a6107 | ||
|
243ad45e59 | ||
|
31d0c621f5 | ||
|
0870974a25 | ||
|
5c4a0a6785 | ||
|
e55babdc53 | ||
|
6e1b540d99 | ||
|
edf20018f0 | ||
|
c09432e171 | ||
|
9c031b5c1e | ||
|
a827cbc3ba | ||
|
c05eebee15 | ||
|
93a69d020a | ||
|
f3fa9c01ba | ||
|
bae5b0c022 | ||
|
67befbc8a8 | ||
|
cac22f01cf | ||
|
94d2dbaa74 | ||
|
6add828bc8 | ||
|
800b67ed65 | ||
|
71e4ddd222 | ||
|
54a68a1713 | ||
|
bd544ad038 | ||
|
d5491da77f | ||
|
591fc0ae28 | ||
|
dac7ac1eae | ||
|
64ee756b7a | ||
|
e2103ce0f2 | ||
|
219902ebff | ||
|
21116373a7 | ||
|
5ed88809f3 | ||
|
bb8b38b1e0 | ||
|
236159242f | ||
|
9d8f839bf2 | ||
|
882148f9ad | ||
|
a863f62489 | ||
|
6478c7c2df | ||
|
129a4ea506 | ||
|
a122243a89 | ||
|
315c55eeb1 | ||
|
cfe50ff8ae | ||
|
d0018a2696 | ||
|
77d9e05a07 | ||
|
80243f377c | ||
|
43b157cf4d | ||
|
f6b50d4bfd | ||
|
806901cfd2 | ||
|
f90eef7f1f | ||
|
06867e14b6 | ||
|
bda2615b30 | ||
|
e1a5d10e75 | ||
|
98fd6b7d82 | ||
|
be264a4b20 | ||
|
01c85b71d2 | ||
|
2d22c0aa55 | ||
|
6e061f5184 | ||
|
44b988f1f6 | ||
|
0ead9558b4 | ||
|
159a1cced1 |
332
INSTALL
332
INSTALL
@@ -2,61 +2,57 @@
|
||||
Install
|
||||
How to install HTML Purifier
|
||||
|
||||
HTML Purifier is designed to run out of the box, so actually using the library
|
||||
is extremely easy. (Although, if you were looking for a step-by-step
|
||||
installation GUI, you've come to the wrong place!) The impatient can scroll
|
||||
down to the bottom of this INSTALL document to see the code, but you really
|
||||
should make sure a few things are properly done.
|
||||
HTML Purifier is designed to run out of the box, so actually using the
|
||||
library is extremely easy. (Although... if you were looking for a
|
||||
step-by-step installation GUI, you've downloaded the wrong software!)
|
||||
|
||||
Todo: Convert to using the array syntax for configuration.
|
||||
While the impatient can get going immediately with some of the sample
|
||||
code at the bottom of this library, it's well worth reading this entire
|
||||
document--most of the other documentation assumes that you are familiar
|
||||
with these contents.
|
||||
|
||||
|
||||
---------------------------------------------------------------------------
|
||||
1. Compatibility
|
||||
|
||||
HTML Purifier works in both PHP 4 and PHP 5, from PHP 4.3.9 and up. It has no
|
||||
core dependencies with other libraries. (Whoopee!)
|
||||
HTML Purifier is PHP 5 only, and is actively tested from PHP 5.0.5 and
|
||||
up. It has no core dependencies with other libraries. PHP
|
||||
4 support was deprecated on December 31, 2007 with HTML Purifier 3.0.0.
|
||||
Essential security fixes will be issued for the 2.1.x branch until
|
||||
August 8, 2008.
|
||||
|
||||
Optional extensions are iconv (usually installed) and tidy (also common).
|
||||
If you use UTF-8 and don't plan on pretty-printing HTML, you can get away with
|
||||
not having either of these extensions.
|
||||
These optional extensions can enhance the capabilities of HTML Purifier:
|
||||
|
||||
* iconv : Converts text to and from non-UTF-8 encodings
|
||||
* bcmath : Used for unit conversion and imagecrash protection
|
||||
* tidy : Used for pretty-printing HTML
|
||||
|
||||
|
||||
---------------------------------------------------------------------------
|
||||
2. Reconnaissance
|
||||
|
||||
2. Including the library
|
||||
A big plus of HTML Purifier is its inerrant support of standards, so
|
||||
your web-pages should be standards-compliant. (They should also use
|
||||
semantic markup, but that's another issue altogether, one HTML Purifier
|
||||
cannot fix without reading your mind.)
|
||||
|
||||
Simply use:
|
||||
|
||||
require_once '/path/to/library/HTMLPurifier.auto.php';
|
||||
|
||||
...and you're good to go. Since HTML Purifier's codebase is fairly
|
||||
large, I recommend only including HTML Purifier when you need it.
|
||||
|
||||
If you don't like your include_path to be fiddled around with, simply set
|
||||
HTML Purifier's library/ directory to the include path yourself and then:
|
||||
|
||||
require_once 'HTMLPurifier.php';
|
||||
|
||||
Only the contents in the library/ folder are necessary, so you can remove
|
||||
everything else when using HTML Purifier in a production environment.
|
||||
|
||||
|
||||
|
||||
3. Preparing the proper output environment
|
||||
|
||||
HTML Purifier is all about web-standards, so accordingly your webpages should
|
||||
be standards compliant. HTML Purifier can deal with these doctypes:
|
||||
HTML Purifier can process these doctypes:
|
||||
|
||||
* XHTML 1.0 Transitional (default)
|
||||
* XHTML 1.0 Strict
|
||||
* HTML 4.01 Transitional
|
||||
* HTML 4.01 Strict
|
||||
* XHTML 1.1
|
||||
|
||||
...and these character encodings:
|
||||
|
||||
* UTF-8 (default)
|
||||
* Any encoding iconv supports (support is crippled for i18n though)
|
||||
* Any encoding iconv supports (with crippled internationalization support)
|
||||
|
||||
The defaults are there for a reason: they are best-practice choices that
|
||||
should not be changed lightly. For those of you in the dark, you can determine
|
||||
the doctype from this code in your HTML documents:
|
||||
These defaults reflect what my choices would be if I were authoring an
|
||||
HTML document, however, what you choose depends on the nature of your
|
||||
codebase. If you don't know what doctype you are using, you can determine
|
||||
the doctype from this identifier at the top of your source code:
|
||||
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||
@@ -65,35 +61,167 @@ the doctype from this code in your HTML documents:
|
||||
|
||||
<meta http-equiv="Content-type" content="text/html;charset=ENCODING">
|
||||
|
||||
For legacy codebases these declarations may be missing. If that is the case,
|
||||
STOP, and read up on character encodings and doctypes (in that order). Here
|
||||
are some links:
|
||||
|
||||
* http://www.joelonsoftware.com/articles/Unicode.html
|
||||
* http://alistapart.com/stories/doctype/
|
||||
|
||||
You may currently be vulnerable to XSS and other security threats, and HTML
|
||||
Purifier won't be able to fix that.
|
||||
If the character encoding declaration is missing, STOP NOW, and
|
||||
read 'docs/enduser-utf8.html' (web accessible at
|
||||
http://htmlpurifier.org/docs/enduser-utf8.html). In fact, even if it is
|
||||
present, read this document anyway, as many websites specify their
|
||||
document's character encoding incorrectly.
|
||||
|
||||
|
||||
---------------------------------------------------------------------------
|
||||
3. Including the library
|
||||
|
||||
The procedure is quite simple:
|
||||
|
||||
require_once '/path/to/library/HTMLPurifier.auto.php';
|
||||
|
||||
This will setup an autoloader, so the library's files are only included
|
||||
when you use them.
|
||||
|
||||
Only the contents in the library/ folder are necessary, so you can remove
|
||||
everything else when using HTML Purifier in a production environment.
|
||||
|
||||
If you installed HTML Purifier via PEAR, all you need to do is:
|
||||
|
||||
require_once 'HTMLPurifier.auto.php';
|
||||
|
||||
Please note that the usual PEAR practice of including just the classes you
|
||||
want will not work with HTML Purifier's autoloading scheme.
|
||||
|
||||
Advanced users, read on; other users can skip to section 4.
|
||||
|
||||
Autoload compatibility
|
||||
----------------------
|
||||
|
||||
HTML Purifier attempts to be as smart as possible when registering an
|
||||
autoloader, but there are some cases where you will need to change
|
||||
your own code to accomodate HTML Purifier. These are those cases:
|
||||
|
||||
PHP VERSION IS LESS THAN 5.1.2, AND YOU'VE DEFINED __autoload
|
||||
Because spl_autoload_register() doesn't exist in early versions
|
||||
of PHP 5, HTML Purifier has no way of adding itself to the autoload
|
||||
stack. Modify your __autoload function to test
|
||||
HTMLPurifier_Bootstrap::autoload($class)
|
||||
|
||||
For example, suppose your autoload function looks like this:
|
||||
|
||||
function __autoload($class) {
|
||||
require str_replace('_', '/', $class) . '.php';
|
||||
return true;
|
||||
}
|
||||
|
||||
A modified version with HTML Purifier would look like this:
|
||||
|
||||
function __autoload($class) {
|
||||
if (HTMLPurifier_Bootstrap::autoload($class)) return true;
|
||||
require str_replace('_', '/', $class) . '.php';
|
||||
return true;
|
||||
}
|
||||
|
||||
Note that there *is* some custom behavior in our autoloader; the
|
||||
original autoloader in our example would work for 99% of the time,
|
||||
but would fail when including language files.
|
||||
|
||||
AN __autoload FUNCTION IS DECLARED AFTER OUR AUTOLOADER IS REGISTERED
|
||||
spl_autoload_register() has the curious behavior of disabling
|
||||
the existing __autoload() handler. Users need to explicitly
|
||||
spl_autoload_register('__autoload'). Because we use SPL when it
|
||||
is available, __autoload() will ALWAYS be disabled. If __autoload()
|
||||
is declared before HTML Purifier is loaded, this is not a problem:
|
||||
HTML Purifier will register the function for you. But if it is
|
||||
declared afterwards, it will mysteriously not work. This
|
||||
snippet of code (after your autoloader is defined) will fix it:
|
||||
|
||||
spl_autoload_register('__autoload')
|
||||
|
||||
Users should also be on guard if they use a version of PHP previous
|
||||
to 5.1.2 without an autoloader--HTML Purifier will define __autoload()
|
||||
for you, which can collide with an autoloader that was added by *you*
|
||||
later.
|
||||
|
||||
|
||||
For better performance
|
||||
----------------------
|
||||
|
||||
Opcode caches, which greatly speed up PHP initialization for scripts
|
||||
with large amounts of code (HTML Purifier included), don't like
|
||||
autoloaders. We offer an include file that includes all of HTML Purifier's
|
||||
files in one go in an opcode cache friendly manner:
|
||||
|
||||
// If /path/to/library isn't already in your include path, uncomment
|
||||
// the below line:
|
||||
// require '/path/to/library/HTMLPurifier.path.php';
|
||||
|
||||
require 'HTMLPurifier.includes.php';
|
||||
|
||||
Optional components still need to be included--you'll know if you try to
|
||||
use a feature and you get a class doesn't exists error! The autoloader
|
||||
can be used in conjunction with this approach to catch classes that are
|
||||
missing. Simply add this afterwards:
|
||||
|
||||
require 'HTMLPurifier.autoload.php';
|
||||
|
||||
Standalone version
|
||||
------------------
|
||||
|
||||
HTML Purifier has a standalone distribution; you can also generate
|
||||
a standalone file from the full version by running the script
|
||||
maintenance/generate-standalone.php . The standalone version has the
|
||||
benefit of having most of its code in one file, so parsing is much
|
||||
faster and the library is easier to manage.
|
||||
|
||||
If HTMLPurifier.standalone.php exists in the library directory, you
|
||||
can use it like this:
|
||||
|
||||
require '/path/to/HTMLPurifier.standalone.php';
|
||||
|
||||
This is equivalent to including HTMLPurifier.includes.php, except that
|
||||
the contents of standalone/ will be added to your path. To override this
|
||||
behavior, specify a new HTMLPURIFIER_PREFIX where standalone files can
|
||||
be found (usually, this will be one directory up, the "true" library
|
||||
directory in full distributions). Don't forget to set your path too!
|
||||
|
||||
The autoloader can be added to the end to ensure the classes are
|
||||
loaded when necessary; otherwise you can manually include them.
|
||||
To use the autoloader, use this:
|
||||
|
||||
require 'HTMLPurifier.autoload.php';
|
||||
|
||||
For advanced users
|
||||
------------------
|
||||
|
||||
HTMLPurifier.auto.php performs a number of operations that can be done
|
||||
individually. These are:
|
||||
|
||||
HTMLPurifier.path.php
|
||||
Puts /path/to/library in the include path. For high performance,
|
||||
this should be done in php.ini.
|
||||
|
||||
HTMLPurifier.autoload.php
|
||||
Registers our autoload handler HTMLPurifier_Bootstrap::autoload($class).
|
||||
|
||||
You can do these operations by yourself--in fact, you must modify your own
|
||||
autoload handler if you are using a version of PHP earlier than PHP 5.1.2
|
||||
(See "Autoload compatibility" above).
|
||||
|
||||
|
||||
---------------------------------------------------------------------------
|
||||
4. Configuration
|
||||
|
||||
HTML Purifier is designed to run out-of-the-box, but occasionally HTML
|
||||
Purifier needs to be told what to do. If you answered no to any of these
|
||||
questions, read on, otherwise, you can skip to the next section (or, if you're
|
||||
Purifier needs to be told what to do. If you answer no to any of these
|
||||
questions, read on; otherwise, you can skip to the next section (or, if you're
|
||||
into configuring things just for the heck of it, skip to 4.3).
|
||||
|
||||
* Am I using UTF-8?
|
||||
* Am I using XHTML 1.0 Transitional?
|
||||
|
||||
If you answered yes to any of these questions, instantiate a configuration
|
||||
If you answered no to any of these questions, instantiate a configuration
|
||||
object and read on:
|
||||
|
||||
$config = HTMLPurifier_Config::createDefault();
|
||||
|
||||
|
||||
|
||||
4.1. Setting a different character encoding
|
||||
|
||||
You really shouldn't use any other encoding except UTF-8, especially if you
|
||||
@@ -114,40 +242,93 @@ websites):
|
||||
|
||||
Note that HTML Purifier's support for non-Unicode encodings is crippled by the
|
||||
fact that any character not supported by that encoding will be silently
|
||||
dropped, EVEN if it is ampersand escaped. This is a current limitation of
|
||||
HTML Purifier that we are NOT actively working to fix. Patches are welcome,
|
||||
but there are so many other gotchas and problems in I18N for non-Unicode
|
||||
encodings that this functionality is low priority. See
|
||||
<http://ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html> for a more
|
||||
detailed lowdown on the topic.
|
||||
|
||||
dropped, EVEN if it is ampersand escaped. If you want to work around
|
||||
this, you are welcome to read docs/enduser-utf8.html for a fix,
|
||||
but please be cognizant of the issues the "solution" creates (for this
|
||||
reason, I do not include the solution in this document).
|
||||
|
||||
|
||||
4.2. Setting a different doctype
|
||||
|
||||
For those of you stuck using HTML 4.01 Transitional, you can disable
|
||||
For those of you using HTML 4.01 Transitional, you can disable
|
||||
XHTML output like this:
|
||||
|
||||
$config->set('Core', 'XHTML', false);
|
||||
$config->set('HTML', 'Doctype', 'HTML 4.01 Transitional');
|
||||
|
||||
I recommend that you use XHTML, although not as much as I recommend UTF-8. If
|
||||
your HTML 4.01 page validates, good for you!
|
||||
|
||||
Currently, we can only guarantee transitional-complaint output, future
|
||||
versions will also allow strict-compliant output.
|
||||
Other supported doctypes include:
|
||||
|
||||
* HTML 4.01 Strict
|
||||
* HTML 4.01 Transitional
|
||||
* XHTML 1.0 Strict
|
||||
* XHTML 1.0 Transitional
|
||||
* XHTML 1.1
|
||||
|
||||
|
||||
4.3. Other settings
|
||||
|
||||
There are more configuration directives which can be read about
|
||||
here: <http://hp.jpsband.org/live/configdoc/plain.html> They're a bit boring,
|
||||
here: <http://htmlpurifier.org/live/configdoc/plain.html> They're a bit boring,
|
||||
but they can help out for those of you who like to exert maximum control over
|
||||
your code.
|
||||
your code. Some of the more interesting ones are configurable at the
|
||||
demo <http://htmlpurifier.org/demo.php> and are well worth looking into
|
||||
for your own system.
|
||||
|
||||
For example, you can fine tune allowed elements and attributes, convert
|
||||
relative URLs to absolute ones, and even autoparagraph input text! These
|
||||
are, respectively, %HTML.Allowed, %URI.MakeAbsolute and %URI.Base, and
|
||||
%AutoFormat.AutoParagraph. The %Namespace.Directive naming convention
|
||||
translates to:
|
||||
|
||||
$config->set('Namespace', 'Directive', $value);
|
||||
|
||||
E.g.
|
||||
|
||||
$config->set('HTML', 'Allowed', 'p,b,a[href],i');
|
||||
$config->set('URI', 'Base', 'http://www.example.com');
|
||||
$config->set('URI', 'MakeAbsolute', true);
|
||||
$config->set('AutoFormat', 'AutoParagraph', true);
|
||||
|
||||
|
||||
---------------------------------------------------------------------------
|
||||
5. Caching
|
||||
|
||||
5. Using the code
|
||||
HTML Purifier generates some cache files (generally one or two) to speed up
|
||||
its execution. For maximum performance, make sure that
|
||||
library/HTMLPurifier/DefinitionCache/Serializer is writeable by the webserver.
|
||||
|
||||
If you are in the library/ folder of HTML Purifier, you can set the
|
||||
appropriate permissions using:
|
||||
|
||||
chmod -R 0755 HTMLPurifier/DefinitionCache/Serializer
|
||||
|
||||
If the above command doesn't work, you may need to assign write permissions
|
||||
to all. This may be necessary if your webserver runs as nobody, but is
|
||||
not recommended since it means any other user can write files in the
|
||||
directory. Use:
|
||||
|
||||
chmod -R 0777 HTMLPurifier/DefinitionCache/Serializer
|
||||
|
||||
You can also chmod files via your FTP client; this option
|
||||
is usually accessible by right clicking the corresponding directory and
|
||||
then selecting "chmod" or "file permissions".
|
||||
|
||||
Starting with 2.0.1, HTML Purifier will generate friendly error messages
|
||||
that will tell you exactly what you have to chmod the directory to, if in doubt,
|
||||
follow its advice.
|
||||
|
||||
If you are unable or unwilling to give write permissions to the cache
|
||||
directory, you can either disable the cache (and suffer a performance
|
||||
hit):
|
||||
|
||||
$config->set('Core', 'DefinitionCache', null);
|
||||
|
||||
Or move the cache directory somewhere else (no trailing slash):
|
||||
|
||||
$config->set('Cache', 'SerializerPath', '/home/user/absolute/path');
|
||||
|
||||
|
||||
---------------------------------------------------------------------------
|
||||
6. Using the code
|
||||
|
||||
The interface is mind-numbingly simple:
|
||||
|
||||
@@ -160,13 +341,15 @@ The interface is mind-numbingly simple:
|
||||
$clean_html = $purifier->purify( $dirty_html );
|
||||
|
||||
That's it! For more examples, check out docs/examples/ (they aren't very
|
||||
different though). Also, SLOW gives advice on what to do if HTML Purifier
|
||||
is slowing down your application.
|
||||
different though). Also, docs/enduser-slow.html gives advice on what to
|
||||
do if HTML Purifier is slowing down your application.
|
||||
|
||||
|
||||
---------------------------------------------------------------------------
|
||||
7. Quick install
|
||||
|
||||
6. Quick install
|
||||
|
||||
First, make sure library/HTMLPurifier/DefinitionCache/Serializer is
|
||||
writable by the webserver (see Section 5: Caching above for details).
|
||||
If your website is in UTF-8 and XHTML Transitional, use this code:
|
||||
|
||||
<?php
|
||||
@@ -182,9 +365,10 @@ If your website is in a different encoding or doctype, use this code:
|
||||
require_once '/path/to/htmlpurifier/library/HTMLPurifier.auto.php';
|
||||
|
||||
$config = HTMLPurifier_Config::createDefault();
|
||||
$config->set('Core', 'Encoding', 'ISO-8859-1'); //replace with your encoding
|
||||
$config->set('Core', 'XHTML', true); //replace with false if HTML 4.01
|
||||
$config->set('Core', 'Encoding', 'ISO-8859-1'); // replace with your encoding
|
||||
$config->set('HTML', 'Doctype', 'HTML 4.01 Transitional'); // replace with your doctype
|
||||
$purifier = new HTMLPurifier($config);
|
||||
|
||||
$clean_html = $purifier->purify($dirty_html);
|
||||
?>
|
||||
?>
|
||||
|
||||
|
67
INSTALL.fr.utf8
Normal file
67
INSTALL.fr.utf8
Normal file
@@ -0,0 +1,67 @@
|
||||
|
||||
Installation
|
||||
Comment installer HTML Purifier
|
||||
|
||||
Attention: Ce document a encode en UTF-8. Si les lettres avec les accents
|
||||
est essoreuse, prenez un mieux editeur de texte.
|
||||
|
||||
À L'Aide: Je ne suis pas un diseur natif de français. Si vous trouvez une
|
||||
erreur dans ce document, racontez-moi! Merci.
|
||||
|
||||
|
||||
L'installation de HTML Purifier est trés simple, parce qu'il ne doit pas
|
||||
la configuration. Dans le pied de de document, les utilisateurs
|
||||
impatient peuvent trouver le code, mais je recommande que vous lisez
|
||||
ce document pour quelques choses.
|
||||
|
||||
|
||||
1. Compatibilité
|
||||
|
||||
HTML Purifier fonctionne dans PHP 5. PHP 5.0.5 est le dernier
|
||||
version que je le testais. Il ne dépend de les autre librairies.
|
||||
|
||||
Les extensions optionnel est iconv (en général déjà installer) et
|
||||
tidy (répandu aussi). Si vous utilisez UTF-8 et ne voulez pas
|
||||
l'indentation, vous pouvez utiliser HTML Purifier sans ces extensions.
|
||||
|
||||
|
||||
2. Inclure la librarie
|
||||
|
||||
Utilisez:
|
||||
|
||||
require_once '/path/to/library/HTMLPurifier.auto.php';
|
||||
|
||||
...quand vous devez utiliser HTML Purifier (ne inclure pas quand vous
|
||||
ne devez pas, parce que HTML Purifier est trés grand.)
|
||||
|
||||
HTML Purifier utilise 'autoload'. Si vous avez définu la fonction
|
||||
__autoload, vous doivez ajoute cet programme:
|
||||
|
||||
spl_autoload_register('__autoload')
|
||||
|
||||
Plus d'information est dans le document 'INSTALL'.
|
||||
|
||||
|
||||
3. Installation vite
|
||||
|
||||
Si votre site web est en UTF-8 et XHTML Transitional, utilisez:
|
||||
|
||||
<?php
|
||||
require_once '/path/to/htmlpurifier/library/HTMLPurifier.auto.php';
|
||||
|
||||
$purificateur = new HTMLPurifier();
|
||||
$html_propre = $purificateur->purify($html_salle);
|
||||
?>
|
||||
|
||||
Sinon, utilisez:
|
||||
|
||||
<?php
|
||||
require_once '/path/to/htmlpurifier/library/HTMLPurifier.auto.php';
|
||||
|
||||
$config = HTMLPurifier_Config::createDefault();
|
||||
$config->set('Core', 'Encoding', 'ISO-8859-1'); //remplacez avec votre encoding
|
||||
$config->set('Core', 'XHTML', true); //remplacez avec false si HTML 4.01
|
||||
$purificateur = new HTMLPurifier($config);
|
||||
|
||||
$html_propre = $purificateur->purify($html_salle);
|
||||
?>
|
568
NEWS
568
NEWS
@@ -9,6 +9,574 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
|
||||
. Internal change
|
||||
==========================
|
||||
|
||||
3.1.1, released 2008-06-19
|
||||
# %URI.Munge now, by default, does not munge resources (for example, <img src="">)
|
||||
In order to enable this again, please set %URI.MungeResources to true.
|
||||
! More robust imagecrash protection with height/width CSS with %CSS.MaxImgLength,
|
||||
and height/width HTML with %HTML.MaxImgLength.
|
||||
! %URI.MungeSecretKey for secure URI munging. Thanks Chris
|
||||
for sponsoring this feature. Check out the corresponding documentation
|
||||
for details. (Att Nightly testers: The API for this feature changed before
|
||||
the general release. Namely, rename your directives %URI.SecureMungeSecretKey =>
|
||||
%URI.MungeSecretKey and and %URI.SecureMunge => %URI.Munge)
|
||||
! Implemented post URI filtering. Set member variable $post to true to set
|
||||
a URIFilter as such.
|
||||
! Allow modules to define injectors via $info_injector. Injectors are
|
||||
automatically disabled if injector's needed elements are not found.
|
||||
! Support for "safe" objects added, use %HTML.SafeObject and %HTML.SafeEmbed.
|
||||
Thanks Chris for sponsoring. If you've been using ad hoc code from the
|
||||
forums, PLEASE use this instead.
|
||||
! Added substitutions for %e, %n, %a and %p in %URI.Munge (in order,
|
||||
embedded, tag name, attribute name, CSS property name). See %URI.Munge
|
||||
for more details. Requested by Jochem Blok.
|
||||
- Disable percent height/width attributes for img.
|
||||
- AttrValidator operations are now atomic; updates to attributes are not
|
||||
manifest in token until end of operations. This prevents naughty internal
|
||||
code from directly modifying CurrentToken when they're not supposed to.
|
||||
This semantics change was requested by frank farmer.
|
||||
- Percent encoding checks enabled for URI query and fragment
|
||||
- Fix stray backslashes in font-family; CSS Unicode character escapes are
|
||||
now properly resolved (although *only* in font-family). Thanks Takeshi Terada
|
||||
for reporting.
|
||||
- Improve parseCDATA algorithm to take into account newline normalization
|
||||
- Account for browser confusion between Yen character and backslash in
|
||||
Shift_JIS encoding. This fix generalizes to any other encoding which is not
|
||||
a strict superset of printable ASCII. Thanks Takeshi Terada for reporting.
|
||||
- Fix missing configuration parameter in Generator calls. Thanks vs for the
|
||||
partial patch.
|
||||
- Improved adherence to Unicode by checking for non-character codepoints.
|
||||
Thanks Geoffrey Sneddon for reporting. This may result in degraded
|
||||
performance for extremely large inputs.
|
||||
- Allow CSS property-value pair ''text-decoration: none''. Thanks Jochem Blok
|
||||
for reporting.
|
||||
. Added HTMLPurifier_UnitConverter and HTMLPurifier_Length for convenient
|
||||
handling of CSS-style lengths. HTMLPurifier_AttrDef_CSS_Length now uses
|
||||
this class.
|
||||
. API of HTMLPurifier_AttrDef_CSS_Length changed from __construct($disable_negative)
|
||||
to __construct($min, $max). __construct(true) is equivalent to
|
||||
__construct('0').
|
||||
. Added HTMLPurifier_AttrDef_Switch class
|
||||
. Rename HTMLPurifier_HTMLModule_Tidy->construct() to setup() and bubble method
|
||||
up inheritance hierarchy to HTMLPurifier_HTMLModule. All HTMLModules
|
||||
get this called with the configuration object. All modules now
|
||||
use this rather than __construct(), although legacy code using constructors
|
||||
will still work--the new format, however, lets modules access the
|
||||
configuration object for HTML namespace dependant tweaks.
|
||||
. AttrDef_HTML_Pixels now takes a single construction parameter, pixels.
|
||||
. ConfigSchema data-structure heavily optimized; on average it uses a third
|
||||
the memory it did previously. The interface has changed accordingly,
|
||||
consult changes to HTMLPurifier_Config for details.
|
||||
. Variable parsing types now are magic integers instead of strings
|
||||
. Added benchmark for ConfigSchema
|
||||
. HTMLPurifier_Generator requires $config and $context parameters. If you
|
||||
don't know what they should be, use HTMLPurifier_Config::createDefault()
|
||||
and new HTMLPurifier_Context().
|
||||
. Printers now properly distinguish between output configuration, and
|
||||
target configuration. This is not applicable to scripts using
|
||||
the Printers for HTML Purifier related tasks.
|
||||
. HTML/CSS Printers must be primed with prepareGenerator($gen_config), otherwise
|
||||
fatal errors will ensue.
|
||||
. URIFilter->prepare can return false in order to abort loading of the filter
|
||||
. Factory for AttrDef_URI implemented, URI#embedded to indicate URI that embeds
|
||||
an external resource.
|
||||
. %URI.Munge functionality factored out into a post-filter class.
|
||||
. Added CurrentCSSProperty context variable during CSS validation
|
||||
|
||||
3.1.0, released 2008-05-18
|
||||
# Unnecessary references to objects (vestiges of PHP4) removed from method
|
||||
signatures. The following methods do not need references when assigning from
|
||||
them and will result in E_STRICT errors if you try:
|
||||
+ HTMLPurifier_Config->get*Definition() [* = HTML, CSS]
|
||||
+ HTMLPurifier_ConfigSchema::instance()
|
||||
+ HTMLPurifier_DefinitionCacheFactory::instance()
|
||||
+ HTMLPurifier_DefinitionCacheFactory->create()
|
||||
+ HTMLPurifier_DoctypeRegistry->register()
|
||||
+ HTMLPurifier_DoctypeRegistry->get()
|
||||
+ HTMLPurifier_HTMLModule->addElement()
|
||||
+ HTMLPurifier_HTMLModule->addBlankElement()
|
||||
+ HTMLPurifier_LanguageFactory::instance()
|
||||
# Printer_ConfigForm's get*() functions were static-ified
|
||||
# %HTML.ForbiddenAttributes requires attribute declarations to be in the
|
||||
form of tag@attr, NOT tag.attr (which will throw an error and won't do
|
||||
anything). This is for forwards compatibility with XML; you'd do best
|
||||
to migrate an %HTML.AllowedAttributes directives to this syntax too.
|
||||
! Allow index to be false for config from form creation
|
||||
! Added HTMLPurifier::VERSION constant
|
||||
! Commas, not dashes, used for serializer IDs. This change is forwards-compatible
|
||||
and allows for version numbers like "3.1.0-dev".
|
||||
! %HTML.Allowed deals gracefully with whitespace anywhere, anytime!
|
||||
! HTML Purifier's URI handling is a lot more robust, with much stricter
|
||||
validation checks and better percent encoding handling. Thanks Gareth Heyes
|
||||
for indicating security vulnerabilities from lax percent encoding.
|
||||
! Bootstrap autoloader deals more robustly with classes that don't exist,
|
||||
preventing class_exists($class, true) from barfing.
|
||||
- InterchangeBuilder now alphabetizes its lists
|
||||
- Validation error in configdoc output fixed
|
||||
- Iconv and other encoding errors muted even with custom error handlers that
|
||||
do not honor error_reporting
|
||||
- Add protection against imagecrash attack with CSS height/width
|
||||
- HTMLPurifier::instance() created for consistency, is equivalent to getInstance()
|
||||
- Fixed and revamped broken ConfigForm smoketest
|
||||
- Bug with bool/null fields in Printer_ConfigForm fixed
|
||||
- Bug with global forbidden attributes fixed
|
||||
- Improved error messages for allowed and forbidden HTML elements and attributes
|
||||
- Missing (or null) in configdoc documentation restored
|
||||
- If DOM throws and exception during parsing with PH5P (occurs in newer versions
|
||||
of DOM), HTML Purifier punts to DirectLex
|
||||
- Fatal error with unserialization of ScriptRequired
|
||||
- Created directories are now chmod'ed properly
|
||||
- Fixed bug with fallback languages in LanguageFactory
|
||||
- Standalone testing setup properly with autoload
|
||||
. Out-of-date documentation revised
|
||||
. UTF-8 encoding check optimization as suggested by Diego
|
||||
. HTMLPurifier_Error removed in favor of exceptions
|
||||
. More copy() function removed; should use clone instead
|
||||
. More extensive unit tests for HTMLDefinition
|
||||
. assertPurification moved to central harness
|
||||
. HTMLPurifier_Generator accepts $config and $context parameters during
|
||||
instantiation, not runtime
|
||||
. Double-quotes outside of attribute values are now unescaped
|
||||
|
||||
3.1.0rc1, released 2008-04-22
|
||||
# Autoload support added. Internal require_once's removed in favor of an
|
||||
explicit require list or autoloading. To use HTML Purifier,
|
||||
you must now either use HTMLPurifier.auto.php
|
||||
or HTMLPurifier.includes.php; setting the include path and including
|
||||
HTMLPurifier.php is insufficient--in such cases include HTMLPurifier.autoload.php
|
||||
as well to register our autoload handler (or modify your autoload function
|
||||
to check HTMLPurifier_Bootstrap::getPath($class)). You can also use
|
||||
HTMLPurifier.safe-includes.php for a less performance friendly but more
|
||||
user-friendly library load.
|
||||
# HTMLPurifier_ConfigSchema static functions are officially deprecated. Schema
|
||||
information is stored in the ConfigSchema directory, and the
|
||||
maintenance/generate-schema-cache.php generates the schema.ser file, which
|
||||
is now instantiated. Support for userland schema changes coming soon!
|
||||
# HTMLPurifier_Config will now throw E_USER_NOTICE when you use a directive
|
||||
alias; to get rid of these errors just modify your configuration to use
|
||||
the new directive name.
|
||||
# HTMLPurifier->addFilter is deprecated; built-in filters can now be
|
||||
enabled using %Filter.$filter_name or by setting your own filters using
|
||||
%Filter.Custom
|
||||
# Directive-level safety properties superceded in favor of module-level
|
||||
safety. Internal method HTMLModule->addElement() has changed, although
|
||||
the externally visible HTMLDefinition->addElement has *not* changed.
|
||||
! Extra utility classes for testing and non-library operations can
|
||||
be found in extras/. Specifically, these are FSTools and ConfigDoc.
|
||||
You may find a use for these in your own project, but right now they
|
||||
are highly experimental and volatile.
|
||||
! Integration with PHPT allows for automated smoketests
|
||||
! Limited support for proprietary HTML elements, namely <marquee>, sponsored
|
||||
by Chris. You can enable them with %HTML.Proprietary if your client
|
||||
demands them.
|
||||
! Support for !important CSS cascade modifier. By default, this will be stripped
|
||||
from CSS, but you can enable it using %CSS.AllowImportant
|
||||
! Support for display and visibility CSS properties added, set %CSS.AllowTricky
|
||||
to true to use them.
|
||||
! HTML Purifier now has its own Exception hierarchy under HTMLPurifier_Exception.
|
||||
Developer error (not enduser error) can cause these to be triggered.
|
||||
! Experimental kses() wrapper introduced with HTMLPurifier.kses.php
|
||||
! Finally %CSS.AllowedProperties for tweaking allowed CSS properties without
|
||||
mucking around with HTMLPurifier_CSSDefinition
|
||||
! ConfigDoc output has been enhanced with version and deprecation info.
|
||||
! %HTML.ForbiddenAttributes and %HTML.ForbiddenElements implemented.
|
||||
- Autoclose now operates iteratively, i.e. <span><span><div> now has
|
||||
both span tags closed.
|
||||
- Various HTMLPurifier_Config convenience functions now accept another parameter
|
||||
$schema which defines what HTMLPurifier_ConfigSchema to use besides the
|
||||
global default.
|
||||
- Fix bug with trusted script handling in libxml versions later than 2.6.28.
|
||||
- Fix bug in ExtractStyleBlocks with comments in style tags
|
||||
- Fix bug in comment parsing for DirectLex
|
||||
- Flush output now displayed when in command line mode for unit tester
|
||||
- Fix bug with rgb(0, 1, 2) color syntax with spaces inside shorthand syntax
|
||||
- HTMLPurifier_HTMLDefinition->addAttribute can now be called multiple times
|
||||
on the same element without emitting errors.
|
||||
- Fixed fatal error in PH5P lexer with invalid tag names
|
||||
. Plugins now get their own changelogs according to project conventions.
|
||||
. Convert tokens to use instanceof, reducing memory footprint and
|
||||
improving comparison speed.
|
||||
. Dry runs now supported in SimpleTest; testing facilities improved
|
||||
. Bootstrap class added for handling autoloading functionality
|
||||
. Implemented recursive glob at FSTools->globr
|
||||
. ConfigSchema now has instance methods for all corresponding define*
|
||||
static methods.
|
||||
. A couple of new historical maintenance scripts were added.
|
||||
. HTMLPurifier/HTMLModule/Tidy/XHTMLAndHTML4.php split into two files
|
||||
. tests/index.php can now be run from any directory.
|
||||
. HTMLPurifier_Token subclasses split into seperate files
|
||||
. HTMLPURIFIER_PREFIX now is defined in Bootstrap.php, NOT HTMLPurifier.php
|
||||
. HTMLPURIFIER_PREFIX can now be defined outside of HTML Purifier
|
||||
. New --php=php flag added, allows PHP executable to be specified (command
|
||||
line only!)
|
||||
. htmlpurifier_add_test() preferred method to translate test files in to
|
||||
classes, because it handles PHPT files too.
|
||||
. Debugger class is deprecated and will be removed soon.
|
||||
. Command line argument parsing for testing scripts revamped, now --opt value
|
||||
format is supported.
|
||||
. Smoketests now cleanup after magic quotes
|
||||
. Generator now can output comments (however, comments are still stripped
|
||||
from HTML Purifier output)
|
||||
. HTMLPurifier_ConfigSchema->validate() deprecated in favor of
|
||||
HTMLPurifier_VarParser->parse()
|
||||
. Integers auto-cast into float type by VarParser.
|
||||
. HTMLPURIFIER_STRICT removed; no validation is performed on runtime, only
|
||||
during cache generation
|
||||
. Reordered script calls in maintenance/flush.php
|
||||
. Command line scripts now honor exit codes
|
||||
. When --flush fails in unit testers, abort tests and print message
|
||||
. Improved documentation in docs/dev-flush.html about the maintenance scripts
|
||||
. copy() methods removed in favor of clone keyword
|
||||
|
||||
3.0.0, released 2008-01-06
|
||||
# HTML Purifier is PHP 5 only! The 2.1.x branch will be maintained
|
||||
until PHP 4 is completely deprecated, but no new features will be added
|
||||
to it.
|
||||
+ Visibility declarations added
|
||||
+ Constructor methods renamed to __construct()
|
||||
+ PHP4 reference cruft removed (in progress)
|
||||
! CSS properties are now case-insensitive
|
||||
! DefinitionCacheFactory now can register new implementations
|
||||
! New HTMLPurifier_Filter_ExtractStyleBlocks for extracting <style> from
|
||||
documents and cleaning their contents up. Requires the CSSTidy library
|
||||
<http://csstidy.sourceforge.net/>. You can access the blocks with the
|
||||
'StyleBlocks' Context variable ($purifier->context->get('StyleBlocks')).
|
||||
The output CSS can also be "scoped" for a specific element, use:
|
||||
%Filter.ExtractStyleBlocksScope
|
||||
! Experimental support for some proprietary CSS attributes allowed:
|
||||
opacity (and all of the browser-specific equivalents) and scrollbar colors.
|
||||
Enable by setting %CSS.Proprietary to true.
|
||||
- Colors missing # but in hex form will be corrected
|
||||
- CSS Number algorithm improved
|
||||
- Unit testing and multi-testing now on steroids: command lines,
|
||||
XML output, and other goodies now added.
|
||||
. Unit tests for Injector improved
|
||||
. New classes:
|
||||
+ HTMLPurifier_AttrDef_CSS_AlphaValue
|
||||
+ HTMLPurifier_AttrDef_CSS_Filter
|
||||
. Multitest now has a file docblock
|
||||
|
||||
2.1.3, released 2007-11-05
|
||||
! tests/multitest.php allows you to test multiple versions by running
|
||||
tests/index.php through multiple interpreters using `phpv` shell
|
||||
script (you must provide this script!)
|
||||
- Fixed poor include ordering for Email URI AttrDefs, causes fatal errors
|
||||
on some systems.
|
||||
- Injector algorithm further refined: off-by-one error regarding skip
|
||||
counts for dormant injectors fixed
|
||||
- Corrective blockquote definition now enabled for HTML 4.01 Strict
|
||||
- Fatal error when <img> tag (or any other element with required attributes)
|
||||
has 'id' attribute fixed, thanks NykO18 for reporting
|
||||
- Fix warning emitted when a non-supported URI scheme is passed to the
|
||||
MakeAbsolute URIFilter, thanks NykO18 (again)
|
||||
- Further refine AutoParagraph injector. Behavior inside of elements
|
||||
allowing paragraph tags clarified: only inline content delimeted by
|
||||
double newlines (not block elements) are paragraphed.
|
||||
- Buggy treatment of end tags of elements that have required attributes
|
||||
fixed (does not manifest on default tag-set)
|
||||
- Spurious internal content reorganization error suppressed
|
||||
- HTMLDefinition->addElement now returns a reference to the created
|
||||
element object, as implied by the documentation
|
||||
- Phorum mod's HTML Purifier help message expanded (unreleased elsewhere)
|
||||
- Fix a theoretical class of infinite loops from DirectLex reported
|
||||
by Nate Abele
|
||||
- Work around unnecessary DOMElement type-cast in PH5P that caused errors
|
||||
in PHP 5.1
|
||||
- Work around PHP 4 SimpleTest lack-of-error complaining for one-time-only
|
||||
HTMLDefinition errors, this may indicate problems with error-collecting
|
||||
facilities in PHP 5
|
||||
- Make ErrorCollectorEMock work in both PHP 4 and PHP 5
|
||||
- Make PH5P work with PHP 5.0 by removing unnecessary array parameter typedef
|
||||
. %Core.AcceptFullDocuments renamed to %Core.ConvertDocumentToFragment
|
||||
to better communicate its purpose
|
||||
. Error unit tests can now specify the expectation of no errors. Future
|
||||
iterations of the harness will be extremely strict about what errors
|
||||
are allowed
|
||||
. Extend Injector hooks to allow for more powerful injector routines
|
||||
. HTMLDefinition->addBlankElement created, as according to the HTMLModule
|
||||
method
|
||||
. Doxygen configuration file updated, with minor improvements
|
||||
. Test runner now checks for similarly named files in conf/ directory too.
|
||||
. Minor cosmetic change to flush-definition-cache.php: trailing newline is
|
||||
outputted
|
||||
. Maintenance script for generating PH5P patch added, original PH5P source
|
||||
file also added under version control
|
||||
. Full unit test runner script title made more descriptive with PHP version
|
||||
. Updated INSTALL file to state that 4.3.7 is the earliest version we
|
||||
are actively testing
|
||||
|
||||
2.1.2, released 2007-09-03
|
||||
! Implemented Object module for trusted users
|
||||
! Implemented experimental HTML5 parsing mode using PH5P. To use, add
|
||||
this to your code:
|
||||
require_once 'HTMLPurifier/Lexer/PH5P.php';
|
||||
$config->set('Core', 'LexerImpl', 'PH5P');
|
||||
Note that this Lexer introduces some classes not in the HTMLPurifier
|
||||
namespace. Also, this is PHP5 only.
|
||||
! CSS property border-spacing implemented
|
||||
- Fix non-visible parsing error in DirectLex with empty tags that have
|
||||
slashes inside attribute values.
|
||||
- Fix typo in CSS definition: border-collapse:seperate; was incorrectly
|
||||
accepted as valid CSS. Usually non-visible, because this styling is the
|
||||
default for tables in most browsers. Thanks Brett Zamir for pointing
|
||||
this out.
|
||||
- Fix validation errors in configuration form
|
||||
- Hammer out a bunch of edge-case bugs in the standalone distribution
|
||||
- Inclusion reflection removed from URISchemeRegistry; you must manually
|
||||
include any new schema files you wish to use
|
||||
- Numerous typo fixes in documentation thanks to Brett Zamir
|
||||
. Unit test refactoring for one logical test per test function
|
||||
. Config and context parameters in ComplexHarness deprecated: instead, edit
|
||||
the $config and $context member variables
|
||||
. HTML wrapper in DOMLex now takes DTD identifiers into account; doesn't
|
||||
really make a difference, but is good for completeness sake
|
||||
. merge-library.php script refactored for greater code reusability and
|
||||
PHP4 compatibility
|
||||
|
||||
2.1.1, released 2007-08-04
|
||||
- Fix show-stopper bug in %URI.MakeAbsolute functionality
|
||||
- Fix PHP4 syntax error in standalone version
|
||||
. Add prefix directory to include path for standalone, this prevents
|
||||
other installations from clobbering the standalone's URI schemes
|
||||
. Single test methods can be invoked by prefixing with __only
|
||||
|
||||
2.1.0, released 2007-08-02
|
||||
# flush-htmldefinition-cache.php superseded in favor of a generic
|
||||
flush-definition-cache.php script, you can clear a specific cache
|
||||
by passing its name as a parameter to the script
|
||||
! Phorum mod implemented for HTML Purifier
|
||||
! With %Core.AggressivelyFixLt, <3 and similar emoticons no longer
|
||||
trigger HTML removal in PHP5 (DOMLex). This directive is not necessary
|
||||
for PHP4 (DirectLex).
|
||||
! Standalone file now available, which greatly reduces the amount of
|
||||
includes (although there are still a few files that reside in the
|
||||
standalone folder)
|
||||
! Relative URIs can now be transformed into their absolute equivalents
|
||||
using %URI.Base and %URI.MakeAbsolute
|
||||
! Ruby implemented for XHTML 1.1
|
||||
! You can now define custom URI filtering behavior, see enduser-uri-filter.html
|
||||
for more details
|
||||
! UTF-8 font names now supported in CSS
|
||||
- AutoFormatters emit friendly error messages if tags or attributes they
|
||||
need are not allowed
|
||||
- ConfigForm's compactification of directive names is now configurable
|
||||
- AutoParagraph autoformatter algorithm refined after field-testing
|
||||
- XHTML 1.1 now applies XHTML 1.0 Strict cleanup routines, namely
|
||||
blockquote wrapping
|
||||
- Contents of <style> tags removed by default when tags are removed
|
||||
. HTMLPurifier_Config->getSerial() implemented, this is extremely useful
|
||||
for output cache invalidation
|
||||
. ConfigForm printer now can retrieve CSS and JS files as strings, in
|
||||
case HTML Purifier's directory is not publically accessible
|
||||
. Introduce new text/itext configuration directive values: these represent
|
||||
longer strings that would be more appropriately edited with a textarea
|
||||
. Allow newlines to act as separators for lists, hashes, lookups and
|
||||
%HTML.Allowed
|
||||
. ConfigForm generates textareas instead of text inputs for lists, hashes,
|
||||
lookups, text and itext fields
|
||||
. Hidden element content removal genericized: %Core.HiddenElements can
|
||||
be used to customize this behavior, by default <script> and <style> are
|
||||
hidden
|
||||
. Added HTMLPURIFIER_PREFIX constant, should be used instead of dirname(__FILE__)
|
||||
. Custom ChildDef added to default include list
|
||||
. URIScheme reflection improved: will not attempt to include file if class
|
||||
already exists. May clobber autoload, so I need to keep an eye on it
|
||||
. ConfigSchema heavily optimized, will only collect information and validate
|
||||
definitions when HTMLPURIFIER_SCHEMA_STRICT is true.
|
||||
. AttrDef_URI unit tests and implementation refactored
|
||||
. benchmarks/ directory now protected from public view with .htaccess file;
|
||||
run the tests via command line
|
||||
. URI scheme is munged off if there is no authority and the scheme is the
|
||||
default one
|
||||
. All unit tests inherit from HTMLPurifier_Harness, not UnitTestCase
|
||||
. Interface for URIScheme changed
|
||||
. Generic URI object to hold components of URI added, most systems involved
|
||||
in URI validation have been migrated to use it
|
||||
. Custom filtering for URIs factored out to URIDefinition interface for
|
||||
maximum extensibility
|
||||
|
||||
2.0.1, released 2007-06-27
|
||||
! Tag auto-closing now based on a ChildDef heuristic rather than a
|
||||
manually set auto_close array; some behavior may change
|
||||
! Experimental AutoFormat functionality added: auto-paragraph and
|
||||
linkify your HTML input by setting %AutoFormat.AutoParagraph and
|
||||
%AutoFormat.Linkify to true
|
||||
! Newlines normalized internally, and then converted back to the
|
||||
value of PHP_EOL. If this is not desired, set your newline format
|
||||
using %Output.Newline.
|
||||
! Beta error collection, messages are implemented for the most generic
|
||||
cases involving Lexing or Strategies
|
||||
- Clean up special case code for <script> tags
|
||||
- Reorder includes for DefinitionCache decorators, fixes a possible
|
||||
missing class error
|
||||
- Fixed bug where manually modified definitions were not saved via cache
|
||||
(mostly harmless, except for the fact that it would be a little slower)
|
||||
- Configuration objects with different serials do not clobber each
|
||||
others when revision numbers are unequal
|
||||
- Improve Serializer DefinitionCache directory permissions checks
|
||||
- DefinitionCache no longer throws errors when it encounters old
|
||||
serial files that do not conform to the current style
|
||||
- Stray xmlns attributes removed from configuration documentation
|
||||
- configForm.php smoketest no longer has XSS vulnerability due to
|
||||
unescaped print_r output
|
||||
- Printer adheres to configuration's directives on output format
|
||||
- Fix improperly named form field in ConfigForm printer
|
||||
. Rewire some test-cases to swallow errors rather than expect them
|
||||
. HTMLDefinition printer updated with some of the new attributes
|
||||
. DefinitionCache keys reordered to reflect precedence: version number,
|
||||
hash, then revision number
|
||||
. %Core.DefinitionCache renamed to %Cache.DefinitionImpl
|
||||
. Interlinking in configuration documentation added using
|
||||
Injector_PurifierLinkify
|
||||
. Directives now keep track of aliases to themselves
|
||||
. Error collector now requires a severity to be passed, use PHP's internal
|
||||
error constants for this
|
||||
. HTMLPurifier_Config::getAllowedDirectivesForForm implemented, allows
|
||||
much easier selective embedding of configuration values
|
||||
. Doctype objects now accept public and system DTD identifiers
|
||||
. %HTML.Doctype is now constrained by specific values, to specify a custom
|
||||
doctype use new %HTML.CustomDoctype
|
||||
. ConfigForm truncates long directives to keep the form small, and does
|
||||
not re-output namespaces
|
||||
|
||||
2.0.0, released 2007-06-20
|
||||
# Completely refactored HTMLModuleManager, decentralizing safety
|
||||
information
|
||||
# Transform modules changed to Tidy modules, which offer more flexibility
|
||||
and better modularization
|
||||
# Configuration object now finalizes itself when a read operation is
|
||||
performed on it, ensuring that its internal state stays consistent.
|
||||
To revert this behavior, you can set the $autoFinalize member variable
|
||||
off, but it's not recommended.
|
||||
# New compact syntax for AttrDef objects that can be used to instantiate
|
||||
new objects via make()
|
||||
# Definitions (esp. HTMLDefinition) are now cached for a significant
|
||||
performance boost. You can disable caching by setting %Core.DefinitionCache
|
||||
to null. You CANNOT edit raw definitions without setting the corresponding
|
||||
DefinitionID directive (%HTML.DefinitionID for HTMLDefinition).
|
||||
# Contents between <script> tags are now completely removed if <script>
|
||||
is not allowed
|
||||
# Prototype-declarations for Lexer removed in favor of configuration
|
||||
determination of Lexer implementations.
|
||||
! HTML Purifier now works in PHP 4.3.2.
|
||||
! Configuration form-editing API makes tweaking HTMLPurifier_Config a
|
||||
breeze!
|
||||
! Configuration directives that accept hashes now allow new string
|
||||
format: key1:value1,key2:value2
|
||||
! ConfigDoc now factored into OOP design
|
||||
! All deprecated elements now natively supported
|
||||
! Implement TinyMCE styled whitelist specification format in
|
||||
%HTML.Allowed
|
||||
! Config object gives more friendly error messages when things go wrong
|
||||
! Advanced API implemented: easy functions for creating elements (addElement)
|
||||
and attributes (addAttribute) on HTMLDefinition
|
||||
! Add native support for required attributes
|
||||
- Deprecated and removed EnableRedundantUTF8Cleaning. It didn't even work!
|
||||
- DOMLex will not emit errors when a custom error handler that does not
|
||||
honor error_reporting is used
|
||||
- StrictBlockquote child definition refrains from wrapping whitespace
|
||||
in tags now.
|
||||
- Bug resulting from tag transforms to non-allowed elements fixed
|
||||
- ChildDef_Custom's regex generation has been improved, removing several
|
||||
false positives
|
||||
. Unit test for ElementDef created, ElementDef behavior modified to
|
||||
be more flexible
|
||||
. Added convenience functions for HTMLModule constructors
|
||||
. AttrTypes now has accessor functions that should be used instead
|
||||
of directly manipulating info
|
||||
. TagTransform_Center deprecated in favor of generic TagTransform_Simple
|
||||
. Add extra protection in AttrDef_URI against phantom Schemes
|
||||
. Doctype object added to HTMLDefinition which describes certain aspects
|
||||
of the operational document type
|
||||
. Lexer is now pre-emptively included, with a conditional include for the
|
||||
PHP5 only version.
|
||||
. HTMLDefinition and CSSDefinition have a common parent class: Definition.
|
||||
. DirectLex can now track line-numbers
|
||||
. Preliminary error collector is in place, although no code actually reports
|
||||
errors yet
|
||||
. Factor out most of ValidateAttributes to new AttrValidator class
|
||||
|
||||
1.6.1, released 2007-05-05
|
||||
! Support for more deprecated attributes via transformations:
|
||||
+ hspace and vspace in img
|
||||
+ size and noshade in hr
|
||||
+ nowrap in td
|
||||
+ clear in br
|
||||
+ align in caption, table, img and hr
|
||||
+ type in ul, ol and li
|
||||
! DirectLex now preserves text in which a < bracket is followed by
|
||||
a non-alphanumeric character. This means that certain emoticons
|
||||
are now preserved.
|
||||
! %Core.RemoveInvalidImg is now operational, when set to false invalid
|
||||
images will hang around with an empty src
|
||||
! target attribute in a tag supported, use %Attr.AllowedFrameTargets
|
||||
to enable
|
||||
! CSS property white-space now allows nowrap (supported in all modern
|
||||
browsers) but not others (which have spotty browser implementations)
|
||||
! XHTML 1.1 mode now sort-of works without any fatal errors, and
|
||||
lang is now moved over to xml:lang.
|
||||
! Attribute transformation smoketest available at smoketests/attrTransform.php
|
||||
! Transformation of font's size attribute now handles super-large numbers
|
||||
- Possibly fatal bug with __autoload() fixed in module manager
|
||||
- Invert HTMLModuleManager->addModule() processing order to check
|
||||
prefixes first and then the literal module
|
||||
- Empty strings get converted to empty arrays instead of arrays with
|
||||
an empty string in them.
|
||||
- Merging in attribute lists now works.
|
||||
. Demo script removed: it has been added to the website's repository
|
||||
. Basic.php script modified to work out of the box
|
||||
. Refactor AttrTransform classes to reduce duplication
|
||||
. AttrTransform_TextAlign axed in favor of a more general
|
||||
AttrTransform_EnumToCSS, refer to HTMLModule/TransformToStrict.php to
|
||||
see how the new equivalent is implemented
|
||||
. Unit tests now use exclusively assertIdentical
|
||||
|
||||
1.6.0, released 2007-04-01
|
||||
! Support for most common deprecated attributes via transformations:
|
||||
+ bgcolor in td, th, tr and table
|
||||
+ border in img
|
||||
+ name in a and img
|
||||
+ width in td, th and hr
|
||||
+ height in td, th
|
||||
! Support for CSS attribute 'height' added
|
||||
! Support for rel and rev attributes in a tags added, use %Attr.AllowedRel
|
||||
and %Attr.AllowedRev to activate
|
||||
- You can define ID blacklists using regular expressions via
|
||||
%Attr.IDBlacklistRegexp
|
||||
- Error messages are emitted when you attempt to "allow" elements or
|
||||
attributes that HTML Purifier does not support
|
||||
- Fix segfault in unit test. The problem is not very reproduceable and
|
||||
I don't know what causes it, but a six line patch fixed it.
|
||||
|
||||
1.5.0, released 2007-03-23
|
||||
! Added a rudimentary I18N and L10N system modeled off MediaWiki. It
|
||||
doesn't actually do anything yet, but keep your eyes peeled.
|
||||
! docs/enduser-utf8.html explains how to use UTF-8 and HTML Purifier
|
||||
! Newly structured HTMLDefinition modeled off of XHTML 1.1 modules.
|
||||
I am loathe to release beta quality APIs, but this is exactly that;
|
||||
don't use the internal interfaces if you're not willing to do migration
|
||||
later on.
|
||||
- Allow 'x' subtag in language codes
|
||||
- Fixed buggy chameleon-support for ins and del
|
||||
. Added support for IDREF attributes (i.e. for)
|
||||
. Renamed HTMLPurifier_AttrDef_Class to HTMLPurifier_AttrDef_Nmtokens
|
||||
. Removed context variable ParentType, replaced with IsInline, which
|
||||
is false when you're not inline and an integer of the parent that
|
||||
caused you to become inline when you are (so possibly zero)
|
||||
. Removed ElementDef->type in favor of ElementDef->descendants_are_inline
|
||||
and HTMLDefinition->content_sets
|
||||
. StrictBlockquote now reports what elements its supposed to allow,
|
||||
rather than what it does allow
|
||||
. Removed HTMLDefinition->info_flow_elements in favor of
|
||||
HTMLDefinition->content_sets['Flow']
|
||||
. Removed redundant "exclusionary" definitions from DTD roster
|
||||
. StrictBlockquote now requires a construction parameter as if it
|
||||
were an Required ChildDef, this is the "real" set of allowed elements
|
||||
. AttrDef partitioned into HTML, CSS and URI segments
|
||||
. Modify Youtube filter regexp to be multiline
|
||||
. Require both PHP5 and DOM extension in order to use DOMLex, fixes
|
||||
some edge cases where a DOMDocument class exists in a PHP4 environment
|
||||
due to DOM XML extension.
|
||||
|
||||
1.4.1, released 2007-01-21
|
||||
! docs/enduser-youtube.html updated according to new functionality
|
||||
- YouTube IDs can have underscores and dashes
|
||||
|
2
README
2
README
@@ -19,4 +19,4 @@ Places to go:
|
||||
an in-depth installation guide.
|
||||
* See WYSIWYG for information on editors like TinyMCE and FCKeditor
|
||||
|
||||
HTML Purifier can be found on the web at: http://hp.jpsband.org/
|
||||
HTML Purifier can be found on the web at: http://htmlpurifier.org/
|
||||
|
146
TODO
146
TODO
@@ -4,36 +4,39 @@ TODO List
|
||||
= KEY ====================
|
||||
# Flagship
|
||||
- Regular
|
||||
? At-risk
|
||||
? Maybe I'll Do It
|
||||
==========================
|
||||
|
||||
1.5 release
|
||||
# Implement all non-essential attribute transforms, configurable
|
||||
# URI validation routines tighter (see docs/dev-code-quality.html) (COMPLEX)
|
||||
# Advanced URI filtering schemes (see docs/proposal-new-directives.txt)
|
||||
If no interest is expressed for a feature that may require a considerable
|
||||
amount of effort to implement, it may get endlessly delayed. Do not be
|
||||
afraid to cast your vote for the next feature to be implemented!
|
||||
|
||||
- Investigate how early internal structures can be accessed; this would
|
||||
prevent structures from being parsed and serialized multiple times.
|
||||
- Built-in support for target="_blank" on all external links
|
||||
- Gitify the repository
|
||||
|
||||
FUTURE VERSIONS
|
||||
---------------
|
||||
|
||||
3.2 release [It's All About Trust] (floating)
|
||||
# Implement untrusted, dangerous elements/attributes
|
||||
- Forms are especially wanted
|
||||
# Implement IDREF support (harder than it seems, since you cannot have
|
||||
IDREFs to non-existent IDs)
|
||||
# Frameset XHTML 1.0 and HTML 4.01 doctypes
|
||||
- Implement <area>
|
||||
- Figure out how to simultaneously set %CSS.Trusted and %HTML.Trusted (?)
|
||||
|
||||
3.3 release [Error'ed]
|
||||
# Error logging for filtering/cleanup procedures
|
||||
- Requires I18N facilities to be created first (COMPLEX)
|
||||
? Configuration profiles: sets of directives that get set with one func call
|
||||
- XSS-attempt detection
|
||||
- XSS-attempt detection--certain errors are flagged XSS-like
|
||||
|
||||
1.6 release
|
||||
# Add pre-packaged "levels" of cleaning (custom behavior already done)
|
||||
- More fine-grained control over escaping behavior
|
||||
- Silently drop content inbetween SCRIPT tags (can be generalized to allow
|
||||
specification of elements that, when detected as foreign, trigger removal
|
||||
of children, although unbalanced tags could wreck havoc (or at least
|
||||
delete the rest of the document)).
|
||||
- Allow specifying global attributes on a tag-by-tag basis in
|
||||
%HTML.AllowAttributes
|
||||
? More user-friendly warnings when %HTML.Allow* attempts to specify a
|
||||
tag or attribute that is not supported
|
||||
- Parse TinyMCE whitelist into our %HTML.Allow* whitelists
|
||||
|
||||
1.7 release
|
||||
3.4 release [Do What I Mean, Not What I Say]
|
||||
# Additional support for poorly written HTML
|
||||
- Microsoft Word HTML cleaning (i.e. MsoNormal, but research essential!)
|
||||
- Friendly strict handling of <address> (block -> <br>)
|
||||
- Remove redundant tags, ex. <u><u>Underlined</u></u>. Implementation notes:
|
||||
? Remove redundant tags, ex. <u><u>Underlined</u></u>. Implementation notes:
|
||||
1. Analyzing which tags to remove duplicants
|
||||
2. Ensure attributes are merged into the parent tag
|
||||
3. Extend the tag exclusion system to specify whether or not the
|
||||
@@ -43,49 +46,86 @@ TODO List
|
||||
- Remove empty inline tags<i></i>
|
||||
- Append something to duplicate IDs so they're still usable (impl. note: the
|
||||
dupe detector would also need to detect the suffix as well)
|
||||
- Externalize inline CSS to promote clean HTML, proposed by Sander Tekelenburg
|
||||
|
||||
2.0 release
|
||||
4.0 release [Beyond HTML]
|
||||
# Legit token based CSS parsing (will require revamping almost every
|
||||
AttrDef class)
|
||||
# Formatters for plaintext (COMPLEX)
|
||||
- Auto-paragraphing (be sure to leverage fact that we know when things
|
||||
shouldn't be paragraphed, such as lists and tables).
|
||||
- Linkify URLs
|
||||
- Smileys
|
||||
- Linkification for HTML Purifier docs: notably configuration and classes
|
||||
AttrDef class). Probably will use CSSTidy class?
|
||||
# More control over allowed CSS properties using a modularization
|
||||
# HTML 5 support
|
||||
# IRI support
|
||||
- Standardize token armor for all areas of processing
|
||||
- Convert RTL/LTR override characters to <bdo> tags, or vice versa on demand.
|
||||
Also, enable disabling of directionality
|
||||
|
||||
3.0 release
|
||||
5.0 release [To XML and Beyond]
|
||||
- AllowedAttributes and ForbiddenAttributes step on the toes of XML by
|
||||
using periods; this needs to be changed.
|
||||
- Extended HTML capabilities based on namespacing and tag transforms (COMPLEX)
|
||||
- Hooks for adding custom processors to custom namespaced tags and
|
||||
attributes, offer default implementation
|
||||
- Lots of documentation and samples
|
||||
- Allow tags to be "armored", an internal flag that protects them
|
||||
from validation and passes them out unharmed
|
||||
- XHTML 1.1 support
|
||||
|
||||
Ongoing
|
||||
- More refactoring to take advantage of PHP5's facilities
|
||||
- Refactor unit tests into lots of test methods
|
||||
- Plugins for major CMSes (COMPLEX)
|
||||
- phpBB
|
||||
- Drupal needs loving!
|
||||
- Phorum need loving!
|
||||
- more! (look for ones that use WYSIWYGs)
|
||||
- Also, maybe a FAQ for extension writers with HTML Purifier
|
||||
|
||||
AutoFormat
|
||||
- Smileys
|
||||
- Syntax highlighting (with GeSHi) with <pre> and possibly <?php
|
||||
- Look at http://drupal.org/project/Modules/category/63 for ideas
|
||||
|
||||
Optimizations
|
||||
- Reduce size of internal data-structures (esp. HTMLDefinition)
|
||||
- Research memory usage of objects versus arrays
|
||||
- Combine multiple strategies into a single, single-pass strategy
|
||||
- Get PH5P working with the latest versions of DOM, which have much more
|
||||
stringent error checking procedures. Maybe convert straight to tokens.
|
||||
- Get rid of set_include_path(). Save this for another major release.
|
||||
|
||||
Neat feature related
|
||||
! Factor demo.php into a set of Printer classes, and then create a stub
|
||||
file for users here (inside the actual HTML Purifier library)
|
||||
! Support exporting configuration, so users can easily tweak settings
|
||||
in the demo, and then copy-paste into their own setup
|
||||
- Advanced URI filtering schemes (see docs/proposal-new-directives.txt)
|
||||
- Allow scoped="scoped" attribute in <style> tags; may be troublesome
|
||||
because regular CSS has no way of uniquely identifying nodes, so we'd
|
||||
have to generate IDs
|
||||
- Explain how to use HTML Purifier in non-PHP languages / create
|
||||
a simple command line stub (or complicated?)
|
||||
- Fixes for Firefox's inability to handle COL alignment props (Bug 915)
|
||||
- Automatically add non-breaking spaces to empty table cells when
|
||||
empty-cells:show is applied to have compatibility with Internet Explorer
|
||||
- Convert RTL/LTR override characters to <bdo> tags, or vice versa on demand.
|
||||
Also, enable disabling of directionality
|
||||
- Table of Contents generation (XHTML Compiler might be reusable). May also
|
||||
be out-of-band information.
|
||||
- Full set of color keywords. Also, a way to add onto them without
|
||||
finalizing the configuration object.
|
||||
- Write a var_export and memcached DefinitionCache - Denis
|
||||
- Allow restriction of allowed class values
|
||||
|
||||
Ongoing
|
||||
- Lots of profiling, make it faster!
|
||||
- Plugins for major CMSes (COMPLEX)
|
||||
- WordPress
|
||||
- eFiction
|
||||
- more! (look for ones that use WYSIWYGs)
|
||||
Maintenance related (slightly boring)
|
||||
# CHMOD install script for PEAR installs
|
||||
! Factor out command line parser into its own class, and unit test it
|
||||
! Nested configuration namespaces
|
||||
- Distinguish between default settings and explicitly set settings, so
|
||||
configurations can be merged
|
||||
- Time PHPT tests
|
||||
|
||||
Unknown release (on a scratch-an-itch basis)
|
||||
- Upgrade SimpleTest testing code to newest versions
|
||||
- Have 'lang' attribute be checked against official lists
|
||||
? Semi-lossy dumb alternate character encoding transformations, achieved by
|
||||
encoding all characters that have string entity equivalents
|
||||
|
||||
Requested
|
||||
? Native content compression, whitespace stripping (don't rely on Tidy, make
|
||||
sure we don't remove from <pre> or related tags)
|
||||
ChildDef related (very boring)
|
||||
- Abstract ChildDef_BlockQuote to work with all elements that only
|
||||
allow blocks in them, required or optional
|
||||
- Implement lenient <ruby> child validation
|
||||
|
||||
Wontfix
|
||||
- Non-lossy smart alternate character encoding transformations (unless
|
||||
patch provided)
|
||||
- Pretty-printing HTML, users can use Tidy on the output on entire page
|
||||
- Pretty-printing HTML: users can use Tidy on the output on entire page
|
||||
- Native content compression, whitespace stripping: use gzip if this is
|
||||
really important
|
||||
|
9
WHATSNEW
Normal file
9
WHATSNEW
Normal file
@@ -0,0 +1,9 @@
|
||||
HTML Purifier 3.1.1 is a security and bugfix release. This release addresses
|
||||
two security vulnerabilities, both related to CSS, and one of which only
|
||||
applies to users using Shift_JIS as their output encoding. There is also
|
||||
a security improvement regarding the imagecrash attack. There is a backwards
|
||||
incompatible change with %URI.Munge, in which resources are no longer munged
|
||||
by default; please enable using %URI.MungeResources. Besides this, there
|
||||
are numerous improvements to URI munging, esp. with the addition of
|
||||
%URI.MungeSecretKey, as well as an experimental implementation of
|
||||
%HTML.SafeObject and %HTML.SafeEmbed. There are also some memory optimizations.
|
4
WYSIWYG
4
WYSIWYG
@@ -16,7 +16,3 @@ trouble. Therein lies the solution:
|
||||
HTML Purifier is perfect for filtering pure-HTML input from WYSIWYG editors.
|
||||
|
||||
Enough said.
|
||||
|
||||
There is a proof-of-concept integration of HTML Purifier with the Mantis
|
||||
bugtracker at http://hp.jpsband.org/mantis/ You can see notes on how
|
||||
this integration was acheived at http://hp.jpsband.org/mantis_notes.txt
|
||||
|
BIN
art/100cases.png
Normal file
BIN
art/100cases.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 2.7 KiB |
1
benchmarks/.htaccess
Normal file
1
benchmarks/.htaccess
Normal file
@@ -0,0 +1 @@
|
||||
Deny from all
|
14
benchmarks/ConfigSchema.php
Normal file
14
benchmarks/ConfigSchema.php
Normal file
@@ -0,0 +1,14 @@
|
||||
<?php
|
||||
|
||||
chdir(dirname(__FILE__));
|
||||
|
||||
//require_once '../library/HTMLPurifier.path.php';
|
||||
shell_exec('php ../maintenance/generate-schema-cache.php');
|
||||
require_once '../library/HTMLPurifier.path.php';
|
||||
require_once 'HTMLPurifier.includes.php';
|
||||
|
||||
$begin = xdebug_memory_usage();
|
||||
|
||||
$schema = HTMLPurifier_ConfigSchema::makeFromSerial();
|
||||
|
||||
echo xdebug_memory_usage() - $begin;
|
@@ -1,12 +1,11 @@
|
||||
<?php
|
||||
|
||||
// emulates inserting a dir called HTMLPurifier into your class dir
|
||||
set_include_path(get_include_path() . PATH_SEPARATOR . '../library/');
|
||||
|
||||
require_once '../library/HTMLPurifier.auto.php';
|
||||
@include_once '../test-settings.php';
|
||||
|
||||
require_once 'HTMLPurifier/ConfigSchema.php';
|
||||
require_once 'HTMLPurifier/Config.php';
|
||||
// PEAR
|
||||
require_once 'Benchmark/Timer.php'; // to do the timing
|
||||
require_once 'Text/Password.php'; // for generating random input
|
||||
|
||||
$LEXERS = array();
|
||||
$RUNS = isset($GLOBALS['HTMLPurifierTest']['Runs'])
|
||||
@@ -15,22 +14,11 @@ $RUNS = isset($GLOBALS['HTMLPurifierTest']['Runs'])
|
||||
require_once 'HTMLPurifier/Lexer/DirectLex.php';
|
||||
$LEXERS['DirectLex'] = new HTMLPurifier_Lexer_DirectLex();
|
||||
|
||||
if (!empty($GLOBALS['HTMLPurifierTest']['PEAR'])) {
|
||||
require_once 'HTMLPurifier/Lexer/PEARSax3.php';
|
||||
$LEXERS['PEARSax3'] = new HTMLPurifier_Lexer_PEARSax3();
|
||||
} else {
|
||||
exit('PEAR required to perform benchmark.');
|
||||
}
|
||||
|
||||
if (version_compare(PHP_VERSION, '5', '>=')) {
|
||||
require_once 'HTMLPurifier/Lexer/DOMLex.php';
|
||||
$LEXERS['DOMLex'] = new HTMLPurifier_Lexer_DOMLex();
|
||||
}
|
||||
|
||||
// PEAR
|
||||
require_once 'Benchmark/Timer.php'; // to do the timing
|
||||
require_once 'Text/Password.php'; // for generating random input
|
||||
|
||||
// custom class to aid unit testing
|
||||
class RowTimer extends Benchmark_Timer
|
||||
{
|
||||
@@ -93,11 +81,14 @@ function print_lexers() {
|
||||
function do_benchmark($name, $document) {
|
||||
global $LEXERS, $RUNS;
|
||||
|
||||
$config = HTMLPurifier_Config::createDefault();
|
||||
$context = new HTMLPurifier_Context();
|
||||
|
||||
$timer = new RowTimer($name);
|
||||
$timer->start();
|
||||
|
||||
foreach($LEXERS as $key => $lexer) {
|
||||
for ($i=0; $i<$RUNS; $i++) $tokens = $lexer->tokenizeHTML($document);
|
||||
for ($i=0; $i<$RUNS; $i++) $tokens = $lexer->tokenizeHTML($document, $config, $context);
|
||||
$timer->setMarker($key);
|
||||
}
|
||||
|
||||
@@ -161,4 +152,4 @@ echo '<div>Random input was: ' .
|
||||
?>
|
||||
|
||||
|
||||
</body></html>
|
||||
</body></html>
|
||||
|
@@ -1,16 +0,0 @@
|
||||
<?php
|
||||
|
||||
set_include_path(get_include_path() . PATH_SEPARATOR . '../library/');
|
||||
|
||||
require_once 'HTMLPurifier/ConfigSchema.php';
|
||||
require_once 'HTMLPurifier/Config.php';
|
||||
require_once 'HTMLPurifier/Lexer/DirectLex.php';
|
||||
|
||||
$input = file_get_contents('samples/Lexer/4.html');
|
||||
$lexer = new HTMLPurifier_Lexer_DirectLex();
|
||||
|
||||
for ($i = 0; $i < 10; $i++) {
|
||||
$tokens = $lexer->tokenizeHTML($input);
|
||||
}
|
||||
|
||||
?>
|
19
benchmarks/Trace.php
Normal file
19
benchmarks/Trace.php
Normal file
@@ -0,0 +1,19 @@
|
||||
<?php
|
||||
|
||||
ini_set('xdebug.trace_format', 1);
|
||||
ini_set('xdebug.show_mem_delta', true);
|
||||
|
||||
if (file_exists('Trace.xt')) {
|
||||
echo "Previous trace Trace.xt must be removed before this script can be run.";
|
||||
exit;
|
||||
}
|
||||
|
||||
xdebug_start_trace(dirname(__FILE__) . '/Trace');
|
||||
require_once '../library/HTMLPurifier.auto.php';
|
||||
|
||||
$purifier = new HTMLPurifier();
|
||||
|
||||
$data = $purifier->purify(file_get_contents('samples/Lexer/4.html'));
|
||||
xdebug_stop_trace();
|
||||
|
||||
echo "Trace finished.";
|
@@ -2,219 +2,60 @@
|
||||
|
||||
/**
|
||||
* Generates XML and HTML documents describing configuration.
|
||||
* @note PHP 5.2+ only!
|
||||
*/
|
||||
|
||||
/*
|
||||
TODO:
|
||||
- make XML format richer (see below)
|
||||
- make XML format richer
|
||||
- extend XSLT transformation (see the corresponding XSLT file)
|
||||
- allow generation of packaged docs that can be easily moved
|
||||
- multipage documentation
|
||||
- determine how to multilingualize
|
||||
- factor out code into classes
|
||||
- add blurbs to ToC
|
||||
*/
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Check and configure environment
|
||||
if (version_compare(PHP_VERSION, '5.2', '<')) exit('PHP 5.2+ required.');
|
||||
error_reporting(E_ALL | E_STRICT);
|
||||
|
||||
if (version_compare('5', PHP_VERSION, '>')) exit('Requires PHP 5 or higher.');
|
||||
error_reporting(E_ALL);
|
||||
chdir(dirname(__FILE__));
|
||||
|
||||
// load dual-libraries
|
||||
require_once '../extras/HTMLPurifierExtras.auto.php';
|
||||
require_once '../library/HTMLPurifier.auto.php';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Include HTML Purifier library
|
||||
// setup HTML Purifier singleton
|
||||
HTMLPurifier::getInstance(array(
|
||||
'AutoFormat.PurifierLinkify' => true
|
||||
));
|
||||
|
||||
set_include_path('../library' . PATH_SEPARATOR . get_include_path());
|
||||
require_once 'HTMLPurifier.php';
|
||||
$interchange = HTMLPurifier_ConfigSchema_InterchangeBuilder::buildFromDirectory();
|
||||
$interchange->validate();
|
||||
|
||||
$style = 'plain'; // use $_GET in the future, careful to validate!
|
||||
$configdoc_xml = 'configdoc.xml';
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Setup convenience functions
|
||||
$xml_builder = new HTMLPurifier_ConfigSchema_Builder_Xml();
|
||||
$xml_builder->openURI($configdoc_xml);
|
||||
$xml_builder->build($interchange);
|
||||
unset($xml_builder); // free handle
|
||||
|
||||
function appendHTMLDiv($document, $node, $html) {
|
||||
global $purifier;
|
||||
$html = $purifier->purify($html);
|
||||
$dom_html = $document->createDocumentFragment();
|
||||
$dom_html->appendXML($html);
|
||||
|
||||
$dom_div = $document->createElement('div');
|
||||
$dom_div->setAttribute('xmlns', 'http://www.w3.org/1999/xhtml');
|
||||
$dom_div->appendChild($dom_html);
|
||||
|
||||
$node->appendChild($dom_div);
|
||||
$xslt = new ConfigDoc_HTMLXSLTProcessor();
|
||||
$xslt->importStylesheet(dirname(__FILE__) . "/styles/$style.xsl");
|
||||
$output = $xslt->transformToHTML($configdoc_xml);
|
||||
|
||||
if (!$output) {
|
||||
echo "Error in generating files\n";
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Load copies of HTMLPurifier_ConfigDef and HTMLPurifier
|
||||
|
||||
$schema = HTMLPurifier_ConfigSchema::instance();
|
||||
$purifier = new HTMLPurifier();
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Generate types.xml, a document describing the constraint "type"
|
||||
|
||||
$types_document = new DOMDocument('1.0', 'UTF-8');
|
||||
$types_root = $types_document->createElement('types');
|
||||
$types_document->appendChild($types_root);
|
||||
$types_document->formatOutput = true;
|
||||
foreach ($schema->types as $name => $expanded_name) {
|
||||
$types_type = $types_document->createElement('type', $expanded_name);
|
||||
$types_type->setAttribute('id', $name);
|
||||
$types_root->appendChild($types_type);
|
||||
}
|
||||
$types_document->save('types.xml');
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Generate configdoc.xml, a document documenting configuration directives
|
||||
|
||||
$dom_document = new DOMDocument('1.0', 'UTF-8');
|
||||
$dom_root = $dom_document->createElement('configdoc');
|
||||
$dom_document->appendChild($dom_root);
|
||||
$dom_document->formatOutput = true;
|
||||
|
||||
// add the name of the application
|
||||
$dom_root->appendChild($dom_document->createElement('title', 'HTML Purifier'));
|
||||
|
||||
/*
|
||||
TODO for XML format:
|
||||
- create a definition (DTD or other) once interface stabilizes
|
||||
*/
|
||||
|
||||
foreach($schema->info as $namespace_name => $namespace_info) {
|
||||
|
||||
$dom_namespace = $dom_document->createElement('namespace');
|
||||
$dom_root->appendChild($dom_namespace);
|
||||
|
||||
$dom_namespace->setAttribute('id', $namespace_name);
|
||||
$dom_namespace->appendChild(
|
||||
$dom_document->createElement('name', $namespace_name)
|
||||
);
|
||||
$dom_namespace_description = $dom_document->createElement('description');
|
||||
$dom_namespace->appendChild($dom_namespace_description);
|
||||
appendHTMLDiv($dom_document, $dom_namespace_description,
|
||||
$schema->info_namespace[$namespace_name]->description);
|
||||
|
||||
foreach ($namespace_info as $name => $info) {
|
||||
|
||||
if ($info->class == 'alias') continue;
|
||||
|
||||
$dom_directive = $dom_document->createElement('directive');
|
||||
$dom_namespace->appendChild($dom_directive);
|
||||
|
||||
$dom_directive->setAttribute('id', $namespace_name . '.' . $name);
|
||||
$dom_directive->appendChild(
|
||||
$dom_document->createElement('name', $name)
|
||||
);
|
||||
|
||||
$dom_constraints = $dom_document->createElement('constraints');
|
||||
$dom_directive->appendChild($dom_constraints);
|
||||
|
||||
$dom_type = $dom_document->createElement('type', $info->type);
|
||||
if ($info->allow_null) {
|
||||
$dom_type->setAttribute('allow-null', 'yes');
|
||||
}
|
||||
$dom_constraints->appendChild($dom_type);
|
||||
|
||||
if ($info->allowed !== true) {
|
||||
$dom_allowed = $dom_document->createElement('allowed');
|
||||
$dom_constraints->appendChild($dom_allowed);
|
||||
foreach ($info->allowed as $allowed => $bool) {
|
||||
$dom_allowed->appendChild(
|
||||
$dom_document->createElement('value', $allowed)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
$raw_default = $schema->defaults[$namespace_name][$name];
|
||||
if (is_bool($raw_default)) {
|
||||
$default = $raw_default ? 'true' : 'false';
|
||||
} elseif (is_string($raw_default)) {
|
||||
$default = "\"$raw_default\"";
|
||||
} elseif (is_null($raw_default)) {
|
||||
$default = 'null';
|
||||
} else {
|
||||
$default = print_r(
|
||||
$schema->defaults[$namespace_name][$name], true
|
||||
);
|
||||
}
|
||||
|
||||
$dom_default = $dom_document->createElement('default', $default);
|
||||
|
||||
// remove this once we get a DTD
|
||||
$dom_default->setAttribute('xml:space', 'preserve');
|
||||
|
||||
$dom_constraints->appendChild($dom_default);
|
||||
|
||||
$dom_descriptions = $dom_document->createElement('descriptions');
|
||||
$dom_directive->appendChild($dom_descriptions);
|
||||
|
||||
foreach ($info->descriptions as $file => $file_descriptions) {
|
||||
foreach ($file_descriptions as $line => $description) {
|
||||
$dom_description = $dom_document->createElement('description');
|
||||
$dom_description->setAttribute('file', $file);
|
||||
$dom_description->setAttribute('line', $line);
|
||||
appendHTMLDiv($dom_document, $dom_description, $description);
|
||||
$dom_descriptions->appendChild($dom_description);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// print_r($dom_document->saveXML());
|
||||
|
||||
// save a copy of the raw XML
|
||||
$dom_document->save('configdoc.xml');
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Generate final output using XSLT
|
||||
|
||||
// load the stylesheet
|
||||
$xsl_stylesheet_name = 'plain';
|
||||
$xsl_stylesheet = "styles/$xsl_stylesheet_name.xsl";
|
||||
$xsl_dom_stylesheet = new DOMDocument();
|
||||
$xsl_dom_stylesheet->load($xsl_stylesheet);
|
||||
|
||||
// setup the XSLT processor
|
||||
$xsl_processor = new XSLTProcessor();
|
||||
|
||||
// perform the transformation
|
||||
$xsl_processor->importStylesheet($xsl_dom_stylesheet);
|
||||
$html_output = $xsl_processor->transformToXML($dom_document);
|
||||
|
||||
// some slight fudges to preserve backwards compatibility
|
||||
$html_output = str_replace('/>', ' />', $html_output); // <br /> not <br>
|
||||
$html_output = str_replace(' xmlns=""', '', $html_output); // rm unnecessary xmlns
|
||||
|
||||
if (class_exists('Tidy')) {
|
||||
// cleanup output
|
||||
$config = array(
|
||||
'indent' => true,
|
||||
'output-xhtml' => true,
|
||||
'wrap' => 80
|
||||
);
|
||||
$tidy = new Tidy;
|
||||
$tidy->parseString($html_output, $config, 'utf8');
|
||||
$tidy->cleanRepair();
|
||||
$html_output = (string) $tidy;
|
||||
}
|
||||
|
||||
// write it to a file (todo: parse into seperate pages)
|
||||
file_put_contents("$xsl_stylesheet_name.html", $html_output);
|
||||
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Output for instant feedback
|
||||
// write out
|
||||
file_put_contents("$style.html", $output);
|
||||
|
||||
if (php_sapi_name() != 'cli') {
|
||||
echo $html_output;
|
||||
// output (instant feedback if it's a browser)
|
||||
echo $output;
|
||||
} else {
|
||||
echo 'Files generated successfully.';
|
||||
}
|
||||
|
||||
?>
|
@@ -1,16 +1,30 @@
|
||||
|
||||
body {margin:1em 4em;}
|
||||
body {margin:0;padding:0;}
|
||||
#content {
|
||||
margin:1em auto;
|
||||
max-width: 47em;
|
||||
width: expression(document.body.clientWidth >
|
||||
85 * parseInt(document.body.currentStyle.fontSize) ?
|
||||
"54em": "auto");
|
||||
}
|
||||
|
||||
table {border-collapse:collapse;}
|
||||
table td, table th {padding:0.2em;}
|
||||
|
||||
table.constraints {margin:0 0 1em;}
|
||||
table.constraints th {text-align:left;padding-left:0.4em;}
|
||||
table.constraints td {padding-right:0.4em;}
|
||||
table.constraints th {
|
||||
text-align:right;padding-left:0.4em;padding-right:0.4em;background:#EEE;
|
||||
width:8em;vertical-align:top;}
|
||||
table.constraints td {padding-right:0.4em; padding-left: 1em;}
|
||||
table.constraints td ul {padding:0; margin:0; list-style:none;}
|
||||
table.constraints td pre {margin:0;}
|
||||
|
||||
#toc {list-style-type:none; font-weight:bold;}
|
||||
#toc ul {list-style-type:disc; font-weight:normal;}
|
||||
#tocContainer {position:relative;}
|
||||
#toc {list-style-type:none; font-weight:bold; font-size:1em; margin-bottom:1em;}
|
||||
#toc li {position:relative; line-height: 1.2em;}
|
||||
#toc .col-2 {margin-left:50%;}
|
||||
#toc .col-l {float:left;}
|
||||
#toc ul {list-style-type:disc; font-weight:normal; padding-bottom:1.2em;}
|
||||
|
||||
.description p {margin-top:0;margin-bottom:1em;}
|
||||
|
||||
@@ -19,6 +33,10 @@ table.constraints td pre {margin:0;}
|
||||
#library {font-size:1em;}
|
||||
h1 {margin-top:0;}
|
||||
h2 {border-bottom:1px solid #CCC; font-family:sans-serif; font-weight:normal;
|
||||
font-size:1.3em;}
|
||||
font-size:1.3em; clear:both;}
|
||||
h3 {font-family:sans-serif; font-size:1.1em; font-weight:bold; }
|
||||
h4 {font-family:sans-serif; font-size:0.9em; font-weight:bold; }
|
||||
|
||||
.deprecated {color: #CCC;}
|
||||
.deprecated table.constraints th {background:#FFF;}
|
||||
.deprecated-notice {color: #000; text-align:center; margin-bottom: 1em;}
|
||||
|
@@ -12,94 +12,187 @@
|
||||
indent = "no"
|
||||
media-type = "text/html"
|
||||
/>
|
||||
<xsl:param name="css" select="'styles/plain.css'"/>
|
||||
<xsl:param name="title" select="'Configuration Documentation'"/>
|
||||
|
||||
<xsl:variable name="typeLookup" select="document('../types.xml')" />
|
||||
<xsl:variable name="typeLookup" select="document('../types.xml')/types" />
|
||||
<xsl:variable name="usageLookup" select="document('../usage.xml')/usage" />
|
||||
|
||||
<!-- Twiddle this variable to get the columns as even as possible -->
|
||||
<xsl:variable name="maxNumberAdjust" select="2" />
|
||||
|
||||
<xsl:template match="/">
|
||||
<html lang="en" xml:lang="en">
|
||||
<head>
|
||||
<title>Configuration Documentation - <xsl:value-of select="/configdoc/title" /></title>
|
||||
<title><xsl:value-of select="$title" /> - <xsl:value-of select="/configdoc/title" /></title>
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=UTF-8" />
|
||||
<link rel="stylesheet" type="text/css" href="styles/plain.css" />
|
||||
<link rel="stylesheet" type="text/css" href="{$css}" />
|
||||
</head>
|
||||
<body>
|
||||
<div id="library"><xsl:value-of select="/configdoc/title" /></div>
|
||||
<h1>Configuration Documentation</h1>
|
||||
<h2>Table of Contents</h2>
|
||||
<ul id="toc">
|
||||
<xsl:apply-templates mode="toc" />
|
||||
</ul>
|
||||
<xsl:apply-templates />
|
||||
<div id="content">
|
||||
<div id="library"><xsl:value-of select="/configdoc/title" /></div>
|
||||
<h1><xsl:value-of select="$title" /></h1>
|
||||
<div id="tocContainer">
|
||||
<h2>Table of Contents</h2>
|
||||
<ul id="toc">
|
||||
<xsl:apply-templates mode="toc">
|
||||
<xsl:with-param name="overflowNumber" select="round(count(/configdoc/namespace) div 2) + $maxNumberAdjust" />
|
||||
</xsl:apply-templates>
|
||||
</ul>
|
||||
</div>
|
||||
<xsl:apply-templates />
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="title" mode="toc" />
|
||||
<xsl:template match="namespace" mode="toc">
|
||||
<xsl:param name="overflowNumber" />
|
||||
<xsl:variable name="number"><xsl:number level="single" /></xsl:variable>
|
||||
<xsl:variable name="directiveNumber"><xsl:number level="any" count="directive" /></xsl:variable>
|
||||
<xsl:if test="count(directive)>0">
|
||||
<li>
|
||||
<!-- BEGIN multicolumn code -->
|
||||
<xsl:if test="$number >= $overflowNumber">
|
||||
<xsl:attribute name="class">col-2</xsl:attribute>
|
||||
</xsl:if>
|
||||
<xsl:if test="$number = $overflowNumber">
|
||||
<xsl:attribute name="style">margin-top:-<xsl:value-of select="($number * 2 + $directiveNumber - 3) * 1.2" />em</xsl:attribute>
|
||||
</xsl:if>
|
||||
<!-- END multicolumn code -->
|
||||
<a href="#{@id}"><xsl:value-of select="name" /></a>
|
||||
<ul>
|
||||
<xsl:apply-templates select="directive" mode="toc" />
|
||||
<xsl:apply-templates select="directive" mode="toc">
|
||||
<xsl:with-param name="overflowNumber" select="$overflowNumber" />
|
||||
</xsl:apply-templates>
|
||||
</ul>
|
||||
<xsl:if test="$number + 1 = $overflowNumber">
|
||||
<div class="col-l" />
|
||||
</xsl:if>
|
||||
</li>
|
||||
</xsl:if>
|
||||
</xsl:template>
|
||||
<xsl:template match="directive" mode="toc">
|
||||
<li><a href="#{@id}"><xsl:value-of select="name" /></a></li>
|
||||
<xsl:variable name="number">
|
||||
<xsl:number level="any" count="directive|namespace" />
|
||||
</xsl:variable>
|
||||
<xsl:if test="not(deprecated)">
|
||||
<li>
|
||||
<a href="#{@id}"><xsl:value-of select="name" /></a>
|
||||
</li>
|
||||
</xsl:if>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="title" />
|
||||
|
||||
<xsl:template match="namespace">
|
||||
<xsl:apply-templates />
|
||||
<xsl:if test="count(directive)=0">
|
||||
<p>No configuration directives defined for this namespace.</p>
|
||||
</xsl:if>
|
||||
<div class="namespace">
|
||||
<xsl:apply-templates />
|
||||
<xsl:if test="count(directive)=0">
|
||||
<p>No configuration directives defined for this namespace.</p>
|
||||
</xsl:if>
|
||||
</div>
|
||||
</xsl:template>
|
||||
<xsl:template match="namespace/name">
|
||||
<h2 id="{../@id}"><xsl:value-of select="." /></h2>
|
||||
</xsl:template>
|
||||
<xsl:template match="namespace/description">
|
||||
<div class="description">
|
||||
<xsl:copy-of select="div/node()" />
|
||||
<xsl:copy-of xmlns:xhtml="http://www.w3.org/1999/xhtml" select="xhtml:div/node()" />
|
||||
</div>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="directive">
|
||||
<xsl:apply-templates />
|
||||
</xsl:template>
|
||||
<xsl:template match="directive/name">
|
||||
<h3 id="{../@id}"><xsl:value-of select="../@id" /></h3>
|
||||
</xsl:template>
|
||||
<xsl:template match="directive/constraints">
|
||||
<table class="constraints">
|
||||
<xsl:apply-templates />
|
||||
<!-- Calculated other values -->
|
||||
<tr>
|
||||
<th>Used by:</th>
|
||||
<td>
|
||||
<xsl:for-each select="../descriptions/description">
|
||||
<xsl:if test="position()>1">, </xsl:if>
|
||||
<xsl:value-of select="@file" />
|
||||
</xsl:for-each>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
</xsl:template>
|
||||
<xsl:template match="directive//description">
|
||||
<div class="description">
|
||||
<xsl:copy-of select="div/node()" />
|
||||
<div>
|
||||
<xsl:attribute name="class"><!--
|
||||
-->directive<!--
|
||||
--><xsl:if test="deprecated"> deprecated</xsl:if><!--
|
||||
--></xsl:attribute>
|
||||
<xsl:apply-templates>
|
||||
<xsl:with-param name="id" select="@id" />
|
||||
</xsl:apply-templates>
|
||||
</div>
|
||||
</xsl:template>
|
||||
<xsl:template match="directive/name">
|
||||
<xsl:param name="id" />
|
||||
<xsl:apply-templates select="../aliases/alias" mode="anchor" />
|
||||
<h3 id="{$id}"><xsl:value-of select="$id" /></h3>
|
||||
</xsl:template>
|
||||
<xsl:template match="alias" mode="anchor">
|
||||
<a id="{.}"></a>
|
||||
</xsl:template>
|
||||
|
||||
<!-- Do not pass through -->
|
||||
<xsl:template match="alias"></xsl:template>
|
||||
|
||||
<xsl:template match="directive/constraints">
|
||||
<xsl:param name="id" />
|
||||
<table class="constraints">
|
||||
<xsl:apply-templates />
|
||||
<xsl:if test="../aliases/alias">
|
||||
<xsl:apply-templates select="../aliases" mode="constraints" />
|
||||
</xsl:if>
|
||||
<xsl:apply-templates select="$usageLookup/directive[@id=$id]" />
|
||||
</table>
|
||||
</xsl:template>
|
||||
<xsl:template match="directive/aliases" mode="constraints">
|
||||
<tr>
|
||||
<th>Aliases</th>
|
||||
<td>
|
||||
<xsl:for-each select="alias">
|
||||
<xsl:if test="position()>1">, </xsl:if>
|
||||
<xsl:value-of select="." />
|
||||
</xsl:for-each>
|
||||
</td>
|
||||
</tr>
|
||||
</xsl:template>
|
||||
<xsl:template match="directive/description">
|
||||
<div class="description">
|
||||
<xsl:copy-of xmlns:xhtml="http://www.w3.org/1999/xhtml" select="xhtml:div/node()" />
|
||||
</div>
|
||||
</xsl:template>
|
||||
<xsl:template match="directive/deprecated">
|
||||
<div class="deprecated-notice">
|
||||
<strong>Warning:</strong>
|
||||
This directive was deprecated in version <xsl:value-of select="version" />.
|
||||
<a href="#{use}">%<xsl:value-of select="use" /></a> should be used instead.
|
||||
</div>
|
||||
</xsl:template>
|
||||
<xsl:template match="usage/directive">
|
||||
<tr>
|
||||
<th>Used in</th>
|
||||
<td>
|
||||
<ul>
|
||||
<xsl:apply-templates />
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</xsl:template>
|
||||
<xsl:template match="usage/directive/file">
|
||||
<li>
|
||||
<em><xsl:value-of select="@name" /></em> on line<xsl:if test="count(line)>1">s</xsl:if>
|
||||
<xsl:text> </xsl:text>
|
||||
<xsl:for-each select="line">
|
||||
<xsl:if test="position()>1">, </xsl:if>
|
||||
<xsl:value-of select="." />
|
||||
</xsl:for-each>
|
||||
</li>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="constraints/version">
|
||||
<tr>
|
||||
<th>Version added</th>
|
||||
<td><xsl:value-of select="." /></td>
|
||||
</tr>
|
||||
</xsl:template>
|
||||
<xsl:template match="constraints/type">
|
||||
<tr>
|
||||
<th>Type:</th>
|
||||
<th>Type</th>
|
||||
<td>
|
||||
<xsl:variable name="type" select="text()" />
|
||||
<xsl:attribute name="class">type type-<xsl:value-of select="$type" /></xsl:attribute>
|
||||
<xsl:value-of select="$typeLookup/types/type[@id=$type]/text()" />
|
||||
<xsl:value-of select="$typeLookup/type[@id=$type]/text()" />
|
||||
<xsl:if test="@allow-null='yes'">
|
||||
(or null)
|
||||
</xsl:if>
|
||||
@@ -108,7 +201,7 @@
|
||||
</xsl:template>
|
||||
<xsl:template match="constraints/allowed">
|
||||
<tr>
|
||||
<th>Allowed values:</th>
|
||||
<th>Allowed values</th>
|
||||
<td>
|
||||
<xsl:for-each select="value"><!--
|
||||
--><xsl:if test="position()>1">, </xsl:if>
|
||||
@@ -119,9 +212,22 @@
|
||||
</xsl:template>
|
||||
<xsl:template match="constraints/default">
|
||||
<tr>
|
||||
<th>Default:</th>
|
||||
<th>Default</th>
|
||||
<td><pre><xsl:value-of select="." xml:space="preserve" /></pre></td>
|
||||
</tr>
|
||||
</xsl:template>
|
||||
<xsl:template match="constraints/external">
|
||||
<tr>
|
||||
<th>External deps</th>
|
||||
<td>
|
||||
<ul>
|
||||
<xsl:apply-templates />
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</xsl:template>
|
||||
<xsl:template match="constraints/external/project">
|
||||
<li><xsl:value-of select="." /></li>
|
||||
</xsl:template>
|
||||
|
||||
</xsl:stylesheet>
|
||||
</xsl:stylesheet>
|
||||
|
14
configdoc/types.xml
Normal file
14
configdoc/types.xml
Normal file
@@ -0,0 +1,14 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<types>
|
||||
<type id="string">String</type>
|
||||
<type id="istring">Case-insensitive string</type>
|
||||
<type id="text">Text</type>
|
||||
<type id="itext">Case-insensitive text</type>
|
||||
<type id="int">Integer</type>
|
||||
<type id="float">Float</type>
|
||||
<type id="bool">Boolean</type>
|
||||
<type id="lookup">Lookup array</type>
|
||||
<type id="list">Array list</type>
|
||||
<type id="hash">Associative array</type>
|
||||
<type id="mixed">Mixed</type>
|
||||
</types>
|
395
configdoc/usage.xml
Normal file
395
configdoc/usage.xml
Normal file
@@ -0,0 +1,395 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<usage>
|
||||
<directive id="Core.CollectErrors">
|
||||
<file name="HTMLPurifier.php">
|
||||
<line>131</line>
|
||||
</file>
|
||||
<file name="HTMLPurifier/Lexer.php">
|
||||
<line>85</line>
|
||||
</file>
|
||||
<file name="HTMLPurifier/Lexer/DirectLex.php">
|
||||
<line>50</line>
|
||||
<line>62</line>
|
||||
<line>327</line>
|
||||
</file>
|
||||
<file name="HTMLPurifier/Strategy/RemoveForeignElements.php">
|
||||
<line>44</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="CSS.MaxImgLength">
|
||||
<file name="HTMLPurifier/CSSDefinition.php">
|
||||
<line>157</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="CSS.Proprietary">
|
||||
<file name="HTMLPurifier/CSSDefinition.php">
|
||||
<line>214</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="CSS.AllowTricky">
|
||||
<file name="HTMLPurifier/CSSDefinition.php">
|
||||
<line>218</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="CSS.AllowImportant">
|
||||
<file name="HTMLPurifier/CSSDefinition.php">
|
||||
<line>222</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="CSS.AllowedProperties">
|
||||
<file name="HTMLPurifier/CSSDefinition.php">
|
||||
<line>274</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Cache.DefinitionImpl">
|
||||
<file name="HTMLPurifier/DefinitionCacheFactory.php">
|
||||
<line>49</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="HTML.Doctype">
|
||||
<file name="HTMLPurifier/DoctypeRegistry.php">
|
||||
<line>83</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="HTML.CustomDoctype">
|
||||
<file name="HTMLPurifier/DoctypeRegistry.php">
|
||||
<line>85</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="HTML.XHTML">
|
||||
<file name="HTMLPurifier/DoctypeRegistry.php">
|
||||
<line>88</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="HTML.Strict">
|
||||
<file name="HTMLPurifier/DoctypeRegistry.php">
|
||||
<line>93</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Core.Encoding">
|
||||
<file name="HTMLPurifier/Encoder.php">
|
||||
<line>267</line>
|
||||
<line>294</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Test.ForceNoIconv">
|
||||
<file name="HTMLPurifier/Encoder.php">
|
||||
<line>272</line>
|
||||
<line>302</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Core.EscapeNonASCIICharacters">
|
||||
<file name="HTMLPurifier/Encoder.php">
|
||||
<line>298</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Core.MaintainLineNumbers">
|
||||
<file name="HTMLPurifier/ErrorCollector.php">
|
||||
<line>81</line>
|
||||
</file>
|
||||
<file name="HTMLPurifier/Lexer.php">
|
||||
<line>82</line>
|
||||
</file>
|
||||
<file name="HTMLPurifier/Lexer/DirectLex.php">
|
||||
<line>45</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Output.CommentScriptContents">
|
||||
<file name="HTMLPurifier/Generator.php">
|
||||
<line>40</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Output.TidyFormat">
|
||||
<file name="HTMLPurifier/Generator.php">
|
||||
<line>69</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Output.Newline">
|
||||
<file name="HTMLPurifier/Generator.php">
|
||||
<line>83</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="HTML.BlockWrapper">
|
||||
<file name="HTMLPurifier/HTMLDefinition.php">
|
||||
<line>222</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="HTML.Parent">
|
||||
<file name="HTMLPurifier/HTMLDefinition.php">
|
||||
<line>230</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="HTML.AllowedElements">
|
||||
<file name="HTMLPurifier/HTMLDefinition.php">
|
||||
<line>247</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="HTML.AllowedAttributes">
|
||||
<file name="HTMLPurifier/HTMLDefinition.php">
|
||||
<line>248</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="HTML.Allowed">
|
||||
<file name="HTMLPurifier/HTMLDefinition.php">
|
||||
<line>251</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="HTML.ForbiddenElements">
|
||||
<file name="HTMLPurifier/HTMLDefinition.php">
|
||||
<line>337</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="HTML.ForbiddenAttributes">
|
||||
<file name="HTMLPurifier/HTMLDefinition.php">
|
||||
<line>338</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="HTML.Trusted">
|
||||
<file name="HTMLPurifier/HTMLModuleManager.php">
|
||||
<line>198</line>
|
||||
</file>
|
||||
<file name="HTMLPurifier/Lexer.php">
|
||||
<line>238</line>
|
||||
</file>
|
||||
<file name="HTMLPurifier/HTMLModule/Image.php">
|
||||
<line>27</line>
|
||||
</file>
|
||||
<file name="HTMLPurifier/Lexer/DirectLex.php">
|
||||
<line>34</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="HTML.AllowedModules">
|
||||
<file name="HTMLPurifier/HTMLModuleManager.php">
|
||||
<line>205</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="HTML.CoreModules">
|
||||
<file name="HTMLPurifier/HTMLModuleManager.php">
|
||||
<line>206</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="HTML.Proprietary">
|
||||
<file name="HTMLPurifier/HTMLModuleManager.php">
|
||||
<line>220</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="HTML.SafeObject">
|
||||
<file name="HTMLPurifier/HTMLModuleManager.php">
|
||||
<line>225</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="HTML.SafeEmbed">
|
||||
<file name="HTMLPurifier/HTMLModuleManager.php">
|
||||
<line>228</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Attr.IDBlacklist">
|
||||
<file name="HTMLPurifier/IDAccumulator.php">
|
||||
<line>26</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Core.Language">
|
||||
<file name="HTMLPurifier/LanguageFactory.php">
|
||||
<line>88</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Core.LexerImpl">
|
||||
<file name="HTMLPurifier/Lexer.php">
|
||||
<line>70</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Core.ConvertDocumentToFragment">
|
||||
<file name="HTMLPurifier/Lexer.php">
|
||||
<line>230</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="URI.Host">
|
||||
<file name="HTMLPurifier/URIDefinition.php">
|
||||
<line>64</line>
|
||||
</file>
|
||||
<file name="HTMLPurifier/URIFilter/DisableExternal.php">
|
||||
<line>8</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="URI.Base">
|
||||
<file name="HTMLPurifier/URIDefinition.php">
|
||||
<line>65</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="URI.DefaultScheme">
|
||||
<file name="HTMLPurifier/URIDefinition.php">
|
||||
<line>72</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="URI.AllowedSchemes">
|
||||
<file name="HTMLPurifier/URISchemeRegistry.php">
|
||||
<line>42</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="URI.OverrideAllowedSchemes">
|
||||
<file name="HTMLPurifier/URISchemeRegistry.php">
|
||||
<line>43</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="URI.Disable">
|
||||
<file name="HTMLPurifier/AttrDef/URI.php">
|
||||
<line>28</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Core.ColorKeywords">
|
||||
<file name="HTMLPurifier/AttrDef/CSS/Color.php">
|
||||
<line>12</line>
|
||||
</file>
|
||||
<file name="HTMLPurifier/AttrDef/HTML/Color.php">
|
||||
<line>12</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Attr.AllowedFrameTargets">
|
||||
<file name="HTMLPurifier/AttrDef/HTML/FrameTarget.php">
|
||||
<line>15</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Attr.EnableID">
|
||||
<file name="HTMLPurifier/AttrDef/HTML/ID.php">
|
||||
<line>20</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Attr.IDPrefix">
|
||||
<file name="HTMLPurifier/AttrDef/HTML/ID.php">
|
||||
<line>26</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Attr.IDPrefixLocal">
|
||||
<file name="HTMLPurifier/AttrDef/HTML/ID.php">
|
||||
<line>28</line>
|
||||
<line>31</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Attr.IDBlacklistRegexp">
|
||||
<file name="HTMLPurifier/AttrDef/HTML/ID.php">
|
||||
<line>54</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Attr.DefaultTextDir">
|
||||
<file name="HTMLPurifier/AttrTransform/BdoDir.php">
|
||||
<line>13</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Core.RemoveInvalidImg">
|
||||
<file name="HTMLPurifier/AttrTransform/ImgRequired.php">
|
||||
<line>18</line>
|
||||
</file>
|
||||
<file name="HTMLPurifier/Strategy/RemoveForeignElements.php">
|
||||
<line>20</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Attr.DefaultInvalidImage">
|
||||
<file name="HTMLPurifier/AttrTransform/ImgRequired.php">
|
||||
<line>19</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Attr.DefaultInvalidImageAlt">
|
||||
<file name="HTMLPurifier/AttrTransform/ImgRequired.php">
|
||||
<line>27</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Core.EscapeInvalidChildren">
|
||||
<file name="HTMLPurifier/ChildDef/Required.php">
|
||||
<line>55</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Cache.SerializerPath">
|
||||
<file name="HTMLPurifier/DefinitionCache/Serializer.php">
|
||||
<line>91</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="FilterParam.ExtractStyleBlocksTidyImpl">
|
||||
<file name="HTMLPurifier/Filter/ExtractStyleBlocks.php">
|
||||
<line>41</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="FilterParam.ExtractStyleBlocksScope">
|
||||
<file name="HTMLPurifier/Filter/ExtractStyleBlocks.php">
|
||||
<line>65</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="FilterParam.ExtractStyleBlocksEscaping">
|
||||
<file name="HTMLPurifier/Filter/ExtractStyleBlocks.php">
|
||||
<line>123</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="HTML.MaxImgLength">
|
||||
<file name="HTMLPurifier/HTMLModule/Image.php">
|
||||
<line>14</line>
|
||||
</file>
|
||||
<file name="HTMLPurifier/HTMLModule/SafeEmbed.php">
|
||||
<line>13</line>
|
||||
</file>
|
||||
<file name="HTMLPurifier/HTMLModule/SafeObject.php">
|
||||
<line>19</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="HTML.TidyLevel">
|
||||
<file name="HTMLPurifier/HTMLModule/Tidy.php">
|
||||
<line>45</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="HTML.TidyAdd">
|
||||
<file name="HTMLPurifier/HTMLModule/Tidy.php">
|
||||
<line>49</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="HTML.TidyRemove">
|
||||
<file name="HTMLPurifier/HTMLModule/Tidy.php">
|
||||
<line>50</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="AutoFormatParam.PurifierLinkifyDocURL">
|
||||
<file name="HTMLPurifier/Injector/PurifierLinkify.php">
|
||||
<line>15</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Core.AggressivelyFixLt">
|
||||
<file name="HTMLPurifier/Lexer/DOMLex.php">
|
||||
<line>44</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Core.DirectLexLineNumberSyncInterval">
|
||||
<file name="HTMLPurifier/Lexer/DirectLex.php">
|
||||
<line>59</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Core.EscapeInvalidTags">
|
||||
<file name="HTMLPurifier/Strategy/MakeWellFormed.php">
|
||||
<line>22</line>
|
||||
</file>
|
||||
<file name="HTMLPurifier/Strategy/RemoveForeignElements.php">
|
||||
<line>19</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Core.RemoveScriptContents">
|
||||
<file name="HTMLPurifier/Strategy/RemoveForeignElements.php">
|
||||
<line>22</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="Core.HiddenElements">
|
||||
<file name="HTMLPurifier/Strategy/RemoveForeignElements.php">
|
||||
<line>23</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="URI.HostBlacklist">
|
||||
<file name="HTMLPurifier/URIFilter/HostBlacklist.php">
|
||||
<line>8</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="URI.MungeResources">
|
||||
<file name="HTMLPurifier/URIFilter/Munge.php">
|
||||
<line>14</line>
|
||||
</file>
|
||||
</directive>
|
||||
<directive id="URI.MungeSecretKey">
|
||||
<file name="HTMLPurifier/URIFilter/Munge.php">
|
||||
<line>15</line>
|
||||
</file>
|
||||
</directive>
|
||||
</usage>
|
218
docs/dev-advanced-api.html
Normal file
218
docs/dev-advanced-api.html
Normal file
@@ -0,0 +1,218 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||
<meta name="description" content="Specification for HTML Purifier's advanced API for defining custom filtering behavior." />
|
||||
<link rel="stylesheet" type="text/css" href="style.css" />
|
||||
|
||||
<title>Advanced API - HTML Purifier</title>
|
||||
|
||||
</head><body>
|
||||
|
||||
<h1>Advanced API</h1>
|
||||
|
||||
<div id="filing">Filed under Development</div>
|
||||
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
||||
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
|
||||
|
||||
<p>
|
||||
<strong>Warning:</strong> This document may be out-of-date. When in doubt,
|
||||
consult the source code documentation.
|
||||
</p>
|
||||
|
||||
<p>HTML Purifier currently natively supports only a subset of HTML's
|
||||
allowed elements, attributes, and behavior; specifically, this subset
|
||||
is the set of elements that are safe for untrusted users to use.
|
||||
However, HTML Purifier is often utilized to ensure standards-compliance
|
||||
from input that is trusted (making it a sort of Tidy substitute),
|
||||
and often users need to define new elements or attributes. The
|
||||
advanced API is oriented specifically for these use-cases.</p>
|
||||
|
||||
<p>Our goals are to let the user:</p>
|
||||
|
||||
<dl>
|
||||
<dt>Select</dt>
|
||||
<dd><ul>
|
||||
<li>Doctype</li>
|
||||
<!-- <li>Filterset</li> -->
|
||||
<li>Elements / Attributes / Modules</li>
|
||||
<li>Tidy</li>
|
||||
</ul></dd>
|
||||
<dt>Customize</dt>
|
||||
<dd><ul>
|
||||
<li>Attributes</li>
|
||||
<li>Elements</li>
|
||||
<!--<li>Doctypes</li>-->
|
||||
</ul></dd>
|
||||
</dl>
|
||||
|
||||
<h2>Select</h2>
|
||||
|
||||
<p>For basic use, the user will have to specify some basic parameters. This
|
||||
is not strictly necessary, as HTML Purifier's default setting will always
|
||||
output safe code, but is required for standards-compliant output.</p>
|
||||
|
||||
<h3>Selecting a Doctype</h3>
|
||||
|
||||
<p>The first thing to select is the <strong>doctype</strong>. This
|
||||
is essential for standards-compliant output.</p>
|
||||
|
||||
<p class="technical">This identifier is based
|
||||
on the name the W3C has given to the document type and <em>not</em>
|
||||
the DTD identifier.</p>
|
||||
|
||||
<p>This parameter is set via the configuration object:</p>
|
||||
|
||||
<pre>$config->set('HTML', 'Doctype', 'XHTML 1.0 Transitional');</pre>
|
||||
|
||||
<p>Due to historical reasons, the default doctype is XHTML 1.0
|
||||
Transitional, however, we really shouldn't be guessing what the user's
|
||||
doctype is. Fortunantely, people who can't be bothered to set this won't
|
||||
be bothered when their pages stop validating.</p>
|
||||
|
||||
<h3>Selecting Elements / Attributes / Modules</h3>
|
||||
|
||||
<p>HTML Purifier will, by default, allow as many elements and attributes
|
||||
as possible. However, a user may decide to roll their own filterset by
|
||||
selecting modules, elements and attributes to allow for their own
|
||||
specific use-case. This can be done using %HTML.Allowed:</p>
|
||||
|
||||
<pre>$config->set('HTML', 'Allowed', 'a[href|title],em,p,blockquote');</pre>
|
||||
|
||||
<p class="technical">The directive %HTML.Allowed is a convenience feature
|
||||
that may be fully expressed with the legacy interface.</p>
|
||||
|
||||
<p>We currently support another interface from older versions:</p>
|
||||
|
||||
<pre>$config->set('HTML', 'AllowedElements', 'a,em,p,blockquote');
|
||||
$config->set('HTML', 'AllowedAttributes', 'a.href,a.title');</pre>
|
||||
|
||||
<p>A user may also choose to allow modules using a specialized
|
||||
directive:</p>
|
||||
|
||||
<pre>$config->set('HTML', 'AllowedModules', 'Hypertext,Text,Lists');</pre>
|
||||
|
||||
<p>But it is not expected that this feature will be widely used.</p>
|
||||
|
||||
<p class="technical">Module selection will work slightly differently
|
||||
from the other AllowedElements and AllowedAttributes directives by
|
||||
directly modifying the doctype you are operating in, in the spirit of
|
||||
XHTML 1.1's modularization. We stop users from shooting themselves in the
|
||||
foot by mandating the modules in %HTML.CoreModules be used.</p>
|
||||
|
||||
<p class="technical">Modules are distinguished from regular elements by the
|
||||
case of their first letter. While XML distinguishes between and allows
|
||||
lower and uppercase letters in element names, XHTML uses only lower-case
|
||||
element names for sake of consistency.</p>
|
||||
|
||||
<h3>Selecting Tidy</h3>
|
||||
|
||||
<p>The name of this segment of functionality is inspired off of Dave
|
||||
Ragget's program HTML Tidy, which purported to help clean up HTML. In
|
||||
HTML Purifier, Tidy functionality involves turning unsupported and
|
||||
deprecated elements into standards-compliant ones, maintaining
|
||||
backwards compatibility, and enforcing best practices.</p>
|
||||
|
||||
<p>This is a complicated feature, and is explained more in depth at
|
||||
<a href="enduser-tidy.html">the Tidy documentation page</a>.</p>
|
||||
|
||||
<!--
|
||||
<h3>Unified selector</h3>
|
||||
|
||||
<p>Because selecting each and every one of these configuration options
|
||||
is a chore, we may wish to offer a specialized configuration method
|
||||
for selecting a filterset. Possibility:</p>
|
||||
|
||||
<pre>function selectFilter($doctype, $filterset, $tidy)</pre>
|
||||
|
||||
<p>...which is simply a light wrapper over the individual configuration
|
||||
calls. A custom config file format or text format could also be adopted.</p>
|
||||
-->
|
||||
|
||||
<h2>Customize</h2>
|
||||
|
||||
<p>By reviewing topic posts in the support forum, we determined that
|
||||
there were two primarily demanded customization features people wanted:
|
||||
to add an attribute to an existing element, and to add an element.
|
||||
Thus, we'll want to create convenience functions for these common
|
||||
use-cases.</p>
|
||||
|
||||
<p>Note that the functions described here are only available if
|
||||
a raw copy of <code>HTMLPurifier_HTMLDefinition</code> was retrieved.
|
||||
Furthermore, caching may prevent your changes from immediately
|
||||
being seen: consult <a href="enduser-customize.html">enduser-customize.html</a> on how
|
||||
to work around this.</p>
|
||||
|
||||
<h3>Attributes</h3>
|
||||
|
||||
<p>An attribute is bound to an element by a name and has a specific
|
||||
<code>AttrDef</code> that validates it. The interface is therefore:</p>
|
||||
|
||||
<pre>function addAttribute($element, $attribute, $attribute_def);</pre>
|
||||
|
||||
<p>Example of the functionality in action:</p>
|
||||
|
||||
<pre>$def->addAttribute('a', 'rel', 'Enum#nofollow');</pre>
|
||||
|
||||
<p>The <code>$attribute_def</code> value is flexible,
|
||||
to make things simpler. It can be a literal object or:</p>
|
||||
|
||||
<ul>
|
||||
<!--<li>Class name: We'll instantiate it for you</li>
|
||||
<li>Function name: We'll create an <code>HTMLPurifier_AttrDef_Anonymous</code>
|
||||
class with that function registered as a callback.</li>-->
|
||||
<li>String attribute type: We'll use <code>HTMLPurifier_AttrTypes</code>
|
||||
to resolve it for you. Any data that follows a hash mark (#) will
|
||||
be used to customize the attribute type: in the example above,
|
||||
we specify which values for Enum to allow.</li>
|
||||
</ul>
|
||||
|
||||
<h3>Elements</h3>
|
||||
|
||||
<p>An element requires certain information as specified by
|
||||
<code>HTMLPurifier_ElementDef</code>. However, not all of it is necessary,
|
||||
the usual things required are:</p>
|
||||
|
||||
<ul>
|
||||
<li>Attributes</li>
|
||||
<li>Content model/type</li>
|
||||
<li>Registration in a content set</li>
|
||||
</ul>
|
||||
|
||||
<p>This suggests an API like this:</p>
|
||||
|
||||
<pre>function addElement($element, $type, $contents,
|
||||
$attr_collections = array(); $attributes = array());</pre>
|
||||
|
||||
<p>Each parameter explained in depth:</p>
|
||||
|
||||
<dl>
|
||||
<dt><code>$element</code></dt>
|
||||
<dd>Element name, ex. 'label'</dd>
|
||||
<dt><code>$type</code></dt>
|
||||
<dd>Content set to register in, ex. 'Inline' or 'Flow'</dd>
|
||||
<dt><code>$contents</code></dt>
|
||||
<dd>Description of allowed children. This is a merged form of
|
||||
<code>HTMLPurifier_ElementDef</code>'s member variables
|
||||
<code>$content_model</code> and <code>$content_model_type</code>,
|
||||
where the form is <q>Type: Model</q>, ex. 'Optional: Inline'.
|
||||
There are also a number of predefined templates one may use.</dd>
|
||||
<dt><code>$attr_collections</code></dt>
|
||||
<dd>Array (or string if only one) of attribute collection(s) to
|
||||
merge into the attributes array.</dd>
|
||||
<dt><code>$attributes</code></dt>
|
||||
<dd>Array of attribute names to attribute definitions, much like
|
||||
the above-described attribute customization.</dd>
|
||||
</dl>
|
||||
|
||||
<p>A possible usage:</p>
|
||||
|
||||
<pre>$def->addElement('font', 'Inline', 'Optional: Inline', 'Common',
|
||||
array('color' => 'Color'));</pre>
|
||||
|
||||
<p>See <code>HTMLPurifier/HTMLModule.php</code> for details.</p>
|
||||
|
||||
<div id="version">$Id$</div>
|
||||
|
||||
</body></html>
|
@@ -1,52 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||
<meta name="description" content="Discusses code quality issues and places that need to be refactored in HTML Purifier." />
|
||||
<link rel="stylesheet" type="text/css" href="./style.css" />
|
||||
|
||||
<title>Code Quality Issues - HTML Purifier</title>
|
||||
|
||||
</head><body>
|
||||
|
||||
<h1>Code Quality Issues</h1>
|
||||
|
||||
<div id="filing">Filed under Development</div>
|
||||
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
||||
<div id="home"><a href="http://hp.jpsband.org/">HTML Purifier</a> End-User Documentation</div>
|
||||
|
||||
<p>Okay, face it. Programmers can get lazy, cut corners, or make mistakes. They
|
||||
also can do quick prototypes, and then forget to rewrite them later. Well,
|
||||
while I can't list mistakes in here, I can list prototype-like segments
|
||||
of code that should be aggressively refactored. This does not list
|
||||
optimization issues, that needs to be done after intense profiling.</p>
|
||||
|
||||
<pre>
|
||||
docs/examples/demo.php - ad hoc HTML/PHP soup to the extreme
|
||||
|
||||
AttrDef
|
||||
Class - doesn't support Unicode characters (fringe); uses regular
|
||||
expressions
|
||||
Lang - code duplication; premature optimization
|
||||
Length - easily mistaken for CSSLength
|
||||
URI - multiple regular expressions; missing validation for parts (?)
|
||||
CSS - parser doesn't accept advanced CSS (fringe)
|
||||
Number - constructor interface inconsistent with Integer
|
||||
ConfigSchema - redefinition is a mess
|
||||
Strategy
|
||||
FixNesting - cannot bubble nodes out of structures, duplicated checks
|
||||
for special-case parent node
|
||||
MakeWellFormed - insufficient automatic closing definitions (check HTML
|
||||
spec for optional end tags, also, closing based on type (block/inline)
|
||||
might be efficient).
|
||||
RemoveForeignElements - should be run in parallel with MakeWellFormed
|
||||
URIScheme - needs to have callable generic checks
|
||||
mailto - doesn't validate emails, doesn't validate querystring
|
||||
news - doesn't validate opaque path
|
||||
nntp - doesn't constrain path
|
||||
</pre>
|
||||
|
||||
<div id="version">$Id$</div>
|
||||
|
||||
</body></html>
|
27
docs/dev-code-quality.txt
Normal file
27
docs/dev-code-quality.txt
Normal file
@@ -0,0 +1,27 @@
|
||||
|
||||
Code Quality Issues
|
||||
|
||||
Okay, face it. Programmers can get lazy, cut corners, or make mistakes. They
|
||||
also can do quick prototypes, and then forget to rewrite them later. Well,
|
||||
while I can't list mistakes in here, I can list prototype-like segments
|
||||
of code that should be aggressively refactored. This does not list
|
||||
optimization issues, that needs to be done after intense profiling.
|
||||
|
||||
docs/examples/demo.php - ad hoc HTML/PHP soup to the extreme
|
||||
|
||||
AttrDef - a lot of duplication, more generic classes need to be created;
|
||||
a lot of strtolower() calls, no legit casing
|
||||
Class - doesn't support Unicode characters (fringe); uses regular expressions
|
||||
Lang - code duplication; premature optimization
|
||||
Length - easily mistaken for CSSLength
|
||||
URI - multiple regular expressions; missing validation for parts (?)
|
||||
CSS - parser doesn't accept advanced CSS (fringe)
|
||||
Number - constructor interface inconsistent with Integer
|
||||
Strategy
|
||||
FixNesting - cannot bubble nodes out of structures, duplicated checks
|
||||
for special-case parent node
|
||||
RemoveForeignElements - should be run in parallel with MakeWellFormed
|
||||
URIScheme - needs to have callable generic checks
|
||||
mailto - doesn't validate emails, doesn't validate querystring
|
||||
news - doesn't validate opaque path
|
||||
nntp - doesn't constrain path
|
376
docs/dev-config-schema.html
Normal file
376
docs/dev-config-schema.html
Normal file
@@ -0,0 +1,376 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||
<meta name="description" content="Describes config schema framework in HTML Purifier." />
|
||||
<link rel="stylesheet" type="text/css" href="./style.css" />
|
||||
<title>Config Schema - HTML Purifier</title>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<h1>Config Schema</h1>
|
||||
|
||||
<div id="filing">Filed under Development</div>
|
||||
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
||||
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
|
||||
|
||||
<p>
|
||||
HTML Purifier has a fairly complex system for configuration. Users
|
||||
interact with a <code>HTMLPurifier_Config</code> object to
|
||||
set configuration directives. The values they set are validated according
|
||||
to a configuration schema, <code>HTMLPurifier_ConfigSchema</code>.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
The schema is mostly transparent to end-users, but if you're doing development
|
||||
work for HTML Purifier and need to define a new configuration directive,
|
||||
you'll need to interact with it. We'll also talk about how to define
|
||||
userspace configuration directives at the very end.
|
||||
</p>
|
||||
|
||||
<h2>Write a directive file</h2>
|
||||
|
||||
<p>
|
||||
Directive files define configuration directives to be used by
|
||||
HTML Purifier. They are placed in <code>library/HTMLPurifier/ConfigSchema/schema/</code>
|
||||
in the form <code><em>Namespace</em>.<em>Directive</em>.txt</code> (I
|
||||
couldn't think of a more descriptive file extension.)
|
||||
Directive files are actually what we call <code>StringHash</code>es,
|
||||
i.e. associative arrays represented in a string form reminiscent of
|
||||
<a href="http://qa.php.net/write-test.php">PHPT</a> tests. Here's a
|
||||
sample directive file, <code>Test.Sample.txt</code>:
|
||||
</p>
|
||||
|
||||
<pre>Test.Sample
|
||||
TYPE: string/null
|
||||
DEFAULT: NULL
|
||||
ALLOWED: 'foo', 'bar'
|
||||
VALUE-ALIASES: 'baz' => 'bar'
|
||||
VERSION: 3.1.0
|
||||
--DESCRIPTION--
|
||||
This is a sample configuration directive for the purposes of the
|
||||
<code>dev-config-schema.html<code> documentation.
|
||||
--ALIASES--
|
||||
Test.Example</pre>
|
||||
|
||||
<p>
|
||||
Each of these segments has a specific meaning:
|
||||
</p>
|
||||
|
||||
<table class="table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Key</th>
|
||||
<th>Example</th>
|
||||
<th>Description</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>ID</td>
|
||||
<td>Test.Sample</td>
|
||||
<td>The name of the directive, in the form Namespace.Directive
|
||||
(implicitly the first line)</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>TYPE</td>
|
||||
<td>string/null</td>
|
||||
<td>The type of variable this directive accepts. See below for
|
||||
details. You can also add <code>/null</code> to the end of
|
||||
any basic type to allow null values too.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>DEFAULT</td>
|
||||
<td>NULL</td>
|
||||
<td>A parseable PHP expression of the default value.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>DESCRIPTION</td>
|
||||
<td>This is a...</td>
|
||||
<td>An HTML description of what this directive does.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>VERSION</td>
|
||||
<td>3.1.0</td>
|
||||
<td><em>Recommended</em>. The version of HTML Purifier this directive was added.
|
||||
Directives that have been around since 1.0.0 don't have this,
|
||||
but any new ones should.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>ALIASES</td>
|
||||
<td>Test.Example</td>
|
||||
<td><em>Optional</em>. A comma separated list of aliases for this directive.
|
||||
This is most useful for backwards compatibility and should
|
||||
not be used otherwise.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>ALLOWED</td>
|
||||
<td>'foo', 'bar'</td>
|
||||
<td><em>Optional</em>. Set of allowed value for a directive,
|
||||
a comma separated list of parseable PHP expressions. This
|
||||
is only allowed string, istring, text and itext TYPEs.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>VALUE-ALIASES</td>
|
||||
<td>'baz' => 'bar'</td>
|
||||
<td><em>Optional</em>. Mapping of one value to another, and
|
||||
should be a comma separated list of keypair duples. This
|
||||
is only allowed string, istring, text and itext TYPEs.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>DEPRECATED-VERSION</td>
|
||||
<td>3.1.0</td>
|
||||
<td><em>Not shown</em>. Indicates that the directive was
|
||||
deprecated this version.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>DEPRECATED-USE</td>
|
||||
<td>Test.NewDirective</td>
|
||||
<td><em>Not shown</em>. Indicates what new directive should be
|
||||
used instead. Note that the directives will functionally be
|
||||
different, although they should offer the same functionality.
|
||||
If they are identical, use an alias instead.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>EXTERNAL</td>
|
||||
<td>CSSTidy</td>
|
||||
<td><em>Not shown</em>. Indicates if there is an external library
|
||||
the user will need to download and install to use this configuration
|
||||
directive. As of right now, this is merely a Google-able name; future
|
||||
versions may also provide links and instructions.</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<p>
|
||||
Some notes on format and style:
|
||||
</p>
|
||||
|
||||
<ul>
|
||||
<li>
|
||||
Each of these keys can be expressed in the short format
|
||||
(<code>KEY: Value</code>) or the long format
|
||||
(<code>--KEY--</code> with value beneath). You must use the
|
||||
long format if multiple lines are needed, or if a long format
|
||||
has been used already (that's why <code>ALIASES</code> in our
|
||||
example is in the long format); otherwise, it's user preference.
|
||||
</li>
|
||||
<li>
|
||||
The HTML descriptions should be wrapped at about 80 columns; do
|
||||
not rely on editor word-wrapping.
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
<p>
|
||||
Also, as promised, here is the set of possible types:
|
||||
</p>
|
||||
|
||||
<table class="table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Type</th>
|
||||
<th>Example</th>
|
||||
<th>Description</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>string</td>
|
||||
<td>'Foo'</td>
|
||||
<td><a href="http://docs.php.net/manual/en/language.types.string.php">String</a> without newlines</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>istring</td>
|
||||
<td>'foo'</td>
|
||||
<td>Case insensitive ASCII string without newlines</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>text</td>
|
||||
<td>"A<em>\n</em>b"</td>
|
||||
<td>String with newlines</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>itext</td>
|
||||
<td>"a<em>\n</em>b"</td>
|
||||
<td>Case insensitive ASCII string without newlines</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>int</td>
|
||||
<td>23</td>
|
||||
<td>Integer</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>float</td>
|
||||
<td>3.0</td>
|
||||
<td>Floating point number</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>bool</td>
|
||||
<td>true</td>
|
||||
<td>Boolean</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>lookup</td>
|
||||
<td>array('key' => true)</td>
|
||||
<td>Lookup array, used with <code>isset($var[$key])</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>list</td>
|
||||
<td>array('f', 'b')</td>
|
||||
<td>List array, with ordered numerical indexes</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>hash</td>
|
||||
<td>array('key' => 'val')</td>
|
||||
<td>Associative array of keys to values</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>mixed</td>
|
||||
<td>new stdclass</td>
|
||||
<td>Any PHP variable is fine</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<p>
|
||||
The examples represent what will be returned out of the configuration
|
||||
object; users have a little bit of leeway when setting configuration
|
||||
values (for example, a lookup value can be specified as a list;
|
||||
HTML Purifier will flip it as necessary.) These types are defined
|
||||
in <a href="http://htmlpurifier.org/svnroot/htmlpurifier/trunk/library/HTMLPurifier/VarParser.php">
|
||||
library/HTMLPurifier/VarParser.php</a>.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
For more information on what values are allowed, and how they are parsed,
|
||||
consult <a href="http://htmlpurifier.org/svnroot/htmlpurifier/trunk/library/HTMLPurifier/ConfigSchema/InterchangeBuilder.php">
|
||||
library/HTMLPurifier/ConfigSchema/InterchangeBuilder.php</a>, as well
|
||||
as <a href="http://htmlpurifier.org/svnroot/htmlpurifier/trunk/library/HTMLPurifier/ConfigSchema/Interchange/Directive.php">
|
||||
library/HTMLPurifier/ConfigSchema/Interchange/Directive.php</a> for
|
||||
the semantics of the parsed values.
|
||||
</p>
|
||||
|
||||
<h2>Refreshing the cache</h2>
|
||||
|
||||
<p>
|
||||
You may have noticed that your directive file isn't doing anything
|
||||
yet. That's because it hasn't been added to the runtime
|
||||
<code>HTMLPurifier_ConfigSchema</code> instance. Run
|
||||
<code>maintenance/generate-schema-cache.php</code> to fix this.
|
||||
If there were no errors, you're good to go! Don't forget to add
|
||||
some unit tests for your functionality!
|
||||
</p>
|
||||
|
||||
<p>
|
||||
If you ever make changes to your configuration directives, you
|
||||
will need to run this script again.
|
||||
</p>
|
||||
|
||||
<h2>Errors</h2>
|
||||
|
||||
<p>
|
||||
All directive files go through a rigorous validation process
|
||||
through <a href="http://htmlpurifier.org/svnroot/htmlpurifier/trunk/library/HTMLPurifier/ConfigSchema/">
|
||||
library/HTMLPurifier/ConfigSchema/Validator.php</a>, as well
|
||||
as some basic checks during building. While
|
||||
listing every error out here is out-of-scope for this document, we
|
||||
can give some general tips for interpreting error messages.
|
||||
There are two types of errors: builder errors and validation errors.
|
||||
</p>
|
||||
|
||||
<h3>Builder errors</h3>
|
||||
|
||||
<blockquote>
|
||||
<p>
|
||||
<strong>Exception:</strong> Expected type string, got
|
||||
integer in DEFAULT in directive hash 'Ns.Dir'
|
||||
</p>
|
||||
</blockquote>
|
||||
|
||||
<p>
|
||||
You can identify a builder error by the keyword "directive hash."
|
||||
These are the easiest to deal with, because they directly correspond
|
||||
with your directive file. Find the offending directive file (which
|
||||
is the directive hash plus the .txt extension), find the
|
||||
offending index ("in DEFAULT" means the DEFAULT key) and fix the error.
|
||||
This particular error would occur if your default value is not the same
|
||||
type as TYPE.
|
||||
</p>
|
||||
|
||||
<h3>Validation errors</h3>
|
||||
|
||||
<blockquote>
|
||||
<p>
|
||||
<strong>Exception:</strong> Alias 3 in valueAliases in directive
|
||||
'Ns.Dir' must be a string
|
||||
</p>
|
||||
</blockquote>
|
||||
|
||||
<p>
|
||||
These are a little trickier, because we're not actually validating
|
||||
your directive file, or even the direct string hash representation.
|
||||
We're validating an Interchange object, and the error messages do
|
||||
not mention any string hash keys.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
Nevertheless, it's not difficult to figure out what went wrong.
|
||||
Read the "context" statements in reverse:
|
||||
</p>
|
||||
|
||||
<dl>
|
||||
<dt>in directive 'Ns.Dir'</dt>
|
||||
<dd>This means we need to look at the directive file <code>Ns.Dir.txt</code></dd>
|
||||
<dt>in valueAliases</dt>
|
||||
<dd>There's no key actually called this, but there's one that's close:
|
||||
VALUE-ALIASES. Indeed, that's where to look.</dd>
|
||||
<dt>Alias 3</dt>
|
||||
<dd>The value alias that is equal to 3 is the culprit.</dd>
|
||||
</dl>
|
||||
|
||||
<p>
|
||||
In this particular case, you're not allowed to alias integers values to
|
||||
strings values.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
The most difficult part is translating the Interchange member variable (valueAliases)
|
||||
into a directive file key (VALUE-ALIASES), but there's a one-to-one
|
||||
correspondence currently. If the two formats diverge, any discrepancies
|
||||
will be described in <a href="http://htmlpurifier.org/svnroot/htmlpurifier/trunk/library/HTMLPurifier/ConfigSchema/InterchangeBuilder.php">
|
||||
library/HTMLPurifier/ConfigSchema/InterchangeBuilder.php</a>.
|
||||
</p>
|
||||
|
||||
<h2>Internals</h2>
|
||||
|
||||
<p>
|
||||
Much of the configuration schema framework's codebase deals with
|
||||
shuffling data from one format to another, and doing validation on this
|
||||
data.
|
||||
The keystone of all of this is the <code>HTMLPurifier_ConfigSchema_Interchange</code>
|
||||
class, which represents the purest, parsed representation of the schema.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
Hand-writing this data is unwieldy, however, so we write directive files.
|
||||
These directive files are parsed by <code>HTMLPurifier_StringHashParser</code>
|
||||
into <code>HTMLPurifier_StringHash</code>es, which then
|
||||
are run through <code>HTMLPurifier_ConfigSchema_InterchangeBuilder</code>
|
||||
to construct the interchange object.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
From the interchange object, the data can be siphoned into other forms
|
||||
using <code>HTMLPurifier_ConfigSchema_Builder</code> subclasses.
|
||||
For example, <code>HTMLPurifier_ConfigSchema_Builder_ConfigSchema</code>
|
||||
generates a runtime <code>HTMLPurifier_ConfigSchema</code> object,
|
||||
which <code>HTMLPurifier_Config</code> uses to validate its incoming
|
||||
data. There is also an XML serializer, which is used to build documentation.
|
||||
</p>
|
||||
|
||||
<div id="version">$Id$</div>
|
||||
|
||||
</body>
|
||||
</html>
|
67
docs/dev-flush.html
Normal file
67
docs/dev-flush.html
Normal file
@@ -0,0 +1,67 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||
<meta name="description" content="Discusses when to flush HTML Purifier's various caches." />
|
||||
<link rel="stylesheet" type="text/css" href="./style.css" />
|
||||
<title>Flushing the Purifier - HTML Purifier</title>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<h1>Flushing the Purifier</h1>
|
||||
|
||||
<div id="filing">Filed under Development</div>
|
||||
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
||||
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
|
||||
|
||||
<p>
|
||||
If you've been poking around the various folders in HTML Purifier,
|
||||
you may have noticed the <code>maintenance</code> directory. Almost
|
||||
all of these scripts are devoted to flushing out the various caches
|
||||
HTML Purifier uses. Normal users don't have to worry about this:
|
||||
regular library usage is transparent. However, when doing development
|
||||
work on HTML Purifier, you may find you have to flush one of the
|
||||
caches.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
As a general rule of thumb, run <code>flush.php</code> whenever you make
|
||||
any <em>major</em> changes, or when tests start mysteriously failing.
|
||||
In more detail, run this script if:
|
||||
</p>
|
||||
|
||||
<ul>
|
||||
<li>
|
||||
You added new source files to HTML Purifier's main library.
|
||||
(see <code>generate-includes.php</code>)
|
||||
</li>
|
||||
<li>
|
||||
You modified the configuration schema (see
|
||||
<code>generate-schema-cache.php</code>). This usually means
|
||||
adding or modifying files in <code>HTMLPurifier/ConfigSchema/schema/</code>,
|
||||
although in rare cases modifying <code>HTMLPurifier/ConfigSchema.php</code>
|
||||
will also require this.
|
||||
</li>
|
||||
<li>
|
||||
You modified a Definition, or its subsystems. The most usual candidate
|
||||
is <code>HTMLPurifier/HTMLDefinition.php</code>, which also encompasses
|
||||
the files in <code>HTMLPurifier/HTMLModule/</code> as well as if you've
|
||||
<a href="enduser-customize.html">customizing definitions</a> without
|
||||
the cache disabled. (see <code>flush-generation-cache.php</code>)
|
||||
</li>
|
||||
<li>
|
||||
You modified source files, and have been using the standalone
|
||||
version from the full installation. (see <code>generate-standalone.php</code>)
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
<p>
|
||||
You can check out the corresponding scripts for more information on what they
|
||||
do.
|
||||
</p>
|
||||
|
||||
<div id="version">$Id$</div>
|
||||
|
||||
</body></html>
|
279
docs/dev-includes.txt
Normal file
279
docs/dev-includes.txt
Normal file
@@ -0,0 +1,279 @@
|
||||
|
||||
INCLUDES, AUTOLOAD, BYTECODE CACHES and OPTIMIZATION
|
||||
|
||||
The Problem
|
||||
-----------
|
||||
|
||||
HTML Purifier contains a number of extra components that are not used all
|
||||
of the time, only if the user explicitly specifies that we should use
|
||||
them.
|
||||
|
||||
Some of these optional components are optionally included (Filter,
|
||||
Language, Lexer, Printer), while others are included all the time
|
||||
(Injector, URIFilter, HTMLModule, URIScheme). We will stipulate that these
|
||||
are all developer specified: it is conceivable that certain Tokens are not
|
||||
used, but this is user-dependent and should not be trusted.
|
||||
|
||||
We should come up with a consistent way to handle these things and ensure
|
||||
that we get the maximum performance when there is bytecode caches and
|
||||
when there are not. Unfortunately, these two goals seem contrary to each
|
||||
other.
|
||||
|
||||
A peripheral issue is the performance of ConfigSchema, which has been
|
||||
shown take a large, constant amount of initialization time, and is
|
||||
intricately linked to the issue of includes due to its pervasive use
|
||||
in our plugin architecture.
|
||||
|
||||
Pros and Cons
|
||||
-------------
|
||||
|
||||
We will assume that user-based extensions will be included by them.
|
||||
|
||||
Conditional includes:
|
||||
Pros:
|
||||
- User management is simplified; only a single directive needs to be set
|
||||
- Only necessary code is included
|
||||
Cons:
|
||||
- Doesn't play nicely with opcode caches
|
||||
- Adds complexity to standalone version
|
||||
- Optional configuration directives are not exposed without a little
|
||||
extra coaxing (not implemented yet)
|
||||
|
||||
Include it all:
|
||||
Pros:
|
||||
- User management is still simple
|
||||
- Plays nicely with opcode caches and standalone version
|
||||
- All configuration directives are present
|
||||
Cons:
|
||||
- Lots of (how much?) extra code is included
|
||||
- Classes that inherit from external libraries will cause compile
|
||||
errors
|
||||
|
||||
Build an include stub (Let's do this!):
|
||||
Pros:
|
||||
- Only necessary code is included
|
||||
- Plays nicely with opcode caches and standalone version
|
||||
- require (without once) can be used, see above
|
||||
- Could further extend as a compilation to one file
|
||||
Cons:
|
||||
- Not implemented yet
|
||||
- Requires user intervention and use of a command line script
|
||||
- Standalone script must be chained to this
|
||||
- More complex and compiled-language-like
|
||||
- Requires a whole new class of system-wide configuration directives,
|
||||
as configuration objects can be reused
|
||||
- Determining what needs to be included can be complex (see above)
|
||||
- No way of autodetecting dynamically instantiated classes
|
||||
- Might be slow
|
||||
|
||||
Include stubs
|
||||
-------------
|
||||
|
||||
This solution may be "just right" for users who are heavily oriented
|
||||
towards performance. However, there are a number of picky implementation
|
||||
details to work out beforehand.
|
||||
|
||||
The number one concern is how to make the HTML Purifier files "work
|
||||
out of the box", while still being able to easily get them into a form
|
||||
that works with this setup. As the codebase stands right now, it would
|
||||
be necessary to strip out all of the require_once calls. The only way
|
||||
we could get rid of the require_once calls is to use __autoload or
|
||||
use the stub for all cases (which might not be a bad idea).
|
||||
|
||||
Aside
|
||||
-----
|
||||
An important thing to remember, however, is that these require_once's
|
||||
are valuable data about what classes a file needs. Unfortunately, there's
|
||||
no distinction between whether or not the file is needed all the time,
|
||||
or whether or not it is one of our "optional" files. Thus, it is
|
||||
effectively useless.
|
||||
|
||||
Deprecated
|
||||
----------
|
||||
One of the things I'd like to do is have the code search for any classes
|
||||
that are explicitly mentioned in the code. If a class isn't mentioned, I
|
||||
get to assume that it is "optional," i.e. included via introspection.
|
||||
The choice is either to use PHP's tokenizer or use regexps; regexps would
|
||||
be faster but a tokenizer would be more correct. If this ends up being
|
||||
unfeasible, adding dependency comments isn't a bad idea. (This could
|
||||
even be done automatically by search/replacing require_once, although
|
||||
we'd have to manually inspect the results for the optional requires.)
|
||||
|
||||
NOTE: This ends up not being necessary, as we're going to make the user
|
||||
figure out all the extra classes they need, and only include the core
|
||||
which is predetermined.
|
||||
|
||||
Using the autoload framework with include stubs works nicely with
|
||||
introspective classes: instead of having to have require_once inside
|
||||
the function, we can let autoload do the work; we simply need to
|
||||
new $class or accept the object straight from the caller. Handling filters
|
||||
becomes a simple matter of ticking off configuration directives, and
|
||||
if ConfigSchema spits out errors, adding the necessary includes. We could
|
||||
also use the autoload framework as a fallback, in case the user forgets
|
||||
to make the include, but doesn't really care about performance.
|
||||
|
||||
Insight
|
||||
-------
|
||||
All of this talk is merely a natural extension of what our current
|
||||
standalone functionality does. However, instead of having our code
|
||||
perform the includes, or attempting to inline everything that possibly
|
||||
could be used, we boot the issue to the user, making them include
|
||||
everything or setup the fallback autoload handler.
|
||||
|
||||
Configuration Schema
|
||||
--------------------
|
||||
|
||||
A common deficiency for all of the conditional include setups (including
|
||||
the dynamically built include PHP stub) is that if one of this
|
||||
conditionally included files includes a configuration directive, it
|
||||
is not accessible to configdoc. A stopgap solution for this problem is
|
||||
to have it piggy-back off of the data in the merge-library.php script
|
||||
to figure out what extra files it needs to include, but if the file also
|
||||
inherits classes that don't exist, we're in big trouble.
|
||||
|
||||
I think it's high time we centralized the configuration documentation.
|
||||
However, the type checking has been a great boon for the library, and
|
||||
I'd like to keep that. The compromise is to use some other source, and
|
||||
then parse it into the ConfigSchema internal format (sans all of those
|
||||
nasty documentation strings which we really don't need at runtime) and
|
||||
serialize that for future use.
|
||||
|
||||
The next question is that of format. XML is very verbose, and the prospect
|
||||
of setting defaults in it gives me willies. However, this may be necessary.
|
||||
Splitting up the file into manageable chunks may alleviate this trouble,
|
||||
and we may be even want to create our own format optimized for specifying
|
||||
configuration. It might look like (based off the PHPT format, which is
|
||||
nicely compact yet unambiguous and human-readable):
|
||||
|
||||
Core.HiddenElements
|
||||
TYPE: lookup
|
||||
DEFAULT: array('script', 'style') // auto-converted during processing
|
||||
--ALIASES--
|
||||
Core.InvisibleElements, Core.StupidElements
|
||||
--DESCRIPTION--
|
||||
<p>
|
||||
Blah blah
|
||||
</p>
|
||||
|
||||
The first line is the directive name, the lines after that prior to the
|
||||
first --HEADER-- block are single-line values, and then after that
|
||||
the multiline values are there. No value is restricted to a particular
|
||||
format: DEFAULT could very well be multiline if that would be easier.
|
||||
This would make it insanely easy, also, to add arbitrary extra parameters,
|
||||
like:
|
||||
|
||||
VERSION: 3.0.0
|
||||
ALLOWED: 'none', 'light', 'medium', 'heavy' // this is wrapped in array()
|
||||
EXTERNAL: CSSTidy // this would be documented somewhere else with a URL
|
||||
|
||||
The final loss would be that you wouldn't know what file the directive
|
||||
was used in; with some clever regexps it should be possible to
|
||||
figure out where $config->get($ns, $d); occurs. Reflective calls to
|
||||
the configuration object is mitigated by the fact that getBatch is
|
||||
used, so we can simply talk about that in the namespace definition page.
|
||||
This might be slow, but it would only happen when we are creating
|
||||
the documentation for consumption, and is sugar.
|
||||
|
||||
We can put this in a schema/ directory, outside of HTML Purifier. The serialized
|
||||
data gets treated like entities.ser.
|
||||
|
||||
The final thing that needs to be handled is user defined configurations.
|
||||
They can be added at runtime using ConfigSchema::registerDirectory()
|
||||
which globs the directory and grabs all of the directives to be incorporated
|
||||
in. Then, the result is saved. We may want to take advantage of the
|
||||
DefinitionCache framework, although it is not altogether certain what
|
||||
configuration directives would be used to generate our key (meta-directives!)
|
||||
|
||||
Further thoughts
|
||||
----------------
|
||||
Our master configuration schema will only need to be updated once
|
||||
every new version, so it's easily versionable. User specified
|
||||
schema files are far more volatile, but it's far too expensive
|
||||
to check the filemtimes of all the files, so a DefinitionRev style
|
||||
mechanism works better. However, we can uniquely identify the
|
||||
schema based on the directories they loaded, so there's no need
|
||||
for a DefinitionId until we give them full programmatic control.
|
||||
|
||||
These variables should be directly incorporated into ConfigSchema,
|
||||
and ConfigSchema should handle serialization. Some refactoring will be
|
||||
necessary for the DefinitionCache classes, as they are built with
|
||||
Config in mind. If the user changes something, the cache file gets
|
||||
rebuilt. If the version changes, the cache file gets rebuilt. Since
|
||||
our unit tests flush the caches before we start, and the operation is
|
||||
pretty fast, this will not negatively impact unit testing.
|
||||
|
||||
One last thing: certain configuration directives require that files
|
||||
get added. They may even be specified dynamically. It is not a good idea
|
||||
for the HTMLPurifier_Config object to be used directly for such matters.
|
||||
Instead, the userland code should explicitly perform the includes. We may
|
||||
put in something like:
|
||||
|
||||
REQUIRES: HTMLPurifier_Filter_ExtractStyleBlocks
|
||||
|
||||
To indicate that if that class doesn't exist, and the user is attempting
|
||||
to use the directive, we should fatally error out. The stub includes the core files,
|
||||
and the user includes everything else. Any reflective things like new
|
||||
$class would be required to tie in with the configuration.
|
||||
|
||||
It would work very well with rarely used configuration options, but it
|
||||
wouldn't be so good for "core" parts that can be disabled. In such cases
|
||||
the core include file would need to be modified, and the only way
|
||||
to properly do this is use the configuration object. Once again, our
|
||||
ability to create cache keys saves the day again: we can create arbitrary
|
||||
stub files for arbitrary configurations and include those. They could
|
||||
even be the single file affairs. The only thing we'd need to include,
|
||||
then, would be HTMLPurifier_Config! Then, the configuration object would
|
||||
load the library.
|
||||
|
||||
An aside...
|
||||
-----------
|
||||
One questions, however, the wisdom of letting PHP files write other PHP
|
||||
files. It seems like a recipe for disaster, or at least lots of headaches
|
||||
in highly secured setups, where PHP does not have the ability to write
|
||||
to its root. In such cases, we could use sticky bits or tell the user
|
||||
to manually generate the file.
|
||||
|
||||
The other troublesome bit is actually doing the calculations necessary.
|
||||
For certain cases, it's simple (such as URIScheme), but for AttrDef
|
||||
and HTMLModule the dependency trees are very complex in relation to
|
||||
%HTML.Allowed and friends. I think that this idea should be shelved
|
||||
and looked at a later, less insane date.
|
||||
|
||||
An interesting dilemma presents itself when a configuration form is offered
|
||||
to the user. Normally, the configuration object is not accessible without
|
||||
editing PHP code; this facility changes thing. The sensible thing to do
|
||||
is stipulate that all classes required by the directives you allow must
|
||||
be included.
|
||||
|
||||
Unit testing
|
||||
------------
|
||||
|
||||
Setting up the parsing and translation into our existing format would not
|
||||
be difficult to do. It might represent a good time for us to rethink our
|
||||
tests for these facilities; as creative as they are, they are often hacky
|
||||
and require public visibility for things that ought to be protected.
|
||||
This is especially applicable for our DefinitionCache tests.
|
||||
|
||||
Migration
|
||||
---------
|
||||
|
||||
Because we are not *adding* anything essentially new, it should be trivial
|
||||
to write a script to take our existing data and dump it into the new format.
|
||||
Well, not trivial, but fairly easy to accomplish. Primary implementation
|
||||
difficulties would probably involve formatting the file nicely.
|
||||
|
||||
Backwards-compatibility
|
||||
-----------------------
|
||||
|
||||
I expect that the ConfigSchema methods should stick around for a little bit,
|
||||
but display E_USER_NOTICE warnings that they are deprecated. This will
|
||||
require documentation!
|
||||
|
||||
New stuff
|
||||
---------
|
||||
|
||||
VERSION: Version number directive was introduced
|
||||
DEPRECATED-VERSION: If the directive was deprecated, when was it deprecated?
|
||||
DEPRECATED-USE: If the directive was deprecated, what should the user use now?
|
||||
REQUIRES: What classes does this configuration directive require, but are
|
||||
not part of the HTML Purifier core?
|
@@ -14,7 +14,7 @@
|
||||
|
||||
<div id="filing">Filed under Development</div>
|
||||
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
||||
<div id="home"><a href="http://hp.jpsband.org/">HTML Purifier</a> End-User Documentation</div>
|
||||
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
|
||||
|
||||
<p>The classes in this library follow a few naming conventions, which may
|
||||
help you find the correct functionality more quickly. Here they are:</p>
|
||||
@@ -79,4 +79,4 @@ help you find the correct functionality more quickly. Here they are:</p>
|
||||
|
||||
<div id="version">$Id$</div>
|
||||
|
||||
</body></html>
|
||||
</body></html>
|
||||
|
@@ -14,7 +14,7 @@
|
||||
|
||||
<div id="filing">Filed under Development</div>
|
||||
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
||||
<div id="home"><a href="http://hp.jpsband.org/">HTML Purifier</a> End-User Documentation</div>
|
||||
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
|
||||
|
||||
<p>Here are some possible optimization techniques we can apply to code sections if
|
||||
they turn out to be slow. Be sure not to prematurely optimize: if you get
|
||||
@@ -23,11 +23,10 @@ that itch, put it here!</p>
|
||||
<ul>
|
||||
<li>Make Tokens Flyweights (may prove problematic, probably not worth it)</li>
|
||||
<li>Rewrite regexps into PHP code</li>
|
||||
<li>Serialize the Definition object</li>
|
||||
<li>Batch regexp validation (do as many per function call as possible)</li>
|
||||
<li>Parallelize strategies</li>
|
||||
</ul>
|
||||
|
||||
<div id="version">$Id$</div>
|
||||
|
||||
</body></html>
|
||||
</body></html>
|
||||
|
@@ -32,14 +32,19 @@ thead th {text-align:left;padding:0.1em;background-color:#EEE;}
|
||||
|
||||
<div id="filing">Filed under Development</div>
|
||||
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
||||
<div id="home"><a href="http://hp.jpsband.org/">HTML Purifier</a> End-User Documentation</div>
|
||||
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
|
||||
|
||||
<p>
|
||||
<strong>Warning:</strong> This table is kept for historical purposes and
|
||||
is not being actively updated.
|
||||
</p>
|
||||
|
||||
<h2>Key</h2>
|
||||
|
||||
<table cellspacing="0"><tbody>
|
||||
<tr><td class="impl-yes">Implemented</td></tr>
|
||||
<tr><td class="impl-partial">Partially implemented</td></tr>
|
||||
<tr><td class="impl-no">Will not implement</td></tr>
|
||||
<tr><td class="impl-no">Not priority to implement</td></tr>
|
||||
<tr><td class="danger">Dangerous attribute/property</td></tr>
|
||||
<tr><td class="css1">Present in CSS1</td></tr>
|
||||
<tr><td class="feature">Feature, requires extra work</td></tr>
|
||||
@@ -118,6 +123,7 @@ thead th {text-align:left;padding:0.1em;background-color:#EEE;}
|
||||
<tbody>
|
||||
<tr><th colspan="2">Table</th></tr>
|
||||
<tr class="impl-yes"><td>border-collapse</td><td>ENUM(collapse, seperate)</td></tr>
|
||||
<tr class="impl-yes"><td>border-space</td><td>MULTIPLE</td></tr>
|
||||
<tr class="impl-yes"><td>caption-side</td><td>ENUM(top, bottom)</td></tr>
|
||||
<tr class="feature"><td>empty-cells</td><td>ENUM(show, hide), No IE support makes this useless,
|
||||
possible fix with &nbsp;? Unknown release milestone.</td></tr>
|
||||
@@ -142,16 +148,16 @@ thead th {text-align:left;padding:0.1em;background-color:#EEE;}
|
||||
|
||||
<tbody>
|
||||
<tr><th colspan="2">Unknown</th></tr>
|
||||
<tr class="danger css1 impl-yes"><td>background-image</td><td>Dangerous, target milestone 1.3</td></tr>
|
||||
<tr class="danger css1 impl-yes"><td>background-image</td><td>Dangerous</td></tr>
|
||||
<tr class="css1 impl-yes"><td>background-attachment</td><td>ENUM(scroll, fixed),
|
||||
Depends on background-image</td></tr>
|
||||
<tr class="css1 impl-yes"><td>background-position</td><td>Depends on background-image</td></tr>
|
||||
<tr class="danger impl-no"><td>cursor</td><td>Dangerous but fluffy</td></tr>
|
||||
<tr class="danger css1"><td>display</td><td>ENUM(...), Dangerous but interesting;
|
||||
<tr class="danger impl-yes"><td>display</td><td>ENUM(...), Dangerous but interesting;
|
||||
will not implement list-item, run-in (Opera only) or table (no IE);
|
||||
inline-block has incomplete IE6 support and requires -moz-inline-box
|
||||
for Mozilla. Unknown target milestone.</td></tr>
|
||||
<tr class="css1"><td>height</td><td>Interesting, why use it? Unknown target milestone.</td></tr>
|
||||
<tr class="css1 impl-yes"><td>height</td><td>Interesting, why use it? Unknown target milestone.</td></tr>
|
||||
<tr class="danger css1 impl-yes"><td>list-style-image</td><td>Dangerous?</td></tr>
|
||||
<tr class="impl-no"><td>max-height</td><td rowspan="4">No IE 5/6</td></tr>
|
||||
<tr class="impl-no"><td>min-height</td></tr>
|
||||
@@ -166,11 +172,11 @@ thead th {text-align:left;padding:0.1em;background-color:#EEE;}
|
||||
Mostly supported. Unknown target milestone.</td></tr>
|
||||
<tr><td>page-break-inside</td><td>ENUM(avoid, auto), Opera only. Unknown target milestone.</td></tr>
|
||||
<tr class="impl-no"><td>quotes</td><td>May be dropped from CSS2, fairly useless for inline context</td></tr>
|
||||
<tr class="impl-no"><td>visibility</td><td>ENUM(visible, hidden, collapse),
|
||||
<tr class="danger impl-yes"><td>visibility</td><td>ENUM(visible, hidden, collapse),
|
||||
Dangerous</td></tr>
|
||||
<tr class="css1 feature"><td>white-space</td><td>ENUM(normal, pre, nowrap, pre-wrap,
|
||||
<tr class="css1 feature impl-partial"><td>white-space</td><td>ENUM(normal, pre, nowrap, pre-wrap,
|
||||
pre-line), Spotty implementation:
|
||||
pre (no IE 5/6), nowrap (no IE 5),
|
||||
pre (no IE 5/6), <em>nowrap</em> (no IE 5, supported),
|
||||
pre-wrap (only Opera), pre-line (no support). Fixable? Unknown target milestone.</td></tr>
|
||||
</tbody>
|
||||
|
||||
@@ -238,14 +244,14 @@ Mozilla on inside and needs -moz-outline, no IE support.</td></tr>
|
||||
<tr><th colspan="3">Questionable</th></tr>
|
||||
<tr class="impl-no"><td>accesskey</td><td>A</td><td>May interfere with main interface</td></tr>
|
||||
<tr class="impl-no"><td>tabindex</td><td>A</td><td>May interfere with main interface</td></tr>
|
||||
<tr><td>target</td><td>A</td><td>Config enabled, only useful for frame layouts, disallowed in strict</td></tr>
|
||||
<tr class="impl-yes"><td>target</td><td>A</td><td>Config enabled, only useful for frame layouts, disallowed in strict</td></tr>
|
||||
</tbody>
|
||||
|
||||
<tbody>
|
||||
<tr><th colspan="3">Miscellaneous</th></tr>
|
||||
<tr><td>datetime</td><td>DEL, INS</td><td>No visible effect, ISO format</td></tr>
|
||||
<tr><td>rel</td><td>A</td><td>Largely user-defined: nofollow, tag (see microformats)</td></tr>
|
||||
<tr><td>rev</td><td>A</td><td>Largely user-defined: vote-*</td></tr>
|
||||
<tr class="impl-yes"><td>rel</td><td>A</td><td>Largely user-defined: nofollow, tag (see microformats)</td></tr>
|
||||
<tr class="impl-yes"><td>rev</td><td>A</td><td>Largely user-defined: vote-*</td></tr>
|
||||
<tr class="feature"><td>axis</td><td>TD, TH</td><td>W3C only: No browser implementation</td></tr>
|
||||
<tr class="feature"><td>char</td><td>COL, COLGROUP, TBODY, TD, TFOOT, TH, THEAD, TR</td><td>W3C only: No browser implementation</td></tr>
|
||||
<tr class="feature"><td>headers</td><td>TD, TH</td><td>W3C only: No browser implementation</td></tr>
|
||||
@@ -262,41 +268,41 @@ Mozilla on inside and needs -moz-outline, no IE support.</td></tr>
|
||||
</tbody>
|
||||
|
||||
<tbody>
|
||||
<tr><th colspan="3">Transform, target milestone 1.4</th></tr>
|
||||
<tr><td rowspan="5">align</td><td>CAPTION</td><td>Near-equiv style 'caption-side', drop left and right</td></tr>
|
||||
<tr><td>IMG</td><td rowspan="2">Margin-left and margin-right = auto or parent div</td></tr>
|
||||
<tr><td>TABLE</td></tr>
|
||||
<tr><td>HR</td><td>Near-equivalent style 'text-align' (Works for IE and Opera, but not Firefox). Also try <code>margin-right:auto; margin-left:0;</code> for left or <code>margin-right:0; margin-left:auto;</code> for right (optionally replacing 0 with the original margin for that side)</td></tr>
|
||||
<tr><th colspan="3">Transform</th></tr>
|
||||
<tr class="impl-yes"><td rowspan="5">align</td><td>CAPTION</td><td>'caption-side' for top/bottom, 'text-align' for left/right</td></tr>
|
||||
<tr class="impl-yes"><td>IMG</td><td rowspan="3">See specimens/html-align-to-css.html</td></tr>
|
||||
<tr class="impl-yes"><td>TABLE</td></tr>
|
||||
<tr class="impl-yes"><td>HR</td></tr>
|
||||
<tr class="impl-yes"><td>H1, H2, H3, H4, H5, H6, P</td><td>Equivalent style 'text-align'</td></tr>
|
||||
<tr class="required impl-yes"><td>alt</td><td>IMG</td><td>Required, insert image filename if src is present or default invalid image text</td></tr>
|
||||
<tr><td rowspan="3">bgcolor</td><td>TABLE</td><td>Equivalent style 'background-color'</td></tr>
|
||||
<tr><td>TR</td><td>Equivalent style 'background-color'</td></tr>
|
||||
<tr><td>TD, TH</td><td>Equivalent style 'background-color'</td></tr>
|
||||
<tr><td>border</td><td>IMG</td><td>Near equivalent style 'border-width', as it only applies when link present</td></tr>
|
||||
<tr><td>clear</td><td>BR</td><td>Near-equiv style 'clear', transform 'all' into 'both'</td></tr>
|
||||
<tr class="impl-yes"><td rowspan="3">bgcolor</td><td>TABLE</td><td>Superset style 'background-color'</td></tr>
|
||||
<tr class="impl-yes"><td>TR</td><td>Superset style 'background-color'</td></tr>
|
||||
<tr class="impl-yes"><td>TD, TH</td><td>Superset style 'background-color'</td></tr>
|
||||
<tr class="impl-yes"><td>border</td><td>IMG</td><td>Equivalent style <code>border:[number]px solid</code></td></tr>
|
||||
<tr class="impl-yes"><td>clear</td><td>BR</td><td>Near-equiv style 'clear', transform 'all' into 'both'</td></tr>
|
||||
<tr class="impl-no"><td>compact</td><td>DL, OL, UL</td><td>Boolean, needs custom CSS class; rarely used anyway</td></tr>
|
||||
<tr class="required impl-yes"><td>dir</td><td>BDO</td><td>Required, insert ltr (or configuration value) if none</td></tr>
|
||||
<tr><td>height</td><td>TD, TH</td><td>Near-equiv style 'height', needs px suffix if original was in pixels</td></tr>
|
||||
<tr><td>hspace</td><td>IMG</td><td>Near-equiv styles 'margin-top' and 'margin-bottom', needs px suffix</td></tr>
|
||||
<tr class="impl-yes"><td>height</td><td>TD, TH</td><td>Near-equiv style 'height', needs px suffix if original was in pixels</td></tr>
|
||||
<tr class="impl-yes"><td>hspace</td><td>IMG</td><td>Near-equiv styles 'margin-top' and 'margin-bottom', needs px suffix</td></tr>
|
||||
<tr class="impl-yes"><td>lang</td><td>*</td><td>Copy value to xml:lang</td></tr>
|
||||
<tr><td rowspan="2">name</td><td>IMG</td><td>Turn into ID</td></tr>
|
||||
<tr><td>A</td><td>Turn into ID? (not deprecated, though in which specs?)</td></tr>
|
||||
<tr><td>noshade</td><td>HR</td><td>Boolean, style 'border-style:solid;'</td></tr>
|
||||
<tr><td>nowrap</td><td>TD, TH</td><td>Boolean, style 'white-space:nowrap;' (not compat with IE5)</td></tr>
|
||||
<tr><td>size</td><td>HR</td><td>Near-equiv 'width', needs px suffix if original was pixels</td></tr>
|
||||
<tr class="impl-yes"><td rowspan="2">name</td><td>IMG</td><td>Turn into ID</td></tr>
|
||||
<tr class="impl-yes"><td>A</td><td>Turn into ID</td></tr>
|
||||
<tr class="impl-yes"><td>noshade</td><td>HR</td><td>Boolean, style 'border-style:solid;'</td></tr>
|
||||
<tr class="impl-yes"><td>nowrap</td><td>TD, TH</td><td>Boolean, style 'white-space:nowrap;' (not compat with IE5)</td></tr>
|
||||
<tr class="impl-yes"><td>size</td><td>HR</td><td>Near-equiv 'height', needs px suffix if original was pixels</td></tr>
|
||||
<tr class="required impl-yes"><td>src</td><td>IMG</td><td>Required, insert blank or default img if not set</td></tr>
|
||||
<tr class="impl-yes"><td>start</td><td>OL</td><td>Poorly supported 'counter-reset', allowed in loose, dropped in strict</td></tr>
|
||||
<tr><td rowspan="3">type</td><td>LI</td><td rowspan="3">Equivalent style 'list-style-type', different allowed values though. (needs testing)</td></tr>
|
||||
<tr><td>OL</td></tr>
|
||||
<tr><td>UL</td></tr>
|
||||
<tr class="impl-yes"><td rowspan="3">type</td><td>LI</td><td rowspan="3">Equivalent style 'list-style-type', different allowed values though. (needs testing)</td></tr>
|
||||
<tr class="impl-yes"><td>OL</td></tr>
|
||||
<tr class="impl-yes"><td>UL</td></tr>
|
||||
<tr class="impl-yes"><td>value</td><td>LI</td><td>Poorly supported 'counter-reset', allowed in loose, dropped in strict</td></tr>
|
||||
<tr><td>vspace</td><td>IMG</td><td>Near-equiv styles 'margin-left' and 'margin-right', needs px suffix, see hspace</td></tr>
|
||||
<tr><td rowspan="2">width</td><td>HR</td><td rowspan="2">Near-equiv style 'width', needs px suffix if original was pixels</td></tr>
|
||||
<tr><td>TD, TH</td></tr>
|
||||
<tr class="impl-yes"><td>vspace</td><td>IMG</td><td>Near-equiv styles 'margin-left' and 'margin-right', needs px suffix, see hspace</td></tr>
|
||||
<tr class="impl-yes"><td rowspan="2">width</td><td>HR</td><td rowspan="2">Near-equiv style 'width', needs px suffix if original was pixels</td></tr>
|
||||
<tr class="impl-yes"><td>TD, TH</td></tr>
|
||||
</tbody>
|
||||
|
||||
</table>
|
||||
|
||||
<div id="version">$Id$</div>
|
||||
|
||||
</body></html>
|
||||
</body></html>
|
||||
|
798
docs/enduser-customize.html
Normal file
798
docs/enduser-customize.html
Normal file
@@ -0,0 +1,798 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||
<meta name="description" content="Tutorial for customizing HTML Purifier's tag and attribute sets." />
|
||||
<link rel="stylesheet" type="text/css" href="style.css" />
|
||||
|
||||
<title>Customize - HTML Purifier</title>
|
||||
|
||||
</head><body>
|
||||
|
||||
<h1 class="subtitled">Customize!</h1>
|
||||
<div class="subtitle">HTML Purifier is a Swiss-Army Knife</div>
|
||||
|
||||
<div id="filing">Filed under End-User</div>
|
||||
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
||||
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
|
||||
|
||||
<p>
|
||||
You may have heard of the <a href="dev-advanced-api.html">Advanced API</a>.
|
||||
If you're interested in reading dry prose and boring functional
|
||||
specifications, feel free to click that link to get a no-nonsense overview
|
||||
on the Advanced API. For the rest of us, there's this tutorial. By the time
|
||||
you're finished reading this, you should have a pretty good idea on
|
||||
how to implement custom tags and attributes that HTML Purifier may not have.
|
||||
</p>
|
||||
|
||||
<h2>Is it necessary?</h2>
|
||||
|
||||
<p>
|
||||
Before we even write any code, it is paramount to consider whether or
|
||||
not the code we're writing is necessary or not. HTML Purifier, by default,
|
||||
contains a large set of elements and attributes: large enough so that
|
||||
<em>any</em> element or attribute in XHTML 1.0 or 1.1 (and its HTML variants)
|
||||
that can be safely used by the general public is implemented.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
So what needs to be implemented? (Feel free to skip this section if
|
||||
you know what you want).
|
||||
</p>
|
||||
|
||||
<h3>XHTML 1.0</h3>
|
||||
|
||||
<p>
|
||||
All of the modules listed below are based off of the
|
||||
<a href="http://www.w3.org/TR/2001/REC-xhtml-modularization-20010410/abstract_modules.html#sec_5.2.">modularization of
|
||||
XHTML</a>, which, while technically for XHTML 1.1, is quite a useful
|
||||
resource.
|
||||
</p>
|
||||
|
||||
<ul>
|
||||
<li>Structure</li>
|
||||
<li>Frames</li>
|
||||
<li>Applets (deprecated)</li>
|
||||
<li>Forms</li>
|
||||
<li>Image maps</li>
|
||||
<li>Objects</li>
|
||||
<li>Frames</li>
|
||||
<li>Events</li>
|
||||
<li>Meta-information</li>
|
||||
<li>Style sheets</li>
|
||||
<li>Link (not hypertext)</li>
|
||||
<li>Base</li>
|
||||
<li>Name</li>
|
||||
</ul>
|
||||
|
||||
<p>
|
||||
If you don't recognize it, you probably don't need it. But the curious
|
||||
can look all of these modules up in the above-mentioned document. Note
|
||||
that inline scripting comes packaged with HTML Purifier (more on this
|
||||
later).
|
||||
</p>
|
||||
|
||||
<h3>XHTML 1.1</h3>
|
||||
|
||||
<p>
|
||||
As of HTMLPurifier 2.1.0, we have implemented the
|
||||
<a href="http://www.w3.org/TR/2001/REC-ruby-20010531/">Ruby module</a>,
|
||||
which defines a set of tags
|
||||
for publishing short annotations for text, used mostly in Japanese
|
||||
and Chinese school texts, but applicable for positioning any text (not
|
||||
limited to translations) above or below other corresponding text.
|
||||
</p>
|
||||
|
||||
<h3>XHTML 2.0</h3>
|
||||
|
||||
<p>
|
||||
<a href="http://www.w3.org/TR/xhtml2/">XHTML 2.0</a> is still a
|
||||
working draft, so any elements introduced in the
|
||||
specification have not been implemented and will not be implemented
|
||||
until we get a recommendation or proposal. Because XHTML 2.0 is
|
||||
an entirely new markup language, implementing rules for it will be
|
||||
no easy task.
|
||||
</p>
|
||||
|
||||
<h3>HTML 5</h3>
|
||||
|
||||
<p>
|
||||
<a href="http://www.whatwg.org/specs/web-apps/current-work/">HTML 5</a>
|
||||
is a fork of HTML 4.01 by WHATWG, who believed that XHTML 2.0 was headed
|
||||
in the wrong direction. It too is a working draft, and may change
|
||||
drastically before publication, but it should be noted that the
|
||||
<code>canvas</code> tag has been implemented by many browser vendors.
|
||||
</p>
|
||||
|
||||
<h3>Proprietary</h3>
|
||||
|
||||
<p>
|
||||
There are a number of proprietary tags still in the wild. Many of them
|
||||
have been documented in <a href="ref-proprietary-tags.txt">ref-proprietary-tags.txt</a>,
|
||||
but there is currently no implementation for any of them.
|
||||
</p>
|
||||
|
||||
<h3>Extensions</h3>
|
||||
|
||||
<p>
|
||||
There are also a number of other XML languages out there that can
|
||||
be embedded in HTML documents: two of the most popular are MathML and
|
||||
SVG, and I frequently get requests to implement these. But they are
|
||||
expansive, comprehensive specifications, and it would take far too long
|
||||
to implement them <em>correctly</em> (most systems I've seen go as far
|
||||
as whitelisting tags and no further; come on, what about nesting!)
|
||||
</p>
|
||||
|
||||
<p>
|
||||
Word of warning: HTML Purifier is currently <em>not</em> namespace
|
||||
aware.
|
||||
</p>
|
||||
|
||||
<h2>Giving back</h2>
|
||||
|
||||
<p>
|
||||
As you may imagine from the details above (don't be abashed if you didn't
|
||||
read it all: a glance over would have done), there's quite a bit that
|
||||
HTML Purifier doesn't implement. Recent architectural changes have
|
||||
allowed HTML Purifier to implement elements and attributes that are not
|
||||
safe! Don't worry, they won't be activated unless you set %HTML.Trusted
|
||||
to true, but they certainly help out users who need to put, say, forms
|
||||
on their page and don't want to go through the trouble of reading this
|
||||
and implementing it themself.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
So any of the above that you implement for your own application could
|
||||
help out some other poor sap on the other side of the globe. Help us
|
||||
out, and send back code so that it can be hammered into a module and
|
||||
released with the core. Any code would be greatly appreciated!
|
||||
</p>
|
||||
|
||||
<h2>And now...</h2>
|
||||
|
||||
<p>
|
||||
Enough philosophical talk, time for some code:
|
||||
</p>
|
||||
|
||||
<pre>$config = HTMLPurifier_Config::createDefault();
|
||||
$config->set('HTML', 'DefinitionID', 'enduser-customize.html tutorial');
|
||||
$config->set('HTML', 'DefinitionRev', 1);
|
||||
$def = $config->getHTMLDefinition(true);</pre>
|
||||
|
||||
<p>
|
||||
Assuming that HTML Purifier has already been properly loaded (hint:
|
||||
include <code>HTMLPurifier.auto.php</code>), this code will set up
|
||||
the environment that you need to start customizing the HTML definition.
|
||||
What's going on?
|
||||
</p>
|
||||
|
||||
<ul>
|
||||
<li>
|
||||
The first three lines are regular configuration code:
|
||||
<ul>
|
||||
<li>
|
||||
%HTML.DefinitionID is set to a unique identifier for your
|
||||
custom HTML definition. This prevents it from clobbering
|
||||
other custom definitions on the same installation.
|
||||
</li>
|
||||
<li>
|
||||
%HTML.DefinitionRev is a revision integer of your HTML
|
||||
definition. Because HTML definitions are cached, you'll need
|
||||
to increment this whenever you make a change in order to flush
|
||||
the cache.
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
<li>
|
||||
The fourth line retrieves a raw <code>HTMLPurifier_HTMLDefinition</code>
|
||||
object that we will be tweaking. If the parameter was removed, we
|
||||
would be retrieving a fully formed definition object, which is somewhat
|
||||
useless for customization purposes.
|
||||
</li>
|
||||
</ul>
|
||||
|
||||
<h3>Broken backwards-compatibility</h3>
|
||||
|
||||
<p>
|
||||
Those of you who have already been twiddling around with the raw
|
||||
HTML definition object, you'll be noticing that you're getting an error
|
||||
when you attempt to retrieve the raw definition object without specifying
|
||||
a DefinitionID. It is vital to caching (see below) that you make a unique
|
||||
name for your customized definition, so make up something right now and
|
||||
things will operate again.
|
||||
</p>
|
||||
|
||||
<h2>Turn off caching</h2>
|
||||
|
||||
<p>
|
||||
To make development easier, we're going to temporarily turn off
|
||||
definition caching:
|
||||
</p>
|
||||
|
||||
<pre>$config = HTMLPurifier_Config::createDefault();
|
||||
$config->set('HTML', 'DefinitionID', 'enduser-customize.html tutorial');
|
||||
$config->set('HTML', 'DefinitionRev', 1);
|
||||
<strong>$config->set('Core', 'DefinitionCache', null); // remove this later!</strong>
|
||||
$def = $config->getHTMLDefinition(true);</pre>
|
||||
|
||||
<p>
|
||||
A few things should be mentioned about the caching mechanism before
|
||||
we move on. For performance reasons, HTML Purifier caches generated
|
||||
<code>HTMLPurifier_Definition</code> objects in serialized files
|
||||
stored (by default) in <code>library/HTMLPurifier/DefinitionCache/Serializer</code>.
|
||||
A lot of processing is done in order to create these objects, so it
|
||||
makes little sense to repeat the same processing over and over again
|
||||
whenever HTML Purifier is called.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
In order to identify a cache entry, HTML Purifier uses three variables:
|
||||
the library's version number, the value of %HTML.DefinitionRev and
|
||||
a serial of relevant configuration. Whenever any of these changes,
|
||||
a new HTML definition is generated. Notice that there is no way
|
||||
for the definition object to track changes to customizations: here, it
|
||||
is up to you to supply appropriate information to DefinitionID and
|
||||
DefinitionRev.
|
||||
</p>
|
||||
|
||||
<h2 id="addAttribute">Add an attribute</h2>
|
||||
|
||||
<p>
|
||||
For this example, we're going to implement the <code>target</code> attribute found
|
||||
on <code>a</code> elements. To implement an attribute, we have to
|
||||
ask a few questions:
|
||||
</p>
|
||||
|
||||
<ol>
|
||||
<li>What element is it found on?</li>
|
||||
<li>What is its name?</li>
|
||||
<li>Is it required or optional?</li>
|
||||
<li>What are valid values for it?</li>
|
||||
</ol>
|
||||
|
||||
<p>
|
||||
The first three are easy: the element is <code>a</code>, the attribute
|
||||
is <code>target</code>, and it is not a required attribute. (If it
|
||||
was required, we'd need to append an asterisk to the attribute name,
|
||||
you'll see an example of this in the addElement() example).
|
||||
</p>
|
||||
|
||||
<p>
|
||||
The last question is a little trickier.
|
||||
Lets allow the special values: _blank, _self, _target and _top.
|
||||
The form of this is called an <strong>enumeration</strong>, a list of
|
||||
valid values, although only one can be used at a time. To translate
|
||||
this into code form, we write:
|
||||
</p>
|
||||
|
||||
<pre>$config = HTMLPurifier_Config::createDefault();
|
||||
$config->set('HTML', 'DefinitionID', 'enduser-customize.html tutorial');
|
||||
$config->set('HTML', 'DefinitionRev', 1);
|
||||
$config->set('Core', 'DefinitionCache', null); // remove this later!
|
||||
$def = $config->getHTMLDefinition(true);
|
||||
<strong>$def->addAttribute('a', 'target', 'Enum#_blank,_self,_target,_top');</strong></pre>
|
||||
|
||||
<p>
|
||||
The <code>Enum#_blank,_self,_target,_top</code> does all the magic.
|
||||
The string is split into two parts, separated by a hash mark (#):
|
||||
</p>
|
||||
|
||||
<ol>
|
||||
<li>The first part is the name of what we call an <code>AttrDef</code></li>
|
||||
<li>The second part is the parameter of the above-mentioned <code>AttrDef</code></li>
|
||||
</ol>
|
||||
|
||||
<p>
|
||||
If that sounds vague and generic, it's because it is! HTML Purifier defines
|
||||
an assortment of different attribute types one can use, and each of these
|
||||
has their own specialized parameter format. Here are some of the more useful
|
||||
ones:
|
||||
</p>
|
||||
|
||||
<table class="table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Type</th>
|
||||
<th>Format</th>
|
||||
<th>Description</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<th>Enum</th>
|
||||
<td><em>[s:]</em>value1,value2,...</td>
|
||||
<td>
|
||||
Attribute with a number of valid values, one of which may be used. When
|
||||
s: is present, the enumeration is case sensitive.
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Bool</th>
|
||||
<td>attribute_name</td>
|
||||
<td>
|
||||
Boolean attribute, with only one valid value: the name
|
||||
of the attribute.
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>CDATA</th>
|
||||
<td></td>
|
||||
<td>
|
||||
Attribute of arbitrary text. Can also be referred to as <strong>Text</strong>
|
||||
(the specification makes a semantic distinction between the two).
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>ID</th>
|
||||
<td></td>
|
||||
<td>
|
||||
Attribute that specifies a unique ID
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Pixels</th>
|
||||
<td></td>
|
||||
<td>
|
||||
Attribute that specifies an integer pixel length
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Length</th>
|
||||
<td></td>
|
||||
<td>
|
||||
Attribute that specifies a pixel or percentage length
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>NMTOKENS</th>
|
||||
<td></td>
|
||||
<td>
|
||||
Attribute that specifies a number of name tokens, example: the
|
||||
<code>class</code> attribute
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>URI</th>
|
||||
<td></td>
|
||||
<td>
|
||||
Attribute that specifies a URI, example: the <code>href</code>
|
||||
attribute
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Number</th>
|
||||
<td></td>
|
||||
<td>
|
||||
Attribute that specifies an positive integer number
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<p>
|
||||
For a complete list, consult
|
||||
<a href="http://htmlpurifier.org/svnroot/htmlpurifier/trunk/library/HTMLPurifier/AttrTypes.php"><code>library/HTMLPurifier/AttrTypes.php</code></a>;
|
||||
more information on attributes that accept parameters can be found on their
|
||||
respective includes in
|
||||
<a href="http://htmlpurifier.org/svnroot/htmlpurifier/trunk/library/HTMLPurifier/AttrDef/"><code>library/HTMLPurifier/AttrDef</code></a>.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
Sometimes, the restrictive list in AttrTypes just doesn't cut it. Don't
|
||||
sweat: you can also use a fully instantiated object as the value. The
|
||||
equivalent, verbose form of the above example is:
|
||||
</p>
|
||||
|
||||
<pre>$config = HTMLPurifier_Config::createDefault();
|
||||
$config->set('HTML', 'DefinitionID', 'enduser-customize.html tutorial');
|
||||
$config->set('HTML', 'DefinitionRev', 1);
|
||||
$config->set('Core', 'DefinitionCache', null); // remove this later!
|
||||
$def = $config->getHTMLDefinition(true);
|
||||
<strong>$def->addAttribute('a', 'target', new HTMLPurifier_AttrDef_Enum(
|
||||
array('_blank','_self','_target','_top')
|
||||
));</strong></pre>
|
||||
|
||||
<p>
|
||||
Trust me, you'll learn to love the shorthand.
|
||||
</p>
|
||||
|
||||
<h2>Add an element</h2>
|
||||
|
||||
<p>
|
||||
Adding attributes is really small-fry stuff, though, and it was possible
|
||||
to add them (albeit a bit more wordy) prior to 2.0. The real gem of
|
||||
the Advanced API is adding elements. There are five questions to
|
||||
ask when adding a new element:
|
||||
</p>
|
||||
|
||||
<ol>
|
||||
<li>What is the element's name?</li>
|
||||
<li>What content set does this element belong to?</li>
|
||||
<li>What are the allowed children of this element?</li>
|
||||
<li>What attributes does the element allow that are general?</li>
|
||||
<li>What attributes does the element allow that are specific to this element?</li>
|
||||
</ol>
|
||||
|
||||
<p>
|
||||
It's a mouthful, and you'll be slightly lost if your not familiar with
|
||||
the HTML specification, so let's explain them step by step.
|
||||
</p>
|
||||
|
||||
<h3>Content set</h3>
|
||||
|
||||
<p>
|
||||
The HTML specification defines two major content sets: Inline
|
||||
and Block. Each of these
|
||||
content sets contain a list of elements: Inline contains things like
|
||||
<code>span</code> and <code>b</code> while Block contains things like
|
||||
<code>div</code> and <code>blockquote</code>.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
These content sets amount to a macro mechanism for HTML definition. Most
|
||||
elements in HTML are organized into one of these two sets, and most
|
||||
elements in HTML allow elements from one of these sets. If we had
|
||||
to write each element verbatim into each other element's allowed
|
||||
children, we would have ridiculously large lists; instead we use
|
||||
content sets to compactify the declaration.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
Practically speaking, there are several useful values you can use here:
|
||||
</p>
|
||||
|
||||
<table class="table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Content set</th>
|
||||
<th>Description</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<th>Inline</th>
|
||||
<td>Character level elements, text</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Block</th>
|
||||
<td>Block-like elements, like paragraphs and lists</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th><em>false</em></th>
|
||||
<td>
|
||||
Any element that doesn't fit into the mold, for example <code>li</code>
|
||||
or <code>tr</code>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<p>
|
||||
By specifying a valid value here, all other elements that use that
|
||||
content set will also allow your element, without you having to do
|
||||
anything. If you specify <em>false</em>, you'll have to register
|
||||
your element manually.
|
||||
</p>
|
||||
|
||||
<h3>Allowed children</h3>
|
||||
|
||||
<p>
|
||||
Allowed children defines the elements that this element can contain.
|
||||
The allowed values may range from none to a complex regexp depending on
|
||||
your element.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
If you've ever taken a look at the HTML DTD's before, you may have
|
||||
noticed declarations like this:
|
||||
</p>
|
||||
|
||||
<pre><!ELEMENT LI - O (%flow;)* -- list item --></pre>
|
||||
|
||||
<p>
|
||||
The <code>(%flow;)*</code> indicates the allowed children of the
|
||||
<code>li</code> tag: <code>li</code> allows any number of flow
|
||||
elements as its children. (The <code>- O</code> allows the closing tag to be
|
||||
omitted, though in XML this is not allowed.) In HTML Purifier,
|
||||
we'd write it like <code>Flow</code> (here's where the content sets
|
||||
we were discussing earlier come into play). There are three shorthand
|
||||
content models you can specify:
|
||||
</p>
|
||||
|
||||
<table class="table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Content model</th>
|
||||
<th>Description</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<th>Empty</th>
|
||||
<td>No children allowed, like <code>br</code> or <code>hr</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Inline</th>
|
||||
<td>Any number of inline elements and text, like <code>span</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Flow</th>
|
||||
<td>Any number of inline elements, block elements and text, like <code>div</code></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<p>
|
||||
This covers 90% of all the cases out there, but what about elements that
|
||||
break the mold like <code>ul</code>? This guy requires at least one
|
||||
child, and the only valid children for it are <code>li</code>. The
|
||||
content model is: <code>Required: li</code>. There are two parts: the
|
||||
first type determines what <code>ChildDef</code> will be used to validate
|
||||
content models. The most common values are:
|
||||
</p>
|
||||
|
||||
<table class="table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Type</th>
|
||||
<th>Description</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<th>Required</th>
|
||||
<td>Children must be one or more of the valid elements</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Optional</th>
|
||||
<td>Children can be any number of the valid elements</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Custom</th>
|
||||
<td>Children must follow the DTD-style regex</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<p>
|
||||
You can also implement your own <code>ChildDef</code>: this was done
|
||||
for a few special cases in HTML Purifier such as <code>Chameleon</code>
|
||||
(for <code>ins</code> and <code>del</code>), <code>StrictBlockquote</code>
|
||||
and <code>Table</code>.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
The second part specifies either valid elements or a regular expression.
|
||||
Valid elements are separated with horizontal bars (|), i.e.
|
||||
"<code>a | b | c</code>". Use #PCDATA to represent plain text.
|
||||
Regular expressions are based off of DTD's style:
|
||||
</p>
|
||||
|
||||
<ul>
|
||||
<li>Parentheses () are used for grouping</li>
|
||||
<li>Commas (,) separate elements that should come one after another</li>
|
||||
<li>Horizontal bars (|) indicate one or the other elements should be used</li>
|
||||
<li>Plus signs (+) are used for a one or more match</li>
|
||||
<li>Asterisks (*) are used for a zero or more match</li>
|
||||
<li>Question marks (?) are used for a zero or one match</li>
|
||||
</ul>
|
||||
|
||||
<p>
|
||||
For example, "<code>a, b?, (c | d), e+, f*</code>" means "In this order,
|
||||
one <code>a</code> element, at most one <code>b</code> element,
|
||||
one <code>c</code> or <code>d</code> element (but not both), one or more
|
||||
<code>e</code> elements, and any number of <code>f</code> elements."
|
||||
Regex veterans should be able to jump right in, and those not so savvy
|
||||
can always copy-paste W3C's content model definitions into HTML Purifier
|
||||
and hope for the best.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
A word of warning: while the regex format is extremely flexible on
|
||||
the developer's side, it is
|
||||
quite unforgiving on the user's side. If the user input does not <em>exactly</em>
|
||||
match the specification, the entire contents of the element will
|
||||
be nuked. This is why there is are specific content model types like
|
||||
Optional and Required: while they could be implemented as <code>Custom:
|
||||
(valid | elements)*</code>, the custom classes contain special recovery
|
||||
measures that make sure as much of the user's original content gets
|
||||
through. HTML Purifier's core, as a rule, does not use Custom.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
One final note: you can also use Content Sets inside your valid elements
|
||||
lists or regular expressions. In fact, the three shorthand content models
|
||||
mentioned above are just that: abbreviations:
|
||||
</p>
|
||||
|
||||
<table class="table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Content model</th>
|
||||
<th>Implementation</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<th>Inline</th>
|
||||
<td>Optional: Inline | #PCDATA</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Flow</th>
|
||||
<td>Optional: Flow | #PCDATA</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<p>
|
||||
When the definition is compiled, Inline will be replaced with a
|
||||
horizontal-bar separated list of inline elements. Also, notice that
|
||||
it does not contain text: you have to specify that yourself.
|
||||
</p>
|
||||
|
||||
<h3>Common attributes</h3>
|
||||
|
||||
<p>
|
||||
Congratulations: you have just gotten over the proverbial hump (Allowed
|
||||
children). Common attributes is much simpler, and boils down to
|
||||
one question: does your element have the <code>id</code>, <code>style</code>,
|
||||
<code>class</code>, <code>title</code> and <code>lang</code> attributes?
|
||||
If so, you'll want to specify the <code>Common</code> attribute collection,
|
||||
which contains these five attributes that are found on almost every
|
||||
HTML element in the specification.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
There are a few more collections, but they're really edge cases:
|
||||
</p>
|
||||
|
||||
<table class="table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Collection</th>
|
||||
<th>Attributes</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<th>I18N</th>
|
||||
<td><code>lang</code>, possibly <code>xml:lang</code></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Core</th>
|
||||
<td><code>style</code>, <code>class</code>, <code>id</code> and <code>title</code></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<p>
|
||||
Common is a combination of the above-mentioned collections.
|
||||
</p>
|
||||
|
||||
<p class="aside">
|
||||
Readers familiar with the modularization may have noticed that the Core
|
||||
attribute collection differs from that specified by the <a
|
||||
href="http://www.w3.org/TR/xhtml-modularization/abstract_modules.html#s_commonatts">abstract
|
||||
modules of the XHTML Modularization 1.1</a>. We believe this section
|
||||
to be in error, as <code>br</code> permits the use of the <code>style</code>
|
||||
attribute even though it uses the <code>Core</code> collection, and
|
||||
the DTD and XML Schemas supplied by W3C support our interpretation.
|
||||
</p>
|
||||
|
||||
<h3>Attributes</h3>
|
||||
|
||||
<p>
|
||||
If you didn't read the <a href="#addAttribute">earlier section on
|
||||
adding attributes</a>, read it now. The last parameter is simply
|
||||
an array of attribute names to attribute implementations, in the exact
|
||||
same format as <code>addAttribute()</code>.
|
||||
</p>
|
||||
|
||||
<h3>Putting it all together</h3>
|
||||
|
||||
<p>
|
||||
We're going to implement <code>form</code>. Before we embark, lets
|
||||
grab a reference implementation from over at the
|
||||
<a href="http://www.w3.org/TR/html4/sgml/loosedtd.html">transitional DTD</a>:
|
||||
</p>
|
||||
|
||||
<pre><!ELEMENT FORM - - (%flow;)* -(FORM) -- interactive form -->
|
||||
<!ATTLIST FORM
|
||||
%attrs; -- %coreattrs, %i18n, %events --
|
||||
action %URI; #REQUIRED -- server-side form handler --
|
||||
method (GET|POST) GET -- HTTP method used to submit the form--
|
||||
enctype %ContentType; "application/x-www-form-urlencoded"
|
||||
accept %ContentTypes; #IMPLIED -- list of MIME types for file upload --
|
||||
name CDATA #IMPLIED -- name of form for scripting --
|
||||
onsubmit %Script; #IMPLIED -- the form was submitted --
|
||||
onreset %Script; #IMPLIED -- the form was reset --
|
||||
target %FrameTarget; #IMPLIED -- render in this frame --
|
||||
accept-charset %Charsets; #IMPLIED -- list of supported charsets --
|
||||
></pre>
|
||||
|
||||
<p>
|
||||
Juicy! With just this, we can answer four of our five questions:
|
||||
</p>
|
||||
|
||||
<ol>
|
||||
<li>What is the element's name? <strong>form</strong></li>
|
||||
<li>What content set does this element belong to? <strong>Block</strong>
|
||||
(this needs a little sleuthing, I find the easiest way is to search
|
||||
the DTD for <code>FORM</code> and determine which set it is in.)</li>
|
||||
<li>What are the allowed children of this element? <strong>One
|
||||
or more flow elements, but no nested <code>form</code>s</strong></li>
|
||||
<li>What attributes does the element allow that are general? <strong>Common</strong></li>
|
||||
<li>What attributes does the element allow that are specific to this element? <strong>A whole bunch, see ATTLIST;
|
||||
we're going to the vital ones: <code>action</code>, <code>method</code> and <code>name</code></strong></li>
|
||||
</ol>
|
||||
|
||||
<p>
|
||||
Time for some code:
|
||||
</p>
|
||||
|
||||
<pre>$config = HTMLPurifier_Config::createDefault();
|
||||
$config->set('HTML', 'DefinitionID', 'enduser-customize.html tutorial');
|
||||
$config->set('HTML', 'DefinitionRev', 1);
|
||||
$config->set('Core', 'DefinitionCache', null); // remove this later!
|
||||
$def = $config->getHTMLDefinition(true);
|
||||
$def->addAttribute('a', 'target', new HTMLPurifier_AttrDef_Enum(
|
||||
array('_blank','_self','_target','_top')
|
||||
));
|
||||
<strong>$form = $def->addElement(
|
||||
'form', // name
|
||||
'Block', // content set
|
||||
'Flow', // allowed children
|
||||
'Common', // attribute collection
|
||||
array( // attributes
|
||||
'action*' => 'URI',
|
||||
'method' => 'Enum#get|post',
|
||||
'name' => 'ID'
|
||||
)
|
||||
);
|
||||
$form->excludes = array('form' => true);</strong></pre>
|
||||
|
||||
<p>
|
||||
Each of the parameters corresponds to one of the questions we asked.
|
||||
Notice that we added an asterisk to the end of the <code>action</code>
|
||||
attribute to indicate that it is required. If someone specifies a
|
||||
<code>form</code> without that attribute, the tag will be axed.
|
||||
Also, the extra line at the end is a special extra declaration that
|
||||
prevents forms from being nested within each other.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
And that's all there is to it! Implementing the rest of the form
|
||||
module is left as an exercise to the user; to see more examples
|
||||
check the <a href="http://htmlpurifier.org/svnroot/htmlpurifier/trunk/library/HTMLPurifier/HTMLModule/"><code>library/HTMLPurifier/HTMLModule/</code></a> directory
|
||||
in your local HTML Purifier installation.
|
||||
</p>
|
||||
|
||||
<h2>And beyond...</h2>
|
||||
|
||||
<p>
|
||||
Perceptive users may have realized that, to a certain extent, we
|
||||
have simply re-implemented the facilities of XML Schema or the
|
||||
Document Type Definition. What you are seeing here, however, is
|
||||
not just an XML Schema or Document Type Definition: it is a fully
|
||||
expressive method of specifying the definition of HTML that is
|
||||
a portable superset of the capabilities of the two above-mentioned schema
|
||||
languages. What makes HTMLDefinition so powerful is the fact that
|
||||
if we don't have an implementation for a content model or an attribute
|
||||
definition, you can supply it yourself by writing a PHP class.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
There are many facets of HTMLDefinition beyond the Advanced API I have
|
||||
walked you through today. To find out more about these, you can
|
||||
check out these source files:
|
||||
</p>
|
||||
|
||||
<ul>
|
||||
<li><a href="http://htmlpurifier.org/svnroot/htmlpurifier/trunk/library/HTMLPurifier/HTMLModule.php"><code>library/HTMLPurifier/HTMLModule.php</code></a></li>
|
||||
<li><a href="http://htmlpurifier.org/svnroot/htmlpurifier/trunk/library/HTMLPurifier/ElementDef.php"><code>library/HTMLPurifier/ElementDef.php</code></a></li>
|
||||
</ul>
|
||||
|
||||
<div id="version">$Id: enduser-tidy.html 1158 2007-06-18 19:26:29Z Edward $</div>
|
||||
|
||||
</body></html>
|
@@ -15,7 +15,7 @@
|
||||
|
||||
<div id="filing">Filed under End-User</div>
|
||||
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
||||
<div id="home"><a href="http://hp.jpsband.org/">HTML Purifier</a> End-User Documentation</div>
|
||||
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
|
||||
|
||||
<p>Prior to HTML Purifier 1.2.0, this library blithely accepted user input that
|
||||
looked like this:</p>
|
||||
@@ -58,7 +58,7 @@ appear elsewhere on the document. The method is simple:</p>
|
||||
|
||||
<pre>$config->set('HTML', 'EnableAttrID', true);
|
||||
$config->set('Attr', 'IDBlacklist' array(
|
||||
'list', 'of', 'attributes', 'that', 'are', 'forbidden'
|
||||
'list', 'of', 'attribute', 'values', 'that', 'are', 'forbidden'
|
||||
));</pre>
|
||||
|
||||
<p>That being said, there are some notable drawbacks. First of all, you have to
|
||||
@@ -71,9 +71,9 @@ to possible standards-compliance issues.</p>
|
||||
<p>Furthermore, this position becomes untenable when a single web page must hold
|
||||
multiple portions of user-submitted content. Since there's obviously no way
|
||||
to find out before-hand what IDs users will use, the blacklist is helpless.
|
||||
And even since HTML Purifier validates each segment seperately, perhaps doing
|
||||
And since HTML Purifier validates each segment separately, perhaps doing
|
||||
so at different times, it would be extremely difficult to dynamically update
|
||||
the blacklist inbetween runs.</p>
|
||||
the blacklist in between runs.</p>
|
||||
|
||||
<p>Finally, simply destroying the ID is extremely un-userfriendly behavior: after
|
||||
all, they might have simply specified a duplicate ID by accident.</p>
|
||||
@@ -144,4 +144,4 @@ anchors is beyond me.</p>
|
||||
<div id="version">$Id$</div>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
</html>
|
||||
|
@@ -36,7 +36,7 @@ forgiving lexer. You may also be interested in the unit tests located in the
|
||||
tests/ folder, which provide a living document on how exactly the filter deals
|
||||
with malformed input.
|
||||
|
||||
In summary:
|
||||
In summary (see corresponding classes for more details):
|
||||
|
||||
1. Parse document into an array of tag and text tokens (Lexer)
|
||||
2. Remove all elements not on whitelist and transform certain other elements
|
||||
|
@@ -6,45 +6,11 @@ through negligence of people. This class will do its job: no more, no less,
|
||||
and it's up to you to provide it the proper information and proper context
|
||||
to be effective. Things to remember:
|
||||
|
||||
1. Character Encoding: UTF-8.
|
||||
This segment will soon be obsoleted by enduser-utf8.html
|
||||
Currently, the parser runs under the assumption that it is dealing
|
||||
with UTF-8. Not ISO-8859-1 or Windows-1252, UTF-8. And definitely not "no
|
||||
character encoding explicitly stated" or UTF-7. If you're not using UTF-8 as
|
||||
your character encoding, make sure you configure HTML Purifier or switch
|
||||
to UTF-8. Now. Also, make sure any input is properly converted to UTF-8, or
|
||||
the parser will mangle it badly (though it won't be a security risk if you're
|
||||
outputting it as UTF-8 though). Character encoding is, in general, a knotty
|
||||
issue, but do yourself a favor and learn about it:
|
||||
<http://www.joelonsoftware.com/articles/Unicode.html>
|
||||
1. Character Encoding: see enduser-utf8.html for more info.
|
||||
|
||||
2. Doctype: XHTML 1.0 Transitional
|
||||
This is what the parser is outputting. For the most
|
||||
part, it's compatible with HTML 4.01, but XHTML enforces some very nice things
|
||||
that all web developers should use. Regardless, NO DOCTYPE is a NO. Quirks mode
|
||||
has waaaay too many quirks for a little parser to handle. We did not select
|
||||
strict in order to prevent ourselves from being too draconic on users, but
|
||||
this may be configurable in the future. Do you want standards compliance?
|
||||
The doctype is a good place to start.
|
||||
2. IDs: see enduser-id.html for more info
|
||||
|
||||
3. IDs
|
||||
This segment is obsoleted by enduser-id.html
|
||||
They need to be unique, but without some knowledge of the
|
||||
rest of the document, it's difficult to know what's unique. %Attr.IDBlacklist
|
||||
needs to be set: we may want to consider disallowing IDs by default to
|
||||
save lazy programmers.
|
||||
3. URIs: see enduser-uri-filter.html
|
||||
|
||||
4. [PROJECTED] Links
|
||||
We're not going to try for spam protection (although
|
||||
some hooks for such a module might be nice) but we may offer the ability to
|
||||
only accept relative URLs. Pick the one that's right for you.
|
||||
|
||||
5. CSS
|
||||
While we can prevent the most flagrant cases from affecting your
|
||||
layout (such as absolutely positioned elements), no amount of code is going
|
||||
to protect your pages from being attacked by garish colors and plain old
|
||||
bad taste. A neat feature would be the ability to define acceptable colors
|
||||
in a document, but that's not likely to be implemented for a while. In the
|
||||
meantime, be sure to make sure that floated elements (permitted, since they
|
||||
can be quite useful) can't mess up your layout. Once again, we may want to
|
||||
disable this by default to protect lazy developers.
|
||||
4. CSS: document pending
|
||||
Explain which CSS styles we blocked and why.
|
||||
|
@@ -15,7 +15,7 @@
|
||||
|
||||
<div id="filing">Filed under End-User</div>
|
||||
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
||||
<div id="home"><a href="http://hp.jpsband.org/">HTML Purifier</a> End-User Documentation</div>
|
||||
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
|
||||
|
||||
<p>HTML Purifier is a very powerful library. But with power comes great
|
||||
responsibility, in the form of longer execution times. Remember, this
|
||||
@@ -114,4 +114,4 @@ if you decide to do that! Especially if you port HTML Purifier to C++.
|
||||
<tt>;-)</tt></p>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
</html>
|
||||
|
230
docs/enduser-tidy.html
Normal file
230
docs/enduser-tidy.html
Normal file
@@ -0,0 +1,230 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||
<meta name="description" content="Tutorial for tweaking HTML Purifier's Tidy-like behavior." />
|
||||
<link rel="stylesheet" type="text/css" href="style.css" />
|
||||
|
||||
<title>Tidy - HTML Purifier</title>
|
||||
|
||||
</head><body>
|
||||
|
||||
<h1>Tidy</h1>
|
||||
|
||||
<div id="filing">Filed under Development</div>
|
||||
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
||||
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
|
||||
|
||||
<p>You've probably heard of HTML Tidy, Dave Raggett's little piece
|
||||
of software that cleans up poorly written HTML. Let me say it straight
|
||||
out:</p>
|
||||
|
||||
<p class="emphasis">This ain't HTML Tidy!</p>
|
||||
|
||||
<p>Rather, Tidy stands for a cool set of Tidy-inspired features in HTML Purifier
|
||||
that allows users to submit deprecated elements and attributes and get
|
||||
valid strict markup back. For example:</p>
|
||||
|
||||
<pre><center>Centered</center></pre>
|
||||
|
||||
<p>...becomes:</p>
|
||||
|
||||
<pre><div style="text-align:center;">Centered</div></pre>
|
||||
|
||||
<p>...when this particular fix is run on the HTML. This tutorial will give
|
||||
you the lowdown of what exactly HTML Purifier will do when Tidy
|
||||
is on, and how to fine-tune this behavior. Once again, <strong>you do
|
||||
not need Tidy installed on your PHP to use these features!</strong></p>
|
||||
|
||||
<h2>What does it do?</h2>
|
||||
|
||||
<p>Tidy will do several things to your HTML:</p>
|
||||
|
||||
<ul>
|
||||
<li>Convert deprecated elements and attributes to standards-compliant
|
||||
alternatives</li>
|
||||
<li>Enforce XHTML compatibility guidelines and other best practices</li>
|
||||
<li>Preserve data that would normally be removed as per W3C</li>
|
||||
</ul>
|
||||
|
||||
<h2>What are levels?</h2>
|
||||
|
||||
<p>Levels describe how aggressive the Tidy module should be when
|
||||
cleaning up HTML. There are four levels to pick: none, light, medium
|
||||
and heavy. Each of these levels has a well-defined set of behavior
|
||||
associated with it, although it may change depending on your doctype.</p>
|
||||
|
||||
<dl>
|
||||
<dt>light</dt>
|
||||
<dd>This is the <strong>lenient</strong> level. If a tag or attribute
|
||||
is about to be removed because it isn't supported by the
|
||||
doctype, Tidy will step in and change into an alternative that
|
||||
is supported.</dd>
|
||||
<dt>medium</dt>
|
||||
<dd>This is the <strong>correctional</strong> level. At this level,
|
||||
all the functions of light are performed, as well as some extra,
|
||||
non-essential best practices enforcement. Changes made on this
|
||||
level are very benign and are unlikely to cause problems.</dd>
|
||||
<dt>heavy</dt>
|
||||
<dd>This is the <strong>aggressive</strong> level. If a tag or
|
||||
attribute is deprecated, it will be converted into a non-deprecated
|
||||
version, no ifs ands or buts.</dd>
|
||||
</dl>
|
||||
|
||||
<p>By default, Tidy operates on the <strong>medium</strong> level. You can
|
||||
change the level of cleaning by setting the %HTML.TidyLevel configuration
|
||||
directive:</p>
|
||||
|
||||
<pre>$config->set('HTML', 'TidyLevel', 'heavy'); // burn baby burn!</pre>
|
||||
|
||||
<h2>Is the light level really light?</h2>
|
||||
|
||||
<p>It depends on what doctype you're using. If your documents are HTML
|
||||
4.01 <em>Transitional</em>, HTML Purifier will be lazy
|
||||
and won't clean up your <code>center</code>
|
||||
or <code>font</code> tags. But if you're using HTML 4.01 <em>Strict</em>,
|
||||
HTML Purifier has no choice: it has to convert them, or they will
|
||||
be nuked out of existence. So while light on Transitional will result
|
||||
in little to no changes, light on Strict will still result in quite
|
||||
a lot of fixes.</p>
|
||||
|
||||
<p>This is different behavior from 1.6 or before, where deprecated
|
||||
tags in transitional documents would
|
||||
always be cleaned up regardless. This is also better behavior.</p>
|
||||
|
||||
<h2>My pages look different!</h2>
|
||||
|
||||
<p>HTML Purifier is tasked with converting deprecated tags and
|
||||
attributes to standards-compliant alternatives, which usually
|
||||
need copious amounts of CSS. It's also not foolproof: sometimes
|
||||
things do get lost in the translation. This is why when HTML Purifier
|
||||
can get away with not doing cleaning, it won't; this is why
|
||||
the default value is <strong>medium</strong> and not heavy.</p>
|
||||
|
||||
<p>Fortunately, only a few attributes have problems with the switch
|
||||
over. They are described below:</p>
|
||||
|
||||
<table class="table">
|
||||
<thead><tr>
|
||||
<th>Element@Attr</th>
|
||||
<th>Changes</th>
|
||||
</tr></thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>caption@align</td>
|
||||
<td>Firefox supports stuffing the caption on the
|
||||
left and right side of the table, a feature that
|
||||
Internet Explorer, understandably, does not have.
|
||||
When align equals right or left, the text will simply
|
||||
be aligned on the left or right side.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>img@align</td>
|
||||
<td>The implementation for align bottom is good, but not
|
||||
perfect. There are a few pixel differences.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>br@clear</td>
|
||||
<td>Clear both gets a little wonky in Internet Explorer. Haven't
|
||||
really been able to figure out why.</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>hr@noshade</td>
|
||||
<td>All browsers implement this slightly differently: we've
|
||||
chosen to make noshade horizontal rules gray.</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<p>There are a few more minor, although irritating, bugs.
|
||||
Some older browsers support deprecated attributes,
|
||||
but not CSS. Transformed elements and attributes will look unstyled
|
||||
to said browsers. Also, CSS precedence is slightly different for
|
||||
inline styles versus presentational markup. In increasing precedence:</p>
|
||||
|
||||
<ol>
|
||||
<li>Presentational attributes</li>
|
||||
<li>External style sheets</li>
|
||||
<li>Inline styling</li>
|
||||
</ol>
|
||||
|
||||
<p>This means that styling that may have been masked by external CSS
|
||||
declarations will start showing up (a good thing, perhaps). Finally,
|
||||
if you've turned off the style attribute, almost all of
|
||||
these transformations will not work. Sorry mates.</p>
|
||||
|
||||
<p>You can review the rendering before and after of these transformations
|
||||
by consulting the <a
|
||||
href="http://htmlpurifier.org/live/smoketests/attrTransform.php">attrTransform.php
|
||||
smoketest</a>.</p>
|
||||
|
||||
<h2>I like the general idea, but the specifics bug me!</h2>
|
||||
|
||||
<p>So you want HTML Purifier to clean up your HTML, but you're not
|
||||
so happy about the br@clear implementation. That's perfectly fine!
|
||||
HTML Purifier will make accomodations:</p>
|
||||
|
||||
<pre>$config->set('HTML', 'Doctype', 'XHTML 1.0 Transitional');
|
||||
$config->set('HTML', 'TidyLevel', 'heavy'); // all changes, minus...
|
||||
<strong>$config->set('HTML', 'TidyRemove', 'br@clear');</strong></pre>
|
||||
|
||||
<p>That third line does the magic, removing the br@clear fix
|
||||
from the module, ensuring that <code><br clear="both" /></code>
|
||||
will pass through unharmed. The reverse is possible too:</p>
|
||||
|
||||
<pre>$config->set('HTML', 'Doctype', 'XHTML 1.0 Transitional');
|
||||
$config->set('HTML', 'TidyLevel', 'none'); // no changes, plus...
|
||||
<strong>$config->set('HTML', 'TidyAdd', 'p@align');</strong></pre>
|
||||
|
||||
<p>In this case, all transformations are shut off, except for the p@align
|
||||
one, which you found handy.</p>
|
||||
|
||||
<p>To find out what the names of fixes you want to turn on or off are,
|
||||
you'll have to consult the source code, specifically the files in
|
||||
<code>HTMLPurifier/HTMLModule/Tidy/</code>. There is, however, a
|
||||
general syntax:</p>
|
||||
|
||||
<table class="table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Name</th>
|
||||
<th>Example</th>
|
||||
<th>Interpretation</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td>element</td>
|
||||
<td>font</td>
|
||||
<td>Tag transform for <em>element</em></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>element@attr</td>
|
||||
<td>br@clear</td>
|
||||
<td>Attribute transform for <em>attr</em> on <em>element</em></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>@attr</td>
|
||||
<td>@lang</td>
|
||||
<td>Global attribute transform for <em>attr</em></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>e#content_model_type</td>
|
||||
<td>blockquote#content_model_type</td>
|
||||
<td>Change of child processing implementation for <em>e</em></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
|
||||
<h2>So... what's the lowdown?</h2>
|
||||
|
||||
<p>The lowdown is, quite frankly, HTML Purifier's default settings are
|
||||
probably good enough. The next step is to bump the level up to heavy,
|
||||
and if that still doesn't satisfy your appetite, do some fine-tuning.
|
||||
Other than that, don't worry about it: this all works silently and
|
||||
effectively in the background.</p>
|
||||
|
||||
<div id="version">$Id$</div>
|
||||
|
||||
</body></html>
|
216
docs/enduser-uri-filter.html
Normal file
216
docs/enduser-uri-filter.html
Normal file
@@ -0,0 +1,216 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"><head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||
<meta name="description" content="Tutorial for creating custom URI filters." />
|
||||
<link rel="stylesheet" type="text/css" href="style.css" />
|
||||
|
||||
<title>URI Filters - HTML Purifier</title>
|
||||
|
||||
</head><body>
|
||||
|
||||
<h1>URI Filters</h1>
|
||||
|
||||
<div id="filing">Filed under End-User</div>
|
||||
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
||||
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
|
||||
|
||||
<p>
|
||||
This is a quick and dirty document to get you on your way to writing
|
||||
custom URI filters for your own URL filtering needs. Why would you
|
||||
want to write a URI filter? If you need URIs your users put into
|
||||
HTML to magically change into a different URI, this is
|
||||
exactly what you need!
|
||||
</p>
|
||||
|
||||
<h2>Creating the class</h2>
|
||||
|
||||
<p>
|
||||
Any URI filter you make will be a subclass of <code>HTMLPurifier_URIFilter</code>.
|
||||
The scaffolding is thus:
|
||||
</p>
|
||||
|
||||
<pre>class HTMLPurifier_URIFilter_<strong>NameOfFilter</strong> extends HTMLPurifier_URIFilter
|
||||
{
|
||||
public $name = '<strong>NameOfFilter</strong>';
|
||||
public function prepare($config) {}
|
||||
public function filter(&$uri, $config, $context) {}
|
||||
}</pre>
|
||||
|
||||
<p>
|
||||
Fill in the variable <code>$name</code> with the name of your filter, and
|
||||
take a look at the two methods. <code>prepare()</code> is an initialization
|
||||
method that is called only once, before any filtering has been done of the
|
||||
HTML. Use it to perform any costly setup work that only needs to be done
|
||||
once. <code>filter()</code> is the guts and innards of our filter:
|
||||
it takes the URI and does whatever needs to be done to it.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
If you've worked with HTML Purifier, you'll recognize the <code>$config</code>
|
||||
and <code>$context</code> parameters. On the other hand, <code>$uri</code>
|
||||
is something unique to this section of the application: it's a
|
||||
<code>HTMLPurifier_URI</code> object. The interface is thus:
|
||||
</p>
|
||||
|
||||
<pre>class HTMLPurifier_URI
|
||||
{
|
||||
public $scheme, $userinfo, $host, $port, $path, $query, $fragment;
|
||||
public function HTMLPurifier_URI($scheme, $userinfo, $host, $port, $path, $query, $fragment);
|
||||
public function toString();
|
||||
public function copy();
|
||||
public function getSchemeObj($config, $context);
|
||||
public function validate($config, $context);
|
||||
}</pre>
|
||||
|
||||
<p>
|
||||
The first three methods are fairly self-explanatory: you have a constructor,
|
||||
a serializer, and a cloner. Generally, you won't be using them when
|
||||
you are manipulating the URI objects themselves.
|
||||
<code>getSchemeObj()</code> is a special purpose method that returns
|
||||
a <code>HTMLPurifier_URIScheme</code> object corresponding to the specific
|
||||
URI at hand. <code>validate()</code> performs general-purpose validation
|
||||
on the internal components of a URI. Once again, you don't need to
|
||||
worry about these: they've already been handled for you.
|
||||
</p>
|
||||
|
||||
<h2>URI format</h2>
|
||||
|
||||
<p>
|
||||
As a URIFilter, we're interested in the member variables of the URI object.
|
||||
</p>
|
||||
|
||||
<table class="quick"><tbody>
|
||||
<tr><th>Scheme</th> <td>The protocol for identifying (and possibly locating) a resource (http, ftp, https)</td></tr>
|
||||
<tr><th>Userinfo</th> <td>User information such as a username (bob)</td></tr>
|
||||
<tr><th>Host</th> <td>Domain name or IP address of the server (example.com, 127.0.0.1)</td></tr>
|
||||
<tr><th>Port</th> <td>Network port number for the server (80, 12345)</td></tr>
|
||||
<tr><th>Path</th> <td>Data that identifies the resource, possibly hierarchical (/path/to, ed@example.com)</td></tr>
|
||||
<tr><th>Query</th> <td>String of information to be interpreted by the resource (?q=search-term)</td></tr>
|
||||
<tr><th>Fragment</th> <td>Additional information for the resource after retrieval (#bookmark)</td></tr>
|
||||
</tbody></table>
|
||||
|
||||
<p>
|
||||
Because the URI is presented to us in this form, and not
|
||||
<code>http://bob@example.com:8080/foo.php?q=string#hash</code>, it saves us
|
||||
a lot of trouble in having to parse the URI every time we want to filter
|
||||
it. For the record, the above URI has the following components:
|
||||
</p>
|
||||
|
||||
<table class="quick"><tbody>
|
||||
<tr><th>Scheme</th> <td>http</td></tr>
|
||||
<tr><th>Userinfo</th> <td>bob</td></tr>
|
||||
<tr><th>Host</th> <td>example.com</td></tr>
|
||||
<tr><th>Port</th> <td>8080</td></tr>
|
||||
<tr><th>Path</th> <td>/foo.php</td></tr>
|
||||
<tr><th>Query</th> <td>q=string</td></tr>
|
||||
<tr><th>Fragment</th> <td>hash</td></tr>
|
||||
</tbody></table>
|
||||
|
||||
<p>
|
||||
Note that there is no question mark or octothorpe in the query or
|
||||
fragment: these get removed during parsing.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
With this information, you can get straight to implementing your
|
||||
<code>filter()</code> method. But one more thing...
|
||||
</p>
|
||||
|
||||
<h2>Return value: Boolean, not URI</h2>
|
||||
|
||||
<p>
|
||||
You may have noticed that the URI is being passed in by reference.
|
||||
This means that whatever changes you make to it, those changes will
|
||||
be reflected in the URI object the callee had. <strong>Do not
|
||||
return the URI object: it is unnecessary and will cause bugs.</strong>
|
||||
Instead, return a boolean value, true if the filtering was successful,
|
||||
or false if the URI is beyond repair and needs to be axed.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
Let's suppose I wanted to write a filter that converted links with a
|
||||
custom <code>image</code> scheme to its corresponding real path on
|
||||
our website:
|
||||
</p>
|
||||
|
||||
<pre>class HTMLPurifier_URIFilter_TransformImageScheme extends HTMLPurifier_URIFilter
|
||||
{
|
||||
public $name = 'TransformImageScheme';
|
||||
public function filter(&$uri, $config, $context) {
|
||||
if ($uri->scheme !== 'image') return true;
|
||||
$img_name = $uri->path;
|
||||
// Overwrite the previous URI object
|
||||
$uri = new HTMLPurifier_URI('http', null, null, null, '/img/' . $img_name . '.png', null, null);
|
||||
return true;
|
||||
}
|
||||
}</pre>
|
||||
|
||||
<p>
|
||||
Notice I did not <code>return $uri;</code>. This filter would turn
|
||||
<code>image:Foo</code> into <code>/img/Foo.png</code>.
|
||||
</p>
|
||||
|
||||
<h2>Activating your filter</h2>
|
||||
|
||||
<p>
|
||||
Having a filter is all well and good, but you need to tell HTML Purifier
|
||||
to use it. Fortunately, this part's simple:
|
||||
</p>
|
||||
|
||||
<pre>$uri = $config->getDefinition('URI');
|
||||
$uri->addFilter(new HTMLPurifier_URIFilter_<strong>NameOfFilter</strong>());</pre>
|
||||
|
||||
<p>
|
||||
If you want to be really fancy, you can define a configuration directive
|
||||
for your filter and have HTML Purifier automatically manage whether or
|
||||
not your filter gets loaded or not (this is how internal filters manage
|
||||
things):
|
||||
</p>
|
||||
|
||||
<pre>HTMLPurifier_ConfigSchema::define(
|
||||
'URI', '<strong>NameOfFilter</strong>', false, 'bool',
|
||||
'<strong>What your filter does.</strong>'
|
||||
);
|
||||
$uri = $config->getDefinition('URI', true);
|
||||
$uri->registerFilter(new HTMLPurifier_URIFilter_<strong>NameOfFilter</strong>());
|
||||
</pre>
|
||||
|
||||
<p>
|
||||
Now, your filter will only be called when %URI.<strong>NameOfFilter</strong>
|
||||
is set to true.
|
||||
</p>
|
||||
|
||||
<h2>Post-filter</h2>
|
||||
|
||||
<p>
|
||||
Remember our TransformImageScheme filter? That filter acted before we had
|
||||
performed scheme validation; otherwise, the URI would have been filtered
|
||||
out when it was discovered that there was no image scheme. Well, a post-filter
|
||||
is run after scheme specific validation, so it's ideal for bulk
|
||||
post-processing of URIs, including munging. To specify a URI as a post-filter,
|
||||
set the <code>$post</code> member variable to TRUE.
|
||||
</p>
|
||||
|
||||
<pre>class HTMLPurifier_URIFilter_MyPostFilter extends HTMLPurifier_URIFilter
|
||||
{
|
||||
public $name = 'MyPostFilter';
|
||||
public $post = true;
|
||||
// ... extra code here
|
||||
}
|
||||
</pre>
|
||||
|
||||
<h2>Examples</h2>
|
||||
|
||||
<p>
|
||||
Check the
|
||||
<a href="http://htmlpurifier.org/svnroot/htmlpurifier/trunk/library/HTMLPurifier/URIFilter/">URIFilter</a>
|
||||
directory for more implementation examples, and see <a href="http://htmlpurifier.org/svnroot/htmlpurifier/trunk/docs/proposal-new-directives.txt">the
|
||||
new directives proposal document</a> for ideas on what could be implemented
|
||||
as a filter.
|
||||
</p>
|
||||
|
||||
<div id="version">$Id$</div>
|
||||
|
||||
</body></html>
|
@@ -5,12 +5,11 @@
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||
<meta name="description" content="Describes the rationale for using UTF-8, the ramifications otherwise, and how to make the switch." />
|
||||
<link rel="stylesheet" type="text/css" href="./style.css" />
|
||||
<script defer="defer" type="text/javascript" src="./toc-gen.js"></script>
|
||||
<style type="text/css">
|
||||
.minor td {font-style:italic;}
|
||||
</style>
|
||||
|
||||
<title>UTF-8 - HTML Purifier</title>
|
||||
<title>UTF-8: The Secret of Character Encoding - HTML Purifier</title>
|
||||
|
||||
<!-- Note to users: this document, though professing to be UTF-8, attempts
|
||||
to use only ASCII characters, because most webservers are configured
|
||||
@@ -19,21 +18,27 @@ own advice for sake of portability. -->
|
||||
|
||||
</head><body>
|
||||
|
||||
<h1>UTF-8</h1>
|
||||
<h1>UTF-8: The Secret of Character Encoding</h1>
|
||||
|
||||
<div id="filing">Filed under End-User</div>
|
||||
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
||||
<div id="home"><a href="http://hp.jpsband.org/">HTML Purifier</a> End-User Documentation</div>
|
||||
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
|
||||
|
||||
<p>Character encoding and character sets, in truth, are not that
|
||||
difficult to understand. But if you don't understand them, you are going
|
||||
to be caught by surprise by some of HTML Purifier's behavior, namely
|
||||
the fact that it operates UTF-8 or the limitations of the character
|
||||
encoding transformations it does. This document will walk you through
|
||||
<p>Character encoding and character sets are not that
|
||||
difficult to understand, but so many people blithely stumble
|
||||
through the worlds of programming without knowing what to actually
|
||||
do about it, or say "Ah, it's a job for those <em>internationalization</em>
|
||||
experts." No, it is not! This document will walk you through
|
||||
determining the encoding of your system and how you should handle
|
||||
this information. It will stay away from excessive discussion on
|
||||
the internals of character encoding, but offer the information in
|
||||
asides that can easily be skipped.</p>
|
||||
the internals of character encoding.</p>
|
||||
|
||||
<p>This document is not designed to be read in its entirety: it will
|
||||
slowly introduce concepts that build on each other: you need not get to
|
||||
the bottom to have learned something new. However, I strongly
|
||||
recommend you read all the way to <strong>Why UTF-8?</strong>, because at least
|
||||
at that point you'd have made a conscious decision not to migrate,
|
||||
which can be a rewarding (but difficult) task.</p>
|
||||
|
||||
<blockquote class="aside">
|
||||
<div class="label">Asides</div>
|
||||
@@ -43,10 +48,54 @@ asides that can easily be skipped.</p>
|
||||
with a greater understanding of the underlying issues.</p>
|
||||
</blockquote>
|
||||
|
||||
<h2>Table of Contents</h2>
|
||||
|
||||
<ol id="toc">
|
||||
<li><a href="#findcharset">Finding the real encoding</a></li>
|
||||
<li><a href="#findmetacharset">Finding the embedded encoding</a></li>
|
||||
<li><a href="#fixcharset">Fixing the encoding</a><ol>
|
||||
<li><a href="#fixcharset-none">No embedded encoding</a></li>
|
||||
<li><a href="#fixcharset-diff">Embedded encoding disagrees</a></li>
|
||||
<li><a href="#fixcharset-server">Changing the server encoding</a><ol>
|
||||
<li><a href="#fixcharset-server-php">PHP header() function</a></li>
|
||||
<li><a href="#fixcharset-server-phpini">PHP ini directive</a></li>
|
||||
<li><a href="#fixcharset-server-nophp">Non-PHP</a></li>
|
||||
<li><a href="#fixcharset-server-htaccess">.htaccess</a></li>
|
||||
<li><a href="#fixcharset-server-ext">File extensions</a></li>
|
||||
</ol></li>
|
||||
<li><a href="#fixcharset-xml">XML</a></li>
|
||||
<li><a href="#fixcharset-internals">Inside the process</a></li>
|
||||
</ol></li>
|
||||
<li><a href="#whyutf8">Why UTF-8?</a><ol>
|
||||
<li><a href="#whyutf8-i18n">Internationalization</a></li>
|
||||
<li><a href="#whyutf8-user">User-friendly</a></li>
|
||||
<li><a href="#whyutf8-forms">Forms</a><ol>
|
||||
<li><a href="#whyutf8-forms-urlencoded">application/x-www-form-urlencoded</a></li>
|
||||
<li><a href="#whyutf8-forms-multipart">multipart/form-data</a></li>
|
||||
</ol></li>
|
||||
<li><a href="#whyutf8-support">Well supported</a></li>
|
||||
<li><a href="#whyutf8-htmlpurifier">HTML Purifiers</a></li>
|
||||
</ol></li>
|
||||
<li><a href="#migrate">Migrate to UTF-8</a><ol>
|
||||
<li><a href="#migrate-db">Configuring your database</a><ol>
|
||||
<li><a href="#migrate-db-legit">Legit method</a></li>
|
||||
<li><a href="#migrate-db-binary">Binary</a></li>
|
||||
</ol></li>
|
||||
<li><a href="#migrate-editor">Text editor</a></li>
|
||||
<li><a href="#migrate-bom">Byte Order Mark (headers already sent!)</a></li>
|
||||
<li><a href="#migrate-fonts">Fonts</a><ol>
|
||||
<li><a href="#migrate-fonts-obscure">Obscure scripts</a></li>
|
||||
<li><a href="#migrate-fonts-occasional">Occasional use</a></li>
|
||||
</ol></li>
|
||||
<li><a href="#migrate-variablewidth">Dealing with variable width in functions</a></li>
|
||||
</ol></li>
|
||||
<li><a href="#externallinks">Further Reading</a></li>
|
||||
</ol>
|
||||
|
||||
<h2 id="findcharset">Finding the real encoding</h2>
|
||||
|
||||
<p>In the beginning, there was ASCII, and things were simple. But they
|
||||
weren't good, for no one could write in Cryllic or Thai. So there
|
||||
weren't good, for no one could write in Cyrillic or Thai. So there
|
||||
exploded a proliferation of character encodings to remedy the problem
|
||||
by extending the characters ASCII could express. This ridiculously
|
||||
simplified version of the history of character encodings shows us that
|
||||
@@ -88,7 +137,7 @@ browser:</p>
|
||||
<dd>View > Encoding: bulleted item is unofficial name</dd>
|
||||
</dl>
|
||||
|
||||
<p>Internet Explorer won't give you the mime (i.e. useful/real) name of the
|
||||
<p>Internet Explorer won't give you the MIME (i.e. useful/real) name of the
|
||||
character encoding, so you'll have to look it up using their description.
|
||||
Some common ones:</p>
|
||||
|
||||
@@ -166,6 +215,12 @@ if your <code>META</code> tag claims that either:</p>
|
||||
|
||||
<h2 id="fixcharset">Fixing the encoding</h2>
|
||||
|
||||
<p class="aside">The advice given here is for pages being served as
|
||||
vanilla <code>text/html</code>. Different practices must be used
|
||||
for <code>application/xml</code> or <code>application/xml+xhtml</code>, see
|
||||
<a href="http://www.w3.org/TR/2002/NOTE-xhtml-media-types-20020430/">W3C's
|
||||
document on XHTML media types</a> for more information.</p>
|
||||
|
||||
<p>If your <code>META</code> encoding and your real encoding match,
|
||||
savvy! You can skip this section. If they don't...</p>
|
||||
|
||||
@@ -181,7 +236,7 @@ of your real encoding.</p>
|
||||
why the character encoding should be explicitly stated. When the
|
||||
browser isn't told what the character encoding of a text is, it
|
||||
has to guess: and sometimes the guess is wrong. Hackers can manipulate
|
||||
this guess in order to slip XSS pass filters and then fool the
|
||||
this guess in order to slip XSS past filters and then fool the
|
||||
browser into executing it as active code. A great example of this
|
||||
is the <a href="http://shiflett.org/archive/177">Google UTF-7
|
||||
exploit</a>.</p>
|
||||
@@ -252,7 +307,8 @@ languages</a>. The appropriate code is:</p>
|
||||
|
||||
<p>...replacing UTF-8 with whatever your embedded encoding is.
|
||||
This code must come before any output, so be careful about
|
||||
stray whitespace in your application.</p>
|
||||
stray whitespace in your application (i.e., any whitespace before
|
||||
output excluding whitespace within <?php ?> tags).</p>
|
||||
|
||||
<h4 id="fixcharset-server-phpini">PHP ini directive</h4>
|
||||
|
||||
@@ -263,8 +319,8 @@ header call: <code><a href="http://php.net/ini.core#ini.default-charset">default
|
||||
|
||||
<p>...will also do the trick. If PHP is running as an Apache module (and
|
||||
not as FastCGI, consult
|
||||
<a href="http://php.net/phpinfo">phpinfo</a>() for details), you can even use htaccess do apply this property
|
||||
globally:</p>
|
||||
<a href="http://php.net/phpinfo">phpinfo</a>() for details), you can even use htaccess to apply this property
|
||||
across many PHP files:</p>
|
||||
|
||||
<pre><a href="http://php.net/configuration.changes#configuration.changes.apache">php_value</a> default_charset "UTF-8"</pre>
|
||||
|
||||
@@ -275,7 +331,7 @@ your own php.ini file, ask your support for details. Use:</p>
|
||||
|
||||
<h4 id="fixcharset-server-nophp">Non-PHP</h4>
|
||||
|
||||
<p>You may, for whatever reason, may need to set the character encoding
|
||||
<p>You may, for whatever reason, need to set the character encoding
|
||||
on non-PHP files, usually plain ol' HTML files. Doing this
|
||||
is more of a hit-or-miss process: depending on the software being
|
||||
used as a webserver and the configuration of that software, certain
|
||||
@@ -310,10 +366,11 @@ to send anything at all:</p>
|
||||
|
||||
<pre><a href="http://httpd.apache.org/docs/1.3/mod/core.html#adddefaultcharset">AddDefaultCharset</a> Off</pre>
|
||||
|
||||
<p>...making your <code>META</code> tags the sole source of
|
||||
character encoding information. In these cases, it is
|
||||
<em>especially</em> important to make sure you have valid <code>META</code>
|
||||
tags on your pages and all the text before them is ASCII.</p>
|
||||
<p>...making your internal charset declaration (usually the <code>META</code> tags)
|
||||
the sole source of character encoding
|
||||
information. In these cases, it is <em>especially</em> important to make
|
||||
sure you have valid <code>META</code> tags on your pages and all the
|
||||
text before them is ASCII.</p>
|
||||
|
||||
<blockquote class="aside"><p>These directives can also be
|
||||
placed in httpd.conf file for Apache, but
|
||||
@@ -378,30 +435,32 @@ IIS to change character encodings, I'd be grateful.</p>
|
||||
|
||||
<p><code>META</code> tags are the most common source of embedded
|
||||
encodings, but they can also come from somewhere else: XML
|
||||
processing instructions. They look like:</p>
|
||||
Declarations. They look like:</p>
|
||||
|
||||
<pre><?xml version="1.0" encoding="UTF-8"?></pre>
|
||||
|
||||
<p>...and are most often found in XML documents (including XHTML).</p>
|
||||
|
||||
<p>For XHTML, this processing instruction theoretically
|
||||
<p>For XHTML, this XML Declaration theoretically
|
||||
overrides the <code>META</code> tag. In reality, this happens only when the
|
||||
XHTML is actually served as legit XML and not HTML, which is almost
|
||||
always never due to Internet Explorer's lack of support for
|
||||
XHTML is actually served as legit XML and not HTML, which is almost always
|
||||
never due to Internet Explorer's lack of support for
|
||||
<code>application/xhtml+xml</code> (even though doing so is often
|
||||
argued to be <a href="http://www.hixie.ch/advocacy/xhtml">good practice</a>).</p>
|
||||
argued to be <a href="http://www.hixie.ch/advocacy/xhtml">good
|
||||
practice</a> and is required by the XHTML 1.1 specification).</p>
|
||||
|
||||
<p>For XML, however, this processing instruction is extremely important.
|
||||
<p>For XML, however, this XML Declaration is extremely important.
|
||||
Since most webservers are not configured to send charsets for .xml files,
|
||||
this is the only thing a parser has to go on. Furthermore, the default
|
||||
for XML files is UTF-8, which often butts heads with more common
|
||||
ISO-8859-1 encoding (you see this in garbled RSS feeds).</p>
|
||||
|
||||
<p>In short, if you use XHTML and have gone through the
|
||||
trouble of adding the XML header, be sure to make sure it jives
|
||||
with your <code>META</code> tags and HTTP headers.</p>
|
||||
trouble of adding the XML Declaration, make sure it jives
|
||||
with your <code>META</code> tags (which should only be present
|
||||
if served in text/html) and HTTP headers.</p>
|
||||
|
||||
<h3>Inside the process</h3>
|
||||
<h3 id="fixcharset-internals">Inside the process</h3>
|
||||
|
||||
<p>This section is not required reading,
|
||||
but may answer some of your questions on what's going on in all
|
||||
@@ -456,7 +515,7 @@ usage in one language sometimes requires the occasional special character
|
||||
that, without surprise, is not available in your character set. Sometimes
|
||||
developers get around this by adding support for multiple encodings: when
|
||||
using Chinese, use Big5, when using Japanese, use Shift-JIS, when
|
||||
using Greek, etc. Other times, they use character entities with great
|
||||
using Greek, etc. Other times, they use character references with great
|
||||
zeal.</p>
|
||||
|
||||
<p>UTF-8, however, obviates the need for any of these complicated
|
||||
@@ -470,14 +529,14 @@ you don't have to use those user-unfriendly entities.</p>
|
||||
|
||||
<p>Websites encoded in Latin-1 (ISO-8859-1) which ocassionally need
|
||||
a special character outside of their scope often will use a character
|
||||
entity to achieve the desired effect. For instance, θ can be
|
||||
entity reference to achieve the desired effect. For instance, θ can be
|
||||
written <code>&theta;</code>, regardless of the character encoding's
|
||||
support of Greek letters.</p>
|
||||
|
||||
<p>This works nicely for limited use of special characters, but
|
||||
say you wanted this sentence of Chinese text: 激光,
|
||||
這兩個字是甚麼意思.
|
||||
The entity-ized version would look like this:</p>
|
||||
The ampersand encoded version would look like this:</p>
|
||||
|
||||
<pre>&#28608;&#20809;, &#36889;&#20841;&#20491;&#23383;&#26159;&#29978;&#40636;&#24847;&#24605;</pre>
|
||||
|
||||
@@ -495,7 +554,7 @@ an application that originally used ISO-8859-1 but switched to UTF-8
|
||||
when it became far to cumbersome to support foreign languages. Bots
|
||||
will now actually go through articles and convert character entities
|
||||
to their corresponding real characters for the sake of user-friendliness
|
||||
and searcheability. See
|
||||
and searchability. See
|
||||
<a href="http://meta.wikimedia.org/wiki/Help:Special_characters">Meta's
|
||||
page on special characters</a> for more details.
|
||||
</p></blockquote>
|
||||
@@ -517,10 +576,11 @@ which may be used by POST, and is required when you want to upload
|
||||
files.</p>
|
||||
|
||||
<p>The following is a summarization of notes from
|
||||
<a href="http://ppewww.physics.gla.ac.uk/~flavell/charset/form-i18n.html">
|
||||
<a href="http://web.archive.org/web/20060427015200/ppewww.ph.gla.ac.uk/~flavell/charset/form-i18n.html">
|
||||
<code>FORM</code> submission and i18n</a>. That document contains lots
|
||||
of useful information, but is written in a rambly manner, so
|
||||
here I try to get right to the point.</p>
|
||||
here I try to get right to the point. (Note: the original has
|
||||
disappeared off the web, so I am linking to the Web Archive copy.)</p>
|
||||
|
||||
<h4 id="whyutf8-forms-urlencoded"><code>application/x-www-form-urlencoded</code></h4>
|
||||
|
||||
@@ -542,7 +602,7 @@ browser you're using, they might:</p>
|
||||
<ul>
|
||||
<li>Replace the unsupported characters with useless question marks,</li>
|
||||
<li>Attempt to fix the characters (example: smart quotes to regular quotes),</li>
|
||||
<li>Replace the character with a character entity, or</li>
|
||||
<li>Replace the character with a character entity reference, or</li>
|
||||
<li>Send it anyway as a different character encoding mixed in
|
||||
with the original encoding (usually Windows-1252 rather than
|
||||
iso-8859-1 or UTF-8 interspersed in 8-bit)</li>
|
||||
@@ -558,7 +618,7 @@ since UTF-8 supports every character.</p>
|
||||
|
||||
<h4 id="whyutf8-forms-multipart"><code>multipart/form-data</code></h4>
|
||||
|
||||
<p>Multipart form submission takes a way a lot of the ambiguity
|
||||
<p>Multipart form submission takes away a lot of the ambiguity
|
||||
that percent-encoding had: the server now can explicitly ask for
|
||||
certain encodings, and the client can explicitly tell the server
|
||||
during the form submission what encoding the fields are in.</p>
|
||||
@@ -571,9 +631,9 @@ Each method has deficiencies, especially the former.</p>
|
||||
<p>If you tell the browser to send the form in the same encoding as
|
||||
the page, you still have the trouble of what to do with characters
|
||||
that are outside of the character encoding's range. The behavior, once
|
||||
again, varies: Firefox 2.0 entity-izes them while Internet Explorer
|
||||
7.0 mangles them beyond intelligibility. For serious I18N purposes,
|
||||
this is not an option.</p>
|
||||
again, varies: Firefox 2.0 converts them to character entity references
|
||||
while Internet Explorer 7.0 mangles them beyond intelligibility. For
|
||||
serious internationalization purposes, this is not an option.</p>
|
||||
|
||||
<p>The other possibility is to set Accept-Encoding to UTF-8, which
|
||||
begs the question: Why aren't you using UTF-8 for everything then?
|
||||
@@ -604,22 +664,378 @@ hounding you about broken pages.</p>
|
||||
|
||||
<h3 id="whyutf8-htmlpurifier">HTML Purifier</h3>
|
||||
|
||||
<p>And finally, we get to HTML Purifier.</p>
|
||||
<p>And finally, we get to HTML Purifier. HTML Purifier is built to
|
||||
deal with UTF-8: any indications otherwise are the result of an
|
||||
encoder that converts text from your preferred encoding to UTF-8, and
|
||||
back again. HTML Purifier never touches anything else, and leaves
|
||||
it up to the module iconv to do the dirty work.</p>
|
||||
|
||||
<p>This approach, however, is not perfect. iconv is blithely unaware
|
||||
of HTML character entities. HTML Purifier, in order to
|
||||
protect against sophisticated escaping schemes, normalizes all character
|
||||
and numeric entitie references before processing the text. This leads to
|
||||
one important ramification:</p>
|
||||
|
||||
<p><strong>Any character that is not supported by the target character
|
||||
set, regardless of whether or not it is in the form of a character
|
||||
entity reference or a raw character, will be silently ignored.</strong></p>
|
||||
|
||||
<p>Example of this principle at work: say you have <code>&theta;</code>
|
||||
in your HTML, but the output is in Latin-1 (which, understandably,
|
||||
does not understand Greek), the following process will occur (assuming you've
|
||||
set the encoding correctly using %Core.Encoding):</p>
|
||||
|
||||
<ul>
|
||||
<li>The <code>Encoder</code> will transform the text from ISO 8859-1 to UTF-8
|
||||
(note that theta is preserved here since it doesn't actually use
|
||||
any non-ASCII characters): <code>&theta;</code></li>
|
||||
<li>The <code>EntityParser</code> will transform all named and numeric
|
||||
character entities to their corresponding raw UTF-8 equivalents:
|
||||
<code>θ</code></li>
|
||||
<li>HTML Purifier processes the code: <code>θ</code></li>
|
||||
<li>The <code>Encoder</code> now transforms the text back from UTF-8
|
||||
to ISO 8859-1. Since Greek is not supported by ISO 8859-1, it
|
||||
will be either ignored or replaced with a question mark:
|
||||
<code>?</code></li>
|
||||
</ul>
|
||||
|
||||
<p>This behaviour is quite unsatisfactory. It is a deal-breaker for
|
||||
international applications, and it can be mildly annoying for the provincial
|
||||
soul who occasionally needs a special character. Since 1.4.0, HTML
|
||||
Purifier has provided a slightly more palatable workaround using
|
||||
%Core.EscapeNonASCIICharacters. The process now looks like:</p>
|
||||
|
||||
<ul>
|
||||
<li>The <code>Encoder</code> transforms encoding to UTF-8: <code>&theta;</code></li>
|
||||
<li>The <code>EntityParser</code> transforms entities: <code>θ</code></li>
|
||||
<li>HTML Purifier processes the code: <code>θ</code></li>
|
||||
<li>The <code>Encoder</code> replaces all non-ASCII characters
|
||||
with numeric entity reference: <code>&#952;</code></li>
|
||||
<li>For good measure, <code>Encoder</code> transforms encoding back to
|
||||
original (which is strictly unnecessary for 99% of encodings
|
||||
out there): <code>&#952;</code> (remember, it's all ASCII!)</li>
|
||||
</ul>
|
||||
|
||||
<p>...which means that this is only good for an occasional foray into
|
||||
the land of Unicode characters, and is totally unacceptable for Chinese
|
||||
or Japanese texts. The even bigger kicker is that, supposing the
|
||||
input encoding was actually ISO-8859-7, which <em>does</em> support
|
||||
theta, the character would get converted into a character entity reference
|
||||
anyway! (The Encoder does not discriminate).</p>
|
||||
|
||||
<p>The current functionality is about where HTML Purifier will be for
|
||||
the rest of eternity. HTML Purifier could attempt to preserve the original
|
||||
form of the character references so that they could be substituted back in, only the
|
||||
DOM extension kills them off irreversibly. HTML Purifier could also attempt
|
||||
to be smart and only convert non-ASCII characters that weren't supported
|
||||
by the target encoding, but that would require reimplementing iconv
|
||||
with HTML awareness, something I will not do.</p>
|
||||
|
||||
<p>So there: either it's UTF-8 or crippled international support. Your pick! (and I'm
|
||||
not being sarcastic here: some people could care less about other languages).</p>
|
||||
|
||||
<h2 id="migrate">Migrate to UTF-8</h2>
|
||||
|
||||
<h3 id="migrate-editor">Text editor</h3>
|
||||
<p>So, you've decided to bite the bullet, and want to migrate to UTF-8.
|
||||
Note that this is not for the faint-hearted, and you should expect
|
||||
the process to take longer than you think it will take.</p>
|
||||
|
||||
<p>The general idea is that you convert all existing text to UTF-8,
|
||||
and then you set all the headers and META tags we discussed earlier
|
||||
to UTF-8. There are many ways going about doing this: you could
|
||||
write a conversion script that runs through the database and re-encodes
|
||||
everything as UTF-8 or you could do the conversion on the fly when someone
|
||||
reads the page. The details depend on your system, but I will cover
|
||||
some of the more subtle points of migration that may trip you up.</p>
|
||||
|
||||
<h3 id="migrate-db">Configuring your database</h3>
|
||||
|
||||
<h3 id="migrate-convert">Convert old text</h3>
|
||||
<p>Most modern databases, the most prominent open-source ones being MySQL
|
||||
4.1+ and PostgreSQL, support character encodings. If you're switching
|
||||
to UTF-8, logically speaking, you'd want to make sure your database
|
||||
knows about the change too. There are some caveats though:</p>
|
||||
|
||||
<h4 id="migrate-db-legit">Legit method</h4>
|
||||
|
||||
<p>Standardization in terms of SQL syntax for specifying character
|
||||
encodings is notoriously spotty. Refer to your respective database's
|
||||
documentation on how to do this properly.</p>
|
||||
|
||||
<p>For <a href="http://dev.mysql.com/doc/refman/5.0/en/charset-conversion.html">MySQL</a>, <code>ALTER</code> will magically perform the
|
||||
character encoding conversion for you. However, you have
|
||||
to make sure that the text inside the column is what is says it is:
|
||||
if you had put Shift-JIS in an ISO 8859-1 column, MySQL will irreversibly mangle
|
||||
the text when you try to convert it to UTF-8. You'll have to convert
|
||||
it to a binary field, convert it to a Shift-JIS field (the real encoding),
|
||||
and then finally to UTF-8. Many a website had pages irreversibly mangled
|
||||
because they didn't realize that they'd been deluding themselves about
|
||||
the character encoding all along, don't become the next victim.</p>
|
||||
|
||||
<p>For <a href="http://www.postgresql.org/docs/8.2/static/multibyte.html">PostgreSQL</a>, there appears to be no direct way to change the
|
||||
encoding of a database (as of 8.2). You will have to dump the data, and then reimport
|
||||
it into a new table. Make sure that your client encoding is set properly:
|
||||
this is how PostgreSQL knows to perform an encoding conversion.</p>
|
||||
|
||||
<p>Many times, you will be also asked about the "collation" of
|
||||
the new column. Collation is how a DBMS sorts text, like ordering
|
||||
B, C and A into A, B and C (the problem gets surprisingly complicated
|
||||
when you get to languages like Thai and Japanese). If in doubt,
|
||||
going with the default setting is usually a safe bet.</p>
|
||||
|
||||
<p>Once the conversion is all said and done, you still have to remember
|
||||
to set the client encoding (your encoding) properly on each database
|
||||
connection using <code>SET NAMES</code> (which is standard SQL and is
|
||||
usually supported).</p>
|
||||
|
||||
<h4 id="migrate-db-binary">Binary</h4>
|
||||
|
||||
<p>Due to the abovementioned compatibility issues, a more interoperable
|
||||
way of storing UTF-8 text is to stuff it in a binary datatype.
|
||||
<code>CHAR</code> becomes <code>BINARY</code>, <code>VARCHAR</code> becomes
|
||||
<code>VARBINARY</code> and <code>TEXT</code> becomes <code>BLOB</code>.
|
||||
Doing so can save you some huge headaches:</p>
|
||||
|
||||
<ul>
|
||||
<li>The syntax for binary data types is very portable,</li>
|
||||
<li>MySQL 4.0 has <em>no</em> support for character encodings, so
|
||||
if you want to support it you <em>have</em> to use binary,</li>
|
||||
<li>MySQL, as of 5.1, has no support for four byte UTF-8 characters,
|
||||
which represent characters beyond the basic multilingual
|
||||
plane, and</li>
|
||||
<li>You will never have to worry about your DBMS being too smart
|
||||
and attempting to convert your text when you don't want it to.</li>
|
||||
</ul>
|
||||
|
||||
<p>MediaWiki, a very prominent international application, uses binary fields
|
||||
for storing their data because of point three.</p>
|
||||
|
||||
<p>There are drawbacks, of course:</p>
|
||||
|
||||
<ul>
|
||||
<li>Database tools like PHPMyAdmin won't be able to offer you inline
|
||||
text editing, since it is declared as binary,</li>
|
||||
<li>It's not semantically correct: it's really text not binary
|
||||
(lying to the database),</li>
|
||||
<li>Unless you use the not-very-portable wizardry mentioned above,
|
||||
you have to change the encoding yourself (usually, you'd do
|
||||
it on the fly), and</li>
|
||||
<li>You will not have collation.</li>
|
||||
</ul>
|
||||
|
||||
<p>Choose based on your circumstances.</p>
|
||||
|
||||
<h3 id="migrate-editor">Text editor</h3>
|
||||
|
||||
<p>For more flat-file oriented systems, you will often be tasked with
|
||||
converting reams of existing text and HTML files into UTF-8, as well as
|
||||
making sure that all new files uploaded are properly encoded. Once again,
|
||||
I can only point vaguely in the right direction for converting your
|
||||
existing files: make sure you backup, make sure you use
|
||||
<a href="http://php.net/ref.iconv">iconv</a>(), and
|
||||
make sure you know what the original character encoding of the files
|
||||
is (or are, depending on the tidiness of your system).</p>
|
||||
|
||||
<p>However, I can proffer more specific advice on the subject of
|
||||
text editors. Many text editors have notoriously spotty Unicode support.
|
||||
To find out how your editor is doing, you can check out <a
|
||||
href="http://www.alanwood.net/unicode/utilities_editors.html">this list</a>
|
||||
or <a href="http://en.wikipedia.org/wiki/Comparison_of_text_editors#Encoding_support">Wikipedia's list.</a>
|
||||
I personally use Notepad++, which works like a charm when it comes to UTF-8.
|
||||
Usually, you will have to <strong>explicitly</strong> tell the editor through some dialogue
|
||||
(usually Save as or Format) what encoding you want it to use. An editor
|
||||
will often offer "Unicode" as a method of saving, which is
|
||||
ambiguous. Make sure you know whether or not they really mean UTF-8
|
||||
or UTF-16 (which is another flavor of Unicode).</p>
|
||||
|
||||
<p>The two things to look out for are whether or not the editor
|
||||
supports <strong>font mixing</strong> (multiple
|
||||
fonts in one document) and whether or not it adds a <strong>BOM</strong>.
|
||||
Font mixing is important because fonts rarely have support for every
|
||||
language known to mankind: in order to be flexible, an editor must
|
||||
be able to take a little from here and a little from there, otherwise
|
||||
all your Chinese characters will come as nice boxes. We'll discuss
|
||||
BOM below.</p>
|
||||
|
||||
<h3 id="migrate-bom">Byte Order Mark (headers already sent!)</h3>
|
||||
|
||||
<p>The BOM, or <a href="http://en.wikipedia.org/wiki/Byte_Order_Mark">Byte
|
||||
Order Mark</a>, is a magical, invisible character placed at
|
||||
the beginning of UTF-8 files to tell people what the encoding is and
|
||||
what the endianness of the text is. It is also unnecessary.</p>
|
||||
|
||||
<p>Because it's invisible, it often
|
||||
catches people by surprise when it starts doing things it shouldn't
|
||||
be doing. For example, this PHP file:</p>
|
||||
|
||||
<pre><strong>BOM</strong><?php
|
||||
header('Location: index.php');
|
||||
?></pre>
|
||||
|
||||
<p>...will fail with the all too familiar <strong>Headers already sent</strong>
|
||||
PHP error. And because the BOM is invisible, this culprit will go unnoticed.
|
||||
My suggestion is to only use ASCII in PHP pages, but if you must, make
|
||||
sure the page is saved WITHOUT the BOM.</p>
|
||||
|
||||
<blockquote class="aside">
|
||||
<p>The headers the error is referring to are <strong>HTTP headers</strong>,
|
||||
which are sent to the browser before any HTML to tell it various
|
||||
information. The moment any regular text (and yes, a BOM counts as
|
||||
ordinary text) is output, the headers must be sent, and you are
|
||||
not allowed to send anymore. Thus, the error.</p>
|
||||
</blockquote>
|
||||
|
||||
<p>If you are reading in text files to insert into the middle of another
|
||||
page, it is strongly advised (but not strictly necessary) that you replace out the UTF-8 byte
|
||||
sequence for BOM <code>"\xEF\xBB\xBF"</code> before inserting it in,
|
||||
via:</p>
|
||||
|
||||
<pre>$text = str_replace("\xEF\xBB\xBF", '', $text);</pre>
|
||||
|
||||
<h3 id="migrate-fonts">Fonts</h3>
|
||||
|
||||
<p>Generally speaking, people who are having trouble with fonts fall
|
||||
into two categories:</p>
|
||||
|
||||
<ul>
|
||||
<li>Those who want to
|
||||
use an extremely obscure language for which there is very little
|
||||
support even among native speakers of the language, and</li>
|
||||
<li>Those where the primary language of the text is
|
||||
well-supported but there are occasional characters
|
||||
that aren't supported.</li>
|
||||
</ul>
|
||||
|
||||
<p>Yes, there's always a chance where an English user happens across
|
||||
a Sinhalese website and doesn't have the right font. But an English user
|
||||
who happens not to have the right fonts probably has no business reading Sinhalese
|
||||
anyway. So we'll deal with the other two edge cases.</p>
|
||||
|
||||
<h4 id="migrate-fonts-obscure">Obscure scripts</h4>
|
||||
|
||||
<p>If you run a Bengali website, you may get comments from users who
|
||||
would like to read your website but get heaps of question marks or
|
||||
other meaningless characters. Fixing this problem requires the
|
||||
installation of a font or language pack which is often highly
|
||||
dependent on what the language is. <a href="http://bn.wikipedia.org/wiki/%E0%A6%89%E0%A6%87%E0%A6%95%E0%A6%BF%E0%A6%AA%E0%A7%87%E0%A6%A1%E0%A6%BF%E0%A6%AF%E0%A6%BC%E0%A6%BE:Bangla_script_display_help">Here is an example</a>
|
||||
of such a help file for the Bengali language, I am sure there are
|
||||
others out there too. You just have to point users to the appropriate
|
||||
help file.</p>
|
||||
|
||||
<h4 id="migrate-fonts-occasional">Occasional use</h4>
|
||||
|
||||
<p>A prime example of when you'll see some very obscure Unicode
|
||||
characters embedded in what otherwise would be very bland ASCII are
|
||||
letters of the
|
||||
<a href="http://en.wikipedia.org/wiki/International_Phonetic_Alphabet">International
|
||||
Phonetic Alphabet (IPA)</a>, use to designate pronounciations in a very standard
|
||||
manner (you probably see them all the time in your dictionary). Your
|
||||
average font probably won't have support for all of the IPA characters
|
||||
like ʘ (bilabial click) or ʒ (voiced postalveolar fricative).
|
||||
So what's a poor browser to do? Font mix! Smart browsers like Mozilla Firefox
|
||||
and Internet Explorer 7 will borrow glyphs from other fonts in order
|
||||
to make sure that all the characters display properly.</p>
|
||||
|
||||
<p>But what happens when the browser isn't smart and happens to be the
|
||||
most widely used browser in the entire world? Microsoft IE 6
|
||||
is not smart enough to borrow from other fonts when a character isn't
|
||||
present, so more often than not you'll be slapped with a nice big �.
|
||||
To get things to work, MSIE 6 needs a little nudge. You could configure it
|
||||
to use a different font to render the text, but you can acheive the same
|
||||
effect by selectively changing the font for blocks of special characters
|
||||
to known good Unicode fonts.</p>
|
||||
|
||||
<p>Fortunantely, the folks over at Wikipedia have already done all the
|
||||
heavy lifting for you. Get the CSS from the horses mouth here:
|
||||
<a href="http://en.wikipedia.org/wiki/MediaWiki:Common.css">Common.css</a>,
|
||||
and search for ".IPA" There are also a smattering of
|
||||
other classes you can use for other purposes, check out
|
||||
<a href="http://meta.wikimedia.org/wiki/Help:Special_characters#Displaying_Special_Characters">this page</a>
|
||||
for more details. For you lazy ones, this should work:</p>
|
||||
|
||||
<pre>.Unicode {
|
||||
font-family: Code2000, "TITUS Cyberbit Basic", "Doulos SIL",
|
||||
"Chrysanthi Unicode", "Bitstream Cyberbit",
|
||||
"Bitstream CyberBase", Thryomanes, Gentium, GentiumAlt,
|
||||
"Lucida Grande", "Arial Unicode MS", "Microsoft Sans Serif",
|
||||
"Lucida Sans Unicode";
|
||||
font-family /**/:inherit; /* resets fonts for everyone but IE6 */
|
||||
}</pre>
|
||||
|
||||
<p>The standard usage goes along the lines of <code><span class="Unicode">Crazy
|
||||
Unicode stuff here</span></code>. Characters in the
|
||||
<a href="http://en.wikipedia.org/wiki/Windows_Glyph_List_4">Windows Glyph List</a>
|
||||
usually don't need to be fixed, but for anything else you probably
|
||||
want to play it safe. Unless, of course, you don't care about IE6
|
||||
users.</p>
|
||||
|
||||
<h3 id="migrate-variablewidth">Dealing with variable width in functions</h3>
|
||||
|
||||
<p>When people claim that PHP6 will solve all our Unicode problems, they're
|
||||
misinformed. It will not fix any of the abovementioned troubles. It will,
|
||||
however, fix the problem we are about to discuss: processing UTF-8 text
|
||||
in PHP.</p>
|
||||
|
||||
<p>PHP (as of PHP5) is blithely unaware of the existence of UTF-8 (with a few
|
||||
notable exceptions). Sometimes, this will cause problems, other times,
|
||||
this won't. So far, we've avoided discussing the architecture of
|
||||
UTF-8, so, we must first ask, what is UTF-8? Yes, it supports Unicode,
|
||||
and yes, it is variable width. Other traits:</p>
|
||||
|
||||
<ul>
|
||||
<li>Every character's byte sequence is unique and will never be found
|
||||
inside the byte sequence of another character,</li>
|
||||
<li>UTF-8 may use up to four bytes to encode a character,</li>
|
||||
<li>UTF-8 text must be checked for well-formedness,</li>
|
||||
<li>Pure ASCII is also valid UTF-8, and</li>
|
||||
<li>Binary sorting will sort UTF-8 in the same order as Unicode.</li>
|
||||
</ul>
|
||||
|
||||
<p>Each of these traits affect different domains of text processing
|
||||
in different ways. It is beyond the scope of this document to explain
|
||||
what precisely these implications are. PHPWact provides
|
||||
a very good <a href="http://www.phpwact.org/php/i18n/utf-8">reference document</a>
|
||||
on what to expect from each function, although coverage is spotty in
|
||||
some areas. Their more general notes on
|
||||
<a href="http://www.phpwact.org/php/i18n/charsets">character sets</a>
|
||||
are also worth looking at for information on UTF-8. Some rules of thumb
|
||||
when dealing with Unicode text:</p>
|
||||
|
||||
<ul>
|
||||
<li>Do not EVER use functions that:<ul>
|
||||
<li>...convert case (strtolower, strtoupper, ucfirst, ucwords)</li>
|
||||
<li>...claim to be case-insensitive (str_ireplace, stristr, strcasecmp)</li>
|
||||
</ul></li>
|
||||
<li>Think twice before using functions that:<ul>
|
||||
<li>...count characters (strlen will return bytes, not characters;
|
||||
str_split and word_wrap may corrupt)</li>
|
||||
<li>...convert characters to entity references (UTF-8 doesn't need entities)</li>
|
||||
<li>...do very complex string processing (*printf)</li>
|
||||
</ul></li>
|
||||
</ul>
|
||||
|
||||
<p>Note: this list applies to UTF-8 encoded text only: if you have
|
||||
a string that you are 100% sure is ASCII, be my guest and use
|
||||
<code>strtolower</code> (HTML Purifier uses this function.)</p>
|
||||
|
||||
<p>Regardless, always think in bytes, not characters. If you use strpos()
|
||||
to find the position of a character, it will be in bytes, but this
|
||||
usually won't matter since substr() also operates with byte indices!</p>
|
||||
|
||||
<p>You'll also need to make sure your UTF-8 is well-formed and will
|
||||
probably need replacements for some of these functions. I recommend
|
||||
using Harry Fuecks' <a href="http://phputf8.sourceforge.net/">PHP
|
||||
UTF-8</a> library, rather than use mb_string directly. HTML Purifier
|
||||
also defines a few useful UTF-8 compatible functions: check out
|
||||
<code>Encoder.php</code> in the <code>/library/HTMLPurifier/</code>
|
||||
directory.</p>
|
||||
|
||||
<h2 id="externallinks">Further Reading</h2>
|
||||
|
||||
<p>Well, that's it. Hopefully this document has served as a very
|
||||
practical springboard into knowledge of how UTF-8 works. You may have
|
||||
decided that you don't want to migrate yet: that's fine, just know
|
||||
what will happen to your output and what bug reports you may recieve.</p>
|
||||
|
||||
<p>Many other developers have already discussed the subject of Unicode,
|
||||
UTF-8 and internationalization, and I would like to defer to them for
|
||||
a more in-depth look into character sets and encodings.</p>
|
||||
@@ -637,4 +1053,4 @@ a more in-depth look into character sets and encodings.</p>
|
||||
</ul>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
</html>
|
||||
|
@@ -15,7 +15,7 @@
|
||||
|
||||
<div id="filing">Filed under End-User</div>
|
||||
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
||||
<div id="home"><a href="http://hp.jpsband.org/">HTML Purifier</a> End-User Documentation</div>
|
||||
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
|
||||
|
||||
<p>Clients like their YouTube videos. It gives them a warm fuzzy feeling when
|
||||
they see a neat little embedded video player on their websites that can play
|
||||
@@ -70,14 +70,12 @@ into your documents. YouTube's code goes like this:</p>
|
||||
class="embed-youtube">AyPzM5WK8ys</span></code> your
|
||||
application can reconstruct the full object from this small snippet that
|
||||
passes through HTML Purifier <em>unharmed</em>.
|
||||
<a href="http://hp.jpsband.org/svnroot/htmlpurifier/trunk/library/HTMLPurifier/Filter/YouTube.php">Show me the code!</a></p>
|
||||
<a href="http://htmlpurifier.org/svnroot/htmlpurifier/trunk/library/HTMLPurifier/Filter/YouTube.php">Show me the code!</a></p>
|
||||
|
||||
<p>And the corresponding usage:</p>
|
||||
|
||||
<pre><?php
|
||||
// assuming $purifier is an instance of HTMLPurifier
|
||||
require_once 'HTMLPurifier/Filter/YouTube.php';
|
||||
$purifier->addFilter(new HTMLPurifier_Filter_YouTube());
|
||||
$config->set('Filter', 'YouTube', true);
|
||||
?></pre>
|
||||
|
||||
<p>There is a bit going in the two code snippets, so let's explain.</p>
|
||||
@@ -149,4 +147,4 @@ like that, for that matter), send it over and it might get included
|
||||
with the core!</p>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
</html>
|
||||
|
@@ -1,14 +1,22 @@
|
||||
<?php exit;
|
||||
<?php
|
||||
|
||||
// This file demonstrates basic usage of HTMLPurifier.
|
||||
|
||||
require_once '/path/to/htmlpurifier/library/HTMLPurifier.auto.php';
|
||||
// replace this with the path to the HTML Purifier library
|
||||
require_once '../../library/HTMLPurifier.auto.php';
|
||||
|
||||
$purifier = new HTMLPurifier();
|
||||
$config = HTMLPurifier_Config::createDefault();
|
||||
|
||||
// configuration goes here:
|
||||
$config->set('Core', 'Encoding', 'UTF-8'); // replace with your encoding
|
||||
$config->set('HTML', 'Doctype', 'XHTML 1.0 Transitional'); // replace with your doctype
|
||||
|
||||
$purifier = new HTMLPurifier($config);
|
||||
|
||||
// untrusted input HTML
|
||||
$html = '<b>Simple and short';
|
||||
|
||||
$pure_html = $purifier->purify($html);
|
||||
|
||||
echo $pure_html;
|
||||
echo '<pre>' . htmlspecialchars($pure_html) . '</pre>';
|
||||
|
||||
?>
|
@@ -1,136 +0,0 @@
|
||||
<?php
|
||||
|
||||
// using _REQUEST because we accept GET and POST requests
|
||||
|
||||
$content = empty($_REQUEST['xml']) ? 'text/html' : 'application/xhtml+xml';
|
||||
header("Content-type:$content;charset=UTF-8");
|
||||
|
||||
// prevent PHP versions with shorttags from barfing
|
||||
echo '<?xml version="1.0" encoding="UTF-8" ?>
|
||||
';
|
||||
|
||||
function getFormMethod() {
|
||||
return (isset($_REQUEST['post'])) ? 'post' : 'get';
|
||||
}
|
||||
|
||||
if (empty($_REQUEST['strict'])) {
|
||||
?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||
<?php
|
||||
} else {
|
||||
?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
||||
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||
<?php
|
||||
}
|
||||
?>
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en">
|
||||
<head>
|
||||
<title>HTML Purifier Live Demo</title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||
</head>
|
||||
<body>
|
||||
<h1>HTML Purifier Live Demo</h1>
|
||||
<?php
|
||||
|
||||
require_once '../../library/HTMLPurifier.auto.php';
|
||||
|
||||
if (!empty($_REQUEST['html'])) { // start result
|
||||
|
||||
if (strlen($_REQUEST['html']) > 50000) {
|
||||
?>
|
||||
<p>Request exceeds maximum allowed text size of 50kb.</p>
|
||||
<?php
|
||||
} else { // start main processing
|
||||
|
||||
$html = get_magic_quotes_gpc() ? stripslashes($_REQUEST['html']) : $_REQUEST['html'];
|
||||
|
||||
$config = HTMLPurifier_Config::createDefault();
|
||||
$config->set('Core', 'TidyFormat', !empty($_REQUEST['tidy']));
|
||||
$config->set('HTML', 'Strict', !empty($_REQUEST['strict']));
|
||||
$purifier = new HTMLPurifier($config);
|
||||
$pure_html = $purifier->purify($html);
|
||||
|
||||
?>
|
||||
<p>Here is your purified HTML:</p>
|
||||
<div style="border:5px solid #CCC;margin:0 10%;padding:1em;">
|
||||
<?php if(getFormMethod() == 'get') { ?>
|
||||
<div style="float:right;">
|
||||
<a href="http://validator.w3.org/check?uri=referer"><img
|
||||
src="http://www.w3.org/Icons/valid-xhtml10"
|
||||
alt="Valid XHTML 1.0 Transitional" height="31" width="88" style="border:0;" /></a>
|
||||
</div>
|
||||
<?php } ?>
|
||||
<?php
|
||||
|
||||
echo $pure_html;
|
||||
|
||||
?>
|
||||
<div style="clear:both;"></div>
|
||||
</div>
|
||||
<p>Here is the source code of the purified HTML:</p>
|
||||
<pre><?php
|
||||
|
||||
echo htmlspecialchars($pure_html, ENT_COMPAT, 'UTF-8');
|
||||
|
||||
?></pre>
|
||||
<?php
|
||||
if (getFormMethod() == 'post') { // start POST validation notice
|
||||
?>
|
||||
<p>If you would like to validate the code with
|
||||
<a href="http://validator.w3.org/#validate-by-input">W3C's
|
||||
validator</a>, copy and paste the <em>entire</em> demo page's source.</p>
|
||||
<?php
|
||||
} // end POST validation notice
|
||||
|
||||
} // end main processing
|
||||
|
||||
// end result
|
||||
} else {
|
||||
|
||||
?>
|
||||
<p>Welcome to the live demo. Enter some HTML and see how HTML Purifier
|
||||
will filter it.</p>
|
||||
<?php
|
||||
|
||||
}
|
||||
|
||||
?>
|
||||
<form id="filter" action="demo.php<?php
|
||||
echo '?' . getFormMethod();
|
||||
if (isset($_REQUEST['profile']) || isset($_REQUEST['XDEBUG_PROFILE'])) {
|
||||
echo '&XDEBUG_PROFILE=1';
|
||||
} ?>" method="<?php echo getFormMethod(); ?>">
|
||||
<fieldset>
|
||||
<legend>HTML Purifier Input (<?php echo getFormMethod(); ?>)</legend>
|
||||
<textarea name="html" cols="60" rows="15"><?php
|
||||
|
||||
if (isset($html)) {
|
||||
echo htmlspecialchars(
|
||||
HTMLPurifier_Encoder::cleanUTF8($html), ENT_COMPAT, 'UTF-8');
|
||||
}
|
||||
?></textarea>
|
||||
<?php if (getFormMethod() == 'get') { ?>
|
||||
<p><strong>Warning:</strong> GET request method can only hold
|
||||
8129 characters (probably less depending on your browser).
|
||||
If you need to test anything
|
||||
larger than that, try the <a href="demo.php?post">POST form</a>.</p>
|
||||
<?php } ?>
|
||||
<?php if (extension_loaded('tidy')) { ?>
|
||||
<div>Nicely format output with Tidy? <input type="checkbox" value="1"
|
||||
name="tidy"<?php if (!empty($_REQUEST['tidy'])) echo ' checked="checked"'; ?> /></div>
|
||||
<?php } ?>
|
||||
<div>XHTML 1.0 Strict output? <input type="checkbox" value="1"
|
||||
name="strict"<?php if (!empty($_REQUEST['strict'])) echo ' checked="checked"'; ?> /></div>
|
||||
<div>Serve as application/xhtml+xml? (not for IE) <input type="checkbox" value="1"
|
||||
name="xml"<?php if (!empty($_REQUEST['xml'])) echo ' checked="checked"'; ?> /></div>
|
||||
<div>
|
||||
<input type="submit" value="Submit" name="submit" class="button" />
|
||||
</div>
|
||||
</fieldset>
|
||||
</form>
|
||||
<p>Return to <a href="http://hp.jpsband.org/">HTML Purifier's home page</a>.
|
||||
Try the form in <a href="demo.php?get">GET</a> and <a href="demo.php?post">POST</a> request
|
||||
flavors (GET is easy to validate with W3C, but POST allows larger inputs).</p>
|
||||
</body>
|
||||
</html>
|
6
docs/fixquotes.htc
Normal file
6
docs/fixquotes.htc
Normal file
@@ -0,0 +1,6 @@
|
||||
<public:attach event="oncontentready" onevent="init();" />
|
||||
<script>
|
||||
function init() {
|
||||
element.innerHTML = '“'+element.innerHTML+'”';
|
||||
}
|
||||
</script>
|
@@ -13,7 +13,7 @@
|
||||
|
||||
<h1>Documentation</h1>
|
||||
|
||||
<p><strong><a href="http://hp.jpsband.org/">HTML Purifier</a></strong> has documentation for all types of people.
|
||||
<p><strong><a href="http://htmlpurifier.org/">HTML Purifier</a></strong> has documentation for all types of people.
|
||||
Here is an index of all of them.</p>
|
||||
|
||||
<h2>End-user</h2>
|
||||
@@ -31,9 +31,18 @@ information for casual developers using HTML Purifier.</p>
|
||||
<dt><a href="enduser-slow.html">Speeding up HTML Purifier</a></dt>
|
||||
<dd>Explains how to speed up HTML Purifier through caching or inbound filtering.</dd>
|
||||
|
||||
<dt><a href="enduser-utf8.html">UTF-8</a></dt>
|
||||
<dt><a href="enduser-utf8.html">UTF-8: The Secret of Character Encoding</a></dt>
|
||||
<dd>Describes the rationale for using UTF-8, the ramifications otherwise, and how to make the switch.</dd>
|
||||
|
||||
<dt><a href="enduser-tidy.html">Tidy</a></dt>
|
||||
<dd>Tutorial for tweaking HTML Purifier's Tidy-like behavior.</dd>
|
||||
|
||||
<dt><a href="enduser-customize.html">Customize</a></dt>
|
||||
<dd>Tutorial for customizing HTML Purifier's tag and attribute sets.</dd>
|
||||
|
||||
<dt><a href="enduser-uri-filter.html">URI Filters</a></dt>
|
||||
<dd>Tutorial for creating custom URI filters.</dd>
|
||||
|
||||
</dl>
|
||||
|
||||
<h2>Development</h2>
|
||||
@@ -42,9 +51,6 @@ conventions.</p>
|
||||
|
||||
<dl>
|
||||
|
||||
<dt><a href="dev-code-quality.html">Code Quality Issues</a></dt>
|
||||
<dd>Discusses code quality issues and places that need to be refactored.</dd>
|
||||
|
||||
<dt><a href="dev-progress.html">Implementation Progress</a></dt>
|
||||
<dd>Tables detailing HTML element and CSS property implementation coverage.</dd>
|
||||
|
||||
@@ -54,6 +60,16 @@ conventions.</p>
|
||||
<dt><a href="dev-optimization.html">Optimization</a></dt>
|
||||
<dd>Discusses possible methods of optimizing HTML Purifier.</dd>
|
||||
|
||||
<dt><a href="dev-flush.html">Flushing the Purifier</a></dt>
|
||||
<dd>Discusses when to flush HTML Purifier's various caches.</dd>
|
||||
|
||||
<dt><a href="dev-advanced-api.html">Advanced API</a></dt>
|
||||
<dd>Specification for HTML Purifier's advanced API for defining
|
||||
custom filtering behavior.</dd>
|
||||
|
||||
<dt><a href="dev-config-schema.html">Config Schema</a></dt>
|
||||
<dd>Describes config schema framework in HTML Purifier.</dd>
|
||||
|
||||
</dl>
|
||||
|
||||
<h2>Proposals</h2>
|
||||
@@ -101,6 +117,12 @@ the code. They may be upgraded to HTML files or stay as TXT scratchpads.</p>
|
||||
<td>Common security issues that may still arise (half-baked).</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td>Development</td>
|
||||
<td><a href="dev-code-quality.txt">Code Quality Issues</a></td>
|
||||
<td>Enumerates code quality issues and places that need to be refactored.</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td>Proposal</td>
|
||||
<td><a href="proposal-filter-levels.txt">Filter levels</a></td>
|
||||
@@ -119,10 +141,16 @@ the code. They may be upgraded to HTML files or stay as TXT scratchpads.</p>
|
||||
<td>Assorted configuration options that could be implemented.</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td>Proposal</td>
|
||||
<td><a href="proposal-css-extraction.txt">CSS extraction</a></td>
|
||||
<td>Taking the inline CSS out of documents and into <code>style</code>.</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td>Reference</td>
|
||||
<td><a href="ref-loose-vs-strict.txt">Loose vs.Strict</a></td>
|
||||
<td>Differences between HTML Strict and Transitional versions.</td>
|
||||
<td><a href="ref-content-models.txt">Handling Content Model Changes</a></td>
|
||||
<td>Discusses how to tidy up content model changes using custom ChildDef classes.</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
@@ -133,14 +161,8 @@ the code. They may be upgraded to HTML files or stay as TXT scratchpads.</p>
|
||||
|
||||
<tr>
|
||||
<td>Reference</td>
|
||||
<td><a href="ref-strictness.txt">Strictness</a></td>
|
||||
<td>Short essay on how loose definition isn't really loose.</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
<td>Reference</td>
|
||||
<td><a href="ref-xhtml-1.1.txt">XHTML 1.1</a></td>
|
||||
<td>What we'd have to do to support XHTML 1.1.</td>
|
||||
<td><a href="ref-html-modularization.txt">Modularization of HTMLDefinition</a></td>
|
||||
<td>Provides a high-level overview of the concepts behind HTMLModules.</td>
|
||||
</tr>
|
||||
|
||||
<tr>
|
||||
@@ -155,4 +177,4 @@ the code. They may be upgraded to HTML files or stay as TXT scratchpads.</p>
|
||||
|
||||
<div id="version">$Id$</div>
|
||||
</body>
|
||||
</html>
|
||||
</html>
|
||||
|
@@ -15,7 +15,7 @@
|
||||
|
||||
<div id="filing">Filed under Proposals</div>
|
||||
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
||||
<div id="home"><a href="http://hp.jpsband.org/">HTML Purifier</a> End-User Documentation</div>
|
||||
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
|
||||
|
||||
<p>Your website probably has a color-scheme.
|
||||
<span style="color:#090; background:#FFF;">Green on white</span>,
|
||||
|
@@ -7,33 +7,15 @@ value is used for. This means decentralized configuration declarations that
|
||||
are nevertheless error checking and a centralized configuration object.
|
||||
|
||||
Directives are divided into namespaces, indicating the major portion of
|
||||
functionality they cover (although there may be overlaps. Please consult
|
||||
functionality they cover (although there may be overlaps). Please consult
|
||||
the documentation in ConfigDef for more information on these namespaces.
|
||||
|
||||
Since configuration is dependant on context, internal classes require a
|
||||
configuration object to be passed as a parameter. (They also require a
|
||||
Context object).
|
||||
Context object). A majority of classes do not need the config object,
|
||||
but for those who do, it is a lifesaver.
|
||||
|
||||
In relation to HTMLDefinition and CSSDefinition, there could be a special class
|
||||
of directives that influence the *construction* of the Definition object.
|
||||
A theoretical call pattern would look like:
|
||||
|
||||
1. Client calls Config->getHTMLDefinition()
|
||||
2. Config calls HTMLDefinition->createNew(this)
|
||||
3. HTMLDefinition constructs itself with base configuration
|
||||
4. HTMLDefinition calls Config->get('HTML')
|
||||
5. Config returns array of directives
|
||||
6. HTMLDefinition performs operations and changes specified by directives
|
||||
7. HTMLPurifier returns constructed definition
|
||||
8. Config caches definition so it doesn't have to be generated again
|
||||
9. Config returns definition
|
||||
|
||||
You could also override Config's copy of the definition with your own
|
||||
custom copy, which OVERRIDES all directives. Only the base, vanilla copy
|
||||
is the Singleton, the object actually interfaced with is a operated-upon
|
||||
clone of that object. Also, if an update to the directives would update
|
||||
the definition, you'd have to force reconstruction.
|
||||
|
||||
In practice, the pulling directives from the config object are
|
||||
solely need-based, and the flex points are littered throughout the
|
||||
setup() function. Some sort of refactoring is likely in order.
|
||||
Definition objects are complex datatypes influenced by their respective
|
||||
directive namespaces (HTMLDefinition with HTML and CSSDefinition with CSS).
|
||||
If any of these directives is updated, HTML Purifier forces the definition
|
||||
to be regenerated.
|
||||
|
32
docs/proposal-css-extraction.txt
Normal file
32
docs/proposal-css-extraction.txt
Normal file
@@ -0,0 +1,32 @@
|
||||
|
||||
Extracting inline CSS from HTML Purifier
|
||||
voodoofied: Assigning semantics to elements
|
||||
|
||||
Sander Tekelenburg brought to my attention the poor programming style of
|
||||
inline CSS in HTML documents. In an ideal world, we wouldn't be using inline
|
||||
CSS at all: everything would be assigned using semantic class attributes
|
||||
from an external stylesheet.
|
||||
|
||||
With ExtractStyleBlocks and CSSTidy, this is now possible (when allowed, users
|
||||
can specify a style element which gets extracted from the user-submitted HTML, which
|
||||
the application can place in the head of the HTML document). But there still
|
||||
is the issue of inline CSS that refuses to go away.
|
||||
|
||||
The basic idea behind this feature is assign every element a unique identifier,
|
||||
and then move all of the CSS data to a style-sheet. This HTML:
|
||||
|
||||
<div style="text-align:center">Big <span style="color:red;">things</span>!</div>
|
||||
|
||||
into
|
||||
|
||||
<div id="hp-12345">Big <span id="hp-12346">things</span>!</div>
|
||||
|
||||
and a stylesheet that is:
|
||||
|
||||
#hp-12345 {text-align:center;}
|
||||
#hp-12346 {color:red;}
|
||||
|
||||
Beyond that, HTML Purifier can magically merge common CSS values together,
|
||||
and a whole manner of other heuristic things. HTML Purifier should also
|
||||
make it easy for an admin to re-style the HTML semantically. Speed is not
|
||||
an issue. Also, better WYSIWYG editors are needed.
|
@@ -2,23 +2,16 @@
|
||||
Filter Levels
|
||||
When one size *does not* fit all
|
||||
|
||||
The more I think about it, the less sense it makes for maintaining one huge
|
||||
monolithic HTMLDefinition class. There's simply so much variation that
|
||||
could go into this definition: the set of HTML good for blog entries is
|
||||
definitely too large for HTML that would be allowed in blog comments. Going
|
||||
from Transitional to Strict requires changes to the definition.
|
||||
It makes little sense to constrain users to one set of HTML elements and
|
||||
attributes and tell them that they are not allowed to mold this in
|
||||
any fashion. Many users demand to be able to custom-select which elements
|
||||
and attributes they want. This is fine: because HTML Purifier keeps close
|
||||
track of what elements are safe to use, there is no way for them to
|
||||
accidently allow an XSS-able tag.
|
||||
|
||||
Allowing users to specify their own whitelists is one step (implemented, btw),
|
||||
but I have doubts on only doing this. Simply put, the typical programmer is too
|
||||
lazy to actually go through the trouble of investigating which tags, attributes
|
||||
and properties to allow. HTMLDefinition makes a big part of what HTMLPurifier
|
||||
is.
|
||||
|
||||
The idea, then, is to setup fundamentally different set of definitions, which
|
||||
can further be customized using simpler configuration options. Alternatively,
|
||||
they could be implemented as configuration profiles, which simply load
|
||||
a set of recommended directives to acheive a desired affect (no simpler
|
||||
config options though).
|
||||
However, combing through the HTML spec to make your own whitelist can
|
||||
be a daunting task. HTML Purifier ought to offer pre-canned filter levels
|
||||
that amateur users can select based on what they think is their use-case.
|
||||
|
||||
Here are some fuzzy levels you could set:
|
||||
|
||||
@@ -39,13 +32,17 @@ Here are some fuzzy levels you could set:
|
||||
|
||||
One final note: when you start axing tags that are more commonly used, you
|
||||
run the risk of accidentally destroying user data, especially if the data
|
||||
is incoming from a WYSIWYG eidtor that hasn't been synced accordingly. This may
|
||||
is incoming from a WYSIWYG editor that hasn't been synced accordingly. This may
|
||||
make forbidden element to text transformations desirable (for example, images).
|
||||
|
||||
|
||||
|
||||
== Element Risk Analysis ==
|
||||
|
||||
Although none of the currently supported elements presents a security
|
||||
threat per-say, some can cause problems for page layouts or be
|
||||
extremely complicated.
|
||||
|
||||
Legend:
|
||||
[danger level] - regular tags / uncommon tags ~ deprecated tags
|
||||
[danger level]* - rare tags
|
||||
@@ -114,6 +111,10 @@ Partially presentational - table.cellpadding, table.cellspacing,
|
||||
|
||||
== CSS Risk Analysis ==
|
||||
|
||||
Currently, there is no support for fine-grained "allowed CSS" specification,
|
||||
mainly because I'm lazy, partially because no one has asked for it. However,
|
||||
this will be added eventually.
|
||||
|
||||
There are certain CSS elements that are extremely useful inline, but then
|
||||
as you get to more presentation oriented styling it may not always be
|
||||
appropriate to inline them.
|
||||
@@ -126,6 +127,7 @@ any CSS properties that are not currently implemented (such as position).
|
||||
Dangerous, can go outside container - float
|
||||
Easy to abuse - font-size, font-family (font), width
|
||||
Colored - background-color (background), border-color (border), color
|
||||
(see proposal-colors.html)
|
||||
Dramatic - border, list-style-position (list-style), margin, padding,
|
||||
text-align, text-indent, text-transform, vertical-align, line-height
|
||||
|
||||
|
@@ -1,42 +1,6 @@
|
||||
We are going to model our I18N/L10N off of MediaWiki's system. Their's is
|
||||
obviously quite complicated, so we're going to simplify it a bit for our needs.
|
||||
|
||||
== Structure ==
|
||||
|
||||
First, you have a Language object. This object contains all the localisable
|
||||
message strings, as well as other important language-specific settings and
|
||||
custom behavior (uppercasing, lowercasing, printing dates, formatting
|
||||
numbers, etc.)
|
||||
|
||||
The object is constructed from two sources: subclassed versions of itself
|
||||
(classes) and Message files (messages).
|
||||
|
||||
== General use ==
|
||||
|
||||
You load a language object by calling the Language::factory() function.
|
||||
This function the class file for the object (taking in account fallback
|
||||
languages by using the fallback langauge's object but overloading the
|
||||
language key) and returns that object. Nothing else happens.
|
||||
|
||||
When a message/etc is requested, a lazy load initializor is called. Now the
|
||||
real work starts. We're first going to take the scenario that the language
|
||||
is not cached. The system loads the Messages file by:
|
||||
|
||||
require( $filename );
|
||||
$cache = compact( self::$mLocalisationKeys );
|
||||
|
||||
...where self::$mLocalisationKeys is the name of variables that could be used
|
||||
in the localization file. This lets you use things like:
|
||||
|
||||
$fallback = false;
|
||||
$rtl = false;
|
||||
|
||||
...and easily siphon them into arrays.
|
||||
|
||||
Then, we load the $fallback language (if not set, English) to fill in the gaps in
|
||||
the messages. There is specialized behavior for certain keys, as they can be
|
||||
mergeable maps, lists or alias lists (not sure what the last one is).
|
||||
|
||||
== Caching ==
|
||||
|
||||
MediaWiki has lots of caching mechanisms built in, which make the code somewhat
|
||||
|
@@ -2,7 +2,8 @@
|
||||
Configuration Ideas
|
||||
|
||||
Here are some theoretical configuration ideas that we could implement some
|
||||
time. Note the naming convention: %Namespace.Directive
|
||||
time. Note the naming convention: %Namespace.Directive. If you want one
|
||||
implemented, give us a ring, and we'll move it up the priority chain.
|
||||
|
||||
%Attr.RewriteFragments - if there's %Attr.IDPrefix we may want to transparently
|
||||
rewrite the URLs we parse too. However, we can only do it when it's a pure
|
||||
@@ -22,8 +23,6 @@ time. Note the naming convention: %Namespace.Directive
|
||||
%URI.AddRelNofollow - will add rel="nofollow" to all links, preventing the
|
||||
spread of ill-gotten pagerank
|
||||
|
||||
%URI.RelativeToAbsolute - transforms all relative URIs to absolute form
|
||||
|
||||
%URI.HostBlacklistRegex - regexes that if matching the host are disallowed
|
||||
%URI.HostWhitelist - domain names that are excluded from the host blacklist
|
||||
%URI.HostPolicy - determines whether or not its reject all and then whitelist
|
||||
|
48
docs/ref-content-models.txt
Normal file
48
docs/ref-content-models.txt
Normal file
@@ -0,0 +1,48 @@
|
||||
|
||||
Handling Content Model Changes
|
||||
|
||||
|
||||
1. Context
|
||||
|
||||
The distinction between Transitional and Strict document types is somewhat
|
||||
of an anomaly in the lineage of XHTML document types (following 1.0, no
|
||||
doctypes do not have flavors: instead, modularization is used to let
|
||||
document authors vary their elements). This transition is usually quite
|
||||
straight-forward, as W3C usually deprecates attributes or elements, which
|
||||
are quite easily handled using tag and attribute transforms.
|
||||
|
||||
However, for two elements, <blockquote>, <body> and <address>, W3C elected
|
||||
to also change the content model. <blockquote> and <body> originally
|
||||
accepted both inline and block elements, but in the strict doctype they
|
||||
only allow block elements. With <address>, the situation is inverted:
|
||||
<p> tags were now forbidden from appearing within this tag.
|
||||
|
||||
|
||||
2. Current situation
|
||||
|
||||
Currently, HTML Purifier treats <blockquote> specially during Tidy mode
|
||||
using a custom ChildDef class StrictBlockquote. StrictBlockquote
|
||||
operates similarly to Required, except that when it encounters an inline
|
||||
element, it will wrap it in a block tag (as specified by
|
||||
%HTML.BlockWrapper, the default is <p>). The naming suggests it can
|
||||
only be used for <blockquote>s, although it may be possible to
|
||||
genericize it to work on other cases of this nature (this would be of
|
||||
little practical application, as no other element in XHTML 1.1 or earlier
|
||||
has a block-only content model).
|
||||
|
||||
Tidy currently contains no custom, lenient implementation for <address>.
|
||||
If one were to be written, it would likely operate on the principle that,
|
||||
when a <p> tag were to be encountered, it would be replaced with a
|
||||
leading and trailing <br /> tag (the contents of <p>, being inline, are
|
||||
not an issue). There is no prior work with this sort of operation.
|
||||
|
||||
|
||||
3. Outside applicability
|
||||
|
||||
There are a number of other elements that contain restrictive content
|
||||
models, such as <ul> or <span> (the latter is restrictive in that it
|
||||
does not allow block elements). In the former case, an errant node
|
||||
is eliminated completely, in the latter case, the text of the node
|
||||
would is preserved (as the parent node does allow PCDATA). Custom
|
||||
content model implementations probably are not the best way of handling
|
||||
these cases, instead, node bubbling should be implemented instead.
|
28
docs/ref-css-length.txt
Normal file
28
docs/ref-css-length.txt
Normal file
@@ -0,0 +1,28 @@
|
||||
|
||||
CSS Length Reference
|
||||
To bound, or not to bound, that is the question
|
||||
|
||||
It's quite a reasonable request, really, and it's already been implemented
|
||||
for HTML. That is, length bounding. It makes little sense to let users
|
||||
define text blocks that have a font-size of 63,360 inches (that's a mile,
|
||||
by the way) or a width of forty-fold the parent container.
|
||||
|
||||
But it's a little more complicated then that. There are multiple units
|
||||
one can use, and we have to a little unit conversion to get things working.
|
||||
Here's what we have:
|
||||
|
||||
Absolute:
|
||||
1 in ~= 2.54 cm
|
||||
1 cm = 10 mm
|
||||
1 pt = 1/72 in
|
||||
1 pc = 12 pt
|
||||
|
||||
Relative:
|
||||
1 em ~= 10.0667 px
|
||||
1 ex ~= 0.5 em, though Mozilla Firefox says 1 ex = 6px
|
||||
1 px ~= 1 pt
|
||||
|
||||
Watch out: font-sizes can also be nested to get successively larger
|
||||
(although I do not relish having to keep track of context font-sizes,
|
||||
this may be necessary, especially for some of the more advanced features
|
||||
for preventing things like white on white).
|
@@ -15,7 +15,7 @@
|
||||
|
||||
<div id="filing">Filed under Reference</div>
|
||||
<div id="index">Return to the <a href="index.html">index</a>.</div>
|
||||
<div id="home"><a href="http://hp.jpsband.org/">HTML Purifier</a> End-User Documentation</div>
|
||||
<div id="home"><a href="http://htmlpurifier.org/">HTML Purifier</a> End-User Documentation</div>
|
||||
|
||||
<p>Many thanks to the DevNetwork community for answering questions,
|
||||
theorizing about design, and offering encouragement during
|
||||
@@ -42,4 +42,4 @@ the development of this library in these forum threads:</p>
|
||||
|
||||
<div id="version">$Id$</div>
|
||||
</body>
|
||||
</html>
|
||||
</html>
|
||||
|
164
docs/ref-html-modularization.txt
Normal file
164
docs/ref-html-modularization.txt
Normal file
@@ -0,0 +1,164 @@
|
||||
|
||||
The Modularization of HTMLDefinition in HTML Purifier
|
||||
|
||||
WARNING: This document was drafted before the implementation of this
|
||||
system, and some implementation details may have evolved over time.
|
||||
|
||||
HTML Purifier uses the modularization of XHTML
|
||||
<http://www.w3.org/TR/xhtml-modularization/> to organize the internals
|
||||
of HTMLDefinition into a more manageable and extensible fashion. Rather
|
||||
than have one super-object, HTMLDefinition is split into HTMLModules,
|
||||
each of which are responsible for defining elements, their attributes,
|
||||
and other properties (for a more indepth coverage, see
|
||||
/library/HTMLPurifier/HTMLModule.php's docblock comments). These modules
|
||||
are managed by HTMLModuleManager.
|
||||
|
||||
Modules that we don't support but could support are:
|
||||
|
||||
* 5.6. Table Modules
|
||||
o 5.6.1. Basic Tables Module [?]
|
||||
* 5.8. Client-side Image Map Module [?]
|
||||
* 5.9. Server-side Image Map Module [?]
|
||||
* 5.12. Target Module [?]
|
||||
* 5.21. Name Identification Module [deprecated]
|
||||
|
||||
These modules would be implemented as "unsafe":
|
||||
|
||||
* 5.2. Core Modules
|
||||
o 5.2.1. Structure Module
|
||||
* 5.3. Applet Module
|
||||
* 5.5. Forms Modules
|
||||
o 5.5.1. Basic Forms Module
|
||||
o 5.5.2. Forms Module
|
||||
* 5.10. Object Module
|
||||
* 5.11. Frames Module
|
||||
* 5.13. Iframe Module
|
||||
* 5.14. Intrinsic Events Module
|
||||
* 5.15. Metainformation Module
|
||||
* 5.16. Scripting Module
|
||||
* 5.17. Style Sheet Module
|
||||
* 5.19. Link Module
|
||||
* 5.20. Base Module
|
||||
|
||||
We will not be using W3C's XML Schemas or DTDs directly due to the lack
|
||||
of robust tools for handling them (the main problem is that all the
|
||||
current parsers are usually PHP 5 only and solely-validating, not
|
||||
correcting).
|
||||
|
||||
This system may be generalized and ported over for CSS.
|
||||
|
||||
== General Use-Case ==
|
||||
|
||||
The outwards API of HTMLDefinition has been largely preserved, not
|
||||
only for backwards-compatibility but also by design. Instead,
|
||||
HTMLDefinition can be retrieved "raw", in which it loads a structure
|
||||
that closely resembles the modules of XHTML 1.1. This structure is very
|
||||
dynamic, making it easy to make cascading changes to global content
|
||||
sets or remove elements in bulk.
|
||||
|
||||
However, once HTML Purifier needs the actual definition, it retrieves
|
||||
a finalized version of HTMLDefinition. The finalized definition involves
|
||||
processing the modules into a form that it is optimized for multiple
|
||||
calls. This final version is immutable and, even if editable, would
|
||||
be extremely hard to change.
|
||||
|
||||
So, some code taking advantage of the XHTML modularization may look
|
||||
like this:
|
||||
|
||||
<?php
|
||||
$config = HTMLPurifier_Config::createDefault();
|
||||
$def =& $config->getHTMLDefinition(true); // reference to raw
|
||||
$def->addElement('marquee', 'Block', 'Flow', 'Common');
|
||||
$purifier = new HTMLPurifier($config);
|
||||
$purifier->purify($html); // now the definition is finalized
|
||||
?>
|
||||
|
||||
== Inclusions ==
|
||||
|
||||
One of the nice features of HTMLDefinition is that piggy-backing off
|
||||
of global attribute and content sets is extremely easy to do.
|
||||
|
||||
=== Attributes ===
|
||||
|
||||
HTMLModule->elements[$element]->attr stores attribute information for the
|
||||
specific attributes of $element. This is quite close to the final
|
||||
API that HTML Purifier interfaces with, but there's an important
|
||||
extra feature: attr may also contain a array with a member index zero.
|
||||
|
||||
<?php
|
||||
HTMLModule->elements[$element]->attr[0] = array('AttrSet');
|
||||
?>
|
||||
|
||||
Rather than map the attribute key 0 to an array (which should be
|
||||
an AttrDef), it defines a number of attribute collections that should
|
||||
be merged into this elements attribute array.
|
||||
|
||||
Furthermore, the value of an attribute key, attribute value pair need
|
||||
not be a fully fledged AttrDef object. They can also be a string, which
|
||||
signifies a AttrDef that is looked up from a centralized registry
|
||||
AttrTypes. This allows more concise attribute definitions that look
|
||||
more like W3C's declarations, as well as offering a centralized point
|
||||
for modifying the behavior of one attribute type. And, of course, the
|
||||
old method of manually instantiating an AttrDef still works.
|
||||
|
||||
=== Attribute Collections ===
|
||||
|
||||
Attribute collections are stored and processed in the AttrCollections
|
||||
object, which is responsible for performing the inclusions signified
|
||||
by the 0 index. These attribute collections, too, are mutable, by
|
||||
using HTMLModule->attr_collections. You may add new attributes
|
||||
to a collection or define an entirely new collection for your module's
|
||||
use. Inclusions can also be cumulative.
|
||||
|
||||
Attribute collections allow us to get rid of so called "global attributes"
|
||||
(which actually aren't so global).
|
||||
|
||||
=== Content Models and ChildDef ===
|
||||
|
||||
An implementation of the above-mentioned attributes and attribute
|
||||
collections was applied to the ChildDef system. HTML Purifier uses
|
||||
a proprietary system called ChildDef for performance and flexibility
|
||||
reasons, but this does not line up very well with W3C's notion of
|
||||
regexps for defining the allowed children of an element.
|
||||
|
||||
HTMLPurifier->elements[$element]->content_model and
|
||||
HTMLPurifier->elements[$element]->content_model_type store information
|
||||
about the final ChildDef that will be stored in
|
||||
HTMLPurifier->elements[$element]->child (we use a different variable
|
||||
because the two forms are sufficiently different).
|
||||
|
||||
$content_model is an abstract, string representation of the internal
|
||||
state of ChildDef, while $content_model_type is a string identifier
|
||||
of which ChildDef subclass to instantiate. $content_model is processed
|
||||
by substituting all content set identifiers (capitalized element names)
|
||||
with their contents. It is then parsed and passed into the appropriate
|
||||
ChildDef class, as defined by the ContentSets->getChildDef() or the
|
||||
custom fallback HTMLModule->getChildDef() for custom child definitions
|
||||
not in the core.
|
||||
|
||||
You'll need to use these facilities if you plan on referencing a content
|
||||
set like "Inline" or "Block", and using them is recommended even if you're
|
||||
not due to their conciseness.
|
||||
|
||||
A few notes on $content_model: it's structure can be as complicated
|
||||
as you want, but the pipe symbol (|) is reserved for defining possible
|
||||
choices, due to the content sets implementation. For example, a content
|
||||
model that looks like:
|
||||
|
||||
"Inline -> Block -> a"
|
||||
|
||||
...when the Inline content set is defined as "span | b" and the Block
|
||||
content set is defined as "div | blockquote", will expand into:
|
||||
|
||||
"span | b -> div | blockquote -> a"
|
||||
|
||||
The custom HTMLModule->getChildDef() function will need to be able to
|
||||
then feed this information to ChildDef in a usable manner.
|
||||
|
||||
=== Content Sets ===
|
||||
|
||||
Content sets can be altered using HTMLModule->content_sets, an associative
|
||||
array of content set names to content set contents. If the content set
|
||||
already exists, your values are appended on to it (great for, say,
|
||||
registering the font tag as an inline element), otherwise it is
|
||||
created. They are substituted into content_model.
|
@@ -1,37 +0,0 @@
|
||||
|
||||
Loose versus Strict
|
||||
Changes from one doctype to another
|
||||
|
||||
There are changes. Wow, how insightful. Not everything changed is relevant
|
||||
to HTML Purifier, though, so let's take a look:
|
||||
|
||||
== Major incompatibilities ==
|
||||
|
||||
[done] BLOCKQUOTE changes from 'flow' to 'block'
|
||||
current behavior: inline inner contents should not be nuked, block-ify as necessary
|
||||
[partially-done] U, S, STRIKE cut
|
||||
current behavior: removed completely
|
||||
projected behavior: replace with appropriate inline span + CSS
|
||||
[done] ADDRESS from potpourri to Inline (removes p tags)
|
||||
current behavior: block tags silently dropped
|
||||
ideal behavior: replace tags with something like <br>. (not high priority)
|
||||
|
||||
== Things we can loosen up ==
|
||||
|
||||
Tags DIR, MENU, CENTER, ISINDEX, FONT, BASEFONT? allowed in loose
|
||||
current behavior: transform to strict-valid forms
|
||||
Attributes allowed in loose (see attribute transforms in 'dev-progress.html')
|
||||
current behavior: projected to transform into strict-valid forms
|
||||
|
||||
== Periphery issues ==
|
||||
|
||||
A tag's attribute 'target' (for selecting frames) cut
|
||||
current behavior: not allowed at all
|
||||
projected behavior: use loose doctype if needed, needs valid values
|
||||
[done] OL/LI tag's attribute 'start'/'value' (for renumbering lists) cut
|
||||
current behavior: no substitute, just delete when in strict, allow in loose
|
||||
Attribute 'name' deprecated in favor of 'id'
|
||||
current behavior: dropped silently
|
||||
projected behavior: create proper AttrTransform (currently not allowed at all)
|
||||
[done] PRE tag allows SUB/SUP? (strict dtd comment vs syntax, loose disallows)
|
||||
current behavior: disallow as usual
|
@@ -18,5 +18,7 @@ HTML Purifier context.
|
||||
|
||||
<listing>, monospace pre-variant (extremely rare)
|
||||
<plaintext>, escapes all tags to the end of document
|
||||
<ruby> and friends, (more research needed, appears to be XHTML 1.1 markup)
|
||||
<xmp>, monospace, replace with pre
|
||||
|
||||
These should be put into their own Tidy module, not loaded by default(?). These
|
||||
all qualify as "lenient" transforms.
|
||||
|
@@ -1,37 +0,0 @@
|
||||
|
||||
Is HTML Purifier Strict or Transitional?
|
||||
A little bit of helpful guidance
|
||||
|
||||
Despite the fact that HTML Purifier professes to support both transitional and
|
||||
strict HTML, it rejects a lot of attributes and elements that are actually, indeed,
|
||||
valid. You can investigate progress.html to find out precisely what we
|
||||
are doing to these *deprecated* attributes.
|
||||
|
||||
However, users have found that Strict HTML imposes some quite unreasonable
|
||||
restrictions on certain things. The start and value attributes in ol and
|
||||
li (respectively) perhaps are the most contested. There's is currently no
|
||||
widely supported browser method short of JavaScript that can replace these
|
||||
two deprecated elements. It behooves us to allow these deprecated
|
||||
attributes when the output is transitional.
|
||||
|
||||
Fortunantely, that's the only real bugger case. The others have near-perfect
|
||||
CSS equivalents, and were presentational anyway. However, the other question
|
||||
pops up: should we always convert these to the CSS forms when 1. the spec
|
||||
allows them anyway and 2. older browsers support them better? After all, the
|
||||
whole point about CSS is to seperate styling from content, so inline styling
|
||||
doesn't solve that problem.
|
||||
|
||||
It's an icky question, and we'll have to deal with it as more and more
|
||||
transforms get implemented. As of right now, however, we currently support
|
||||
these loose-only constructs in loose mode:
|
||||
|
||||
- <ul start="1">, <li value="1"> attributes
|
||||
- <u>, <strike>, <s> tags
|
||||
- flow children in <blockquote>
|
||||
- mixed children in <address>
|
||||
|
||||
The changed child definitions as well as the ul.start li.value are the most
|
||||
compelling reasons why loose should be used. We may want offer disabling <u>,
|
||||
<strike> and <s> by themselves. We may also want to offer no pre-emptive
|
||||
deprecated conversions. This all must be unified.
|
||||
|
@@ -2,8 +2,23 @@
|
||||
Web Hypertext Application Technology Working Group
|
||||
WHATWG
|
||||
|
||||
I don't think we need to worry about them. Untrusted users shouldn't be
|
||||
submitting applications, eh? But if some interesting attribute pops up in
|
||||
their spec, and might be worth supporting, stick it here.
|
||||
== HTML 5 ==
|
||||
|
||||
(none so far, as you can see)
|
||||
URL: http://www.whatwg.org/specs/web-apps/current-work/
|
||||
|
||||
HTML 5 defines a kaboodle of new elements and attributes, as well as
|
||||
some well-defined, "quirks mode" HTML parsing. Although WHATWG professes
|
||||
to be targeted towards web applications, many of their semantic additions
|
||||
would be quite useful in regular documents. Eventually, HTML
|
||||
Purifier will need to audit their lists and figure out what changes need
|
||||
to be made. This process is complicated by the fact that the WHATWG
|
||||
doesn't buy into W3C's modularization of XHTML 1.1: we may need
|
||||
to remodularize HTML 5 (probably done by section name). No sense in
|
||||
committing ourselves till the spec stabilizes, though.
|
||||
|
||||
More immediately speaking though, however, is the well-defined parsing
|
||||
behavior that HTML 5 adds. While I have little interest in writing
|
||||
another DirectLex parser, other parsers like ph5p
|
||||
<http://jero.net/lab/ph5p/> can be adapted to DOMLex to support much more
|
||||
flexible HTML parsing (a cool feature I've seen is how they resolve
|
||||
<b>bold<i>both</b>italic</i>).
|
||||
|
@@ -1,21 +0,0 @@
|
||||
|
||||
Getting XHTML 1.1 Working
|
||||
|
||||
It's quite simple, according to <http://www.w3.org/TR/xhtml11/changes.html>
|
||||
|
||||
1. Scratch lang entirely in favor of xml:lang
|
||||
2. Scratch name entirely in favor of id (partially-done)
|
||||
3. Support Ruby <http://www.w3.org/TR/2001/REC-ruby-20010531/>
|
||||
|
||||
...but that's only an informative section. More things to do:
|
||||
|
||||
1. Scratch style attribute (it's deprecated)
|
||||
2. Be module-aware (this might entail intelligent grouping in the definition
|
||||
and allowing users to specifically remove certain modules (see 5))
|
||||
3. Cross-reference minimal content models with existing DTDs and determine
|
||||
changes (todo)
|
||||
4. Watch out for the Legacy Module
|
||||
<http://www.w3.org/TR/2001/REC-xhtml-modularization-20010410/abstract_modules.html#s_legacymodule>
|
||||
5. Let users specify their own custom modules
|
||||
6. Study Modularization document
|
||||
<http://www.w3.org/TR/2001/REC-xhtml-modularization-20010410/>
|
8
docs/specimens/LICENSE
Normal file
8
docs/specimens/LICENSE
Normal file
@@ -0,0 +1,8 @@
|
||||
Licensing of Specimens
|
||||
|
||||
Some files in this directory have different licenses:
|
||||
|
||||
windows-live-mail-desktop-beta.html - donated by laacz, public domain
|
||||
img.png - LGPL, from <http://commons.wikimedia.org/wiki/Image:Pastille_chrome.png>
|
||||
|
||||
All other files are by me, and are licensed under LGPL.
|
165
docs/specimens/html-align-to-css.html
Normal file
165
docs/specimens/html-align-to-css.html
Normal file
@@ -0,0 +1,165 @@
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
|
||||
"http://www.w3.org/TR/html4/loose.dtd">
|
||||
<html>
|
||||
<head>
|
||||
<title>HTML align attribute to CSS - HTML Purifier Specimen</title>
|
||||
<style type="text/css">
|
||||
div.container {position:relative;height:110px;}
|
||||
div.container.legend .test {text-align:center;line-height:100px;}
|
||||
div.test {width:100px;height:100px;border:1px solid black;
|
||||
position:absolute;top:10px;}
|
||||
div.test.html {left:10px;}
|
||||
div.test.css {left:140px;}
|
||||
table {background:#F00;}
|
||||
img {border:1px solid #000;}
|
||||
hr {width:50px;}
|
||||
div.segment {width:250px; float:left; margin-top:1em;}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<h1>HTML align attribute to CSS</h1>
|
||||
|
||||
<p>Inspect source for methodology.</p>
|
||||
|
||||
<div class="container legend">
|
||||
<div class="test html">
|
||||
HTML
|
||||
</div>
|
||||
<div class="test css">
|
||||
CSS
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="segment">
|
||||
|
||||
<h2>table.align</h2>
|
||||
|
||||
<h3>left</h3>
|
||||
<div class="container">
|
||||
<div class="test html">
|
||||
a<table align="left"><tr><td>O</td></tr></table>a
|
||||
</div>
|
||||
<div class="test css">
|
||||
a<table style="float:left;"><tr><td>O</td></tr></table>a
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<h3>center</h3>
|
||||
<div class="container">
|
||||
<div class="test html">
|
||||
a<table align="center"><tr><td>O</td></tr></table>a
|
||||
</div>
|
||||
<div class="test css">
|
||||
a<table style="margin-left:auto; margin-right:auto;"><tr><td>O</td></tr></table>a
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<h3>right</h3>
|
||||
<div class="container">
|
||||
<div class="test html">
|
||||
a<table align="right"><tr><td>O</td></tr></table>a
|
||||
</div>
|
||||
<div class="test css">
|
||||
a<table style="float:right;"><tr><td>O</td></tr></table>a
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
<!-- ################################################################## -->
|
||||
|
||||
<div class="segment">
|
||||
<h2>img.align</h2>
|
||||
<h3>left</h3>
|
||||
<div class="container">
|
||||
<div class="test html">
|
||||
a<img src="img.png" align="left">a
|
||||
</div>
|
||||
<div class="test css">
|
||||
a<img src="img.png" style="float:left;">a
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<h3>right</h3>
|
||||
<div class="container">
|
||||
<div class="test html">
|
||||
a<img src="img.png" align="right">a
|
||||
</div>
|
||||
<div class="test css">
|
||||
a<img src="img.png" style="float:right;">a
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<h3>bottom</h3>
|
||||
<div class="container">
|
||||
<div class="test html">
|
||||
a<img src="img.png" align="bottom">a
|
||||
</div>
|
||||
<div class="test css">
|
||||
a<img src="img.png" style="vertical-align:baseline;">a
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<h3>middle</h3>
|
||||
<div class="container">
|
||||
<div class="test html">
|
||||
a<img src="img.png" align="middle">a
|
||||
</div>
|
||||
<div class="test css">
|
||||
a<img src="img.png" style="vertical-align:middle;">a
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<h3>top</h3>
|
||||
<div class="container">
|
||||
<div class="test html">
|
||||
a<img src="img.png" align="top">a
|
||||
</div>
|
||||
<div class="test css">
|
||||
a<img src="img.png" style="vertical-align:top;">a
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
<!-- ################################################################## -->
|
||||
|
||||
<div class="segment">
|
||||
|
||||
<h2>hr.align</h2>
|
||||
|
||||
<h3>left</h3>
|
||||
<div class="container">
|
||||
<div class="test html">
|
||||
<hr align="left" />
|
||||
</div>
|
||||
<div class="test css">
|
||||
<hr style="margin-right:auto; margin-left:0; text-align:left;" />
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<h3>center</h3>
|
||||
<div class="container">
|
||||
<div class="test html">
|
||||
<hr align="center" />
|
||||
</div>
|
||||
<div class="test css">
|
||||
<hr style="margin-right:auto; margin-left:auto; text-align:center;" />
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<h3>right</h3>
|
||||
<div class="container">
|
||||
<div class="test html">
|
||||
<hr align="right" />
|
||||
</div>
|
||||
<div class="test css">
|
||||
<hr style="margin-right:0; margin-left:auto; text-align:right;" />
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
BIN
docs/specimens/img.png
Normal file
BIN
docs/specimens/img.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 2.1 KiB |
129
docs/specimens/jochem-blok-word.html
Normal file
129
docs/specimens/jochem-blok-word.html
Normal file
@@ -0,0 +1,129 @@
|
||||
<html xmlns:v="urn:schemas-microsoft-com:vml" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:w="urn:schemas-microsoft-com:office:word" xmlns:m="http://schemas.microsoft.com/office/2004/12/omml" xmlns="http://www.w3.org/TR/REC-html40">
|
||||
|
||||
<head>
|
||||
<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=us-ascii">
|
||||
<meta name=Generator content="Microsoft Word 12 (filtered medium)">
|
||||
<!--[if !mso]>
|
||||
<style>
|
||||
v\:* {behavior:url(#default#VML);}
|
||||
o\:* {behavior:url(#default#VML);}
|
||||
w\:* {behavior:url(#default#VML);}
|
||||
..shape {behavior:url(#default#VML);}
|
||||
</style>
|
||||
<![endif]-->
|
||||
<style>
|
||||
<!--
|
||||
/* Font Definitions */
|
||||
@font-face
|
||||
{font-family:"Cambria Math";
|
||||
panose-1:2 4 5 3 5 4 6 3 2 4;}
|
||||
@font-face
|
||||
{font-family:Calibri;
|
||||
panose-1:2 15 5 2 2 2 4 3 2 4;}
|
||||
@font-face
|
||||
{font-family:Tahoma;
|
||||
panose-1:2 11 6 4 3 5 4 4 2 4;}
|
||||
@font-face
|
||||
{font-family:Verdana;
|
||||
panose-1:2 11 6 4 3 5 4 4 2 4;}
|
||||
/* Style Definitions */
|
||||
p.MsoNormal, li.MsoNormal, div.MsoNormal
|
||||
{margin:0cm;
|
||||
margin-bottom:.0001pt;
|
||||
font-size:10.0pt;
|
||||
font-family:"Verdana","sans-serif";}
|
||||
a:link, span.MsoHyperlink
|
||||
{mso-style-priority:99;
|
||||
color:blue;
|
||||
text-decoration:underline;}
|
||||
a:visited, span.MsoHyperlinkFollowed
|
||||
{mso-style-priority:99;
|
||||
color:purple;
|
||||
text-decoration:underline;}
|
||||
p.MsoAcetate, li.MsoAcetate, div.MsoAcetate
|
||||
{mso-style-priority:99;
|
||||
mso-style-link:"Balloon Text Char";
|
||||
margin:0cm;
|
||||
margin-bottom:.0001pt;
|
||||
font-size:8.0pt;
|
||||
font-family:"Tahoma","sans-serif";}
|
||||
span.EmailStyle17
|
||||
{mso-style-type:personal-compose;
|
||||
font-family:"Verdana","sans-serif";
|
||||
color:windowtext;}
|
||||
span.BalloonTextChar
|
||||
{mso-style-name:"Balloon Text Char";
|
||||
mso-style-priority:99;
|
||||
mso-style-link:"Balloon Text";
|
||||
font-family:"Tahoma","sans-serif";}
|
||||
..MsoChpDefault
|
||||
{mso-style-type:export-only;}
|
||||
@page Section1
|
||||
{size:612.0pt 792.0pt;
|
||||
margin:70.85pt 70.85pt 70.85pt 70.85pt;}
|
||||
div.Section1
|
||||
{page:Section1;}
|
||||
-->
|
||||
</style>
|
||||
<!--[if gte mso 9]><xml>
|
||||
<o:shapedefaults v:ext="edit" spidmax="2050" />
|
||||
</xml><![endif]--><!--[if gte mso 9]><xml>
|
||||
<o:shapelayout v:ext="edit">
|
||||
<o:idmap v:ext="edit" data="1" />
|
||||
</o:shapelayout></xml><![endif]-->
|
||||
</head>
|
||||
|
||||
<body lang=NL link=blue vlink=purple>
|
||||
|
||||
<div class=Section1>
|
||||
|
||||
<p class=MsoNormal><img width=1277 height=994 id="Picture_x0020_1"
|
||||
src="cid:image001.png@01C8CBDF.5D1BAEE0"><o:p></o:p></p>
|
||||
|
||||
<p class=MsoNormal><o:p> </o:p></p>
|
||||
|
||||
<p class=MsoNormal><b>Name<o:p></o:p></b></p>
|
||||
|
||||
<p class=MsoNormal>E-mail : <a href="mailto:mail@example.com"><span
|
||||
style='color:windowtext'>mail@example.com</span></a><o:p></o:p></p>
|
||||
|
||||
<p class=MsoNormal><o:p> </o:p></p>
|
||||
|
||||
<p class=MsoNormal><b>Company<o:p></o:p></b></p>
|
||||
|
||||
<p class=MsoNormal>Address 1<o:p></o:p></p>
|
||||
|
||||
<p class=MsoNormal>Address 2<o:p></o:p></p>
|
||||
|
||||
<p class=MsoNormal><o:p> </o:p></p>
|
||||
|
||||
<p class=MsoNormal>Telefoon : +xx xx xxx xxx xx <span style='color:black'><o:p></o:p></span></p>
|
||||
|
||||
<p class=MsoNormal><span lang=EN-US style='color:black'>Fax : +xx xx xxx xx xx<o:p></o:p></span></p>
|
||||
|
||||
<p class=MsoNormal><span lang=EN-US style='color:black'>Internet : </span><span
|
||||
style='color:black'><a href="http://www.example.com/"><span lang=EN-US
|
||||
style='color:black'>http://www.example.com</span></a></span><span
|
||||
lang=EN-US style='color:black'><o:p></o:p></span></p>
|
||||
|
||||
<p class=MsoNormal><span lang=EN-US style='color:black'>Kamer van koophandel
|
||||
xxxxxxxxx<o:p></o:p></span></p>
|
||||
|
||||
<p class=MsoNormal><span lang=EN-US style='color:black'><o:p> </o:p></span></p>
|
||||
|
||||
<p class=MsoNormal><span lang=EN-US style='font-size:7.5pt;color:black'>Op deze
|
||||
e-mail is een disclaimer van toepassing, ga naar </span><span lang=EN-US
|
||||
style='font-size:7.5pt'><a
|
||||
href="http://www.example.com/disclaimer"><span
|
||||
style='color:black'>www.example.com/disclaimer</span></a><br>
|
||||
<span style='color:black'>A disclaimer is applicable to this email, please
|
||||
refer to </span><a href="http://www.example.com/disclaimer"><span
|
||||
style='color:black'>www.example.com/disclaimer</span></a><o:p></o:p></span></p>
|
||||
|
||||
<p class=MsoNormal><span lang=EN-US><o:p> </o:p></span></p>
|
||||
|
||||
</div>
|
||||
|
||||
</body>
|
||||
|
||||
</html>
|
74
docs/specimens/windows-live-mail-desktop-beta.html
Normal file
74
docs/specimens/windows-live-mail-desktop-beta.html
Normal file
@@ -0,0 +1,74 @@
|
||||
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
|
||||
<HTML ChildAreas="4" xmlns:canvas><HEAD>
|
||||
<META http-equiv=Content-Type content=text/html;charset=windows-1257>
|
||||
<STYLE></STYLE>
|
||||
|
||||
<META content="MSHTML 6.00.6000.16414" name=GENERATOR></HEAD>
|
||||
<BODY id=MailContainerBody
|
||||
style="PADDING-RIGHT: 10px; PADDING-LEFT: 10px; FONT-SIZE: 10pt; COLOR: #000000; PADDING-TOP: 15px; FONT-FAMILY: Arial"
|
||||
bgColor=#ff6600 leftMargin=0 background="" topMargin=0
|
||||
name="Compose message area" acc_role="text" CanvasTabStop="false">
|
||||
<DIV
|
||||
style="BORDER-TOP: #dddddd 1px solid; FONT-SIZE: 10pt; WIDTH: 100%; MARGIN-RIGHT: 10px; PADDING-TOP: 5px; BORDER-BOTTOM: #dddddd 1px solid; FONT-FAMILY: Verdana; HEIGHT: 25px; BACKGROUND-COLOR: #ffffff"><NOBR><SPAN
|
||||
title="View a slideshow of the pictures in this e-mail message."
|
||||
style="PADDING-RIGHT: 20px"><A style="COLOR: #0088e4"
|
||||
href="http://g.msn.com/5meen_us/171?path=/photomail/{6fc0065f-ffdd-4ca6-9a4c-cc5a93dc122f}&image=47D7B182CFEFB10!127&imagehi=47D7B182CFEFB10!125&CID=323550092004883216">Play
|
||||
slideshow </A></SPAN><SPAN style="COLOR: #909090"><SPAN>|</SPAN><SPAN
|
||||
style="PADDING-LEFT: 20px"> Download the highest quality version of a picture by
|
||||
clicking the + above it </SPAN></SPAN></NOBR></DIV>
|
||||
<DIV
|
||||
style="PADDING-RIGHT: 5px; PADDING-LEFT: 7px; PADDING-BOTTOM: 2px; WIDTH: 100%; PADDING-TOP: 2px">
|
||||
<OL>
|
||||
<LI><IMG title="Angry smile emoticon"
|
||||
style="FLOAT: none; MARGIN: 0px; POSITION: static" tabIndex=-1
|
||||
alt="Angry smile emoticon" src="cid:49F0C856199E4D688D2D740680733D74@wc"
|
||||
MSNNonUserImageOrEmoticon="true">Un ka <FONT style="BACKGROUND-COLOR: #800000"
|
||||
color=#cc99ff><STRONG>Tev</STRONG></FONT> iet, un ko tu dari?
|
||||
<LI>Aha!</LI></OL>
|
||||
|
||||
<UL>
|
||||
<LI>Buletets
|
||||
<LI>
|
||||
<DIV align=justify><A title=http://laacz.lv/blog/
|
||||
href="http://laacz.lv/blog/">http://laacz.lv/blog/</A> un <A
|
||||
title=http://google.com/ href="http://google.com/">gugle</A></DIV>
|
||||
<LI>Sarakstucitis</LI></UL></DIV><SPAN><SPAN xmlns:canvas="canvas-namespace-id"
|
||||
layoutEmptyTextWellFont="Tahoma"><SPAN
|
||||
style="MARGIN-BOTTOM: 15px; OVERFLOW: visible; HEIGHT: 16px"></SPAN><SPAN
|
||||
style="MARGIN-BOTTOM: 25px; VERTICAL-ALIGN: top; OVERFLOW: visible; MARGIN-RIGHT: 25px; HEIGHT: 234px">
|
||||
<TABLE style="DISPLAY: inline">
|
||||
<TBODY>
|
||||
<TR>
|
||||
|
||||
<TD>
|
||||
<DIV
|
||||
style="FONT-WEIGHT: bold; FONT-SIZE: 12pt; FONT-FAMILY: arial; TEXT-ALIGN: center"><A
|
||||
id=HiresARef
|
||||
title="Click here to view or download a high resolution version of this picture"
|
||||
style="COLOR: #0088e4; TEXT-DECORATION: none"
|
||||
href="http://byfiles.storage.msn.com/x1pMvt0I80jTgT6DuaCpEMbprX3nk3jNv_vjigxV_EYVSMyM_PKgEvDEUtuNhQC-F-23mTTcKyqx6eGaeK2e_wMJ0ikwpDdFntk4SY7pfJUv2g2Ck6R2S2vAA?download">+</A></DIV>
|
||||
<DIV
|
||||
title="Click here to view the full image using the online photo viewer."
|
||||
style="DISPLAY: inline; OVERFLOW: hidden; WIDTH: 140px; HEIGHT: 140px"><A
|
||||
href="http://g.msn.com/5meen_us/171?path=/photomail/{6fc0065f-ffdd-4ca6-9a4c-cc5a93dc122f}&image=47D7B182CFEFB10!127&imagehi=47D7B182CFEFB10!125&CID=323550092004883216"
|
||||
border="0"><IMG
|
||||
style="MARGIN-TOP: 15px; DISPLAY: inline-block; MARGIN-LEFT: 0px"
|
||||
height=109 src="cid:006A71303B80404E9FB6184E55D6A446@wc" width=140
|
||||
border=0></A></DIV></TD></TR>
|
||||
<TR>
|
||||
<TD>
|
||||
<DIV
|
||||
style="FONT-SIZE: 10pt; WIDTH: 140px; FONT-FAMILY: verdana; TEXT-ALIGN: center"><EM><STRONG>This
|
||||
<U>is </U></STRONG><U>tit</U>le</EM> fo<STRONG>r <FONT
|
||||
face="Arial Black">t<FONT color=#800000 size=7>h<U>i</U></FONT>s
|
||||
</FONT>picture</STRONG></DIV></TD></TR></TBODY></TABLE></SPAN></SPAN></SPAN>
|
||||
|
||||
<DIV
|
||||
style="PADDING-RIGHT: 5px; PADDING-LEFT: 7px; PADDING-BOTTOM: 2px; WIDTH: 100%; PADDING-TOP: 2px; HEIGHT: 50px">
|
||||
<DIV> </DIV></DIV>
|
||||
<DIV
|
||||
style="BORDER-TOP: #dddddd 1px solid; FONT-SIZE: 10pt; MARGIN-BOTTOM: 10px; WIDTH: 100%; COLOR: #909090; MARGIN-RIGHT: 10px; PADDING-TOP: 9px; FONT-FAMILY: Verdana; HEIGHT: 42px; BACKGROUND-COLOR: #ffffff"><NOBR><SPAN
|
||||
title="Join Windows Live to share photos using Windows Live Photo E-mail.">Online
|
||||
pictures are available for 30 days. <A style="COLOR: #0088e4"
|
||||
href="http://g.msn.com/5meen_us/175">Get Windows Live Mail desktop to create
|
||||
your own photo e-mails. </A></SPAN></NOBR></DIV></BODY></HTML>
|
@@ -25,6 +25,7 @@ h4 {font-family:sans-serif; font-size:0.9em; font-weight:bold; }
|
||||
.aside {margin-left:2em; font-family:sans-serif; font-size:0.9em; }
|
||||
blockquote .label {font-weight:bold; font-size:1em; margin:0 0 .1em;
|
||||
border-bottom:1px solid #CCC;}
|
||||
.emphasis {font-weight:bold; text-align:center; font-size:1.3em;}
|
||||
|
||||
/* A regular table */
|
||||
.table {border-collapse:collapse; border-bottom:2px solid #888; margin-left:2em; }
|
||||
@@ -32,6 +33,9 @@ blockquote .label {font-weight:bold; font-size:1em; margin:0 0 .1em;
|
||||
.table thead th:first-child {-moz-border-radius-topleft:1em;}
|
||||
.table tbody td {border-bottom:1px solid #CCC; padding-right:0.6em;padding-left:0.6em;}
|
||||
|
||||
/* A quick table*/
|
||||
table.quick tbody th {text-align:right; padding-right:1em;}
|
||||
|
||||
/* Category of the file */
|
||||
#filing {font-weight:bold; font-size:smaller; }
|
||||
|
||||
@@ -42,3 +46,29 @@ blockquote .label {font-weight:bold; font-size:1em; margin:0 0 .1em;
|
||||
|
||||
/* Contains, without exception, $Id$, for SVN version info. */
|
||||
#version {text-align:right; font-style:italic; margin:2em 0;}
|
||||
|
||||
#toc ol ol {list-style-type:lower-roman;}
|
||||
#toc ol {list-style-type:decimal;}
|
||||
#toc {list-style-type:upper-alpha;}
|
||||
|
||||
q {
|
||||
behavior: url(fixquotes.htc); /* IE fix */
|
||||
quotes: '\201C' '\201D' '\2018' '\2019';
|
||||
}
|
||||
q:before {
|
||||
content: open-quote;
|
||||
}
|
||||
q:after {
|
||||
content: close-quote;
|
||||
}
|
||||
|
||||
/* Marks off implementation details interesting only to the person writing
|
||||
the class described in the spec. */
|
||||
.technical {margin-left:2em; }
|
||||
.technical:before {content:"Technical note: "; font-weight:bold; color:#061; }
|
||||
|
||||
/* Marks off sections that are lacking. */
|
||||
.fixme {margin-left:2em; }
|
||||
.fixme:before {content:"Fix me: "; font-weight:bold; color:#C00; }
|
||||
|
||||
#applicability {margin: 1em 5%; font-style:italic;}
|
||||
|
85
extras/ConfigDoc/HTMLXSLTProcessor.php
Normal file
85
extras/ConfigDoc/HTMLXSLTProcessor.php
Normal file
@@ -0,0 +1,85 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Decorator/extender XSLT processor specifically for HTML documents.
|
||||
*/
|
||||
class ConfigDoc_HTMLXSLTProcessor
|
||||
{
|
||||
|
||||
/**
|
||||
* Instance of XSLTProcessor
|
||||
*/
|
||||
protected $xsltProcessor;
|
||||
|
||||
public function __construct($proc = false) {
|
||||
if ($proc === false) $proc = new XSLTProcessor();
|
||||
$this->xsltProcessor = $proc;
|
||||
}
|
||||
|
||||
/**
|
||||
* @note Allows a string $xsl filename to be passed
|
||||
*/
|
||||
public function importStylesheet($xsl) {
|
||||
if (is_string($xsl)) {
|
||||
$xsl_file = $xsl;
|
||||
$xsl = new DOMDocument();
|
||||
$xsl->load($xsl_file);
|
||||
}
|
||||
return $this->xsltProcessor->importStylesheet($xsl);
|
||||
}
|
||||
|
||||
/**
|
||||
* Transforms an XML file into compatible XHTML based on the stylesheet
|
||||
* @param $xml XML DOM tree, or string filename
|
||||
* @return string HTML output
|
||||
* @todo Rename to transformToXHTML, as transformToHTML is misleading
|
||||
*/
|
||||
public function transformToHTML($xml) {
|
||||
if (is_string($xml)) {
|
||||
$dom = new DOMDocument();
|
||||
$dom->load($xml);
|
||||
} else {
|
||||
$dom = $xml;
|
||||
}
|
||||
$out = $this->xsltProcessor->transformToXML($dom);
|
||||
|
||||
// fudges for HTML backwards compatibility
|
||||
// assumes that document is XHTML
|
||||
$out = str_replace('/>', ' />', $out); // <br /> not <br/>
|
||||
$out = str_replace(' xmlns=""', '', $out); // rm unnecessary xmlns
|
||||
|
||||
if (class_exists('Tidy')) {
|
||||
// cleanup output
|
||||
$config = array(
|
||||
'indent' => true,
|
||||
'output-xhtml' => true,
|
||||
'wrap' => 80
|
||||
);
|
||||
$tidy = new Tidy;
|
||||
$tidy->parseString($out, $config, 'utf8');
|
||||
$tidy->cleanRepair();
|
||||
$out = (string) $tidy;
|
||||
}
|
||||
|
||||
return $out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Bulk sets parameters for the XSL stylesheet
|
||||
* @param array $options Associative array of options to set
|
||||
*/
|
||||
public function setParameters($options) {
|
||||
foreach ($options as $name => $value) {
|
||||
$this->xsltProcessor->setParameter('', $name, $value);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Forward any other calls to the XSLT processor
|
||||
*/
|
||||
public function __call($name, $arguments) {
|
||||
call_user_func_array(array($this->xsltProcessor, $name), $arguments);
|
||||
}
|
||||
|
||||
}
|
||||
|
155
extras/FSTools.php
Normal file
155
extras/FSTools.php
Normal file
@@ -0,0 +1,155 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Filesystem tools not provided by default; can recursively create, copy
|
||||
* and delete folders. Some template methods are provided for extensibility.
|
||||
*
|
||||
* @note This class must be instantiated to be used, although it does
|
||||
* not maintain state.
|
||||
*/
|
||||
class FSTools
|
||||
{
|
||||
|
||||
private static $singleton;
|
||||
|
||||
/**
|
||||
* Returns a global instance of FSTools
|
||||
*/
|
||||
static public function singleton() {
|
||||
if (empty(FSTools::$singleton)) FSTools::$singleton = new FSTools();
|
||||
return FSTools::$singleton;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets our global singleton to something else; useful for overloading
|
||||
* functions.
|
||||
*/
|
||||
static public function setSingleton($singleton) {
|
||||
FSTools::$singleton = $singleton;
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursively creates a directory
|
||||
* @param string $folder Name of folder to create
|
||||
* @note Adapted from the PHP manual comment 76612
|
||||
*/
|
||||
public function mkdirr($folder) {
|
||||
$folders = preg_split("#[\\\\/]#", $folder);
|
||||
$base = '';
|
||||
for($i = 0, $c = count($folders); $i < $c; $i++) {
|
||||
if(empty($folders[$i])) {
|
||||
if (!$i) {
|
||||
// special case for root level
|
||||
$base .= DIRECTORY_SEPARATOR;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
$base .= $folders[$i];
|
||||
if(!is_dir($base)){
|
||||
$this->mkdir($base);
|
||||
}
|
||||
$base .= DIRECTORY_SEPARATOR;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy a file, or recursively copy a folder and its contents; modified
|
||||
* so that copied files, if PHP, have includes removed
|
||||
* @note Adapted from http://aidanlister.com/repos/v/function.copyr.php
|
||||
*/
|
||||
public function copyr($source, $dest) {
|
||||
// Simple copy for a file
|
||||
if (is_file($source)) {
|
||||
return $this->copy($source, $dest);
|
||||
}
|
||||
// Make destination directory
|
||||
if (!is_dir($dest)) {
|
||||
$this->mkdir($dest);
|
||||
}
|
||||
// Loop through the folder
|
||||
$dir = $this->dir($source);
|
||||
while ( false !== ($entry = $dir->read()) ) {
|
||||
// Skip pointers
|
||||
if ($entry == '.' || $entry == '..') {
|
||||
continue;
|
||||
}
|
||||
if (!$this->copyable($entry)) {
|
||||
continue;
|
||||
}
|
||||
// Deep copy directories
|
||||
if ($dest !== "$source/$entry") {
|
||||
$this->copyr("$source/$entry", "$dest/$entry");
|
||||
}
|
||||
}
|
||||
// Clean up
|
||||
$dir->close();
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Overloadable function that tests a filename for copyability. By
|
||||
* default, everything should be copied; you can restrict things to
|
||||
* ignore hidden files, unreadable files, etc. This function
|
||||
* applies to copyr().
|
||||
*/
|
||||
public function copyable($file) {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete a file, or a folder and its contents
|
||||
* @note Adapted from http://aidanlister.com/repos/v/function.rmdirr.php
|
||||
*/
|
||||
public function rmdirr($dirname)
|
||||
{
|
||||
// Sanity check
|
||||
if (!$this->file_exists($dirname)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Simple delete for a file
|
||||
if ($this->is_file($dirname) || $this->is_link($dirname)) {
|
||||
return $this->unlink($dirname);
|
||||
}
|
||||
|
||||
// Loop through the folder
|
||||
$dir = $this->dir($dirname);
|
||||
while (false !== $entry = $dir->read()) {
|
||||
// Skip pointers
|
||||
if ($entry == '.' || $entry == '..') {
|
||||
continue;
|
||||
}
|
||||
// Recurse
|
||||
$this->rmdirr($dirname . DIRECTORY_SEPARATOR . $entry);
|
||||
}
|
||||
|
||||
// Clean up
|
||||
$dir->close();
|
||||
return $this->rmdir($dirname);
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursively globs a directory.
|
||||
*/
|
||||
public function globr($dir, $pattern, $flags = NULL) {
|
||||
$files = $this->glob("$dir/$pattern", $flags);
|
||||
if ($files === false) $files = array();
|
||||
$sub_dirs = $this->glob("$dir/*", GLOB_ONLYDIR);
|
||||
if ($sub_dirs === false) $sub_dirs = array();
|
||||
foreach ($sub_dirs as $sub_dir) {
|
||||
$sub_files = $this->globr($sub_dir, $pattern, $flags);
|
||||
$files = array_merge($files, $sub_files);
|
||||
}
|
||||
return $files;
|
||||
}
|
||||
|
||||
/**
|
||||
* Allows for PHP functions to be called and be stubbed.
|
||||
* @warning This function will not work for functions that need
|
||||
* to pass references; manually define a stub function for those.
|
||||
*/
|
||||
public function __call($name, $args) {
|
||||
return call_user_func_array($name, $args);
|
||||
}
|
||||
|
||||
}
|
124
extras/FSTools/File.php
Normal file
124
extras/FSTools/File.php
Normal file
@@ -0,0 +1,124 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Represents a file in the filesystem
|
||||
*
|
||||
* @warning Be sure to distinguish between get() and write() versus
|
||||
* read() and put(), the former operates on the entire file, while
|
||||
* the latter operates on a handle.
|
||||
*/
|
||||
class FSTools_File
|
||||
{
|
||||
|
||||
/** Filename of file this object represents */
|
||||
protected $name;
|
||||
|
||||
/** Handle for the file */
|
||||
protected $handle = false;
|
||||
|
||||
/** Instance of FSTools for interfacing with filesystem */
|
||||
protected $fs;
|
||||
|
||||
/**
|
||||
* Filename of file you wish to instantiate.
|
||||
* @note This file need not exist
|
||||
*/
|
||||
public function __construct($name, $fs = false) {
|
||||
$this->name = $name;
|
||||
$this->fs = $fs ? $fs : FSTools::singleton();
|
||||
}
|
||||
|
||||
/** Returns the filename of the file. */
|
||||
public function getName() {return $this->name;}
|
||||
|
||||
/** Returns directory of the file without trailing slash */
|
||||
public function getDirectory() {return $this->fs->dirname($this->name);}
|
||||
|
||||
/**
|
||||
* Retrieves the contents of a file
|
||||
* @todo Throw an exception if file doesn't exist
|
||||
*/
|
||||
public function get() {
|
||||
return $this->fs->file_get_contents($this->name);
|
||||
}
|
||||
|
||||
/** Writes contents to a file, creates new file if necessary */
|
||||
public function write($contents) {
|
||||
return $this->fs->file_put_contents($this->name, $contents);
|
||||
}
|
||||
|
||||
/** Deletes the file */
|
||||
public function delete() {
|
||||
return $this->fs->unlink($this->name);
|
||||
}
|
||||
|
||||
/** Returns true if file exists and is a file. */
|
||||
public function exists() {
|
||||
return $this->fs->is_file($this->name);
|
||||
}
|
||||
|
||||
/** Returns last file modification time */
|
||||
public function getMTime() {
|
||||
return $this->fs->filemtime($this->name);
|
||||
}
|
||||
|
||||
/**
|
||||
* Chmod a file
|
||||
* @note We ignore errors because of some weird owner trickery due
|
||||
* to SVN duality
|
||||
*/
|
||||
public function chmod($octal_code) {
|
||||
return @$this->fs->chmod($this->name, $octal_code);
|
||||
}
|
||||
|
||||
/** Opens file's handle */
|
||||
public function open($mode) {
|
||||
if ($this->handle) $this->close();
|
||||
$this->handle = $this->fs->fopen($this->name, $mode);
|
||||
return true;
|
||||
}
|
||||
|
||||
/** Closes file's handle */
|
||||
public function close() {
|
||||
if (!$this->handle) return false;
|
||||
$status = $this->fs->fclose($this->handle);
|
||||
$this->handle = false;
|
||||
return $status;
|
||||
}
|
||||
|
||||
/** Retrieves a line from an open file, with optional max length $length */
|
||||
public function getLine($length = null) {
|
||||
if (!$this->handle) $this->open('r');
|
||||
if ($length === null) return $this->fs->fgets($this->handle);
|
||||
else return $this->fs->fgets($this->handle, $length);
|
||||
}
|
||||
|
||||
/** Retrieves a character from an open file */
|
||||
public function getChar() {
|
||||
if (!$this->handle) $this->open('r');
|
||||
return $this->fs->fgetc($this->handle);
|
||||
}
|
||||
|
||||
/** Retrieves an $length bytes of data from an open data */
|
||||
public function read($length) {
|
||||
if (!$this->handle) $this->open('r');
|
||||
return $this->fs->fread($this->handle, $length);
|
||||
}
|
||||
|
||||
/** Writes to an open file */
|
||||
public function put($string) {
|
||||
if (!$this->handle) $this->open('a');
|
||||
return $this->fs->fwrite($this->handle, $string);
|
||||
}
|
||||
|
||||
/** Returns TRUE if the end of the file has been reached */
|
||||
public function eof() {
|
||||
if (!$this->handle) return true;
|
||||
return $this->fs->feof($this->handle);
|
||||
}
|
||||
|
||||
public function __destruct() {
|
||||
if ($this->handle) $this->close();
|
||||
}
|
||||
|
||||
}
|
9
extras/HTMLPurifierExtras.auto.php
Normal file
9
extras/HTMLPurifierExtras.auto.php
Normal file
@@ -0,0 +1,9 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* This is a stub include that automatically configures the include path.
|
||||
*/
|
||||
|
||||
set_include_path(dirname(__FILE__) . PATH_SEPARATOR . get_include_path() );
|
||||
require_once 'HTMLPurifierExtras.php';
|
||||
require_once 'HTMLPurifierExtras.autoload.php';
|
23
extras/HTMLPurifierExtras.autoload.php
Normal file
23
extras/HTMLPurifierExtras.autoload.php
Normal file
@@ -0,0 +1,23 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* @file
|
||||
* Convenience file that registers autoload handler for HTML Purifier.
|
||||
*
|
||||
* @warning
|
||||
* This autoloader does not contain the compatibility code seen in
|
||||
* HTMLPurifier_Bootstrap; the user is expected to make any necessary
|
||||
* changes to use this library.
|
||||
*/
|
||||
|
||||
if (function_exists('spl_autoload_register')) {
|
||||
spl_autoload_register(array('HTMLPurifierExtras', 'autoload'));
|
||||
if (function_exists('__autoload')) {
|
||||
// Be polite and ensure that userland autoload gets retained
|
||||
spl_autoload_register('__autoload');
|
||||
}
|
||||
} elseif (!function_exists('__autoload')) {
|
||||
function __autoload($class) {
|
||||
return HTMLPurifierExtras::autoload($class);
|
||||
}
|
||||
}
|
27
extras/HTMLPurifierExtras.php
Normal file
27
extras/HTMLPurifierExtras.php
Normal file
@@ -0,0 +1,27 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Meta-class for HTML Purifier's extra class hierarchies, similar to
|
||||
* HTMLPurifier_Bootstrap.
|
||||
*/
|
||||
class HTMLPurifierExtras
|
||||
{
|
||||
|
||||
public static function autoload($class) {
|
||||
$path = HTMLPurifierExtras::getPath($class);
|
||||
if (!$path) return false;
|
||||
require $path;
|
||||
return true;
|
||||
}
|
||||
|
||||
public static function getPath($class) {
|
||||
if (
|
||||
strncmp('FSTools', $class, 7) !== 0 &&
|
||||
strncmp('ConfigDoc', $class, 9) !== 0
|
||||
) return false;
|
||||
// Custom implementations can go here
|
||||
// Standard implementation:
|
||||
return str_replace('_', '/', $class) . '.php';
|
||||
}
|
||||
|
||||
}
|
30
extras/README
Normal file
30
extras/README
Normal file
@@ -0,0 +1,30 @@
|
||||
|
||||
HTML Purifier Extras
|
||||
The Method Behind The Madness!
|
||||
|
||||
The extras/ folder in HTML Purifier contains--you guessed it--extra things
|
||||
for HTML Purifier. Specifically, these are two extra libraries called
|
||||
FSTools and ConfigSchema. They're extra for a reason: you don't need them
|
||||
if you're using HTML Purifier for normal usage: filtering HTML. However,
|
||||
if you're a developer, and would like to test HTML Purifier, or need to
|
||||
use one of HTML Purifier's maintenance scripts, chances are they'll need
|
||||
these libraries. Who knows: maybe you'll find them useful too!
|
||||
|
||||
Here are the libraries:
|
||||
|
||||
|
||||
FSTools
|
||||
-------
|
||||
|
||||
Short for File System Tools, this is a poor-man's object-oriented wrapper for
|
||||
the filesystem. It currently consists of two classes:
|
||||
|
||||
- FSTools: This is a singleton that contains a manner of useful functions
|
||||
such as recursive glob, directory removal, etc, as well as the ability
|
||||
to call arbitrary native PHP functions through it like $FS->fopen(...).
|
||||
This makes it a lot simpler to mock these filesystem calls for unit testing.
|
||||
|
||||
- FSTools_File: This object represents a single file, and has almost any
|
||||
method imaginable one would need.
|
||||
|
||||
Check the files themselves for more information.
|
@@ -5,6 +5,5 @@
|
||||
*/
|
||||
|
||||
set_include_path(dirname(__FILE__) . PATH_SEPARATOR . get_include_path() );
|
||||
require_once 'HTMLPurifier.php';
|
||||
|
||||
?>
|
||||
require_once 'HTMLPurifier/Bootstrap.php';
|
||||
require_once 'HTMLPurifier.autoload.php';
|
||||
|
19
library/HTMLPurifier.autoload.php
Normal file
19
library/HTMLPurifier.autoload.php
Normal file
@@ -0,0 +1,19 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* @file
|
||||
* Convenience file that registers autoload handler for HTML Purifier.
|
||||
*/
|
||||
|
||||
if (function_exists('spl_autoload_register') && function_exists('spl_autoload_unregister')) {
|
||||
// We need unregister for our pre-registering functionality
|
||||
HTMLPurifier_Bootstrap::registerAutoload();
|
||||
if (function_exists('__autoload')) {
|
||||
// Be polite and ensure that userland autoload gets retained
|
||||
spl_autoload_register('__autoload');
|
||||
}
|
||||
} elseif (!function_exists('__autoload')) {
|
||||
function __autoload($class) {
|
||||
return HTMLPurifier_Bootstrap::autoload($class);
|
||||
}
|
||||
}
|
@@ -1,21 +1,22 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Function wrapper for HTML Purifier for quick use.
|
||||
* @note This function only includes the library when it is called. While
|
||||
* this is efficient for instances when you only use HTML Purifier
|
||||
* on a few of your pages, it murders bytecode caching. You still
|
||||
* need to add HTML Purifier to your path.
|
||||
* @file
|
||||
* Defines a function wrapper for HTML Purifier for quick use.
|
||||
* @note ''HTMLPurifier()'' is NOT the same as ''new HTMLPurifier()''
|
||||
*/
|
||||
|
||||
/**
|
||||
* Purify HTML.
|
||||
* @param $html String HTML to purify
|
||||
* @param $config Configuration to use, can be any value accepted by
|
||||
* HTMLPurifier_Config::create()
|
||||
*/
|
||||
function HTMLPurifier($html, $config = null) {
|
||||
static $purifier = false;
|
||||
if (!$purifier) {
|
||||
require_once 'HTMLPurifier.php';
|
||||
$purifier = new HTMLPurifier();
|
||||
}
|
||||
return $purifier->purify($html, $config);
|
||||
}
|
||||
|
||||
?>
|
195
library/HTMLPurifier.includes.php
Normal file
195
library/HTMLPurifier.includes.php
Normal file
@@ -0,0 +1,195 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* @file
|
||||
* This file was auto-generated by generate-includes.php and includes all of
|
||||
* the core files required by HTML Purifier. Use this if performance is a
|
||||
* primary concern and you are using an opcode cache. PLEASE DO NOT EDIT THIS
|
||||
* FILE, changes will be overwritten the next time the script is run.
|
||||
*
|
||||
* @version 3.1.1
|
||||
*
|
||||
* @warning
|
||||
* You must *not* include any other HTML Purifier files before this file,
|
||||
* because 'require' not 'require_once' is used.
|
||||
*
|
||||
* @warning
|
||||
* This file requires that the include path contains the HTML Purifier
|
||||
* library directory; this is not auto-set.
|
||||
*/
|
||||
|
||||
require 'HTMLPurifier.php';
|
||||
require 'HTMLPurifier/AttrCollections.php';
|
||||
require 'HTMLPurifier/AttrDef.php';
|
||||
require 'HTMLPurifier/AttrTransform.php';
|
||||
require 'HTMLPurifier/AttrTypes.php';
|
||||
require 'HTMLPurifier/AttrValidator.php';
|
||||
require 'HTMLPurifier/Bootstrap.php';
|
||||
require 'HTMLPurifier/Definition.php';
|
||||
require 'HTMLPurifier/CSSDefinition.php';
|
||||
require 'HTMLPurifier/ChildDef.php';
|
||||
require 'HTMLPurifier/Config.php';
|
||||
require 'HTMLPurifier/ConfigSchema.php';
|
||||
require 'HTMLPurifier/ContentSets.php';
|
||||
require 'HTMLPurifier/Context.php';
|
||||
require 'HTMLPurifier/DefinitionCache.php';
|
||||
require 'HTMLPurifier/DefinitionCacheFactory.php';
|
||||
require 'HTMLPurifier/Doctype.php';
|
||||
require 'HTMLPurifier/DoctypeRegistry.php';
|
||||
require 'HTMLPurifier/ElementDef.php';
|
||||
require 'HTMLPurifier/Encoder.php';
|
||||
require 'HTMLPurifier/EntityLookup.php';
|
||||
require 'HTMLPurifier/EntityParser.php';
|
||||
require 'HTMLPurifier/ErrorCollector.php';
|
||||
require 'HTMLPurifier/Exception.php';
|
||||
require 'HTMLPurifier/Filter.php';
|
||||
require 'HTMLPurifier/Generator.php';
|
||||
require 'HTMLPurifier/HTMLDefinition.php';
|
||||
require 'HTMLPurifier/HTMLModule.php';
|
||||
require 'HTMLPurifier/HTMLModuleManager.php';
|
||||
require 'HTMLPurifier/IDAccumulator.php';
|
||||
require 'HTMLPurifier/Injector.php';
|
||||
require 'HTMLPurifier/Language.php';
|
||||
require 'HTMLPurifier/LanguageFactory.php';
|
||||
require 'HTMLPurifier/Length.php';
|
||||
require 'HTMLPurifier/Lexer.php';
|
||||
require 'HTMLPurifier/PercentEncoder.php';
|
||||
require 'HTMLPurifier/Strategy.php';
|
||||
require 'HTMLPurifier/StringHash.php';
|
||||
require 'HTMLPurifier/StringHashParser.php';
|
||||
require 'HTMLPurifier/TagTransform.php';
|
||||
require 'HTMLPurifier/Token.php';
|
||||
require 'HTMLPurifier/TokenFactory.php';
|
||||
require 'HTMLPurifier/URI.php';
|
||||
require 'HTMLPurifier/URIDefinition.php';
|
||||
require 'HTMLPurifier/URIFilter.php';
|
||||
require 'HTMLPurifier/URIParser.php';
|
||||
require 'HTMLPurifier/URIScheme.php';
|
||||
require 'HTMLPurifier/URISchemeRegistry.php';
|
||||
require 'HTMLPurifier/UnitConverter.php';
|
||||
require 'HTMLPurifier/VarParser.php';
|
||||
require 'HTMLPurifier/VarParserException.php';
|
||||
require 'HTMLPurifier/AttrDef/CSS.php';
|
||||
require 'HTMLPurifier/AttrDef/Enum.php';
|
||||
require 'HTMLPurifier/AttrDef/Integer.php';
|
||||
require 'HTMLPurifier/AttrDef/Lang.php';
|
||||
require 'HTMLPurifier/AttrDef/Switch.php';
|
||||
require 'HTMLPurifier/AttrDef/Text.php';
|
||||
require 'HTMLPurifier/AttrDef/URI.php';
|
||||
require 'HTMLPurifier/AttrDef/CSS/Number.php';
|
||||
require 'HTMLPurifier/AttrDef/CSS/AlphaValue.php';
|
||||
require 'HTMLPurifier/AttrDef/CSS/Background.php';
|
||||
require 'HTMLPurifier/AttrDef/CSS/BackgroundPosition.php';
|
||||
require 'HTMLPurifier/AttrDef/CSS/Border.php';
|
||||
require 'HTMLPurifier/AttrDef/CSS/Color.php';
|
||||
require 'HTMLPurifier/AttrDef/CSS/Composite.php';
|
||||
require 'HTMLPurifier/AttrDef/CSS/DenyElementDecorator.php';
|
||||
require 'HTMLPurifier/AttrDef/CSS/Filter.php';
|
||||
require 'HTMLPurifier/AttrDef/CSS/Font.php';
|
||||
require 'HTMLPurifier/AttrDef/CSS/FontFamily.php';
|
||||
require 'HTMLPurifier/AttrDef/CSS/ImportantDecorator.php';
|
||||
require 'HTMLPurifier/AttrDef/CSS/Length.php';
|
||||
require 'HTMLPurifier/AttrDef/CSS/ListStyle.php';
|
||||
require 'HTMLPurifier/AttrDef/CSS/Multiple.php';
|
||||
require 'HTMLPurifier/AttrDef/CSS/Percentage.php';
|
||||
require 'HTMLPurifier/AttrDef/CSS/TextDecoration.php';
|
||||
require 'HTMLPurifier/AttrDef/CSS/URI.php';
|
||||
require 'HTMLPurifier/AttrDef/HTML/Bool.php';
|
||||
require 'HTMLPurifier/AttrDef/HTML/Color.php';
|
||||
require 'HTMLPurifier/AttrDef/HTML/FrameTarget.php';
|
||||
require 'HTMLPurifier/AttrDef/HTML/ID.php';
|
||||
require 'HTMLPurifier/AttrDef/HTML/Pixels.php';
|
||||
require 'HTMLPurifier/AttrDef/HTML/Length.php';
|
||||
require 'HTMLPurifier/AttrDef/HTML/LinkTypes.php';
|
||||
require 'HTMLPurifier/AttrDef/HTML/MultiLength.php';
|
||||
require 'HTMLPurifier/AttrDef/HTML/Nmtokens.php';
|
||||
require 'HTMLPurifier/AttrDef/URI/Email.php';
|
||||
require 'HTMLPurifier/AttrDef/URI/Host.php';
|
||||
require 'HTMLPurifier/AttrDef/URI/IPv4.php';
|
||||
require 'HTMLPurifier/AttrDef/URI/IPv6.php';
|
||||
require 'HTMLPurifier/AttrDef/URI/Email/SimpleCheck.php';
|
||||
require 'HTMLPurifier/AttrTransform/BdoDir.php';
|
||||
require 'HTMLPurifier/AttrTransform/BgColor.php';
|
||||
require 'HTMLPurifier/AttrTransform/BoolToCSS.php';
|
||||
require 'HTMLPurifier/AttrTransform/Border.php';
|
||||
require 'HTMLPurifier/AttrTransform/EnumToCSS.php';
|
||||
require 'HTMLPurifier/AttrTransform/ImgRequired.php';
|
||||
require 'HTMLPurifier/AttrTransform/ImgSpace.php';
|
||||
require 'HTMLPurifier/AttrTransform/Lang.php';
|
||||
require 'HTMLPurifier/AttrTransform/Length.php';
|
||||
require 'HTMLPurifier/AttrTransform/Name.php';
|
||||
require 'HTMLPurifier/AttrTransform/SafeEmbed.php';
|
||||
require 'HTMLPurifier/AttrTransform/SafeObject.php';
|
||||
require 'HTMLPurifier/AttrTransform/SafeParam.php';
|
||||
require 'HTMLPurifier/AttrTransform/ScriptRequired.php';
|
||||
require 'HTMLPurifier/ChildDef/Chameleon.php';
|
||||
require 'HTMLPurifier/ChildDef/Custom.php';
|
||||
require 'HTMLPurifier/ChildDef/Empty.php';
|
||||
require 'HTMLPurifier/ChildDef/Required.php';
|
||||
require 'HTMLPurifier/ChildDef/Optional.php';
|
||||
require 'HTMLPurifier/ChildDef/StrictBlockquote.php';
|
||||
require 'HTMLPurifier/ChildDef/Table.php';
|
||||
require 'HTMLPurifier/DefinitionCache/Decorator.php';
|
||||
require 'HTMLPurifier/DefinitionCache/Null.php';
|
||||
require 'HTMLPurifier/DefinitionCache/Serializer.php';
|
||||
require 'HTMLPurifier/DefinitionCache/Decorator/Cleanup.php';
|
||||
require 'HTMLPurifier/DefinitionCache/Decorator/Memory.php';
|
||||
require 'HTMLPurifier/HTMLModule/Bdo.php';
|
||||
require 'HTMLPurifier/HTMLModule/CommonAttributes.php';
|
||||
require 'HTMLPurifier/HTMLModule/Edit.php';
|
||||
require 'HTMLPurifier/HTMLModule/Hypertext.php';
|
||||
require 'HTMLPurifier/HTMLModule/Image.php';
|
||||
require 'HTMLPurifier/HTMLModule/Legacy.php';
|
||||
require 'HTMLPurifier/HTMLModule/List.php';
|
||||
require 'HTMLPurifier/HTMLModule/NonXMLCommonAttributes.php';
|
||||
require 'HTMLPurifier/HTMLModule/Object.php';
|
||||
require 'HTMLPurifier/HTMLModule/Presentation.php';
|
||||
require 'HTMLPurifier/HTMLModule/Proprietary.php';
|
||||
require 'HTMLPurifier/HTMLModule/Ruby.php';
|
||||
require 'HTMLPurifier/HTMLModule/SafeEmbed.php';
|
||||
require 'HTMLPurifier/HTMLModule/SafeObject.php';
|
||||
require 'HTMLPurifier/HTMLModule/Scripting.php';
|
||||
require 'HTMLPurifier/HTMLModule/StyleAttribute.php';
|
||||
require 'HTMLPurifier/HTMLModule/Tables.php';
|
||||
require 'HTMLPurifier/HTMLModule/Target.php';
|
||||
require 'HTMLPurifier/HTMLModule/Text.php';
|
||||
require 'HTMLPurifier/HTMLModule/Tidy.php';
|
||||
require 'HTMLPurifier/HTMLModule/XMLCommonAttributes.php';
|
||||
require 'HTMLPurifier/HTMLModule/Tidy/Proprietary.php';
|
||||
require 'HTMLPurifier/HTMLModule/Tidy/XHTMLAndHTML4.php';
|
||||
require 'HTMLPurifier/HTMLModule/Tidy/Strict.php';
|
||||
require 'HTMLPurifier/HTMLModule/Tidy/Transitional.php';
|
||||
require 'HTMLPurifier/HTMLModule/Tidy/XHTML.php';
|
||||
require 'HTMLPurifier/Injector/AutoParagraph.php';
|
||||
require 'HTMLPurifier/Injector/Linkify.php';
|
||||
require 'HTMLPurifier/Injector/PurifierLinkify.php';
|
||||
require 'HTMLPurifier/Injector/SafeObject.php';
|
||||
require 'HTMLPurifier/Lexer/DOMLex.php';
|
||||
require 'HTMLPurifier/Lexer/DirectLex.php';
|
||||
require 'HTMLPurifier/Strategy/Composite.php';
|
||||
require 'HTMLPurifier/Strategy/Core.php';
|
||||
require 'HTMLPurifier/Strategy/FixNesting.php';
|
||||
require 'HTMLPurifier/Strategy/MakeWellFormed.php';
|
||||
require 'HTMLPurifier/Strategy/RemoveForeignElements.php';
|
||||
require 'HTMLPurifier/Strategy/ValidateAttributes.php';
|
||||
require 'HTMLPurifier/TagTransform/Font.php';
|
||||
require 'HTMLPurifier/TagTransform/Simple.php';
|
||||
require 'HTMLPurifier/Token/Comment.php';
|
||||
require 'HTMLPurifier/Token/Tag.php';
|
||||
require 'HTMLPurifier/Token/Empty.php';
|
||||
require 'HTMLPurifier/Token/End.php';
|
||||
require 'HTMLPurifier/Token/Start.php';
|
||||
require 'HTMLPurifier/Token/Text.php';
|
||||
require 'HTMLPurifier/URIFilter/DisableExternal.php';
|
||||
require 'HTMLPurifier/URIFilter/DisableExternalResources.php';
|
||||
require 'HTMLPurifier/URIFilter/HostBlacklist.php';
|
||||
require 'HTMLPurifier/URIFilter/MakeAbsolute.php';
|
||||
require 'HTMLPurifier/URIFilter/Munge.php';
|
||||
require 'HTMLPurifier/URIScheme/ftp.php';
|
||||
require 'HTMLPurifier/URIScheme/http.php';
|
||||
require 'HTMLPurifier/URIScheme/https.php';
|
||||
require 'HTMLPurifier/URIScheme/mailto.php';
|
||||
require 'HTMLPurifier/URIScheme/news.php';
|
||||
require 'HTMLPurifier/URIScheme/nntp.php';
|
||||
require 'HTMLPurifier/VarParser/Flexible.php';
|
||||
require 'HTMLPurifier/VarParser/Native.php';
|
28
library/HTMLPurifier.kses.php
Normal file
28
library/HTMLPurifier.kses.php
Normal file
@@ -0,0 +1,28 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* @file
|
||||
* Emulation layer for code that used kses(), substituting in HTML Purifier.
|
||||
*/
|
||||
|
||||
require_once dirname(__FILE__) . '/HTMLPurifier.auto.php';
|
||||
|
||||
function kses($string, $allowed_html, $allowed_protocols = null) {
|
||||
$config = HTMLPurifier_Config::createDefault();
|
||||
$allowed_elements = array();
|
||||
$allowed_attributes = array();
|
||||
foreach ($allowed_html as $element => $attributes) {
|
||||
$allowed_elements[$element] = true;
|
||||
foreach ($attributes as $attribute => $x) {
|
||||
$allowed_attributes["$element.$attribute"] = true;
|
||||
}
|
||||
}
|
||||
$config->set('HTML', 'AllowedElements', $allowed_elements);
|
||||
$config->set('HTML', 'AllowedAttributes', $allowed_attributes);
|
||||
$allowed_schemes = array();
|
||||
if ($allowed_protocols !== null) {
|
||||
$config->set('URI', 'AllowedSchemes', $allowed_protocols);
|
||||
}
|
||||
$purifier = new HTMLPurifier($config);
|
||||
return $purifier->purify($string);
|
||||
}
|
9
library/HTMLPurifier.path.php
Normal file
9
library/HTMLPurifier.path.php
Normal file
@@ -0,0 +1,9 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* @file
|
||||
* Convenience stub file that adds HTML Purifier's library file to the path
|
||||
* without any other side-effects.
|
||||
*/
|
||||
|
||||
set_include_path(dirname(__FILE__) . PATH_SEPARATOR . get_include_path() );
|
@@ -1,7 +1,6 @@
|
||||
<?php
|
||||
|
||||
/*!
|
||||
* @mainpage
|
||||
/*! @mainpage
|
||||
*
|
||||
* HTML Purifier is an HTML filter that will take an arbitrary snippet of
|
||||
* HTML and rigorously test, validate and filter it into a version that
|
||||
@@ -16,14 +15,12 @@
|
||||
* -# Generating HTML from the purified tokens.
|
||||
*
|
||||
* However, most users will only need to interface with the HTMLPurifier
|
||||
* class, so this massive amount of infrastructure is usually concealed.
|
||||
* If you plan on working with the internals, be sure to include
|
||||
* HTMLPurifier_ConfigSchema and HTMLPurifier_Config.
|
||||
* and HTMLPurifier_Config.
|
||||
*/
|
||||
|
||||
/*
|
||||
HTML Purifier 1.4.1 - Standards Compliant HTML Filtering
|
||||
Copyright (C) 2006 Edward Z. Yang
|
||||
HTML Purifier 3.1.1 - Standards Compliant HTML Filtering
|
||||
Copyright (C) 2006-2008 Edward Z. Yang
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
@@ -40,42 +37,45 @@
|
||||
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
*/
|
||||
|
||||
// almost every class has an undocumented dependency to these, so make sure
|
||||
// they get included
|
||||
require_once 'HTMLPurifier/ConfigSchema.php';
|
||||
require_once 'HTMLPurifier/Config.php';
|
||||
require_once 'HTMLPurifier/Context.php';
|
||||
|
||||
require_once 'HTMLPurifier/Lexer.php';
|
||||
require_once 'HTMLPurifier/Generator.php';
|
||||
require_once 'HTMLPurifier/Strategy/Core.php';
|
||||
require_once 'HTMLPurifier/Encoder.php';
|
||||
|
||||
/**
|
||||
* Main library execution class.
|
||||
* Facade that coordinates HTML Purifier's subsystems in order to purify HTML.
|
||||
*
|
||||
* Facade that performs calls to the HTMLPurifier_Lexer,
|
||||
* HTMLPurifier_Strategy and HTMLPurifier_Generator subsystems in order to
|
||||
* purify HTML.
|
||||
* @note There are several points in which configuration can be specified
|
||||
* for HTML Purifier. The precedence of these (from lowest to
|
||||
* highest) is as follows:
|
||||
* -# Instance: new HTMLPurifier($config)
|
||||
* -# Invocation: purify($html, $config)
|
||||
* These configurations are entirely independent of each other and
|
||||
* are *not* merged (this behavior may change in the future).
|
||||
*
|
||||
* @todo We need an easier way to inject strategies, it'll probably end
|
||||
* up getting done through config though.
|
||||
* @todo We need an easier way to inject strategies using the configuration
|
||||
* object.
|
||||
*/
|
||||
class HTMLPurifier
|
||||
{
|
||||
|
||||
var $version = '1.4.1';
|
||||
/** Version of HTML Purifier */
|
||||
public $version = '3.1.1';
|
||||
|
||||
var $config;
|
||||
var $filters;
|
||||
/** Constant with version of HTML Purifier */
|
||||
const VERSION = '3.1.1';
|
||||
|
||||
var $lexer, $strategy, $generator;
|
||||
/** Global configuration object */
|
||||
public $config;
|
||||
|
||||
/** Array of extra HTMLPurifier_Filter objects to run on HTML, for backwards compatibility */
|
||||
private $filters = array();
|
||||
|
||||
/** Single instance of HTML Purifier */
|
||||
private static $instance;
|
||||
|
||||
protected $strategy, $generator;
|
||||
|
||||
/**
|
||||
* Final HTMLPurifier_Context of last run purification. Might be an array.
|
||||
* @public
|
||||
* Resultant HTMLPurifier_Context of last run purification. Is an array
|
||||
* of contexts if the last called method was purifyArray().
|
||||
*/
|
||||
var $context;
|
||||
public $context;
|
||||
|
||||
/**
|
||||
* Initializes the purifier.
|
||||
@@ -85,13 +85,11 @@ class HTMLPurifier
|
||||
* The parameter can also be any type that
|
||||
* HTMLPurifier_Config::create() supports.
|
||||
*/
|
||||
function HTMLPurifier($config = null) {
|
||||
public function __construct($config = null) {
|
||||
|
||||
$this->config = HTMLPurifier_Config::create($config);
|
||||
|
||||
$this->lexer = HTMLPurifier_Lexer::create();
|
||||
$this->strategy = new HTMLPurifier_Strategy_Core();
|
||||
$this->generator = new HTMLPurifier_Generator();
|
||||
|
||||
}
|
||||
|
||||
@@ -99,7 +97,8 @@ class HTMLPurifier
|
||||
* Adds a filter to process the output. First come first serve
|
||||
* @param $filter HTMLPurifier_Filter object
|
||||
*/
|
||||
function addFilter($filter) {
|
||||
public function addFilter($filter) {
|
||||
trigger_error('HTMLPurifier->addFilter() is deprecated, use configuration directives in the Filter namespace or Filter.Custom', E_USER_WARNING);
|
||||
$this->filters[] = $filter;
|
||||
}
|
||||
|
||||
@@ -113,15 +112,58 @@ class HTMLPurifier
|
||||
* that HTMLPurifier_Config::create() supports.
|
||||
* @return Purified HTML
|
||||
*/
|
||||
function purify($html, $config = null) {
|
||||
public function purify($html, $config = null) {
|
||||
|
||||
// :TODO: make the config merge in, instead of replace
|
||||
$config = $config ? HTMLPurifier_Config::create($config) : $this->config;
|
||||
|
||||
// implementation is partially environment dependant, partially
|
||||
// configuration dependant
|
||||
$lexer = HTMLPurifier_Lexer::create($config);
|
||||
|
||||
$context = new HTMLPurifier_Context();
|
||||
|
||||
// setup HTML generator
|
||||
$this->generator = new HTMLPurifier_Generator($config, $context);
|
||||
$context->register('Generator', $this->generator);
|
||||
|
||||
// set up global context variables
|
||||
if ($config->get('Core', 'CollectErrors')) {
|
||||
// may get moved out if other facilities use it
|
||||
$language_factory = HTMLPurifier_LanguageFactory::instance();
|
||||
$language = $language_factory->create($config, $context);
|
||||
$context->register('Locale', $language);
|
||||
|
||||
$error_collector = new HTMLPurifier_ErrorCollector($context);
|
||||
$context->register('ErrorCollector', $error_collector);
|
||||
}
|
||||
|
||||
// setup id_accumulator context, necessary due to the fact that
|
||||
// AttrValidator can be called from many places
|
||||
$id_accumulator = HTMLPurifier_IDAccumulator::build($config, $context);
|
||||
$context->register('IDAccumulator', $id_accumulator);
|
||||
|
||||
$html = HTMLPurifier_Encoder::convertToUTF8($html, $config, $context);
|
||||
|
||||
for ($i = 0, $size = count($this->filters); $i < $size; $i++) {
|
||||
$html = $this->filters[$i]->preFilter($html, $config, $context);
|
||||
// setup filters
|
||||
$filter_flags = $config->getBatch('Filter');
|
||||
$custom_filters = $filter_flags['Custom'];
|
||||
unset($filter_flags['Custom']);
|
||||
$filters = array();
|
||||
foreach ($filter_flags as $filter => $flag) {
|
||||
if (!$flag) continue;
|
||||
$class = "HTMLPurifier_Filter_$filter";
|
||||
$filters[] = new $class;
|
||||
}
|
||||
foreach ($custom_filters as $filter) {
|
||||
// maybe "HTMLPurifier_Filter_$filter", but be consistent with AutoFormat
|
||||
$filters[] = $filter;
|
||||
}
|
||||
$filters = array_merge($filters, $this->filters);
|
||||
// maybe prepare(), but later
|
||||
|
||||
for ($i = 0, $filter_size = count($filters); $i < $filter_size; $i++) {
|
||||
$html = $filters[$i]->preFilter($html, $config, $context);
|
||||
}
|
||||
|
||||
// purified HTML
|
||||
@@ -130,17 +172,16 @@ class HTMLPurifier
|
||||
// list of tokens
|
||||
$this->strategy->execute(
|
||||
// list of un-purified tokens
|
||||
$this->lexer->tokenizeHTML(
|
||||
$lexer->tokenizeHTML(
|
||||
// un-purified HTML
|
||||
$html, $config, $context
|
||||
),
|
||||
$config, $context
|
||||
),
|
||||
$config, $context
|
||||
)
|
||||
);
|
||||
|
||||
for ($i = $size - 1; $i >= 0; $i--) {
|
||||
$html = $this->filters[$i]->postFilter($html, $config, $context);
|
||||
for ($i = $filter_size - 1; $i >= 0; $i--) {
|
||||
$html = $filters[$i]->postFilter($html, $config, $context);
|
||||
}
|
||||
|
||||
$html = HTMLPurifier_Encoder::convertFromUTF8($html, $config, $context);
|
||||
@@ -154,7 +195,7 @@ class HTMLPurifier
|
||||
* See HTMLPurifier::purify() for more details.
|
||||
* @return Array of purified HTML
|
||||
*/
|
||||
function purifyArray($array_of_html, $config = null) {
|
||||
public function purifyArray($array_of_html, $config = null) {
|
||||
$context_array = array();
|
||||
foreach ($array_of_html as $key => $html) {
|
||||
$array_of_html[$key] = $this->purify($html, $config);
|
||||
@@ -164,7 +205,30 @@ class HTMLPurifier
|
||||
return $array_of_html;
|
||||
}
|
||||
|
||||
/**
|
||||
* Singleton for enforcing just one HTML Purifier in your system
|
||||
* @param $prototype Optional prototype HTMLPurifier instance to
|
||||
* overload singleton with, or HTMLPurifier_Config
|
||||
* instance to configure the generated version with.
|
||||
*/
|
||||
public static function instance($prototype = null) {
|
||||
if (!self::$instance || $prototype) {
|
||||
if ($prototype instanceof HTMLPurifier) {
|
||||
self::$instance = $prototype;
|
||||
} elseif ($prototype) {
|
||||
self::$instance = new HTMLPurifier($prototype);
|
||||
} else {
|
||||
self::$instance = new HTMLPurifier();
|
||||
}
|
||||
}
|
||||
return self::$instance;
|
||||
}
|
||||
|
||||
/**
|
||||
* @note Backwards compatibility, see instance()
|
||||
*/
|
||||
public static function getInstance($prototype = null) {
|
||||
return HTMLPurifier::instance($prototype);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
?>
|
189
library/HTMLPurifier.safe-includes.php
Normal file
189
library/HTMLPurifier.safe-includes.php
Normal file
@@ -0,0 +1,189 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* @file
|
||||
* This file was auto-generated by generate-includes.php and includes all of
|
||||
* the core files required by HTML Purifier. This is a convenience stub that
|
||||
* includes all files using dirname(__FILE__) and require_once. PLEASE DO NOT
|
||||
* EDIT THIS FILE, changes will be overwritten the next time the script is run.
|
||||
*
|
||||
* Changes to include_path are not necessary.
|
||||
*/
|
||||
|
||||
$__dir = dirname(__FILE__);
|
||||
|
||||
require_once $__dir . '/HTMLPurifier.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrCollections.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrTransform.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrTypes.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrValidator.php';
|
||||
require_once $__dir . '/HTMLPurifier/Bootstrap.php';
|
||||
require_once $__dir . '/HTMLPurifier/Definition.php';
|
||||
require_once $__dir . '/HTMLPurifier/CSSDefinition.php';
|
||||
require_once $__dir . '/HTMLPurifier/ChildDef.php';
|
||||
require_once $__dir . '/HTMLPurifier/Config.php';
|
||||
require_once $__dir . '/HTMLPurifier/ConfigSchema.php';
|
||||
require_once $__dir . '/HTMLPurifier/ContentSets.php';
|
||||
require_once $__dir . '/HTMLPurifier/Context.php';
|
||||
require_once $__dir . '/HTMLPurifier/DefinitionCache.php';
|
||||
require_once $__dir . '/HTMLPurifier/DefinitionCacheFactory.php';
|
||||
require_once $__dir . '/HTMLPurifier/Doctype.php';
|
||||
require_once $__dir . '/HTMLPurifier/DoctypeRegistry.php';
|
||||
require_once $__dir . '/HTMLPurifier/ElementDef.php';
|
||||
require_once $__dir . '/HTMLPurifier/Encoder.php';
|
||||
require_once $__dir . '/HTMLPurifier/EntityLookup.php';
|
||||
require_once $__dir . '/HTMLPurifier/EntityParser.php';
|
||||
require_once $__dir . '/HTMLPurifier/ErrorCollector.php';
|
||||
require_once $__dir . '/HTMLPurifier/Exception.php';
|
||||
require_once $__dir . '/HTMLPurifier/Filter.php';
|
||||
require_once $__dir . '/HTMLPurifier/Generator.php';
|
||||
require_once $__dir . '/HTMLPurifier/HTMLDefinition.php';
|
||||
require_once $__dir . '/HTMLPurifier/HTMLModule.php';
|
||||
require_once $__dir . '/HTMLPurifier/HTMLModuleManager.php';
|
||||
require_once $__dir . '/HTMLPurifier/IDAccumulator.php';
|
||||
require_once $__dir . '/HTMLPurifier/Injector.php';
|
||||
require_once $__dir . '/HTMLPurifier/Language.php';
|
||||
require_once $__dir . '/HTMLPurifier/LanguageFactory.php';
|
||||
require_once $__dir . '/HTMLPurifier/Length.php';
|
||||
require_once $__dir . '/HTMLPurifier/Lexer.php';
|
||||
require_once $__dir . '/HTMLPurifier/PercentEncoder.php';
|
||||
require_once $__dir . '/HTMLPurifier/Strategy.php';
|
||||
require_once $__dir . '/HTMLPurifier/StringHash.php';
|
||||
require_once $__dir . '/HTMLPurifier/StringHashParser.php';
|
||||
require_once $__dir . '/HTMLPurifier/TagTransform.php';
|
||||
require_once $__dir . '/HTMLPurifier/Token.php';
|
||||
require_once $__dir . '/HTMLPurifier/TokenFactory.php';
|
||||
require_once $__dir . '/HTMLPurifier/URI.php';
|
||||
require_once $__dir . '/HTMLPurifier/URIDefinition.php';
|
||||
require_once $__dir . '/HTMLPurifier/URIFilter.php';
|
||||
require_once $__dir . '/HTMLPurifier/URIParser.php';
|
||||
require_once $__dir . '/HTMLPurifier/URIScheme.php';
|
||||
require_once $__dir . '/HTMLPurifier/URISchemeRegistry.php';
|
||||
require_once $__dir . '/HTMLPurifier/UnitConverter.php';
|
||||
require_once $__dir . '/HTMLPurifier/VarParser.php';
|
||||
require_once $__dir . '/HTMLPurifier/VarParserException.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/CSS.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/Enum.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/Integer.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/Lang.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/Switch.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/Text.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/URI.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/CSS/Number.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/CSS/AlphaValue.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/CSS/Background.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/CSS/BackgroundPosition.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/CSS/Border.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/CSS/Color.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/CSS/Composite.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/CSS/DenyElementDecorator.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/CSS/Filter.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/CSS/Font.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/CSS/FontFamily.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/CSS/ImportantDecorator.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/CSS/Length.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/CSS/ListStyle.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/CSS/Multiple.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/CSS/Percentage.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/CSS/TextDecoration.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/CSS/URI.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/HTML/Bool.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/HTML/Color.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/HTML/FrameTarget.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/HTML/ID.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/HTML/Pixels.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/HTML/Length.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/HTML/LinkTypes.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/HTML/MultiLength.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/HTML/Nmtokens.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/URI/Email.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/URI/Host.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/URI/IPv4.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/URI/IPv6.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrDef/URI/Email/SimpleCheck.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrTransform/BdoDir.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrTransform/BgColor.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrTransform/BoolToCSS.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrTransform/Border.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrTransform/EnumToCSS.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrTransform/ImgRequired.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrTransform/ImgSpace.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrTransform/Lang.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrTransform/Length.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrTransform/Name.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrTransform/SafeEmbed.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrTransform/SafeObject.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrTransform/SafeParam.php';
|
||||
require_once $__dir . '/HTMLPurifier/AttrTransform/ScriptRequired.php';
|
||||
require_once $__dir . '/HTMLPurifier/ChildDef/Chameleon.php';
|
||||
require_once $__dir . '/HTMLPurifier/ChildDef/Custom.php';
|
||||
require_once $__dir . '/HTMLPurifier/ChildDef/Empty.php';
|
||||
require_once $__dir . '/HTMLPurifier/ChildDef/Required.php';
|
||||
require_once $__dir . '/HTMLPurifier/ChildDef/Optional.php';
|
||||
require_once $__dir . '/HTMLPurifier/ChildDef/StrictBlockquote.php';
|
||||
require_once $__dir . '/HTMLPurifier/ChildDef/Table.php';
|
||||
require_once $__dir . '/HTMLPurifier/DefinitionCache/Decorator.php';
|
||||
require_once $__dir . '/HTMLPurifier/DefinitionCache/Null.php';
|
||||
require_once $__dir . '/HTMLPurifier/DefinitionCache/Serializer.php';
|
||||
require_once $__dir . '/HTMLPurifier/DefinitionCache/Decorator/Cleanup.php';
|
||||
require_once $__dir . '/HTMLPurifier/DefinitionCache/Decorator/Memory.php';
|
||||
require_once $__dir . '/HTMLPurifier/HTMLModule/Bdo.php';
|
||||
require_once $__dir . '/HTMLPurifier/HTMLModule/CommonAttributes.php';
|
||||
require_once $__dir . '/HTMLPurifier/HTMLModule/Edit.php';
|
||||
require_once $__dir . '/HTMLPurifier/HTMLModule/Hypertext.php';
|
||||
require_once $__dir . '/HTMLPurifier/HTMLModule/Image.php';
|
||||
require_once $__dir . '/HTMLPurifier/HTMLModule/Legacy.php';
|
||||
require_once $__dir . '/HTMLPurifier/HTMLModule/List.php';
|
||||
require_once $__dir . '/HTMLPurifier/HTMLModule/NonXMLCommonAttributes.php';
|
||||
require_once $__dir . '/HTMLPurifier/HTMLModule/Object.php';
|
||||
require_once $__dir . '/HTMLPurifier/HTMLModule/Presentation.php';
|
||||
require_once $__dir . '/HTMLPurifier/HTMLModule/Proprietary.php';
|
||||
require_once $__dir . '/HTMLPurifier/HTMLModule/Ruby.php';
|
||||
require_once $__dir . '/HTMLPurifier/HTMLModule/SafeEmbed.php';
|
||||
require_once $__dir . '/HTMLPurifier/HTMLModule/SafeObject.php';
|
||||
require_once $__dir . '/HTMLPurifier/HTMLModule/Scripting.php';
|
||||
require_once $__dir . '/HTMLPurifier/HTMLModule/StyleAttribute.php';
|
||||
require_once $__dir . '/HTMLPurifier/HTMLModule/Tables.php';
|
||||
require_once $__dir . '/HTMLPurifier/HTMLModule/Target.php';
|
||||
require_once $__dir . '/HTMLPurifier/HTMLModule/Text.php';
|
||||
require_once $__dir . '/HTMLPurifier/HTMLModule/Tidy.php';
|
||||
require_once $__dir . '/HTMLPurifier/HTMLModule/XMLCommonAttributes.php';
|
||||
require_once $__dir . '/HTMLPurifier/HTMLModule/Tidy/Proprietary.php';
|
||||
require_once $__dir . '/HTMLPurifier/HTMLModule/Tidy/XHTMLAndHTML4.php';
|
||||
require_once $__dir . '/HTMLPurifier/HTMLModule/Tidy/Strict.php';
|
||||
require_once $__dir . '/HTMLPurifier/HTMLModule/Tidy/Transitional.php';
|
||||
require_once $__dir . '/HTMLPurifier/HTMLModule/Tidy/XHTML.php';
|
||||
require_once $__dir . '/HTMLPurifier/Injector/AutoParagraph.php';
|
||||
require_once $__dir . '/HTMLPurifier/Injector/Linkify.php';
|
||||
require_once $__dir . '/HTMLPurifier/Injector/PurifierLinkify.php';
|
||||
require_once $__dir . '/HTMLPurifier/Injector/SafeObject.php';
|
||||
require_once $__dir . '/HTMLPurifier/Lexer/DOMLex.php';
|
||||
require_once $__dir . '/HTMLPurifier/Lexer/DirectLex.php';
|
||||
require_once $__dir . '/HTMLPurifier/Strategy/Composite.php';
|
||||
require_once $__dir . '/HTMLPurifier/Strategy/Core.php';
|
||||
require_once $__dir . '/HTMLPurifier/Strategy/FixNesting.php';
|
||||
require_once $__dir . '/HTMLPurifier/Strategy/MakeWellFormed.php';
|
||||
require_once $__dir . '/HTMLPurifier/Strategy/RemoveForeignElements.php';
|
||||
require_once $__dir . '/HTMLPurifier/Strategy/ValidateAttributes.php';
|
||||
require_once $__dir . '/HTMLPurifier/TagTransform/Font.php';
|
||||
require_once $__dir . '/HTMLPurifier/TagTransform/Simple.php';
|
||||
require_once $__dir . '/HTMLPurifier/Token/Comment.php';
|
||||
require_once $__dir . '/HTMLPurifier/Token/Tag.php';
|
||||
require_once $__dir . '/HTMLPurifier/Token/Empty.php';
|
||||
require_once $__dir . '/HTMLPurifier/Token/End.php';
|
||||
require_once $__dir . '/HTMLPurifier/Token/Start.php';
|
||||
require_once $__dir . '/HTMLPurifier/Token/Text.php';
|
||||
require_once $__dir . '/HTMLPurifier/URIFilter/DisableExternal.php';
|
||||
require_once $__dir . '/HTMLPurifier/URIFilter/DisableExternalResources.php';
|
||||
require_once $__dir . '/HTMLPurifier/URIFilter/HostBlacklist.php';
|
||||
require_once $__dir . '/HTMLPurifier/URIFilter/MakeAbsolute.php';
|
||||
require_once $__dir . '/HTMLPurifier/URIFilter/Munge.php';
|
||||
require_once $__dir . '/HTMLPurifier/URIScheme/ftp.php';
|
||||
require_once $__dir . '/HTMLPurifier/URIScheme/http.php';
|
||||
require_once $__dir . '/HTMLPurifier/URIScheme/https.php';
|
||||
require_once $__dir . '/HTMLPurifier/URIScheme/mailto.php';
|
||||
require_once $__dir . '/HTMLPurifier/URIScheme/news.php';
|
||||
require_once $__dir . '/HTMLPurifier/URIScheme/nntp.php';
|
||||
require_once $__dir . '/HTMLPurifier/VarParser/Flexible.php';
|
||||
require_once $__dir . '/HTMLPurifier/VarParser/Native.php';
|
127
library/HTMLPurifier/AttrCollections.php
Normal file
127
library/HTMLPurifier/AttrCollections.php
Normal file
@@ -0,0 +1,127 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Defines common attribute collections that modules reference
|
||||
*/
|
||||
|
||||
class HTMLPurifier_AttrCollections
|
||||
{
|
||||
|
||||
/**
|
||||
* Associative array of attribute collections, indexed by name
|
||||
*/
|
||||
public $info = array();
|
||||
|
||||
/**
|
||||
* Performs all expansions on internal data for use by other inclusions
|
||||
* It also collects all attribute collection extensions from
|
||||
* modules
|
||||
* @param $attr_types HTMLPurifier_AttrTypes instance
|
||||
* @param $modules Hash array of HTMLPurifier_HTMLModule members
|
||||
*/
|
||||
public function __construct($attr_types, $modules) {
|
||||
// load extensions from the modules
|
||||
foreach ($modules as $module) {
|
||||
foreach ($module->attr_collections as $coll_i => $coll) {
|
||||
if (!isset($this->info[$coll_i])) {
|
||||
$this->info[$coll_i] = array();
|
||||
}
|
||||
foreach ($coll as $attr_i => $attr) {
|
||||
if ($attr_i === 0 && isset($this->info[$coll_i][$attr_i])) {
|
||||
// merge in includes
|
||||
$this->info[$coll_i][$attr_i] = array_merge(
|
||||
$this->info[$coll_i][$attr_i], $attr);
|
||||
continue;
|
||||
}
|
||||
$this->info[$coll_i][$attr_i] = $attr;
|
||||
}
|
||||
}
|
||||
}
|
||||
// perform internal expansions and inclusions
|
||||
foreach ($this->info as $name => $attr) {
|
||||
// merge attribute collections that include others
|
||||
$this->performInclusions($this->info[$name]);
|
||||
// replace string identifiers with actual attribute objects
|
||||
$this->expandIdentifiers($this->info[$name], $attr_types);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes a reference to an attribute associative array and performs
|
||||
* all inclusions specified by the zero index.
|
||||
* @param &$attr Reference to attribute array
|
||||
*/
|
||||
public function performInclusions(&$attr) {
|
||||
if (!isset($attr[0])) return;
|
||||
$merge = $attr[0];
|
||||
$seen = array(); // recursion guard
|
||||
// loop through all the inclusions
|
||||
for ($i = 0; isset($merge[$i]); $i++) {
|
||||
if (isset($seen[$merge[$i]])) continue;
|
||||
$seen[$merge[$i]] = true;
|
||||
// foreach attribute of the inclusion, copy it over
|
||||
if (!isset($this->info[$merge[$i]])) continue;
|
||||
foreach ($this->info[$merge[$i]] as $key => $value) {
|
||||
if (isset($attr[$key])) continue; // also catches more inclusions
|
||||
$attr[$key] = $value;
|
||||
}
|
||||
if (isset($this->info[$merge[$i]][0])) {
|
||||
// recursion
|
||||
$merge = array_merge($merge, $this->info[$merge[$i]][0]);
|
||||
}
|
||||
}
|
||||
unset($attr[0]);
|
||||
}
|
||||
|
||||
/**
|
||||
* Expands all string identifiers in an attribute array by replacing
|
||||
* them with the appropriate values inside HTMLPurifier_AttrTypes
|
||||
* @param &$attr Reference to attribute array
|
||||
* @param $attr_types HTMLPurifier_AttrTypes instance
|
||||
*/
|
||||
public function expandIdentifiers(&$attr, $attr_types) {
|
||||
|
||||
// because foreach will process new elements we add, make sure we
|
||||
// skip duplicates
|
||||
$processed = array();
|
||||
|
||||
foreach ($attr as $def_i => $def) {
|
||||
// skip inclusions
|
||||
if ($def_i === 0) continue;
|
||||
|
||||
if (isset($processed[$def_i])) continue;
|
||||
|
||||
// determine whether or not attribute is required
|
||||
if ($required = (strpos($def_i, '*') !== false)) {
|
||||
// rename the definition
|
||||
unset($attr[$def_i]);
|
||||
$def_i = trim($def_i, '*');
|
||||
$attr[$def_i] = $def;
|
||||
}
|
||||
|
||||
$processed[$def_i] = true;
|
||||
|
||||
// if we've already got a literal object, move on
|
||||
if (is_object($def)) {
|
||||
// preserve previous required
|
||||
$attr[$def_i]->required = ($required || $attr[$def_i]->required);
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($def === false) {
|
||||
unset($attr[$def_i]);
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($t = $attr_types->get($def)) {
|
||||
$attr[$def_i] = $t;
|
||||
$attr[$def_i]->required = $required;
|
||||
} else {
|
||||
unset($attr[$def_i]);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -10,26 +10,29 @@
|
||||
* subclasses are also responsible for cleaning the code if possible.
|
||||
*/
|
||||
|
||||
class HTMLPurifier_AttrDef
|
||||
abstract class HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
/**
|
||||
* Tells us whether or not an HTML attribute is minimized. Only the
|
||||
* boolean attribute vapourware would use this.
|
||||
* Tells us whether or not an HTML attribute is minimized. Has no
|
||||
* meaning in other contexts.
|
||||
*/
|
||||
var $minimized = false;
|
||||
public $minimized = false;
|
||||
|
||||
/**
|
||||
* Tells us whether or not an HTML attribute is required. Has no
|
||||
* meaning in other contexts
|
||||
*/
|
||||
public $required = false;
|
||||
|
||||
/**
|
||||
* Validates and cleans passed string according to a definition.
|
||||
*
|
||||
* @public
|
||||
* @param $string String to be validated and cleaned.
|
||||
* @param $config Mandatory HTMLPurifier_Config object.
|
||||
* @param $context Mandatory HTMLPurifier_AttrContext object.
|
||||
*/
|
||||
function validate($string, $config, &$context) {
|
||||
trigger_error('Cannot call abstract function', E_USER_ERROR);
|
||||
}
|
||||
abstract public function validate($string, $config, $context);
|
||||
|
||||
/**
|
||||
* Convenience method that parses a string as if it were CDATA.
|
||||
@@ -48,20 +51,36 @@ class HTMLPurifier_AttrDef
|
||||
*
|
||||
* @warning This processing is inconsistent with XML's whitespace handling
|
||||
* as specified by section 3.3.3 and referenced XHTML 1.0 section
|
||||
* 4.7. Compliant processing requires all line breaks normalized
|
||||
* to "\n", so the fix is not as simple as fixing it in this
|
||||
* function. Trim and whitespace collapsing are supposed to only
|
||||
* occur in NMTOKENs. However, note that we are NOT necessarily
|
||||
* parsing XML, thus, this behavior may still be correct.
|
||||
*
|
||||
* @public
|
||||
* 4.7. However, note that we are NOT necessarily
|
||||
* parsing XML, thus, this behavior may still be correct. We
|
||||
* assume that newlines have been normalized.
|
||||
*/
|
||||
function parseCDATA($string) {
|
||||
public function parseCDATA($string) {
|
||||
$string = trim($string);
|
||||
$string = str_replace("\n", '', $string);
|
||||
$string = str_replace(array("\r", "\t"), ' ', $string);
|
||||
$string = str_replace(array("\n", "\t", "\r"), ' ', $string);
|
||||
return $string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Factory method for creating this class from a string.
|
||||
* @param $string String construction info
|
||||
* @return Created AttrDef object corresponding to $string
|
||||
*/
|
||||
public function make($string) {
|
||||
// default implementation, return a flyweight of this object.
|
||||
// If $string has an effect on the returned object (i.e. you
|
||||
// need to overload this method), it is best
|
||||
// to clone or instantiate new copies. (Instantiation is safer.)
|
||||
return $this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes spaces from rgb(0, 0, 0) so that shorthand CSS properties work
|
||||
* properly. THIS IS A HACK!
|
||||
*/
|
||||
protected function mungeRgb($string) {
|
||||
return preg_replace('/rgb\((\d+)\s*,\s*(\d+)\s*,\s*(\d+)\)/', 'rgb(\1,\2,\3)', $string);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
?>
|
@@ -1,8 +1,5 @@
|
||||
<?php
|
||||
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
require_once 'HTMLPurifier/CSSDefinition.php';
|
||||
|
||||
/**
|
||||
* Validates the HTML attribute style, otherwise known as CSS.
|
||||
* @note We don't implement the whole CSS specification, so it might be
|
||||
@@ -17,7 +14,7 @@ require_once 'HTMLPurifier/CSSDefinition.php';
|
||||
class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
function validate($css, $config, &$context) {
|
||||
public function validate($css, $config, $context) {
|
||||
|
||||
$css = $this->parseCDATA($css);
|
||||
|
||||
@@ -32,13 +29,32 @@ class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef
|
||||
$declarations = explode(';', $css);
|
||||
$propvalues = array();
|
||||
|
||||
/**
|
||||
* Name of the current CSS property being validated.
|
||||
*/
|
||||
$property = false;
|
||||
$context->register('CurrentCSSProperty', $property);
|
||||
|
||||
foreach ($declarations as $declaration) {
|
||||
if (!$declaration) continue;
|
||||
if (!strpos($declaration, ':')) continue;
|
||||
list($property, $value) = explode(':', $declaration, 2);
|
||||
$property = trim($property);
|
||||
$value = trim($value);
|
||||
if (!isset($definition->info[$property])) continue;
|
||||
$ok = false;
|
||||
do {
|
||||
if (isset($definition->info[$property])) {
|
||||
$ok = true;
|
||||
break;
|
||||
}
|
||||
if (ctype_lower($property)) break;
|
||||
$property = strtolower($property);
|
||||
if (isset($definition->info[$property])) {
|
||||
$ok = true;
|
||||
break;
|
||||
}
|
||||
} while(0);
|
||||
if (!$ok) continue;
|
||||
// inefficient call, since the validator will do this again
|
||||
if (strtolower(trim($value)) !== 'inherit') {
|
||||
// inherit works for everything (but only on the base property)
|
||||
@@ -51,6 +67,8 @@ class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef
|
||||
$propvalues[$property] = $result;
|
||||
}
|
||||
|
||||
$context->destroy('CurrentCSSProperty');
|
||||
|
||||
// procedure does not write the new CSS simultaneously, so it's
|
||||
// slightly inefficient, but it's the only way of getting rid of
|
||||
// duplicates. Perhaps config to optimize it, but not now.
|
||||
@@ -66,4 +84,3 @@ class HTMLPurifier_AttrDef_CSS extends HTMLPurifier_AttrDef
|
||||
|
||||
}
|
||||
|
||||
?>
|
19
library/HTMLPurifier/AttrDef/CSS/AlphaValue.php
Normal file
19
library/HTMLPurifier/AttrDef/CSS/AlphaValue.php
Normal file
@@ -0,0 +1,19 @@
|
||||
<?php
|
||||
|
||||
class HTMLPurifier_AttrDef_CSS_AlphaValue extends HTMLPurifier_AttrDef_CSS_Number
|
||||
{
|
||||
|
||||
public function __construct() {
|
||||
parent::__construct(false); // opacity is non-negative, but we will clamp it
|
||||
}
|
||||
|
||||
public function validate($number, $config, $context) {
|
||||
$result = parent::validate($number, $config, $context);
|
||||
if ($result === false) return $result;
|
||||
$float = (float) $result;
|
||||
if ($float < 0.0) $result = '0';
|
||||
if ($float > 1.0) $result = '1';
|
||||
return $result;
|
||||
}
|
||||
|
||||
}
|
@@ -1,22 +1,19 @@
|
||||
<?php
|
||||
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
require_once 'HTMLPurifier/CSSDefinition.php';
|
||||
|
||||
/**
|
||||
* Validates shorthand CSS property background.
|
||||
* @warning Does not support url tokens that have internal spaces.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_Background extends HTMLPurifier_AttrDef
|
||||
class HTMLPurifier_AttrDef_CSS_Background extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
/**
|
||||
* Local copy of component validators.
|
||||
* @note See HTMLPurifier_AttrDef_Font::$info for a similar impl.
|
||||
*/
|
||||
var $info;
|
||||
protected $info;
|
||||
|
||||
function HTMLPurifier_AttrDef_Background($config) {
|
||||
public function __construct($config) {
|
||||
$def = $config->getCSSDefinition();
|
||||
$this->info['background-color'] = $def->info['background-color'];
|
||||
$this->info['background-image'] = $def->info['background-image'];
|
||||
@@ -25,12 +22,15 @@ class HTMLPurifier_AttrDef_Background extends HTMLPurifier_AttrDef
|
||||
$this->info['background-position'] = $def->info['background-position'];
|
||||
}
|
||||
|
||||
function validate($string, $config, &$context) {
|
||||
public function validate($string, $config, $context) {
|
||||
|
||||
// regular pre-processing
|
||||
$string = $this->parseCDATA($string);
|
||||
if ($string === '') return false;
|
||||
|
||||
// munge rgb() decl if necessary
|
||||
$string = $this->mungeRgb($string);
|
||||
|
||||
// assumes URI doesn't have spaces in it
|
||||
$bits = explode(' ', strtolower($string)); // bits to process
|
||||
|
||||
@@ -84,4 +84,3 @@ class HTMLPurifier_AttrDef_Background extends HTMLPurifier_AttrDef
|
||||
|
||||
}
|
||||
|
||||
?>
|
@@ -1,9 +1,5 @@
|
||||
<?php
|
||||
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
require_once 'HTMLPurifier/AttrDef/CSSLength.php';
|
||||
require_once 'HTMLPurifier/AttrDef/Percentage.php';
|
||||
|
||||
/* W3C says:
|
||||
[ // adjective and number must be in correct order, even if
|
||||
// you could switch them without introducing ambiguity.
|
||||
@@ -45,18 +41,18 @@ require_once 'HTMLPurifier/AttrDef/Percentage.php';
|
||||
/**
|
||||
* Validates the value of background-position.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_BackgroundPosition extends HTMLPurifier_AttrDef
|
||||
class HTMLPurifier_AttrDef_CSS_BackgroundPosition extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
var $length;
|
||||
var $percentage;
|
||||
protected $length;
|
||||
protected $percentage;
|
||||
|
||||
function HTMLPurifier_AttrDef_BackgroundPosition() {
|
||||
$this->length = new HTMLPurifier_AttrDef_CSSLength();
|
||||
$this->percentage = new HTMLPurifier_AttrDef_Percentage();
|
||||
public function __construct() {
|
||||
$this->length = new HTMLPurifier_AttrDef_CSS_Length();
|
||||
$this->percentage = new HTMLPurifier_AttrDef_CSS_Percentage();
|
||||
}
|
||||
|
||||
function validate($string, $config, &$context) {
|
||||
public function validate($string, $config, $context) {
|
||||
$string = $this->parseCDATA($string);
|
||||
$bits = explode(' ', $string);
|
||||
|
||||
@@ -127,4 +123,3 @@ class HTMLPurifier_AttrDef_BackgroundPosition extends HTMLPurifier_AttrDef
|
||||
|
||||
}
|
||||
|
||||
?>
|
@@ -1,28 +1,26 @@
|
||||
<?php
|
||||
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
|
||||
/**
|
||||
* Validates the border property as defined by CSS.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_Border extends HTMLPurifier_AttrDef
|
||||
class HTMLPurifier_AttrDef_CSS_Border extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
/**
|
||||
* Local copy of properties this property is shorthand for.
|
||||
*/
|
||||
var $info = array();
|
||||
protected $info = array();
|
||||
|
||||
function HTMLPurifier_AttrDef_Border($config) {
|
||||
public function __construct($config) {
|
||||
$def = $config->getCSSDefinition();
|
||||
$this->info['border-width'] = $def->info['border-width'];
|
||||
$this->info['border-style'] = $def->info['border-style'];
|
||||
$this->info['border-top-color'] = $def->info['border-top-color'];
|
||||
}
|
||||
|
||||
function validate($string, $config, &$context) {
|
||||
public function validate($string, $config, $context) {
|
||||
$string = $this->parseCDATA($string);
|
||||
// we specifically will not support rgb() syntax with spaces
|
||||
$string = $this->mungeRgb($string);
|
||||
$bits = explode(' ', $string);
|
||||
$done = array(); // segments we've finished
|
||||
$ret = ''; // return value
|
||||
@@ -42,4 +40,3 @@ class HTMLPurifier_AttrDef_Border extends HTMLPurifier_AttrDef
|
||||
|
||||
}
|
||||
|
||||
?>
|
@@ -1,54 +1,24 @@
|
||||
<?php
|
||||
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
|
||||
/**
|
||||
* Validates Color as defined by CSS.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_Color extends HTMLPurifier_AttrDef
|
||||
class HTMLPurifier_AttrDef_CSS_Color extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
/**
|
||||
* Color keyword lookup table.
|
||||
* @todo Extend it to include all usually allowed colors.
|
||||
*/
|
||||
var $colors = array(
|
||||
'maroon' => '#800000',
|
||||
'red' => '#F00',
|
||||
'orange' => '#FFA500',
|
||||
'yellow' => '#FF0',
|
||||
'olive' => '#808000',
|
||||
'purple' => '#800080',
|
||||
'fuchsia' => '#F0F',
|
||||
'white' => '#FFF',
|
||||
'lime' => '#0F0',
|
||||
'green' => '#008000',
|
||||
'navy' => '#000080',
|
||||
'blue' => '#00F',
|
||||
'aqua' => '#0FF',
|
||||
'teal' => '#008080',
|
||||
'black' => '#000',
|
||||
'silver' => '#C0C0C0',
|
||||
'gray' => '#808080'
|
||||
);
|
||||
|
||||
function validate($color, $config, &$context) {
|
||||
public function validate($color, $config, $context) {
|
||||
|
||||
static $colors = null;
|
||||
if ($colors === null) $colors = $config->get('Core', 'ColorKeywords');
|
||||
|
||||
$color = trim($color);
|
||||
if (!$color) return false;
|
||||
if ($color === '') return false;
|
||||
|
||||
$lower = strtolower($color);
|
||||
if (isset($this->colors[$lower])) return $this->colors[$lower];
|
||||
if (isset($colors[$lower])) return $colors[$lower];
|
||||
|
||||
if ($color[0] === '#') {
|
||||
// hexadecimal handling
|
||||
$hex = substr($color, 1);
|
||||
$length = strlen($hex);
|
||||
if ($length !== 3 && $length !== 6) return false;
|
||||
if (!ctype_xdigit($hex)) return false;
|
||||
} else {
|
||||
if (strpos($color, 'rgb(') !== false) {
|
||||
// rgb literal handling
|
||||
if (strpos($color, 'rgb(')) return false;
|
||||
$length = strlen($color);
|
||||
if (strpos($color, ')') !== $length - 1) return false;
|
||||
$triad = substr($color, 4, $length - 4 - 1);
|
||||
@@ -86,6 +56,17 @@ class HTMLPurifier_AttrDef_Color extends HTMLPurifier_AttrDef
|
||||
}
|
||||
$new_triad = implode(',', $new_parts);
|
||||
$color = "rgb($new_triad)";
|
||||
} else {
|
||||
// hexadecimal handling
|
||||
if ($color[0] === '#') {
|
||||
$hex = substr($color, 1);
|
||||
} else {
|
||||
$hex = $color;
|
||||
$color = '#' . $color;
|
||||
}
|
||||
$length = strlen($hex);
|
||||
if ($length !== 3 && $length !== 6) return false;
|
||||
if (!ctype_xdigit($hex)) return false;
|
||||
}
|
||||
|
||||
return $color;
|
||||
@@ -94,4 +75,3 @@ class HTMLPurifier_AttrDef_Color extends HTMLPurifier_AttrDef
|
||||
|
||||
}
|
||||
|
||||
?>
|
@@ -9,23 +9,23 @@
|
||||
* especially useful for CSS values, which often are a choice between
|
||||
* an enumerated set of predefined values or a flexible data type.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_Composite extends HTMLPurifier_AttrDef
|
||||
class HTMLPurifier_AttrDef_CSS_Composite extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
/**
|
||||
* List of HTMLPurifier_AttrDef objects that may process strings
|
||||
* @protected
|
||||
* @todo Make protected
|
||||
*/
|
||||
var $defs;
|
||||
public $defs;
|
||||
|
||||
/**
|
||||
* @param $defs List of HTMLPurifier_AttrDef objects
|
||||
*/
|
||||
function HTMLPurifier_AttrDef_Composite($defs) {
|
||||
public function __construct($defs) {
|
||||
$this->defs = $defs;
|
||||
}
|
||||
|
||||
function validate($string, $config, &$context) {
|
||||
public function validate($string, $config, $context) {
|
||||
foreach ($this->defs as $i => $def) {
|
||||
$result = $this->defs[$i]->validate($string, $config, $context);
|
||||
if ($result !== false) return $result;
|
||||
@@ -35,4 +35,3 @@ class HTMLPurifier_AttrDef_Composite extends HTMLPurifier_AttrDef
|
||||
|
||||
}
|
||||
|
||||
?>
|
26
library/HTMLPurifier/AttrDef/CSS/DenyElementDecorator.php
Normal file
26
library/HTMLPurifier/AttrDef/CSS/DenyElementDecorator.php
Normal file
@@ -0,0 +1,26 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Decorator which enables CSS properties to be disabled for specific elements.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_CSS_DenyElementDecorator extends HTMLPurifier_AttrDef
|
||||
{
|
||||
protected $def, $element;
|
||||
|
||||
/**
|
||||
* @param $def Definition to wrap
|
||||
* @param $element Element to deny
|
||||
*/
|
||||
public function __construct($def, $element) {
|
||||
$this->def = $def;
|
||||
$this->element = $element;
|
||||
}
|
||||
/**
|
||||
* Checks if CurrentToken is set and equal to $this->element
|
||||
*/
|
||||
public function validate($string, $config, $context) {
|
||||
$token = $context->get('CurrentToken', true);
|
||||
if ($token && $token->name == $this->element) return false;
|
||||
return $this->def->validate($string, $config, $context);
|
||||
}
|
||||
}
|
52
library/HTMLPurifier/AttrDef/CSS/Filter.php
Normal file
52
library/HTMLPurifier/AttrDef/CSS/Filter.php
Normal file
@@ -0,0 +1,52 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Microsoft's proprietary filter: CSS property
|
||||
* @note Currently supports the alpha filter. In the future, this will
|
||||
* probably need an extensible framework
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_CSS_Filter extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
protected $intValidator;
|
||||
|
||||
public function __construct() {
|
||||
$this->intValidator = new HTMLPurifier_AttrDef_Integer();
|
||||
}
|
||||
|
||||
public function validate($value, $config, $context) {
|
||||
$value = $this->parseCDATA($value);
|
||||
if ($value === 'none') return $value;
|
||||
// if we looped this we could support multiple filters
|
||||
$function_length = strcspn($value, '(');
|
||||
$function = trim(substr($value, 0, $function_length));
|
||||
if ($function !== 'alpha' &&
|
||||
$function !== 'Alpha' &&
|
||||
$function !== 'progid:DXImageTransform.Microsoft.Alpha'
|
||||
) return false;
|
||||
$cursor = $function_length + 1;
|
||||
$parameters_length = strcspn($value, ')', $cursor);
|
||||
$parameters = substr($value, $cursor, $parameters_length);
|
||||
$params = explode(',', $parameters);
|
||||
$ret_params = array();
|
||||
$lookup = array();
|
||||
foreach ($params as $param) {
|
||||
list($key, $value) = explode('=', $param);
|
||||
$key = trim($key);
|
||||
$value = trim($value);
|
||||
if (isset($lookup[$key])) continue;
|
||||
if ($key !== 'opacity') continue;
|
||||
$value = $this->intValidator->validate($value, $config, $context);
|
||||
if ($value === false) continue;
|
||||
$int = (int) $value;
|
||||
if ($int > 100) $value = '100';
|
||||
if ($int < 0) $value = '0';
|
||||
$ret_params[] = "$key=$value";
|
||||
$lookup[$key] = true;
|
||||
}
|
||||
$ret_parameters = implode(',', $ret_params);
|
||||
$ret_function = "$function($ret_parameters)";
|
||||
return $ret_function;
|
||||
}
|
||||
|
||||
}
|
@@ -1,11 +1,9 @@
|
||||
<?php
|
||||
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
|
||||
/**
|
||||
* Validates shorthand CSS property font.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_Font extends HTMLPurifier_AttrDef
|
||||
class HTMLPurifier_AttrDef_CSS_Font extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
/**
|
||||
@@ -16,21 +14,9 @@ class HTMLPurifier_AttrDef_Font extends HTMLPurifier_AttrDef
|
||||
* CSSDefinition, this wouldn't be necessary. We'd instantiate
|
||||
* our own copies.
|
||||
*/
|
||||
var $info = array();
|
||||
protected $info = array();
|
||||
|
||||
/**
|
||||
* System font keywords.
|
||||
*/
|
||||
var $system_fonts = array(
|
||||
'caption' => true,
|
||||
'icon' => true,
|
||||
'menu' => true,
|
||||
'message-box' => true,
|
||||
'small-caption' => true,
|
||||
'status-bar' => true
|
||||
);
|
||||
|
||||
function HTMLPurifier_AttrDef_Font($config) {
|
||||
public function __construct($config) {
|
||||
$def = $config->getCSSDefinition();
|
||||
$this->info['font-style'] = $def->info['font-style'];
|
||||
$this->info['font-variant'] = $def->info['font-variant'];
|
||||
@@ -40,7 +26,16 @@ class HTMLPurifier_AttrDef_Font extends HTMLPurifier_AttrDef
|
||||
$this->info['font-family'] = $def->info['font-family'];
|
||||
}
|
||||
|
||||
function validate($string, $config, &$context) {
|
||||
public function validate($string, $config, $context) {
|
||||
|
||||
static $system_fonts = array(
|
||||
'caption' => true,
|
||||
'icon' => true,
|
||||
'menu' => true,
|
||||
'message-box' => true,
|
||||
'small-caption' => true,
|
||||
'status-bar' => true
|
||||
);
|
||||
|
||||
// regular pre-processing
|
||||
$string = $this->parseCDATA($string);
|
||||
@@ -48,7 +43,7 @@ class HTMLPurifier_AttrDef_Font extends HTMLPurifier_AttrDef
|
||||
|
||||
// check if it's one of the keywords
|
||||
$lowercase_string = strtolower($string);
|
||||
if (isset($this->system_fonts[$lowercase_string])) {
|
||||
if (isset($system_fonts[$lowercase_string])) {
|
||||
return $lowercase_string;
|
||||
}
|
||||
|
||||
@@ -151,4 +146,3 @@ class HTMLPurifier_AttrDef_Font extends HTMLPurifier_AttrDef
|
||||
|
||||
}
|
||||
|
||||
?>
|
89
library/HTMLPurifier/AttrDef/CSS/FontFamily.php
Normal file
89
library/HTMLPurifier/AttrDef/CSS/FontFamily.php
Normal file
@@ -0,0 +1,89 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Validates a font family list according to CSS spec
|
||||
* @todo whitelisting allowed fonts would be nice
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
public function validate($string, $config, $context) {
|
||||
static $generic_names = array(
|
||||
'serif' => true,
|
||||
'sans-serif' => true,
|
||||
'monospace' => true,
|
||||
'fantasy' => true,
|
||||
'cursive' => true
|
||||
);
|
||||
|
||||
// assume that no font names contain commas in them
|
||||
$fonts = explode(',', $string);
|
||||
$final = '';
|
||||
foreach($fonts as $font) {
|
||||
$font = trim($font);
|
||||
if ($font === '') continue;
|
||||
// match a generic name
|
||||
if (isset($generic_names[$font])) {
|
||||
$final .= $font . ', ';
|
||||
continue;
|
||||
}
|
||||
// match a quoted name
|
||||
if ($font[0] === '"' || $font[0] === "'") {
|
||||
$length = strlen($font);
|
||||
if ($length <= 2) continue;
|
||||
$quote = $font[0];
|
||||
if ($font[$length - 1] !== $quote) continue;
|
||||
$font = substr($font, 1, $length - 2);
|
||||
|
||||
$new_font = '';
|
||||
for ($i = 0, $c = strlen($font); $i < $c; $i++) {
|
||||
if ($font[$i] === '\\') {
|
||||
$i++;
|
||||
if ($i >= $c) {
|
||||
$new_font .= '\\';
|
||||
break;
|
||||
}
|
||||
if (ctype_xdigit($font[$i])) {
|
||||
$code = $font[$i];
|
||||
for ($a = 1, $i++; $i < $c && $a < 6; $i++, $a++) {
|
||||
if (!ctype_xdigit($font[$i])) break;
|
||||
$code .= $font[$i];
|
||||
}
|
||||
// We have to be extremely careful when adding
|
||||
// new characters, to make sure we're not breaking
|
||||
// the encoding.
|
||||
$char = HTMLPurifier_Encoder::unichr(hexdec($code));
|
||||
if (HTMLPurifier_Encoder::cleanUTF8($char) === '') continue;
|
||||
$new_font .= $char;
|
||||
if ($i < $c && trim($font[$i]) !== '') $i--;
|
||||
continue;
|
||||
}
|
||||
if ($font[$i] === "\n") continue;
|
||||
}
|
||||
$new_font .= $font[$i];
|
||||
}
|
||||
|
||||
$font = $new_font;
|
||||
}
|
||||
// $font is a pure representation of the font name
|
||||
|
||||
if (ctype_alnum($font) && $font !== '') {
|
||||
// very simple font, allow it in unharmed
|
||||
$final .= $font . ', ';
|
||||
continue;
|
||||
}
|
||||
|
||||
// complicated font, requires quoting
|
||||
|
||||
// armor single quotes and new lines
|
||||
$font = str_replace("\\", "\\\\", $font);
|
||||
$font = str_replace("'", "\\'", $font);
|
||||
$final .= "'$font', ";
|
||||
}
|
||||
$final = rtrim($final, ', ');
|
||||
if ($final === '') return false;
|
||||
return $final;
|
||||
}
|
||||
|
||||
}
|
||||
|
38
library/HTMLPurifier/AttrDef/CSS/ImportantDecorator.php
Normal file
38
library/HTMLPurifier/AttrDef/CSS/ImportantDecorator.php
Normal file
@@ -0,0 +1,38 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Decorator which enables !important to be used in CSS values.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_CSS_ImportantDecorator extends HTMLPurifier_AttrDef
|
||||
{
|
||||
protected $def, $allow;
|
||||
|
||||
/**
|
||||
* @param $def Definition to wrap
|
||||
* @param $allow Whether or not to allow !important
|
||||
*/
|
||||
public function __construct($def, $allow = false) {
|
||||
$this->def = $def;
|
||||
$this->allow = $allow;
|
||||
}
|
||||
/**
|
||||
* Intercepts and removes !important if necessary
|
||||
*/
|
||||
public function validate($string, $config, $context) {
|
||||
// test for ! and important tokens
|
||||
$string = trim($string);
|
||||
$is_important = false;
|
||||
// :TODO: optimization: test directly for !important and ! important
|
||||
if (strlen($string) >= 9 && substr($string, -9) === 'important') {
|
||||
$temp = rtrim(substr($string, 0, -9));
|
||||
// use a temp, because we might want to restore important
|
||||
if (strlen($temp) >= 1 && substr($temp, -1) === '!') {
|
||||
$string = rtrim(substr($temp, 0, -1));
|
||||
$is_important = true;
|
||||
}
|
||||
}
|
||||
$string = $this->def->validate($string, $config, $context);
|
||||
if ($this->allow && $is_important) $string .= ' !important';
|
||||
return $string;
|
||||
}
|
||||
}
|
46
library/HTMLPurifier/AttrDef/CSS/Length.php
Normal file
46
library/HTMLPurifier/AttrDef/CSS/Length.php
Normal file
@@ -0,0 +1,46 @@
|
||||
<?php
|
||||
|
||||
/**
|
||||
* Represents a Length as defined by CSS.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_CSS_Length extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
protected $min, $max;
|
||||
|
||||
/**
|
||||
* @param HTMLPurifier_Length $max Minimum length, or null for no bound. String is also acceptable.
|
||||
* @param HTMLPurifier_Length $max Maximum length, or null for no bound. String is also acceptable.
|
||||
*/
|
||||
public function __construct($min = null, $max = null) {
|
||||
$this->min = $min !== null ? HTMLPurifier_Length::make($min) : null;
|
||||
$this->max = $max !== null ? HTMLPurifier_Length::make($max) : null;
|
||||
}
|
||||
|
||||
public function validate($string, $config, $context) {
|
||||
$string = $this->parseCDATA($string);
|
||||
|
||||
// Optimizations
|
||||
if ($string === '') return false;
|
||||
if ($string === '0') return '0';
|
||||
if (strlen($string) === 1) return false;
|
||||
|
||||
$length = HTMLPurifier_Length::make($string);
|
||||
if (!$length->isValid()) return false;
|
||||
|
||||
if ($this->min) {
|
||||
$c = $length->compareTo($this->min);
|
||||
if ($c === false) return false;
|
||||
if ($c < 0) return false;
|
||||
}
|
||||
if ($this->max) {
|
||||
$c = $length->compareTo($this->max);
|
||||
if ($c === false) return false;
|
||||
if ($c > 0) return false;
|
||||
}
|
||||
|
||||
return $length->toString();
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -1,28 +1,26 @@
|
||||
<?php
|
||||
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
|
||||
/**
|
||||
* Validates shorthand CSS property list-style.
|
||||
* @warning Does not support url tokens that have internal spaces.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_ListStyle extends HTMLPurifier_AttrDef
|
||||
class HTMLPurifier_AttrDef_CSS_ListStyle extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
/**
|
||||
* Local copy of component validators.
|
||||
* @note See HTMLPurifier_AttrDef_Font::$info for a similar impl.
|
||||
* @note See HTMLPurifier_AttrDef_CSS_Font::$info for a similar impl.
|
||||
*/
|
||||
var $info;
|
||||
protected $info;
|
||||
|
||||
function HTMLPurifier_AttrDef_ListStyle($config) {
|
||||
public function __construct($config) {
|
||||
$def = $config->getCSSDefinition();
|
||||
$this->info['list-style-type'] = $def->info['list-style-type'];
|
||||
$this->info['list-style-position'] = $def->info['list-style-position'];
|
||||
$this->info['list-style-image'] = $def->info['list-style-image'];
|
||||
}
|
||||
|
||||
function validate($string, $config, &$context) {
|
||||
public function validate($string, $config, $context) {
|
||||
|
||||
// regular pre-processing
|
||||
$string = $this->parseCDATA($string);
|
||||
@@ -77,4 +75,3 @@ class HTMLPurifier_AttrDef_ListStyle extends HTMLPurifier_AttrDef
|
||||
|
||||
}
|
||||
|
||||
?>
|
@@ -1,7 +1,5 @@
|
||||
<?php
|
||||
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
|
||||
/**
|
||||
* Framework class for strings that involve multiple values.
|
||||
*
|
||||
@@ -13,29 +11,31 @@ require_once 'HTMLPurifier/AttrDef.php';
|
||||
* can only be used alone: it will never manifest as part of a multi
|
||||
* shorthand declaration. Thus, this class does not allow inherit.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_Multiple extends HTMLPurifier_AttrDef
|
||||
class HTMLPurifier_AttrDef_CSS_Multiple extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
/**
|
||||
* Instance of component definition to defer validation to.
|
||||
* @todo Make protected
|
||||
*/
|
||||
var $single;
|
||||
public $single;
|
||||
|
||||
/**
|
||||
* Max number of values allowed.
|
||||
* @todo Make protected
|
||||
*/
|
||||
var $max;
|
||||
public $max;
|
||||
|
||||
/**
|
||||
* @param $single HTMLPurifier_AttrDef to multiply
|
||||
* @param $max Max number of values allowed (usually four)
|
||||
*/
|
||||
function HTMLPurifier_AttrDef_Multiple($single, $max = 4) {
|
||||
public function __construct($single, $max = 4) {
|
||||
$this->single = $single;
|
||||
$this->max = $max;
|
||||
}
|
||||
|
||||
function validate($string, $config, &$context) {
|
||||
public function validate($string, $config, $context) {
|
||||
$string = $this->parseCDATA($string);
|
||||
if ($string === '') return false;
|
||||
$parts = explode(' ', $string); // parseCDATA replaced \r, \t and \n
|
||||
@@ -55,4 +55,3 @@ class HTMLPurifier_AttrDef_Multiple extends HTMLPurifier_AttrDef
|
||||
|
||||
}
|
||||
|
||||
?>
|
@@ -3,26 +3,31 @@
|
||||
/**
|
||||
* Validates a number as defined by the CSS spec.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_Number extends HTMLPurifier_AttrDef
|
||||
class HTMLPurifier_AttrDef_CSS_Number extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
/**
|
||||
* Bool indicating whether or not only positive values allowed.
|
||||
*/
|
||||
var $non_negative = false;
|
||||
protected $non_negative = false;
|
||||
|
||||
/**
|
||||
* @param $non_negative Bool indicating whether negatives are forbidden
|
||||
*/
|
||||
function HTMLPurifier_AttrDef_Number($non_negative = false) {
|
||||
public function __construct($non_negative = false) {
|
||||
$this->non_negative = $non_negative;
|
||||
}
|
||||
|
||||
function validate($number, $config, &$context) {
|
||||
/**
|
||||
* @warning Some contexts do not pass $config, $context. These
|
||||
* variables should not be used without checking HTMLPurifier_Length
|
||||
*/
|
||||
public function validate($number, $config, $context) {
|
||||
|
||||
$number = $this->parseCDATA($number);
|
||||
|
||||
if ($number === '') return false;
|
||||
if ($number === '0') return '0';
|
||||
|
||||
$sign = '';
|
||||
switch ($number[0]) {
|
||||
@@ -37,13 +42,16 @@ class HTMLPurifier_AttrDef_Number extends HTMLPurifier_AttrDef
|
||||
$number = ltrim($number, '0');
|
||||
return $number ? $sign . $number : '0';
|
||||
}
|
||||
if (!strpos($number, '.')) return false;
|
||||
|
||||
// Period is the only non-numeric character allowed
|
||||
if (strpos($number, '.') === false) return false;
|
||||
|
||||
list($left, $right) = explode('.', $number, 2);
|
||||
|
||||
if (!ctype_digit($left)) return false;
|
||||
$left = ltrim($left, '0');
|
||||
if ($left === '' && $right === '') return false;
|
||||
if ($left !== '' && !ctype_digit($left)) return false;
|
||||
|
||||
$left = ltrim($left, '0');
|
||||
$right = rtrim($right, '0');
|
||||
|
||||
if ($right === '') {
|
||||
@@ -58,4 +66,3 @@ class HTMLPurifier_AttrDef_Number extends HTMLPurifier_AttrDef
|
||||
|
||||
}
|
||||
|
||||
?>
|
@@ -1,27 +1,24 @@
|
||||
<?php
|
||||
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
require_once 'HTMLPurifier/AttrDef/Number.php';
|
||||
|
||||
/**
|
||||
* Validates a Percentage as defined by the CSS spec.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_Percentage extends HTMLPurifier_AttrDef
|
||||
class HTMLPurifier_AttrDef_CSS_Percentage extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
/**
|
||||
* Instance of HTMLPurifier_AttrDef_Number to defer number validation
|
||||
* Instance of HTMLPurifier_AttrDef_CSS_Number to defer number validation
|
||||
*/
|
||||
var $number_def;
|
||||
protected $number_def;
|
||||
|
||||
/**
|
||||
* @param Bool indicating whether to forbid negative values
|
||||
*/
|
||||
function HTMLPurifier_AttrDef_Percentage($non_negative = false) {
|
||||
$this->number_def = new HTMLPurifier_AttrDef_Number($non_negative);
|
||||
public function __construct($non_negative = false) {
|
||||
$this->number_def = new HTMLPurifier_AttrDef_CSS_Number($non_negative);
|
||||
}
|
||||
|
||||
function validate($string, $config, &$context) {
|
||||
public function validate($string, $config, $context) {
|
||||
|
||||
$string = $this->parseCDATA($string);
|
||||
|
||||
@@ -40,4 +37,3 @@ class HTMLPurifier_AttrDef_Percentage extends HTMLPurifier_AttrDef
|
||||
|
||||
}
|
||||
|
||||
?>
|
@@ -1,32 +1,29 @@
|
||||
<?php
|
||||
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
|
||||
/**
|
||||
* Validates the value for the CSS property text-decoration
|
||||
* @note This class could be generalized into a version that acts sort of
|
||||
* like Enum except you can compound the allowed values.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_TextDecoration extends HTMLPurifier_AttrDef
|
||||
class HTMLPurifier_AttrDef_CSS_TextDecoration extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
/**
|
||||
* Lookup table of allowed values.
|
||||
* @protected
|
||||
*/
|
||||
var $allowed_values = array(
|
||||
'line-through' => true,
|
||||
'overline' => true,
|
||||
'underline' => true
|
||||
);
|
||||
|
||||
function validate($string, $config, &$context) {
|
||||
public function validate($string, $config, $context) {
|
||||
|
||||
static $allowed_values = array(
|
||||
'line-through' => true,
|
||||
'overline' => true,
|
||||
'underline' => true,
|
||||
);
|
||||
|
||||
$string = strtolower($this->parseCDATA($string));
|
||||
|
||||
if ($string === 'none') return $string;
|
||||
|
||||
$parts = explode(' ', $string);
|
||||
$final = '';
|
||||
foreach ($parts as $part) {
|
||||
if (isset($this->allowed_values[$part])) {
|
||||
if (isset($allowed_values[$part])) {
|
||||
$final .= $part . ' ';
|
||||
}
|
||||
}
|
||||
@@ -38,4 +35,3 @@ class HTMLPurifier_AttrDef_TextDecoration extends HTMLPurifier_AttrDef
|
||||
|
||||
}
|
||||
|
||||
?>
|
@@ -1,24 +1,22 @@
|
||||
<?php
|
||||
|
||||
require_once 'HTMLPurifier/AttrDef/URI.php';
|
||||
|
||||
/**
|
||||
* Validates a URI in CSS syntax, which uses url('http://example.com')
|
||||
* @note While theoretically speaking we a URI in a CSS document could
|
||||
* @note While theoretically speaking a URI in a CSS document could
|
||||
* be non-embedded, as of CSS2 there is no such usage so we're
|
||||
* generalizing it. This may need to be changed in the future.
|
||||
* @warning Since HTMLPurifier_AttrDef_CSS blindly uses semicolons as
|
||||
* the separator, you cannot put a literal semicolon in
|
||||
* in the URI. Try percent encoding it, in that case.
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_CSSURI extends HTMLPurifier_AttrDef_URI
|
||||
class HTMLPurifier_AttrDef_CSS_URI extends HTMLPurifier_AttrDef_URI
|
||||
{
|
||||
|
||||
function HTMLPurifier_AttrDef_CSSURI() {
|
||||
$this->HTMLPurifier_AttrDef_URI(true); // always embedded
|
||||
public function __construct() {
|
||||
parent::__construct(true); // always embedded
|
||||
}
|
||||
|
||||
function validate($uri_string, $config, &$context) {
|
||||
public function validate($uri_string, $config, $context) {
|
||||
// parse the URI out of the string and then pass it onto
|
||||
// the parent object
|
||||
|
||||
@@ -29,7 +27,7 @@ class HTMLPurifier_AttrDef_CSSURI extends HTMLPurifier_AttrDef_URI
|
||||
if ($uri_string[$new_length] != ')') return false;
|
||||
$uri = trim(substr($uri_string, 0, $new_length));
|
||||
|
||||
if (isset($uri[0]) && ($uri[0] == "'" || $uri[0] == '"')) {
|
||||
if (!empty($uri) && ($uri[0] == "'" || $uri[0] == '"')) {
|
||||
$quote = $uri[0];
|
||||
$new_length = strlen($uri) - 1;
|
||||
if ($uri[$new_length] !== $quote) return false;
|
||||
@@ -55,4 +53,3 @@ class HTMLPurifier_AttrDef_CSSURI extends HTMLPurifier_AttrDef_URI
|
||||
|
||||
}
|
||||
|
||||
?>
|
@@ -1,57 +0,0 @@
|
||||
<?php
|
||||
|
||||
require_once 'HTMLPurifier/AttrDef.php';
|
||||
require_once 'HTMLPurifier/AttrDef/Number.php';
|
||||
|
||||
/**
|
||||
* Represents a Length as defined by CSS.
|
||||
* @warning Be sure not to confuse this with HTMLPurifier_AttrDef_Length!
|
||||
*/
|
||||
class HTMLPurifier_AttrDef_CSSLength extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
/**
|
||||
* Valid unit lookup table.
|
||||
* @warning The code assumes all units are two characters long. Be careful
|
||||
* if we have to change this behavior!
|
||||
*/
|
||||
var $units = array('em' => true, 'ex' => true, 'px' => true, 'in' => true,
|
||||
'cm' => true, 'mm' => true, 'pt' => true, 'pc' => true);
|
||||
/**
|
||||
* Instance of HTMLPurifier_AttrDef_Number to defer number validation to
|
||||
*/
|
||||
var $number_def;
|
||||
|
||||
/**
|
||||
* @param $non_negative Bool indication whether or not negative values are
|
||||
* allowed.
|
||||
*/
|
||||
function HTMLPurifier_AttrDef_CSSLength($non_negative = false) {
|
||||
$this->number_def = new HTMLPurifier_AttrDef_Number($non_negative);
|
||||
}
|
||||
|
||||
function validate($length, $config, &$context) {
|
||||
|
||||
$length = $this->parseCDATA($length);
|
||||
if ($length === '') return false;
|
||||
if ($length === '0') return '0';
|
||||
$strlen = strlen($length);
|
||||
if ($strlen === 1) return false; // impossible!
|
||||
|
||||
// we assume all units are two characters
|
||||
$unit = substr($length, $strlen - 2);
|
||||
if (!ctype_lower($unit)) $unit = strtolower($unit);
|
||||
$number = substr($length, 0, $strlen - 2);
|
||||
|
||||
if (!isset($this->units[$unit])) return false;
|
||||
|
||||
$number = $this->number_def->validate($number, $config, $context);
|
||||
if ($number === false) return false;
|
||||
|
||||
return $number . $unit;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
?>
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user