diff --git a/Doxyfile b/Doxyfile index 9c31af60..3fed194f 100644 --- a/Doxyfile +++ b/Doxyfile @@ -4,7 +4,7 @@ # Project related configuration options #--------------------------------------------------------------------------- PROJECT_NAME = HTML Purifier -PROJECT_NUMBER = 1.3.0 +PROJECT_NUMBER = 1.3.2 OUTPUT_DIRECTORY = "C:/Documents and Settings/Edward/My Documents/My Webs/htmlpurifier/docs/doxygen" CREATE_SUBDIRS = NO OUTPUT_LANGUAGE = English diff --git a/NEWS b/NEWS index a4e27fb0..e7ecfe4d 100644 --- a/NEWS +++ b/NEWS @@ -9,6 +9,20 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier . Internal change ========================== +1.3.2, released 2006-12-25 +! HTMLPurifier object now accepts configuration arrays, no need to manually + instantiate a configuration object +! Context object now accessible to outside +! Added enduser-youtube.html, explains how to embed YouTube videos. See + also corresponding smoketest preserveYouTube.php. +! Added purifyArray(), which takes a list of HTML and purifies it all +! Added static member variable $version to HTML Purifier with PHP-compatible + version number string. +- Fixed fatal error thrown by upper-cased language attributes +- printDefinition.php: added labels, added better clarification +. HTMLPurifier_Config::create() added, takes mixed variable and converts into + a HTMLPurifier_Config object. + 1.3.1, released 2006-12-06 ! Added HTMLPurifier.func.php stub for a convenient function to call the library - Fixed bug in RemoveInvalidImg code that caused all images to be dropped diff --git a/TODO b/TODO index b29e320f..7bfd2389 100644 --- a/TODO +++ b/TODO @@ -10,6 +10,7 @@ TODO List 1.4 release # More extensive URI filtering schemes (see docs/proposal-new-directives.txt) # Allow for background-image and list-style-image (intrinsically tied to above) + # Add hooks for custom behavior (for instance, YouTube preservation) - Aggressive caching ? Rich set* methods and config file loaders for HTMLPurifier_Config ? Configuration profiles: sets of directives that get set with one func call @@ -66,7 +67,6 @@ Unknown release (on a scratch-an-itch basis) - Append something to duplicate IDs so they're still usable (impl. note: the dupe detector would also need to detect the suffix as well) - Have 'lang' attribute be checked against official lists - - Docs on how to embed YouTube videos (and friends) without patches Encoding workarounds - Non-lossy dumb alternate character encoding transformations, achieved by @@ -84,7 +84,18 @@ Requested 3. Extend the tag exclusion system to specify whether or not the contents should be dropped or not (currently, there's code that could do something like this if it didn't drop the inner text too.) - - Accept array input, by iterating and purifying all of the items + - More user-friendly warnings when %HTML.Allow* attempts to specify a + tag or attribute that is not supported + - Allow specifying global attributes on a tag-by-tag basis in + %HTML.AllowAttributes + - Parse TinyMCE whitelist into our %HTML.Allow* whitelists + - XSS-attempt detection + - More user-friendly warnings when %HTML.Allow* attempts to specify a + tag or attribute that is not supported + - Allow specifying global attributes on a tag-by-tag basis in + %HTML.AllowAttributes + - Parse TinyMCE whitelist into our %HTML.Allow whitelists + - XSS-attempt detection Wontfix - Non-lossy smart alternate character encoding transformations (unless diff --git a/docs/enduser-youtube.html b/docs/enduser-youtube.html new file mode 100644 index 00000000..3e363267 --- /dev/null +++ b/docs/enduser-youtube.html @@ -0,0 +1,179 @@ + + + + + + + +Embedding YouTube Videos - HTML Purifier + + + +

Embedding YouTube Videos

+
...as well as other dangerous active content
+ +
Filed under End-User
+
Return to the index.
+ +

Clients like their YouTube videos. It gives them a warm fuzzy feeling when +they see a neat little embedded video player on their websites that can play +the latest clips from their documentary "Fido and the Bones of Spring". +All joking aside, the ability to embed YouTube videos or other active +content in their pages is something that a lot of people like.

+ +

This is a bad idea. The moment you embed anything untrusted, +you will definitely be slammed by a manner of nasties that can be +embedded in things from your run of the mill Flash movie to +Quicktime movies. +Even img tags, which HTML Purifier allows by default, can be +dangerous. Be distrustful of anything that tells a browser to load content +from another website automatically.

+ +

Luckily for us, however, whitelisting saves the day. Sure, letting users +include any old random flash file could be dangerous, but if it's +from a specific website, it probably is okay. If no amount of pleading will +convince the people upstairs that they should just settle with just linking +to their movies, you may find this technique very useful.

+ +

Sample

+ +

Below is custom code that allows users to embed +YouTube videos. This is not favoritism: this trick can easily be adapted for +other forms of embeddable content.

+ +

Usually, websites like YouTube give us boilerplate code that you can insert +into your documents. YouTube's code goes like this:

+ +
+<object width="425" height="350">
+  <param name="movie" value="http://www.youtube.com/v/AyPzM5WK8ys" />
+  <param name="wmode" value="transparent" />
+  <embed src="http://www.youtube.com/v/AyPzM5WK8ys"
+         type="application/x-shockwave-flash"
+         wmode="transparent" width="425" height="350" />
+</object>
+
+ +

There are two things to note about this code:

+ +
    +
  1. <embed> is not recognized by W3C, so if you want + standards-compliant code, you'll have to get rid of it.
  2. +
  3. The code is exactly the same for all instances, except for the + identifier AyPzM5WK8ys which tells us which movie file + to retrieve.
  4. +
+ +

What point 2 means is that if we have code like <span +class="embed-youtube">AyPzM5WK8ys</span> your +application can reconstruct the full object from this small snippet that +passes through HTML Purifier unharmed.

+ +
+<?php
+
+class HTMLPurifierX_PreserveYouTube extends HTMLPurifier
+{
+    function purify($html, $config = null) {
+        $pre_regex = '#<object[^>]+>.+?'.
+            'http://www.youtube.com/v/([A-Za-z0-9]+).+?</object>#';
+        $pre_replace = '<span class="youtube-embed">\1</span>';
+        $html = preg_replace($pre_regex, $pre_replace, $html);
+        $html = parent::purify($html, $config);
+        $post_regex = '#<span class="youtube-embed">([A-Za-z0-9]+)</span>#';
+        $post_replace = '<object width="425" height="350" '.
+            'data="http://www.youtube.com/v/\1">'.
+            '<param name="movie" value="http://www.youtube.com/v/\1"></param>'.
+            '<param name="wmode" value="transparent"></param>'.
+            '<!--[if IE]>'.
+            '<embed src="http://www.youtube.com/v/\1"'.
+            'type="application/x-shockwave-flash"'.
+            'wmode="transparent" width="425" height="350" />'.
+            '<![endif]-->'.
+            '</object>';
+        $html = preg_replace($post_regex, $post_replace, $html);
+        return $html;
+    }
+}
+
+$purifier = new HTMLPurifierX_PreserveYouTube();
+$html_still_with_youtube = $purifier->purify($html_with_youtube);
+
+?>
+
+ +

There is a bit going on here, so let's explain.

+ +
    +
  1. The class uses the prefix HTMLPurifierX because it's + userspace code. Don't use HTMLPurifier in front of your + class, since it might clobber another class in the library.
  2. +
  3. In order to keep the interface compatible, we've extended HTMLPurifier + into a new class that preserves the YouTube videos. This means that + all you have to do is replace all instances of + new HTMLPurifier to new + HTMLPurifierX_PreserveYouTube. There's other ways to go about + doing this: if you were calling a function that wrapped HTML Purifier, + you could paste the PHP right there. If you wanted to be really + fancy, you could make a decorator for HTMLPurifier.
  4. +
  5. The first preg_replace call replaces any YouTube code users may have + embedded into the benign span tag. Span is used because it is inline, + and objects are inline too. We are very careful to be extremely + restrictive on what goes inside the span tag, as if an errant code + gets in there it could get messy.
  6. +
  7. The HTML is then purified as usual.
  8. +
  9. Then, another preg_replace replaces the span tag with a fully fledged + object. Note that the embed is removed, and, in its place, a data + attribute was added to the object. This makes the tag standards + compliant! It also breaks Internet Explorer, so we add in a bit of + conditional comments with the old embed code to make it work again. + It's all quite convoluted but works.
  10. +
+ +

Warning

+ +

There are a number of possible problems with the code above, depending +on how you look at it.

+ +

Cannot change width and height

+ +

The width and height of the final YouTube movie cannot be adjusted. This +is because I am lazy. If you really insist on letting users change the size +of the movie, what you need to do is package up the attributes inside the +span tag (along with the movie ID). It gets complicated though: a malicious +user can specify an outrageously large height and width and attempt to crash +the user's operating system/browser. You need to either cap it by limiting +the amount of digits allowed in the regex or using a callback to check the +number.

+ +

Trusts media's host's security

+ +

By allowing this code onto our website, we are trusting that YouTube has +tech-savvy enough people not to allow their users to inject malicious +code into the Flash files. An exploit on YouTube means an exploit on your +site. Even though YouTube is run by the reputable Google, it +doesn't +mean they are +invulnerable. +You're putting a certain measure of the job on an external provider (just as +you have by entrusting your user input to HTML Purifier), and +it is important that you are cognizant of the risk.

+ +

Poorly written adaptations compromise security

+ +

This should go without saying, but if you're going to adapt this code +for Google Video or the like, make sure you do it right. It's +extremely easy to allow a character too many in the final section and +suddenly you're introducing XSS into HTML Purifier's XSS free output. HTML +Purifier may be well written, but it cannot guard against vulnerabilities +introduced after it has finished.

+ +

Future plans

+ +

It would probably be a good idea if this code was added to the core +library. Look out for the inclusion of this into the core as a decorator +or the like.

+ + + \ No newline at end of file diff --git a/docs/index.html b/docs/index.html index 26d37e2a..12d839db 100644 --- a/docs/index.html +++ b/docs/index.html @@ -23,7 +23,10 @@ information for casual developers using HTML Purifier.

IDs
-
Explains various methods for allowing IDs in documents safely in HTML Purifier.
+
Explains various methods for allowing IDs in documents safely.
+ +
Embedding YouTube videos
+
Explains how to safely allow the embedding of flash from trusted sites.
diff --git a/docs/ref-xhtml-1.1.txt b/docs/ref-xhtml-1.1.txt index db8127e5..affe4f2a 100644 --- a/docs/ref-xhtml-1.1.txt +++ b/docs/ref-xhtml-1.1.txt @@ -10,7 +10,8 @@ It's quite simple, according to ...but that's only an informative section. More things to do: 1. Scratch style attribute (it's deprecated) -2. Be module-aware +2. Be module-aware (this might entail intelligent grouping in the definition + and allowing users to specifically remove certain modules (see 5)) 3. Cross-reference minimal content models with existing DTDs and determine changes (todo) 4. Watch out for the Legacy Module diff --git a/library/HTMLPurifier.php b/library/HTMLPurifier.php index 8577e560..88ced00f 100644 --- a/library/HTMLPurifier.php +++ b/library/HTMLPurifier.php @@ -22,7 +22,7 @@ */ /* - HTML Purifier 1.3.1 - Standards Compliant HTML Filtering + HTML Purifier 1.3.2 - Standards Compliant HTML Filtering Copyright (C) 2006 Edward Z. Yang This library is free software; you can redistribute it and/or @@ -64,19 +64,29 @@ require_once 'HTMLPurifier/Encoder.php'; class HTMLPurifier { + var $version = '1.3.2'; + var $config; var $lexer, $strategy, $generator; + /** + * Final HTMLPurifier_Context of last run purification. Might be an array. + * @public + */ + var $context; + /** * Initializes the purifier. * @param $config Optional HTMLPurifier_Config object for all instances of * the purifier, if omitted, a default configuration is * supplied (which can be overridden on a per-use basis). + * The parameter can also be any type that + * HTMLPurifier_Config::create() supports. */ function HTMLPurifier($config = null) { - $this->config = $config ? $config : HTMLPurifier_Config::createDefault(); + $this->config = HTMLPurifier_Config::create($config); $this->lexer = HTMLPurifier_Lexer::create(); $this->strategy = new HTMLPurifier_Strategy_Core(); @@ -91,25 +101,54 @@ class HTMLPurifier * @param $html String of HTML to purify * @param $config HTMLPurifier_Config object for this operation, if omitted, * defaults to the config object specified during this - * object's construction. + * object's construction. The parameter can also be any type + * that HTMLPurifier_Config::create() supports. * @return Purified HTML */ function purify($html, $config = null) { - $config = $config ? $config : $this->config; + + $config = $config ? HTMLPurifier_Config::create($config) : $this->config; + $context =& new HTMLPurifier_Context(); $html = $this->encoder->convertToUTF8($html, $config, $context); + + // purified HTML $html = $this->generator->generateFromTokens( + // list of tokens $this->strategy->execute( - $this->lexer->tokenizeHTML($html, $config, $context), + // list of un-purified tokens + $this->lexer->tokenizeHTML( + // un-purified HTML + $html, $config, $context + ), $config, $context ), $config, $context ); + $html = $this->encoder->convertFromUTF8($html, $config, $context); + $this->context =& $context; return $html; } + /** + * Filters an array of HTML snippets + * @param $config Optional HTMLPurifier_Config object for this operation. + * See HTMLPurifier::purify() for more details. + * @return Array of purified HTML + */ + function purifyArray($array_of_html, $config = null) { + $context_array = array(); + foreach ($array_of_html as $key => $html) { + $array_of_html[$key] = $this->purify($html, $config); + $context_array[$key] = $this->context; + } + $this->context = $context_array; + return $array_of_html; + } + + } ?> \ No newline at end of file diff --git a/library/HTMLPurifier/AttrDef/Lang.php b/library/HTMLPurifier/AttrDef/Lang.php index 58809c2b..67183747 100644 --- a/library/HTMLPurifier/AttrDef/Lang.php +++ b/library/HTMLPurifier/AttrDef/Lang.php @@ -49,7 +49,7 @@ class HTMLPurifier_AttrDef_Lang extends HTMLPurifier_AttrDef if ($length == 0 || $length == 1 || $length > 8 || !ctype_alnum($subtags[1])) { return $new_string; } - if (!ctype_lower($subtags[1])) $subtags[1] = strotolower($subtags[1]); + if (!ctype_lower($subtags[1])) $subtags[1] = strtolower($subtags[1]); $new_string .= '-' . $subtags[1]; if ($num_subtags == 2) return $new_string; @@ -61,7 +61,7 @@ class HTMLPurifier_AttrDef_Lang extends HTMLPurifier_AttrDef return $new_string; } if (!ctype_lower($subtags[$i])) { - $subtags[$i] = strotolower($subtags[$i]); + $subtags[$i] = strtolower($subtags[$i]); } $new_string .= '-' . $subtags[$i]; } diff --git a/library/HTMLPurifier/Config.php b/library/HTMLPurifier/Config.php index c6a5eba1..39f62855 100644 --- a/library/HTMLPurifier/Config.php +++ b/library/HTMLPurifier/Config.php @@ -44,6 +44,20 @@ class HTMLPurifier_Config $this->def = $definition; // keep a copy around for checking } + /** + * Convenience constructor that creates a config object based on a mixed var + * @param mixed $config Variable that defines the state of the config + * object. Can be: a HTMLPurifier_Config() object or + * an array of directives based on loadArray(). + * @return Configured HTMLPurifier_Config object + */ + function create($config) { + if (is_a($config, 'HTMLPurifier_Config')) return $config; + $ret = HTMLPurifier_Config::createDefault(); + if (is_array($config)) $ret->loadArray($config); + return $ret; + } + /** * Convenience constructor that creates a default configuration object. * @return Default HTMLPurifier_Config object. diff --git a/smoketests/preserveYouTube.php b/smoketests/preserveYouTube.php new file mode 100644 index 00000000..ef347b47 --- /dev/null +++ b/smoketests/preserveYouTube.php @@ -0,0 +1,65 @@ +'; +?> + + + HTML Purifier Preserve YouTube Smoketest + + + +

HTML Purifier Preserve YouTube Smoketest

+]+>.+?'. + 'http://www.youtube.com/v/([A-Za-z0-9]+).+?#'; + $pre_replace = '\1'; + $html = preg_replace($pre_regex, $pre_replace, $html); + $html = parent::purify($html, $config); + $post_regex = '#([A-Za-z0-9]+)#'; + $post_replace = ''. + ''. + ''. + ''. + ''; + $html = preg_replace($post_regex, $post_replace, $html); + return $html; + } +} + +$string = ''; + +$regular_purifier = new HTMLPurifier(); +$youtube_purifier = new HTMLPurifierX_PreserveYouTube(); + +?> +

Unpurified

+

Click here to see the unpurified version (breaks validation).

+
+ +

Without YouTube exception

+
purify($string); +?>
+ +

With YouTube exception

+
purify($string); +?>
+ + + \ No newline at end of file diff --git a/smoketests/printDefinition.php b/smoketests/printDefinition.php index a937ec71..91c15d4c 100644 --- a/smoketests/printDefinition.php +++ b/smoketests/printDefinition.php @@ -36,6 +36,7 @@ echo '';