MDL-25826 import HTMLPurifier 4.3.0

This commit is contained in:
Petr Skoda
2011-04-09 10:05:27 +02:00
parent 6b14adf210
commit 88efb586b4
42 changed files with 852 additions and 211 deletions

View File

@@ -2,11 +2,43 @@
/**
* Validates a font family list according to CSS spec
* @todo whitelisting allowed fonts would be nice
*/
class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
{
protected $mask = null;
public function __construct() {
$this->mask = '- ';
for ($c = 'a'; $c <= 'z'; $c++) $this->mask .= $c;
for ($c = 'A'; $c <= 'Z'; $c++) $this->mask .= $c;
for ($c = '0'; $c <= '9'; $c++) $this->mask .= $c; // cast-y, but should be fine
// special bytes used by UTF-8
for ($i = 0x80; $i <= 0xFF; $i++) {
// We don't bother excluding invalid bytes in this range,
// because the our restriction of well-formed UTF-8 will
// prevent these from ever occurring.
$this->mask .= chr($i);
}
/*
PHP's internal strcspn implementation is
O(length of string * length of mask), making it inefficient
for large masks. However, it's still faster than
preg_match 8)
for (p = s1;;) {
spanp = s2;
do {
if (*spanp == c || p == s1_end) {
return p - s1;
}
} while (spanp++ < (s2_end - 1));
c = *++p;
}
*/
// possible optimization: invert the mask.
}
public function validate($string, $config, $context) {
static $generic_names = array(
'serif' => true,
@@ -15,6 +47,7 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
'fantasy' => true,
'cursive' => true
);
$allowed_fonts = $config->get('CSS.AllowedFonts');
// assume that no font names contain commas in them
$fonts = explode(',', $string);
@@ -24,7 +57,9 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
if ($font === '') continue;
// match a generic name
if (isset($generic_names[$font])) {
$final .= $font . ', ';
if ($allowed_fonts === null || isset($allowed_fonts[$font])) {
$final .= $font . ', ';
}
continue;
}
// match a quoted name
@@ -40,6 +75,10 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
// $font is a pure representation of the font name
if ($allowed_fonts !== null && !isset($allowed_fonts[$font])) {
continue;
}
if (ctype_alnum($font) && $font !== '') {
// very simple font, allow it in unharmed
$final .= $font . ', ';
@@ -50,17 +89,103 @@ class HTMLPurifier_AttrDef_CSS_FontFamily extends HTMLPurifier_AttrDef
// shouldn't show up regardless
$font = str_replace(array("\n", "\t", "\r", "\x0C"), ' ', $font);
// These ugly transforms don't pose a security
// risk (as \\ and \" might). We could try to be clever and
// use single-quote wrapping when there is a double quote
// present, but I have choosen not to implement that.
// (warning: this code relies on the selection of quotation
// mark below)
$font = str_replace('\\', '\\5C ', $font);
$font = str_replace('"', '\\22 ', $font);
// Here, there are various classes of characters which need
// to be treated differently:
// - Alphanumeric characters are essentially safe. We
// handled these above.
// - Spaces require quoting, though most parsers will do
// the right thing if there aren't any characters that
// can be misinterpreted
// - Dashes rarely occur, but they fairly unproblematic
// for parsing/rendering purposes.
// The above characters cover the majority of Western font
// names.
// - Arbitrary Unicode characters not in ASCII. Because
// most parsers give little thought to Unicode, treatment
// of these codepoints is basically uniform, even for
// punctuation-like codepoints. These characters can
// show up in non-Western pages and are supported by most
// major browsers, for example: " 明朝" is a
// legitimate font-name
// <http://ja.wikipedia.org/wiki/MS_明朝>. See
// the CSS3 spec for more examples:
// <http://www.w3.org/TR/2011/WD-css3-fonts-20110324/localizedfamilynames.png>
// You can see live samples of these on the Internet:
// <http://www.google.co.jp/search?q=font-family++明朝|ゴシック>
// However, most of these fonts have ASCII equivalents:
// for example, 'MS Mincho', and it's considered
// professional to use ASCII font names instead of
// Unicode font names. Thanks Takeshi Terada for
// providing this information.
// The following characters, to my knowledge, have not been
// used to name font names.
// - Single quote. While theoretically you might find a
// font name that has a single quote in its name (serving
// as an apostrophe, e.g. Dave's Scribble), I haven't
// been able to find any actual examples of this.
// Internet Explorer's cssText translation (which I
// believe is invoked by innerHTML) normalizes any
// quoting to single quotes, and fails to escape single
// quotes. (Note that this is not IE's behavior for all
// CSS properties, just some sort of special casing for
// font-family). So a single quote *cannot* be used
// safely in the font-family context if there will be an
// innerHTML/cssText translation. Note that Firefox 3.x
// does this too.
// - Double quote. In IE, these get normalized to
// single-quotes, no matter what the encoding. (Fun
// fact, in IE8, the 'content' CSS property gained
// support, where they special cased to preserve encoded
// double quotes, but still translate unadorned double
// quotes into single quotes.) So, because their
// fixpoint behavior is identical to single quotes, they
// cannot be allowed either. Firefox 3.x displays
// single-quote style behavior.
// - Backslashes are reduced by one (so \\ -> \) every
// iteration, so they cannot be used safely. This shows
// up in IE7, IE8 and FF3
// - Semicolons, commas and backticks are handled properly.
// - The rest of the ASCII punctuation is handled properly.
// We haven't checked what browsers do to unadorned
// versions, but this is not important as long as the
// browser doesn't /remove/ surrounding quotes (as IE does
// for HTML).
//
// With these results in hand, we conclude that there are
// various levels of safety:
// - Paranoid: alphanumeric, spaces and dashes(?)
// - International: Paranoid + non-ASCII Unicode
// - Edgy: Everything except quotes, backslashes
// - NoJS: Standards compliance, e.g. sod IE. Note that
// with some judicious character escaping (since certain
// types of escaping doesn't work) this is theoretically
// OK as long as innerHTML/cssText is not called.
// We believe that international is a reasonable default
// (that we will implement now), and once we do more
// extensive research, we may feel comfortable with dropping
// it down to edgy.
// complicated font, requires quoting
$final .= "\"$font\", "; // note that this will later get turned into &quot;
// Edgy: alphanumeric, spaces, dashes and Unicode. Use of
// str(c)spn assumes that the string was already well formed
// Unicode (which of course it is).
if (strspn($font, $this->mask) !== strlen($font)) {
continue;
}
// Historical:
// In the absence of innerHTML/cssText, these ugly
// transforms don't pose a security risk (as \\ and \"
// might--these escapes are not supported by most browsers).
// We could try to be clever and use single-quote wrapping
// when there is a double quote present, but I have choosen
// not to implement that. (NOTE: you can reduce the amount
// of escapes by one depending on what quoting style you use)
// $font = str_replace('\\', '\\5C ', $font);
// $font = str_replace('"', '\\22 ', $font);
// $font = str_replace("'", '\\27 ', $font);
// font possibly with spaces, requires quoting
$final .= "'$font', ";
}
$final = rtrim($final, ', ');
if ($final === '') return false;

View File

@@ -43,6 +43,15 @@ class HTMLPurifier_AttrDef_CSS_URI extends HTMLPurifier_AttrDef_URI
// extra sanity check; should have been done by URI
$result = str_replace(array('"', "\\", "\n", "\x0c", "\r"), "", $result);
// suspicious characters are ()'; we're going to percent encode
// them for safety.
$result = str_replace(array('(', ')', "'"), array('%28', '%29', '%27'), $result);
// there's an extra bug where ampersands lose their escaping on
// an innerHTML cycle, so a very unlucky query parameter could
// then change the meaning of the URL. Unfortunately, there's
// not much we can do about that...
return "url(\"$result\")";
}

View File

@@ -23,6 +23,12 @@ class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef
public function validate($string, $config, $context) {
$length = strlen($string);
// empty hostname is OK; it's usually semantically equivalent:
// the default host as defined by a URI scheme is used:
//
// If the URI scheme defines a default for host, then that
// default applies when the host subcomponent is undefined
// or when the registered name is empty (zero length).
if ($string === '') return '';
if ($length > 1 && $string[0] === '[' && $string[$length-1] === ']') {
//IPv6

View File

@@ -0,0 +1,41 @@
<?php
// must be called POST validation
/**
* Adds rel="nofollow" to all outbound links. This transform is
* only attached if Attr.Nofollow is TRUE.
*/
class HTMLPurifier_AttrTransform_Nofollow extends HTMLPurifier_AttrTransform
{
private $parser;
public function __construct() {
$this->parser = new HTMLPurifier_URIParser();
}
public function transform($attr, $config, $context) {
if (!isset($attr['href'])) {
return $attr;
}
// XXX Kind of inefficient
$url = $this->parser->parse($attr['href']);
$scheme = $url->getSchemeObj($config, $context);
if (!is_null($url->host) && $scheme !== false && $scheme->browsable) {
if (isset($attr['rel'])) {
$attr['rel'] .= ' nofollow';
} else {
$attr['rel'] = 'nofollow';
}
}
return $attr;
}
}
// vim: et sw=4 sts=4

View File

@@ -19,6 +19,7 @@ class HTMLPurifier_AttrTransform_SafeParam extends HTMLPurifier_AttrTransform
public function __construct() {
$this->uri = new HTMLPurifier_AttrDef_URI(true); // embedded
$this->wmode = new HTMLPurifier_AttrDef_Enum(array('window', 'opaque', 'transparent'));
}
public function transform($attr, $config, $context) {
@@ -41,7 +42,7 @@ class HTMLPurifier_AttrTransform_SafeParam extends HTMLPurifier_AttrTransform
}
break;
case 'wmode':
$attr['value'] = 'window';
$attr['value'] = $this->wmode->validate($attr['value'], $config, $context);
break;
case 'movie':
case 'src':

View File

@@ -37,7 +37,12 @@ class HTMLPurifier_Bootstrap
public static function autoload($class) {
$file = HTMLPurifier_Bootstrap::getPath($class);
if (!$file) return false;
require HTMLPURIFIER_PREFIX . '/' . $file;
// Technically speaking, it should be ok and more efficient to
// just do 'require', but Antonio Parraga reports that with
// Zend extensions such as Zend debugger and APC, this invariant
// may be broken. Since we have efficient alternatives, pay
// the cost here and avoid the bug.
require_once HTMLPURIFIER_PREFIX . '/' . $file;
return true;
}
@@ -65,10 +70,11 @@ class HTMLPurifier_Bootstrap
if ( ($funcs = spl_autoload_functions()) === false ) {
spl_autoload_register($autoload);
} elseif (function_exists('spl_autoload_unregister')) {
$buggy = version_compare(PHP_VERSION, '5.2.11', '<');
$compat = version_compare(PHP_VERSION, '5.1.2', '<=') &&
version_compare(PHP_VERSION, '5.1.0', '>=');
foreach ($funcs as $func) {
if (is_array($func)) {
if ($buggy && is_array($func)) {
// :TRICKY: There are some compatibility issues and some
// places where we need to error out
$reflector = new ReflectionMethod($func[0], $func[1]);

View File

@@ -219,6 +219,10 @@ class HTMLPurifier_CSSDefinition extends HTMLPurifier_Definition
$this->doSetupTricky($config);
}
if ($config->get('CSS.Trusted')) {
$this->doSetupTrusted($config);
}
$allow_important = $config->get('CSS.AllowImportant');
// wrap all attr-defs with decorator that handles !important
foreach ($this->info as $k => $v) {
@@ -260,6 +264,23 @@ class HTMLPurifier_CSSDefinition extends HTMLPurifier_Definition
$this->info['overflow'] = new HTMLPurifier_AttrDef_Enum(array('visible', 'hidden', 'auto', 'scroll'));
}
protected function doSetupTrusted($config) {
$this->info['position'] = new HTMLPurifier_AttrDef_Enum(array(
'static', 'relative', 'absolute', 'fixed'
));
$this->info['top'] =
$this->info['left'] =
$this->info['right'] =
$this->info['bottom'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
new HTMLPurifier_AttrDef_CSS_Length(),
new HTMLPurifier_AttrDef_CSS_Percentage(),
new HTMLPurifier_AttrDef_Enum(array('auto')),
));
$this->info['z-index'] = new HTMLPurifier_AttrDef_CSS_Composite(array(
new HTMLPurifier_AttrDef_Integer(),
new HTMLPurifier_AttrDef_Enum(array('auto')),
));
}
/**
* Performs extra config-based processing. Based off of

View File

@@ -20,7 +20,7 @@ class HTMLPurifier_Config
/**
* HTML Purifier's version
*/
public $version = '4.2.0';
public $version = '4.3.0';
/**
* Bool indicator whether or not to automatically finalize
@@ -76,7 +76,8 @@ class HTMLPurifier_Config
/**
* Set to false if you do not want line and file numbers in errors
* (useful when unit testing)
* (useful when unit testing). This will also compress some errors
* and exceptions.
*/
public $chatty = true;
@@ -318,26 +319,64 @@ class HTMLPurifier_Config
* Retrieves object reference to the HTML definition.
* @param $raw Return a copy that has not been setup yet. Must be
* called before it's been setup, otherwise won't work.
* @param $optimized If true, this method may return null, to
* indicate that a cached version of the modified
* definition object is available and no further edits
* are necessary. Consider using
* maybeGetRawHTMLDefinition, which is more explicitly
* named, instead.
*/
public function getHTMLDefinition($raw = false) {
return $this->getDefinition('HTML', $raw);
public function getHTMLDefinition($raw = false, $optimized = false) {
return $this->getDefinition('HTML', $raw, $optimized);
}
/**
* Retrieves object reference to the CSS definition
* @param $raw Return a copy that has not been setup yet. Must be
* called before it's been setup, otherwise won't work.
* @param $optimized If true, this method may return null, to
* indicate that a cached version of the modified
* definition object is available and no further edits
* are necessary. Consider using
* maybeGetRawCSSDefinition, which is more explicitly
* named, instead.
*/
public function getCSSDefinition($raw = false) {
return $this->getDefinition('CSS', $raw);
public function getCSSDefinition($raw = false, $optimized = false) {
return $this->getDefinition('CSS', $raw, $optimized);
}
/**
* Retrieves object reference to the URI definition
* @param $raw Return a copy that has not been setup yet. Must be
* called before it's been setup, otherwise won't work.
* @param $optimized If true, this method may return null, to
* indicate that a cached version of the modified
* definition object is available and no further edits
* are necessary. Consider using
* maybeGetRawURIDefinition, which is more explicitly
* named, instead.
*/
public function getURIDefinition($raw = false, $optimized = false) {
return $this->getDefinition('URI', $raw, $optimized);
}
/**
* Retrieves a definition
* @param $type Type of definition: HTML, CSS, etc
* @param $raw Whether or not definition should be returned raw
* @param $optimized Only has an effect when $raw is true. Whether
* or not to return null if the result is already present in
* the cache. This is off by default for backwards
* compatibility reasons, but you need to do things this
* way in order to ensure that caching is done properly.
* Check out enduser-customize.html for more details.
* We probably won't ever change this default, as much as the
* maybe semantics is the "right thing to do."
*/
public function getDefinition($type, $raw = false) {
public function getDefinition($type, $raw = false, $optimized = false) {
if ($optimized && !$raw) {
throw new HTMLPurifier_Exception("Cannot set optimized = true when raw = false");
}
if (!$this->finalized) $this->autoFinalize();
// temporarily suspend locks, so we can handle recursive definition calls
$lock = $this->lock;
@@ -346,52 +385,137 @@ class HTMLPurifier_Config
$cache = $factory->create($type, $this);
$this->lock = $lock;
if (!$raw) {
// see if we can quickly supply a definition
// full definition
// ---------------
// check if definition is in memory
if (!empty($this->definitions[$type])) {
if (!$this->definitions[$type]->setup) {
$this->definitions[$type]->setup($this);
$cache->set($this->definitions[$type], $this);
$def = $this->definitions[$type];
// check if the definition is setup
if ($def->setup) {
return $def;
} else {
$def->setup($this);
if ($def->optimized) $cache->add($def, $this);
return $def;
}
return $this->definitions[$type];
}
// memory check missed, try cache
$this->definitions[$type] = $cache->get($this);
if ($this->definitions[$type]) {
// definition in cache, return it
return $this->definitions[$type];
// check if definition is in cache
$def = $cache->get($this);
if ($def) {
// definition in cache, save to memory and return it
$this->definitions[$type] = $def;
return $def;
}
} elseif (
!empty($this->definitions[$type]) &&
!$this->definitions[$type]->setup
) {
// raw requested, raw in memory, quick return
return $this->definitions[$type];
// initialize it
$def = $this->initDefinition($type);
// set it up
$this->lock = $type;
$def->setup($this);
$this->lock = null;
// save in cache
$cache->add($def, $this);
// return it
return $def;
} else {
// raw definition
// --------------
// check preconditions
$def = null;
if ($optimized) {
if (is_null($this->get($type . '.DefinitionID'))) {
// fatally error out if definition ID not set
throw new HTMLPurifier_Exception("Cannot retrieve raw version without specifying %$type.DefinitionID");
}
}
if (!empty($this->definitions[$type])) {
$def = $this->definitions[$type];
if ($def->setup && !$optimized) {
$extra = $this->chatty ? " (try moving this code block earlier in your initialization)" : "";
throw new HTMLPurifier_Exception("Cannot retrieve raw definition after it has already been setup" . $extra);
}
if ($def->optimized === null) {
$extra = $this->chatty ? " (try flushing your cache)" : "";
throw new HTMLPurifier_Exception("Optimization status of definition is unknown" . $extra);
}
if ($def->optimized !== $optimized) {
$msg = $optimized ? "optimized" : "unoptimized";
$extra = $this->chatty ? " (this backtrace is for the first inconsistent call, which was for a $msg raw definition)" : "";
throw new HTMLPurifier_Exception("Inconsistent use of optimized and unoptimized raw definition retrievals" . $extra);
}
}
// check if definition was in memory
if ($def) {
if ($def->setup) {
// invariant: $optimized === true (checked above)
return null;
} else {
return $def;
}
}
// if optimized, check if definition was in cache
// (because we do the memory check first, this formulation
// is prone to cache slamming, but I think
// guaranteeing that either /all/ of the raw
// setup code or /none/ of it is run is more important.)
if ($optimized) {
// This code path only gets run once; once we put
// something in $definitions (which is guaranteed by the
// trailing code), we always short-circuit above.
$def = $cache->get($this);
if ($def) {
// save the full definition for later, but don't
// return it yet
$this->definitions[$type] = $def;
return null;
}
}
// check invariants for creation
if (!$optimized) {
if (!is_null($this->get($type . '.DefinitionID'))) {
if ($this->chatty) {
$this->triggerError("Due to a documentation error in previous version of HTML Purifier, your definitions are not being cached. If this is OK, you can remove the %$type.DefinitionRev and %$type.DefinitionID declaration. Otherwise, modify your code to use maybeGetRawDefinition, and test if the returned value is null before making any edits (if it is null, that means that a cached version is available, and no raw operations are necessary). See <a href='http://htmlpurifier.org/docs/enduser-customize.html#optimized'>Customize</a> for more details", E_USER_WARNING);
} else {
$this->triggerError("Useless DefinitionID declaration", E_USER_WARNING);
}
}
}
// initialize it
$def = $this->initDefinition($type);
$def->optimized = $optimized;
return $def;
}
throw new HTMLPurifier_Exception("The impossible happened!");
}
private function initDefinition($type) {
// quick checks failed, let's create the object
if ($type == 'HTML') {
$this->definitions[$type] = new HTMLPurifier_HTMLDefinition();
$def = new HTMLPurifier_HTMLDefinition();
} elseif ($type == 'CSS') {
$this->definitions[$type] = new HTMLPurifier_CSSDefinition();
$def = new HTMLPurifier_CSSDefinition();
} elseif ($type == 'URI') {
$this->definitions[$type] = new HTMLPurifier_URIDefinition();
$def = new HTMLPurifier_URIDefinition();
} else {
throw new HTMLPurifier_Exception("Definition of $type type not supported");
}
// quick abort if raw
if ($raw) {
if (is_null($this->get($type . '.DefinitionID'))) {
// fatally error out if definition ID not set
throw new HTMLPurifier_Exception("Cannot retrieve raw version without specifying %$type.DefinitionID");
}
return $this->definitions[$type];
}
// set it up
$this->lock = $type;
$this->definitions[$type]->setup($this);
$this->lock = null;
// save in cache
$cache->set($this->definitions[$type], $this);
return $this->definitions[$type];
$this->definitions[$type] = $def;
return $def;
}
public function maybeGetRawDefinition($name) {
return $this->getDefinition($name, true, true);
}
public function maybeGetRawHTMLDefinition() {
return $this->getDefinition('HTML', true, true);
}
public function maybeGetRawCSSDefinition() {
return $this->getDefinition('CSS', true, true);
}
public function maybeGetRawURIDefinition() {
return $this->getDefinition('URI', true, true);
}
/**
@@ -549,17 +673,22 @@ class HTMLPurifier_Config
/**
* Produces a nicely formatted error message by supplying the
* stack frame information from two levels up and OUTSIDE of
* HTMLPurifier_Config.
* stack frame information OUTSIDE of HTMLPurifier_Config.
*/
protected function triggerError($msg, $no) {
// determine previous stack frame
$backtrace = debug_backtrace();
if ($this->chatty && isset($backtrace[1])) {
$frame = $backtrace[1];
$extra = " on line {$frame['line']} in file {$frame['file']}";
} else {
$extra = '';
$extra = '';
if ($this->chatty) {
$trace = debug_backtrace();
// zip(tail(trace), trace) -- but PHP is not Haskell har har
for ($i = 0, $c = count($trace); $i < $c - 1; $i++) {
if ($trace[$i + 1]['class'] === 'HTMLPurifier_Config') {
continue;
}
$frame = $trace[$i];
$extra = " invoked on line {$frame['line']} in file {$frame['file']}";
break;
}
}
trigger_error($msg . $extra, $no);
}

View File

@@ -60,7 +60,13 @@ class HTMLPurifier_ConfigSchema {
* Unserializes the default ConfigSchema.
*/
public static function makeFromSerial() {
return unserialize(file_get_contents(HTMLPURIFIER_PREFIX . '/HTMLPurifier/ConfigSchema/schema.ser'));
$contents = file_get_contents(HTMLPURIFIER_PREFIX . '/HTMLPurifier/ConfigSchema/schema.ser');
$r = unserialize($contents);
if (!$r) {
$hash = sha1($contents);
trigger_error("Unserialization of configuration schema failed, sha1 of file was $hash", E_USER_ERROR);
}
return $r;
}
/**

View File

@@ -0,0 +1,12 @@
CSS.AllowedFonts
TYPE: lookup/null
VERSION: 4.3.0
DEFAULT: NULL
--DESCRIPTION--
<p>
Allows you to manually specify a set of allowed fonts. If
<code>NULL</code>, all fonts are allowed. This directive
affects generic names (serif, sans-serif, monospace, cursive,
fantasy) as well as specific font families.
</p>
--# vim: et sw=4 sts=4

View File

@@ -0,0 +1,9 @@
CSS.Trusted
TYPE: bool
VERSION: 4.2.1
DEFAULT: false
--DESCRIPTION--
Indicates whether or not the user's CSS input is trusted or not. If the
input is trusted, a more expansive set of allowed properties. See
also %HTML.Trusted.
--# vim: et sw=4 sts=4

View File

@@ -0,0 +1,11 @@
Cache.SerializerPermissions
TYPE: int
VERSION: 4.3.0
DEFAULT: 0755
--DESCRIPTION--
<p>
Directory permissions of the files and directories created inside
the DefinitionCache/Serializer or other custom serializer path.
</p>
--# vim: et sw=4 sts=4

View File

@@ -0,0 +1,7 @@
HTML.Nofollow
TYPE: bool
VERSION: 4.3.0
DEFAULT: FALSE
--DESCRIPTION--
If enabled, nofollow rel attributes are added to all outgoing links.
--# vim: et sw=4 sts=4

View File

@@ -5,4 +5,5 @@ DEFAULT: false
--DESCRIPTION--
Indicates whether or not the user input is trusted or not. If the input is
trusted, a more expansive set of allowed tags and attributes will be used.
See also %CSS.Trusted.
--# vim: et sw=4 sts=4

View File

@@ -0,0 +1,15 @@
Output.FixInnerHTML
TYPE: bool
VERSION: 4.3.0
DEFAULT: true
--DESCRIPTION--
<p>
If true, HTML Purifier will protect against Internet Explorer's
mishandling of the <code>innerHTML</code> attribute by appending
a space to any attribute that does not contain angled brackets, spaces
or quotes, but contains a backtick. This slightly changes the
semantics of any given attribute, so if this is unacceptable and
you do not use <code>innerHTML</code> on any of your pages, you can
turn this directive off.
</p>
--# vim: et sw=4 sts=4

View File

@@ -12,6 +12,17 @@ abstract class HTMLPurifier_Definition
*/
public $setup = false;
/**
* If true, write out the final definition object to the cache after
* setup. This will be true only if all invocations to get a raw
* definition object are also optimized. This does not cause file
* system thrashing because on subsequent calls the cached object
* is used and any writes to the raw definition object are short
* circuited. See enduser-customize.html for the high-level
* picture.
*/
public $optimized = null;
/**
* What type of definition is it?
*/

View File

@@ -9,14 +9,14 @@ class HTMLPurifier_DefinitionCache_Serializer extends
$file = $this->generateFilePath($config);
if (file_exists($file)) return false;
if (!$this->_prepareDir($config)) return false;
return $this->_write($file, serialize($def));
return $this->_write($file, serialize($def), $config);
}
public function set($def, $config) {
if (!$this->checkDefType($def)) return;
$file = $this->generateFilePath($config);
if (!$this->_prepareDir($config)) return false;
return $this->_write($file, serialize($def));
return $this->_write($file, serialize($def), $config);
}
public function replace($def, $config) {
@@ -24,7 +24,7 @@ class HTMLPurifier_DefinitionCache_Serializer extends
$file = $this->generateFilePath($config);
if (!file_exists($file)) return false;
if (!$this->_prepareDir($config)) return false;
return $this->_write($file, serialize($def));
return $this->_write($file, serialize($def), $config);
}
public function get($config) {
@@ -97,22 +97,33 @@ class HTMLPurifier_DefinitionCache_Serializer extends
* Convenience wrapper function for file_put_contents
* @param $file File name to write to
* @param $data Data to write into file
* @param $config Config object
* @return Number of bytes written if success, or false if failure.
*/
private function _write($file, $data) {
return file_put_contents($file, $data);
private function _write($file, $data, $config) {
$result = file_put_contents($file, $data);
if ($result !== false) {
// set permissions of the new file (no execute)
$chmod = $config->get('Cache.SerializerPermissions');
if (!$chmod) {
$chmod = 0644; // invalid config or simpletest
}
$chmod = $chmod & 0666;
chmod($file, $chmod);
}
return $result;
}
/**
* Prepares the directory that this type stores the serials in
* @param $config Config object
* @return True if successful
*/
private function _prepareDir($config) {
$directory = $this->generateDirectoryPath($config);
//$chmod = $config->get('Cache.SerializerPermissions'); // it would be nice to get this accepted in upstream
global $CFG; $chmod = $CFG->directorypermissions;
$chmod = $config->get('Cache.SerializerPermissions');
if (!$chmod) {
$chmod = 0755;
$chmod = 0755; // invalid config or simpletest
}
if (!is_dir($directory)) {
$base = $this->generateBaseDirectoryPath($config);
@@ -136,6 +147,9 @@ class HTMLPurifier_DefinitionCache_Serializer extends
/**
* Tests permissions on a directory and throws out friendly
* error messages and attempts to chmod it itself if possible
* @param $dir Directory path
* @param $chmod Permissions
* @return True if directory writable
*/
private function _testPermissions($dir, $chmod) {
// early abort, if it is writable, everything is hunky-dory
@@ -152,10 +166,9 @@ class HTMLPurifier_DefinitionCache_Serializer extends
if (fileowner($dir) === posix_getuid()) {
// we can chmod it ourselves
$chmod = $chmod | 0700;
chmod($dir, $chmod);
return true;
if (chmod($dir, $chmod)) return true;
} elseif (filegroup($dir) === posix_getgid()) {
$chmod = $chmod | 0007;
$chmod = $chmod | 0070;
} else {
// PHP's probably running as nobody, so we'll
// need to give global permissions

View File

File diff suppressed because one or more lines are too long

View File

@@ -36,6 +36,11 @@ class HTMLPurifier_Generator
*/
private $_flashCompat;
/**
* Cache of %Output.FixInnerHTML
*/
private $_innerHTMLFix;
/**
* Stack for keeping track of object information when outputting IE
* compatibility code.
@@ -54,6 +59,7 @@ class HTMLPurifier_Generator
public function __construct($config, $context) {
$this->config = $config;
$this->_scriptFix = $config->get('Output.CommentScriptContents');
$this->_innerHTMLFix = $config->get('Output.FixInnerHTML');
$this->_sortAttr = $config->get('Output.SortAttr');
$this->_flashCompat = $config->get('Output.FlashCompat');
$this->_def = $config->getHTMLDefinition();
@@ -132,19 +138,7 @@ class HTMLPurifier_Generator
$_extra = '';
if ($this->_flashCompat) {
if ($token->name == "object" && !empty($this->_flashStack)) {
$flash = array_pop($this->_flashStack);
$compat_token = new HTMLPurifier_Token_Empty("embed");
foreach ($flash->attr as $name => $val) {
if ($name == "classid") continue;
if ($name == "type") continue;
if ($name == "data") $name = "src";
$compat_token->attr[$name] = $val;
}
foreach ($flash->param as $name => $val) {
if ($name == "movie") $name = "src";
$compat_token->attr[$name] = $val;
}
$_extra = "<!--[if IE]>".$this->generateFromToken($compat_token)."<![endif]-->";
// doesn't do anything for now
}
}
return $_extra . '</' . $token->name . '>';
@@ -202,6 +196,37 @@ class HTMLPurifier_Generator
continue;
}
}
// Workaround for Internet Explorer innerHTML bug.
// Essentially, Internet Explorer, when calculating
// innerHTML, omits quotes if there are no instances of
// angled brackets, quotes or spaces. However, when parsing
// HTML (for example, when you assign to innerHTML), it
// treats backticks as quotes. Thus,
// <img alt="``" />
// becomes
// <img alt=`` />
// becomes
// <img alt='' />
// Fortunately, all we need to do is trigger an appropriate
// quoting style, which we do by adding an extra space.
// This also is consistent with the W3C spec, which states
// that user agents may ignore leading or trailing
// whitespace (in fact, most don't, at least for attributes
// like alt, but an extra space at the end is barely
// noticeable). Still, we have a configuration knob for
// this, since this transformation is not necesary if you
// don't process user input with innerHTML or you don't plan
// on supporting Internet Explorer.
if ($this->_innerHTMLFix) {
if (strpos($value, '`') !== false) {
// check if correct quoting style would not already be
// triggered
if (strcspn($value, '"\' <>') === strlen($value)) {
// protect!
$value .= ' ';
}
}
}
$html .= $key.'="'.$this->escape($value).'" ';
}
return rtrim($html);

View File

@@ -0,0 +1,19 @@
<?php
/**
* Module adds the nofollow attribute transformation to a tags. It
* is enabled by HTML.Nofollow
*/
class HTMLPurifier_HTMLModule_Nofollow extends HTMLPurifier_HTMLModule
{
public $name = 'Nofollow';
public function setup($config) {
$a = $this->addBlankElement('a');
$a->attr_transform_post[] = new HTMLPurifier_AttrTransform_Nofollow();
}
}
// vim: et sw=4 sts=4

View File

@@ -21,7 +21,7 @@ class HTMLPurifier_HTMLModule_SafeEmbed extends HTMLPurifier_HTMLModule
'allowscriptaccess' => 'Enum#never',
'allownetworking' => 'Enum#internal',
'flashvars' => 'Text',
'wmode' => 'Enum#window',
'wmode' => 'Enum#window,transparent,opaque',
'name' => 'ID',
)
);

View File

@@ -29,7 +29,6 @@ class HTMLPurifier_HTMLModule_SafeObject extends HTMLPurifier_HTMLModule
'width' => 'Pixels#' . $max,
'height' => 'Pixels#' . $max,
'data' => 'URI#embedded',
'classid' => 'Enum#clsid:d27cdb6e-ae6d-11cf-96b8-444553540000',
'codebase' => new HTMLPurifier_AttrDef_Enum(array(
'http://download.macromedia.com/pub/shockwave/cabs/flash/swflash.cab#version=6,0,40,0')),
)

View File

@@ -216,19 +216,19 @@ class HTMLPurifier_HTMLModuleManager
}
}
// add proprietary module (this gets special treatment because
// it is completely removed from doctypes, etc.)
// custom modules
if ($config->get('HTML.Proprietary')) {
$modules[] = 'Proprietary';
}
// add SafeObject/Safeembed modules
if ($config->get('HTML.SafeObject')) {
$modules[] = 'SafeObject';
}
if ($config->get('HTML.SafeEmbed')) {
$modules[] = 'SafeEmbed';
}
if ($config->get('HTML.Nofollow')) {
$modules[] = 'Nofollow';
}
// merge in custom modules
$modules = array_merge($modules, $this->userModules);

View File

@@ -235,7 +235,7 @@ class HTMLPurifier_Lexer
*/
protected static function removeIEConditional($string) {
return preg_replace(
'#<!--\[if [^>]+\]>.*<!\[endif\]-->#si', // probably should generalize for all strings
'#<!--\[if [^>]+\]>.*?<!\[endif\]-->#si', // probably should generalize for all strings
'',
$string
);
@@ -273,11 +273,11 @@ class HTMLPurifier_Lexer
$html = $this->escapeCommentedCDATA($html);
}
$html = $this->removeIEConditional($html);
// escape CDATA
$html = $this->escapeCDATA($html);
$html = $this->removeIEConditional($html);
// extract body from document if applicable
if ($config->get('Core.ConvertDocumentToFragment')) {
$e = false;

View File

@@ -72,23 +72,57 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
}
/**
* Recursive function that tokenizes a node, putting it into an accumulator.
*
* Iterative function that tokenizes a node, putting it into an accumulator.
* To iterate is human, to recurse divine - L. Peter Deutsch
* @param $node DOMNode to be tokenized.
* @param $tokens Array-list of already tokenized tokens.
* @param $collect Says whether or start and close are collected, set to
* false at first recursion because it's the implicit DIV
* tag you're dealing with.
* @returns Tokens of node appended to previously passed tokens.
*/
protected function tokenizeDOM($node, &$tokens, $collect = false) {
protected function tokenizeDOM($node, &$tokens) {
$level = 0;
$nodes = array($level => array($node));
$closingNodes = array();
do {
while (!empty($nodes[$level])) {
$node = array_shift($nodes[$level]); // FIFO
$collect = $level > 0 ? true : false;
$needEndingTag = $this->createStartNode($node, $tokens, $collect);
if ($needEndingTag) {
$closingNodes[$level][] = $node;
}
if ($node->childNodes && $node->childNodes->length) {
$level++;
$nodes[$level] = array();
foreach ($node->childNodes as $childNode) {
array_push($nodes[$level], $childNode);
}
}
}
$level--;
if ($level && isset($closingNodes[$level])) {
while($node = array_pop($closingNodes[$level])) {
$this->createEndNode($node, $tokens);
}
}
} while ($level > 0);
}
/**
* @param $node DOMNode to be tokenized.
* @param $tokens Array-list of already tokenized tokens.
* @param $collect Says whether or start and close are collected, set to
* false at first recursion because it's the implicit DIV
* tag you're dealing with.
* @returns bool if the token needs an endtoken
*/
protected function createStartNode($node, &$tokens, $collect) {
// intercept non element nodes. WE MUST catch all of them,
// but we're not getting the character reference nodes because
// those should have been preprocessed
if ($node->nodeType === XML_TEXT_NODE) {
$tokens[] = $this->factory->createText($node->data);
return;
return false;
} elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
// undo libxml's special treatment of <script> and <style> tags
$last = end($tokens);
@@ -106,48 +140,44 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
}
}
$tokens[] = $this->factory->createText($this->parseData($data));
return;
return false;
} elseif ($node->nodeType === XML_COMMENT_NODE) {
// this is code is only invoked for comments in script/style in versions
// of libxml pre-2.6.28 (regular comments, of course, are still
// handled regularly)
$tokens[] = $this->factory->createComment($node->data);
return;
return false;
} elseif (
// not-well tested: there may be other nodes we have to grab
$node->nodeType !== XML_ELEMENT_NODE
) {
return;
return false;
}
$attr = $node->hasAttributes() ?
$this->transformAttrToAssoc($node->attributes) :
array();
$attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();
// We still have to make sure that the element actually IS empty
if (!$node->childNodes->length) {
if ($collect) {
$tokens[] = $this->factory->createEmpty($node->tagName, $attr);
}
return false;
} else {
if ($collect) { // don't wrap on first iteration
if ($collect) {
$tokens[] = $this->factory->createStart(
$tag_name = $node->tagName, // somehow, it get's dropped
$attr
);
}
foreach ($node->childNodes as $node) {
// remember, it's an accumulator. Otherwise, we'd have
// to use array_merge
$this->tokenizeDOM($node, $tokens, true);
}
if ($collect) {
$tokens[] = $this->factory->createEnd($tag_name);
}
return true;
}
}
protected function createEndNode($node, &$tokens) {
$tokens[] = $this->factory->createEnd($node->tagName);
}
/**
* Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
*

View File

@@ -2,6 +2,14 @@
/**
* Takes tokens makes them well-formed (balance end tags, etc.)
*
* Specification of the armor attributes this strategy uses:
*
* - MakeWellFormed_TagClosedError: This armor field is used to
* suppress tag closed errors for certain tokens [TagClosedSuppress],
* in particular, if a tag was generated automatically by HTML
* Purifier, we may rely on our infrastructure to close it for us
* and shouldn't report an error to the user [TagClosedAuto].
*/
class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
{
@@ -43,6 +51,12 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
// local variables
$generator = new HTMLPurifier_Generator($config, $context);
$escape_invalid_tags = $config->get('Core.EscapeInvalidTags');
// used for autoclose early abortion
$global_parent_allowed_elements = array();
if (isset($definition->info[$definition->info_parent])) {
// may be unset under testing circumstances
$global_parent_allowed_elements = $definition->info[$definition->info_parent]->child->getAllowedElements($config);
}
$e = $context->get('ErrorCollector', true);
$t = false; // token index
$i = false; // injector index
@@ -102,7 +116,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
// -- end INJECTOR --
// a note on punting:
// a note on reprocessing:
// In order to reduce code duplication, whenever some code needs
// to make HTML changes in order to make things "correct", the
// new HTML gets sent through the purifier, regardless of its
@@ -149,7 +163,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
$top_nesting = array_pop($this->stack);
$this->stack[] = $top_nesting;
// send error
// send error [TagClosedSuppress]
if ($e && !isset($top_nesting->armor['MakeWellFormed_TagClosedError'])) {
$e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by document end', $top_nesting);
}
@@ -193,12 +207,12 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
$ok = false;
if ($type === 'empty' && $token instanceof HTMLPurifier_Token_Start) {
// claims to be a start tag but is empty
$token = new HTMLPurifier_Token_Empty($token->name, $token->attr);
$token = new HTMLPurifier_Token_Empty($token->name, $token->attr, $token->line, $token->col, $token->armor);
$ok = true;
} elseif ($type && $type !== 'empty' && $token instanceof HTMLPurifier_Token_Empty) {
// claims to be empty but really is a start tag
$this->swap(new HTMLPurifier_Token_End($token->name));
$this->insertBefore(new HTMLPurifier_Token_Start($token->name, $token->attr));
$this->insertBefore(new HTMLPurifier_Token_Start($token->name, $token->attr, $token->line, $token->col, $token->armor));
// punt (since we had to modify the input stream in a non-trivial way)
$reprocess = true;
continue;
@@ -211,6 +225,19 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
// ...unless they also have to close their parent
if (!empty($this->stack)) {
// Performance note: you might think that it's rather
// inefficient, recalculating the autoclose information
// for every tag that a token closes (since when we
// do an autoclose, we push a new token into the
// stream and then /process/ that, before
// re-processing this token.) But this is
// necessary, because an injector can make an
// arbitrary transformations to the autoclosing
// tokens we introduce, so things may have changed
// in the meantime. Also, doing the inefficient thing is
// "easy" to reason about (for certain perverse definitions
// of "easy")
$parent = array_pop($this->stack);
$this->stack[] = $parent;
@@ -243,24 +270,51 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
}
if ($autoclose) {
// errors need to be updated
$new_token = new HTMLPurifier_Token_End($parent->name);
$new_token->start = $parent;
if ($carryover) {
$element = clone $parent;
$element->armor['MakeWellFormed_TagClosedError'] = true;
$element->carryover = true;
$this->processToken(array($new_token, $token, $element));
} else {
$this->insertBefore($new_token);
}
if ($e && !isset($parent->armor['MakeWellFormed_TagClosedError'])) {
if (!$carryover) {
$e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag auto closed', $parent);
} else {
$e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag carryover', $parent);
// check if this autoclose is doomed to fail
// (this rechecks $parent, which his harmless)
$autoclose_ok = isset($global_parent_allowed_elements[$token->name]);
if (!$autoclose_ok) {
foreach ($this->stack as $ancestor) {
$elements = $definition->info[$ancestor->name]->child->getAllowedElements($config);
if (isset($elements[$token->name])) {
$autoclose_ok = true;
break;
}
if ($definition->info[$token->name]->wrap) {
$wrapname = $definition->info[$token->name]->wrap;
$wrapdef = $definition->info[$wrapname];
$wrap_elements = $wrapdef->child->getAllowedElements($config);
if (isset($wrap_elements[$token->name]) && isset($elements[$wrapname])) {
$autoclose_ok = true;
break;
}
}
}
}
if ($autoclose_ok) {
// errors need to be updated
$new_token = new HTMLPurifier_Token_End($parent->name);
$new_token->start = $parent;
if ($carryover) {
$element = clone $parent;
// [TagClosedAuto]
$element->armor['MakeWellFormed_TagClosedError'] = true;
$element->carryover = true;
$this->processToken(array($new_token, $token, $element));
} else {
$this->insertBefore($new_token);
}
// [TagClosedSuppress]
if ($e && !isset($parent->armor['MakeWellFormed_TagClosedError'])) {
if (!$carryover) {
$e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag auto closed', $parent);
} else {
$e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag carryover', $parent);
}
}
} else {
$this->remove();
}
$reprocess = true;
continue;
}
@@ -366,7 +420,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
if ($e) {
for ($j = $c - 1; $j > 0; $j--) {
// notice we exclude $j == 0, i.e. the current ending tag, from
// the errors...
// the errors... [TagClosedSuppress]
if (!isset($skipped_tags[$j]->armor['MakeWellFormed_TagClosedError'])) {
$e->send(E_NOTICE, 'Strategy_MakeWellFormed: Tag closed by element end', $skipped_tags[$j]);
}
@@ -381,6 +435,7 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
$new_token->start = $skipped_tags[$j];
array_unshift($replace, $new_token);
if (isset($definition->info[$new_token->name]) && $definition->info[$new_token->name]->formatting) {
// [TagClosedAuto]
$element = clone $skipped_tags[$j];
$element->carryover = true;
$element->armor['MakeWellFormed_TagClosedError'] = true;
@@ -449,7 +504,8 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
}
/**
* Inserts a token before the current token. Cursor now points to this token
* Inserts a token before the current token. Cursor now points to
* this token. You must reprocess after this.
*/
private function insertBefore($token) {
array_splice($this->tokens, $this->t, 0, array($token));
@@ -457,14 +513,15 @@ class HTMLPurifier_Strategy_MakeWellFormed extends HTMLPurifier_Strategy
/**
* Removes current token. Cursor now points to new token occupying previously
* occupied space.
* occupied space. You must reprocess after this.
*/
private function remove() {
array_splice($this->tokens, $this->t, 1);
}
/**
* Swap current token with new token. Cursor points to new token (no change).
* Swap current token with new token. Cursor points to new token (no
* change). You must reprocess after this.
*/
private function swap($token) {
$this->tokens[$this->t] = $token;

View File

@@ -63,13 +63,15 @@ class HTMLPurifier_TagTransform_Font extends HTMLPurifier_TagTransform
// handle size transform
if (isset($attr['size'])) {
// normalize large numbers
if ($attr['size']{0} == '+' || $attr['size']{0} == '-') {
$size = (int) $attr['size'];
if ($size < -2) $attr['size'] = '-2';
if ($size > 4) $attr['size'] = '+4';
} else {
$size = (int) $attr['size'];
if ($size > 7) $attr['size'] = '7';
if ($attr['size'] !== '') {
if ($attr['size']{0} == '+' || $attr['size']{0} == '-') {
$size = (int) $attr['size'];
if ($size < -2) $attr['size'] = '-2';
if ($size > 4) $attr['size'] = '+4';
} else {
$size = (int) $attr['size'];
if ($size > 7) $attr['size'] = '7';
}
}
if (isset($this->_size_lookup[$attr['size']])) {
$prepend_style .= 'font-size:' .

View File

@@ -33,7 +33,7 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token
* @param $name String name.
* @param $attr Associative array of attributes.
*/
public function __construct($name, $attr = array(), $line = null, $col = null) {
public function __construct($name, $attr = array(), $line = null, $col = null, $armor = array()) {
$this->name = ctype_lower($name) ? $name : strtolower($name);
foreach ($attr as $key => $value) {
// normalization only necessary when key is not lowercase
@@ -50,6 +50,7 @@ class HTMLPurifier_Token_Tag extends HTMLPurifier_Token
$this->attr = $attr;
$this->line = $line;
$this->col = $col;
$this->armor = $armor;
}
}

View File

@@ -67,14 +67,6 @@ class HTMLPurifier_URI
$chars_gen_delims = ':/?#[]@';
$chars_pchar = $chars_sub_delims . ':@';
// validate scheme (MUST BE FIRST!)
if (!is_null($this->scheme) && is_null($this->host)) {
$def = $config->getDefinition('URI');
if ($def->defaultScheme === $this->scheme) {
$this->scheme = null;
}
}
// validate host
if (!is_null($this->host)) {
$host_def = new HTMLPurifier_AttrDef_URI_Host();
@@ -82,6 +74,21 @@ class HTMLPurifier_URI
if ($this->host === false) $this->host = null;
}
// validate scheme
// NOTE: It's not appropriate to check whether or not this
// scheme is in our registry, since a URIFilter may convert a
// URI that we don't allow into one we do. So instead, we just
// check if the scheme can be dropped because there is no host
// and it is our default scheme.
if (!is_null($this->scheme) && is_null($this->host) || $this->host === '') {
// support for relative paths is pretty abysmal when the
// scheme is present, so axe it when possible
$def = $config->getDefinition('URI');
if ($def->defaultScheme === $this->scheme) {
$this->scheme = null;
}
}
// validate username
if (!is_null($this->userinfo)) {
$encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . ':');
@@ -96,32 +103,48 @@ class HTMLPurifier_URI
// validate path
$path_parts = array();
$segments_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/');
if (!is_null($this->host)) {
if (!is_null($this->host)) { // this catches $this->host === ''
// path-abempty (hier and relative)
// http://www.example.com/my/path
// //www.example.com/my/path (looks odd, but works, and
// recognized by most browsers)
// (this set is valid or invalid on a scheme by scheme
// basis, so we'll deal with it later)
// file:///my/path
// ///my/path
$this->path = $segments_encoder->encode($this->path);
} elseif ($this->path !== '' && $this->path[0] === '/') {
// path-absolute (hier and relative)
if (strlen($this->path) >= 2 && $this->path[1] === '/') {
// This shouldn't ever happen!
$this->path = '';
} else {
} elseif ($this->path !== '') {
if ($this->path[0] === '/') {
// path-absolute (hier and relative)
// http:/my/path
// /my/path
if (strlen($this->path) >= 2 && $this->path[1] === '/') {
// This could happen if both the host gets stripped
// out
// http://my/path
// //my/path
$this->path = '';
} else {
$this->path = $segments_encoder->encode($this->path);
}
} elseif (!is_null($this->scheme)) {
// path-rootless (hier)
// http:my/path
// Short circuit evaluation means we don't need to check nz
$this->path = $segments_encoder->encode($this->path);
}
} elseif (!is_null($this->scheme) && $this->path !== '') {
// path-rootless (hier)
// Short circuit evaluation means we don't need to check nz
$this->path = $segments_encoder->encode($this->path);
} elseif (is_null($this->scheme) && $this->path !== '') {
// path-noscheme (relative)
// (once again, not checking nz)
$segment_nc_encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . '@');
$c = strpos($this->path, '/');
if ($c !== false) {
$this->path =
$segment_nc_encoder->encode(substr($this->path, 0, $c)) .
$segments_encoder->encode(substr($this->path, $c));
} else {
$this->path = $segment_nc_encoder->encode($this->path);
// path-noscheme (relative)
// my/path
// (once again, not checking nz)
$segment_nc_encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . '@');
$c = strpos($this->path, '/');
if ($c !== false) {
$this->path =
$segment_nc_encoder->encode(substr($this->path, 0, $c)) .
$segments_encoder->encode(substr($this->path, $c));
} else {
$this->path = $segment_nc_encoder->encode($this->path);
}
}
} else {
// path-empty (hier and relative)
@@ -150,6 +173,9 @@ class HTMLPurifier_URI
public function toString() {
// reconstruct authority
$authority = null;
// there is a rendering difference between a null authority
// (http:foo-bar) and an empty string authority
// (http:///foo-bar).
if (!is_null($this->host)) {
$authority = '';
if(!is_null($this->userinfo)) $authority .= $this->userinfo . '@';
@@ -157,7 +183,12 @@ class HTMLPurifier_URI
if(!is_null($this->port)) $authority .= ':' . $this->port;
}
// reconstruct the result
// Reconstruct the result
// One might wonder about parsing quirks from browsers after
// this reconstruction. Unfortunately, parsing behavior depends
// on what *scheme* was employed (file:///foo is handled *very*
// differently than http:///foo), so unfortunately we have to
// defer to the schemes to do the right thing.
$result = '';
if (!is_null($this->scheme)) $result .= $this->scheme . ':';
if (!is_null($authority)) $result .= '//' . $authority;

View File

@@ -3,11 +3,13 @@
/**
* Validator for the components of a URI for a specific scheme
*/
class HTMLPurifier_URIScheme
abstract class HTMLPurifier_URIScheme
{
/**
* Scheme's default port (integer)
* Scheme's default port (integer). If an explicit port number is
* specified that coincides with the default port, it will be
* elided.
*/
public $default_port = null;
@@ -24,17 +26,62 @@ class HTMLPurifier_URIScheme
public $hierarchical = false;
/**
* Validates the components of a URI
* @note This implementation should be called by children if they define
* a default port, as it does port processing.
* @param $uri Instance of HTMLPurifier_URI
* Whether or not the URI may omit a hostname when the scheme is
* explicitly specified, ala file:///path/to/file. As of writing,
* 'file' is the only scheme that browsers support his properly.
*/
public $may_omit_host = false;
/**
* Validates the components of a URI for a specific scheme.
* @param $uri Reference to a HTMLPurifier_URI object
* @param $config HTMLPurifier_Config object
* @param $context HTMLPurifier_Context object
* @return Bool success or failure
*/
public abstract function doValidate(&$uri, $config, $context);
/**
* Public interface for validating components of a URI. Performs a
* bunch of default actions. Don't overload this method.
* @param $uri Reference to a HTMLPurifier_URI object
* @param $config HTMLPurifier_Config object
* @param $context HTMLPurifier_Context object
* @return Bool success or failure
*/
public function validate(&$uri, $config, $context) {
if ($this->default_port == $uri->port) $uri->port = null;
return true;
// kludge: browsers do funny things when the scheme but not the
// authority is set
if (!$this->may_omit_host &&
// if the scheme is present, a missing host is always in error
(!is_null($uri->scheme) && ($uri->host === '' || is_null($uri->host))) ||
// if the scheme is not present, a *blank* host is in error,
// since this translates into '///path' which most browsers
// interpret as being 'http://path'.
(is_null($uri->scheme) && $uri->host === '')
) {
do {
if (is_null($uri->scheme)) {
if (substr($uri->path, 0, 2) != '//') {
$uri->host = null;
break;
}
// URI is '////path', so we cannot nullify the
// host to preserve semantics. Try expanding the
// hostname instead (fall through)
}
// first see if we can manually insert a hostname
$host = $config->get('URI.Host');
if (!is_null($host)) {
$uri->host = $host;
} else {
// we can't do anything sensible, reject the URL.
return false;
}
} while (false);
}
return $this->doValidate($uri, $config, $context);
}
}

View File

@@ -13,8 +13,11 @@ class HTMLPurifier_URIScheme_data extends HTMLPurifier_URIScheme {
'image/gif' => true,
'image/png' => true,
);
// this is actually irrelevant since we only write out the path
// component
public $may_omit_host = true;
public function validate(&$uri, $config, $context) {
public function doValidate(&$uri, $config, $context) {
$result = explode(',', $uri->path, 2);
$is_base64 = false;
$charset = null;

View File

@@ -9,8 +9,14 @@ class HTMLPurifier_URIScheme_file extends HTMLPurifier_URIScheme {
// machines, so placing them as an img src is incorrect.
public $browsable = false;
public function validate(&$uri, $config, $context) {
parent::validate($uri, $config, $context);
// Basically the *only* URI scheme for which this is true, since
// accessing files on the local machine is very common. In fact,
// browsers on some operating systems don't understand the
// authority, though I hear it is used on Windows to refer to
// network shares.
public $may_omit_host = true;
public function doValidate(&$uri, $config, $context) {
// Authentication method is not supported
$uri->userinfo = null;
// file:// makes no provisions for accessing the resource

View File

@@ -9,8 +9,7 @@ class HTMLPurifier_URIScheme_ftp extends HTMLPurifier_URIScheme {
public $browsable = true; // usually
public $hierarchical = true;
public function validate(&$uri, $config, $context) {
parent::validate($uri, $config, $context);
public function doValidate(&$uri, $config, $context) {
$uri->query = null;
// typecode check

View File

@@ -9,8 +9,7 @@ class HTMLPurifier_URIScheme_http extends HTMLPurifier_URIScheme {
public $browsable = true;
public $hierarchical = true;
public function validate(&$uri, $config, $context) {
parent::validate($uri, $config, $context);
public function doValidate(&$uri, $config, $context) {
$uri->userinfo = null;
return true;
}

View File

@@ -12,9 +12,9 @@
class HTMLPurifier_URIScheme_mailto extends HTMLPurifier_URIScheme {
public $browsable = false;
public $may_omit_host = true;
public function validate(&$uri, $config, $context) {
parent::validate($uri, $config, $context);
public function doValidate(&$uri, $config, $context) {
$uri->userinfo = null;
$uri->host = null;
$uri->port = null;

View File

@@ -6,9 +6,9 @@
class HTMLPurifier_URIScheme_news extends HTMLPurifier_URIScheme {
public $browsable = false;
public $may_omit_host = true;
public function validate(&$uri, $config, $context) {
parent::validate($uri, $config, $context);
public function doValidate(&$uri, $config, $context) {
$uri->userinfo = null;
$uri->host = null;
$uri->port = null;

View File

@@ -8,8 +8,7 @@ class HTMLPurifier_URIScheme_nntp extends HTMLPurifier_URIScheme {
public $default_port = 119;
public $browsable = false;
public function validate(&$uri, $config, $context) {
parent::validate($uri, $config, $context);
public function doValidate(&$uri, $config, $context) {
$uri->userinfo = null;
$uri->query = null;
return true;