diff --git a/NEWS b/NEWS
index 04a7fa9e..f43cde3a 100644
--- a/NEWS
+++ b/NEWS
@@ -39,6 +39,8 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier
raw definition. Reported by ajh.
- Switch to using require_once in the Bootstrap to work around bad
interaction with Zend Debugger and APC. Reported by Antonio Parraga.
+- Fix URI handling when hostname is missing but scheme is present.
+ Reported by Neike Taika-Tessaro.
4.2.0, released 2010-09-15
! Added %Core.RemoveProcessingInstructions, which lets you remove
diff --git a/library/HTMLPurifier/AttrDef/URI/Host.php b/library/HTMLPurifier/AttrDef/URI/Host.php
index 2156c10c..feca469d 100644
--- a/library/HTMLPurifier/AttrDef/URI/Host.php
+++ b/library/HTMLPurifier/AttrDef/URI/Host.php
@@ -23,6 +23,12 @@ class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef
public function validate($string, $config, $context) {
$length = strlen($string);
+ // empty hostname is OK; it's usually semantically equivalent:
+ // the default host as defined by a URI scheme is used:
+ //
+ // If the URI scheme defines a default for host, then that
+ // default applies when the host subcomponent is undefined
+ // or when the registered name is empty (zero length).
if ($string === '') return '';
if ($length > 1 && $string[0] === '[' && $string[$length-1] === ']') {
//IPv6
diff --git a/library/HTMLPurifier/URI.php b/library/HTMLPurifier/URI.php
index 8b50d0d1..92bff87a 100644
--- a/library/HTMLPurifier/URI.php
+++ b/library/HTMLPurifier/URI.php
@@ -67,14 +67,6 @@ class HTMLPurifier_URI
$chars_gen_delims = ':/?#[]@';
$chars_pchar = $chars_sub_delims . ':@';
- // validate scheme (MUST BE FIRST!)
- if (!is_null($this->scheme) && is_null($this->host)) {
- $def = $config->getDefinition('URI');
- if ($def->defaultScheme === $this->scheme) {
- $this->scheme = null;
- }
- }
-
// validate host
if (!is_null($this->host)) {
$host_def = new HTMLPurifier_AttrDef_URI_Host();
@@ -82,6 +74,21 @@ class HTMLPurifier_URI
if ($this->host === false) $this->host = null;
}
+ // validate scheme
+ // NOTE: It's not appropriate to check whether or not this
+ // scheme is in our registry, since a URIFilter may convert a
+ // URI that we don't allow into one we do. So instead, we just
+ // check if the scheme can be dropped because there is no host
+ // and it is our default scheme.
+ if (!is_null($this->scheme) && is_null($this->host) || $this->host === '') {
+ // support for relative paths is pretty abysmal when the
+ // scheme is present, so axe it when possible
+ $def = $config->getDefinition('URI');
+ if ($def->defaultScheme === $this->scheme) {
+ $this->scheme = null;
+ }
+ }
+
// validate username
if (!is_null($this->userinfo)) {
$encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . ':');
@@ -96,32 +103,48 @@ class HTMLPurifier_URI
// validate path
$path_parts = array();
$segments_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/');
- if (!is_null($this->host)) {
+ if (!is_null($this->host)) { // this catches $this->host === ''
// path-abempty (hier and relative)
+ // http://www.example.com/my/path
+ // //www.example.com/my/path (looks odd, but works, and
+ // recognized by most browsers)
+ // (this set is valid or invalid on a scheme by scheme
+ // basis, so we'll deal with it later)
+ // file:///my/path
+ // ///my/path
$this->path = $segments_encoder->encode($this->path);
- } elseif ($this->path !== '' && $this->path[0] === '/') {
- // path-absolute (hier and relative)
- if (strlen($this->path) >= 2 && $this->path[1] === '/') {
- // This shouldn't ever happen!
- $this->path = '';
- } else {
+ } elseif ($this->path !== '') {
+ if ($this->path[0] === '/') {
+ // path-absolute (hier and relative)
+ // http:/my/path
+ // /my/path
+ if (strlen($this->path) >= 2 && $this->path[1] === '/') {
+ // This could happen if both the host gets stripped
+ // out
+ // http://my/path
+ // //my/path
+ $this->path = '';
+ } else {
+ $this->path = $segments_encoder->encode($this->path);
+ }
+ } elseif (!is_null($this->scheme)) {
+ // path-rootless (hier)
+ // http:my/path
+ // Short circuit evaluation means we don't need to check nz
$this->path = $segments_encoder->encode($this->path);
- }
- } elseif (!is_null($this->scheme) && $this->path !== '') {
- // path-rootless (hier)
- // Short circuit evaluation means we don't need to check nz
- $this->path = $segments_encoder->encode($this->path);
- } elseif (is_null($this->scheme) && $this->path !== '') {
- // path-noscheme (relative)
- // (once again, not checking nz)
- $segment_nc_encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . '@');
- $c = strpos($this->path, '/');
- if ($c !== false) {
- $this->path =
- $segment_nc_encoder->encode(substr($this->path, 0, $c)) .
- $segments_encoder->encode(substr($this->path, $c));
} else {
- $this->path = $segment_nc_encoder->encode($this->path);
+ // path-noscheme (relative)
+ // my/path
+ // (once again, not checking nz)
+ $segment_nc_encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . '@');
+ $c = strpos($this->path, '/');
+ if ($c !== false) {
+ $this->path =
+ $segment_nc_encoder->encode(substr($this->path, 0, $c)) .
+ $segments_encoder->encode(substr($this->path, $c));
+ } else {
+ $this->path = $segment_nc_encoder->encode($this->path);
+ }
}
} else {
// path-empty (hier and relative)
@@ -150,6 +173,9 @@ class HTMLPurifier_URI
public function toString() {
// reconstruct authority
$authority = null;
+ // there is a rendering difference between a null authority
+ // (http:foo-bar) and an empty string authority
+ // (http:///foo-bar).
if (!is_null($this->host)) {
$authority = '';
if(!is_null($this->userinfo)) $authority .= $this->userinfo . '@';
@@ -157,7 +183,12 @@ class HTMLPurifier_URI
if(!is_null($this->port)) $authority .= ':' . $this->port;
}
- // reconstruct the result
+ // Reconstruct the result
+ // One might wonder about parsing quirks from browsers after
+ // this reconstruction. Unfortunately, parsing behaviro depends
+ // on what *scheme* was employed (file:///foo is handled *very*
+ // differently than http:///foo), so unfortunately we have to
+ // defer to the schemes to do the right thing.
$result = '';
if (!is_null($this->scheme)) $result .= $this->scheme . ':';
if (!is_null($authority)) $result .= '//' . $authority;
diff --git a/library/HTMLPurifier/URIScheme.php b/library/HTMLPurifier/URIScheme.php
index 039710fd..25eb8410 100644
--- a/library/HTMLPurifier/URIScheme.php
+++ b/library/HTMLPurifier/URIScheme.php
@@ -3,11 +3,13 @@
/**
* Validator for the components of a URI for a specific scheme
*/
-class HTMLPurifier_URIScheme
+abstract class HTMLPurifier_URIScheme
{
/**
- * Scheme's default port (integer)
+ * Scheme's default port (integer). If an explicit port number is
+ * specified that coincides with the default port, it will be
+ * elided.
*/
public $default_port = null;
@@ -24,17 +26,62 @@ class HTMLPurifier_URIScheme
public $hierarchical = false;
/**
- * Validates the components of a URI
- * @note This implementation should be called by children if they define
- * a default port, as it does port processing.
- * @param $uri Instance of HTMLPurifier_URI
+ * Whether or not the URI may omit a hostname when the scheme is
+ * explicitly specified, ala file:///path/to/file. As of writing,
+ * 'file' is the only scheme that browsers support his properly.
+ */
+ public $may_omit_host = false;
+
+ /**
+ * Validates the components of a URI for a specific scheme.
+ * @param $uri Reference to a HTMLPurifier_URI object
+ * @param $config HTMLPurifier_Config object
+ * @param $context HTMLPurifier_Context object
+ * @return Bool success or failure
+ */
+ public abstract function doValidate(&$uri, $config, $context);
+
+ /**
+ * Public interface for validating components of a URI. Performs a
+ * bunch of default actions. Don't overload this method.
+ * @param $uri Reference to a HTMLPurifier_URI object
* @param $config HTMLPurifier_Config object
* @param $context HTMLPurifier_Context object
* @return Bool success or failure
*/
public function validate(&$uri, $config, $context) {
if ($this->default_port == $uri->port) $uri->port = null;
- return true;
+ // kludge: browsers do funny things when the scheme but not the
+ // authority is set
+ if (!$this->may_omit_host &&
+ // if the scheme is present, a missing host is always in error
+ (!is_null($uri->scheme) && ($uri->host === '' || is_null($uri->host))) ||
+ // if the scheme is not present, a *blank* host is in error,
+ // since this translates into '///path' which most browsers
+ // interpret as being 'http://path'.
+ (is_null($uri->scheme) && $uri->host === '')
+ ) {
+ do {
+ if (is_null($uri->scheme)) {
+ if (substr($uri->path, 0, 2) != '//') {
+ $uri->host = null;
+ break;
+ }
+ // URI is '////path', so we cannot nullify the
+ // host to preserve semantics. Try expanding the
+ // hostname instead (fall through)
+ }
+ // first see if we can manually insert a hostname
+ $host = $config->get('URI.Host');
+ if (!is_null($host)) {
+ $uri->host = $host;
+ } else {
+ // we can't do anything sensible, reject the URL.
+ return false;
+ }
+ } while (false);
+ }
+ return $this->doValidate($uri, $config, $context);
}
}
diff --git a/library/HTMLPurifier/URIScheme/data.php b/library/HTMLPurifier/URIScheme/data.php
index b7f1989c..a5c43989 100644
--- a/library/HTMLPurifier/URIScheme/data.php
+++ b/library/HTMLPurifier/URIScheme/data.php
@@ -13,8 +13,11 @@ class HTMLPurifier_URIScheme_data extends HTMLPurifier_URIScheme {
'image/gif' => true,
'image/png' => true,
);
+ // this is actually irrelevant since we only write out the path
+ // component
+ public $may_omit_host = true;
- public function validate(&$uri, $config, $context) {
+ public function doValidate(&$uri, $config, $context) {
$result = explode(',', $uri->path, 2);
$is_base64 = false;
$charset = null;
diff --git a/library/HTMLPurifier/URIScheme/file.php b/library/HTMLPurifier/URIScheme/file.php
index 407b6c17..d74a3f19 100644
--- a/library/HTMLPurifier/URIScheme/file.php
+++ b/library/HTMLPurifier/URIScheme/file.php
@@ -9,8 +9,14 @@ class HTMLPurifier_URIScheme_file extends HTMLPurifier_URIScheme {
// machines, so placing them as an img src is incorrect.
public $browsable = false;
- public function validate(&$uri, $config, $context) {
- parent::validate($uri, $config, $context);
+ // Basically the *only* URI scheme for which this is true, since
+ // accessing files on the local machine is very common. In fact,
+ // browsers on some operating systems don't understand the
+ // authority, though I hear it is used on Windows to refer to
+ // network shares.
+ public $may_omit_host = true;
+
+ public function doValidate(&$uri, $config, $context) {
// Authentication method is not supported
$uri->userinfo = null;
// file:// makes no provisions for accessing the resource
diff --git a/library/HTMLPurifier/URIScheme/ftp.php b/library/HTMLPurifier/URIScheme/ftp.php
index 5849bf7f..0fb2abf6 100644
--- a/library/HTMLPurifier/URIScheme/ftp.php
+++ b/library/HTMLPurifier/URIScheme/ftp.php
@@ -9,8 +9,7 @@ class HTMLPurifier_URIScheme_ftp extends HTMLPurifier_URIScheme {
public $browsable = true; // usually
public $hierarchical = true;
- public function validate(&$uri, $config, $context) {
- parent::validate($uri, $config, $context);
+ public function doValidate(&$uri, $config, $context) {
$uri->query = null;
// typecode check
diff --git a/library/HTMLPurifier/URIScheme/http.php b/library/HTMLPurifier/URIScheme/http.php
index b097a31d..959b8daf 100644
--- a/library/HTMLPurifier/URIScheme/http.php
+++ b/library/HTMLPurifier/URIScheme/http.php
@@ -9,8 +9,7 @@ class HTMLPurifier_URIScheme_http extends HTMLPurifier_URIScheme {
public $browsable = true;
public $hierarchical = true;
- public function validate(&$uri, $config, $context) {
- parent::validate($uri, $config, $context);
+ public function doValidate(&$uri, $config, $context) {
$uri->userinfo = null;
return true;
}
diff --git a/library/HTMLPurifier/URIScheme/mailto.php b/library/HTMLPurifier/URIScheme/mailto.php
index c1e2cd5a..9db4cb23 100644
--- a/library/HTMLPurifier/URIScheme/mailto.php
+++ b/library/HTMLPurifier/URIScheme/mailto.php
@@ -12,9 +12,9 @@
class HTMLPurifier_URIScheme_mailto extends HTMLPurifier_URIScheme {
public $browsable = false;
+ public $may_omit_host = true;
- public function validate(&$uri, $config, $context) {
- parent::validate($uri, $config, $context);
+ public function doValidate(&$uri, $config, $context) {
$uri->userinfo = null;
$uri->host = null;
$uri->port = null;
diff --git a/library/HTMLPurifier/URIScheme/news.php b/library/HTMLPurifier/URIScheme/news.php
index f5f54f4f..84a6748d 100644
--- a/library/HTMLPurifier/URIScheme/news.php
+++ b/library/HTMLPurifier/URIScheme/news.php
@@ -6,9 +6,9 @@
class HTMLPurifier_URIScheme_news extends HTMLPurifier_URIScheme {
public $browsable = false;
+ public $may_omit_host = true;
- public function validate(&$uri, $config, $context) {
- parent::validate($uri, $config, $context);
+ public function doValidate(&$uri, $config, $context) {
$uri->userinfo = null;
$uri->host = null;
$uri->port = null;
diff --git a/library/HTMLPurifier/URIScheme/nntp.php b/library/HTMLPurifier/URIScheme/nntp.php
index 5bf93ea7..4ccea0df 100644
--- a/library/HTMLPurifier/URIScheme/nntp.php
+++ b/library/HTMLPurifier/URIScheme/nntp.php
@@ -8,8 +8,7 @@ class HTMLPurifier_URIScheme_nntp extends HTMLPurifier_URIScheme {
public $default_port = 119;
public $browsable = false;
- public function validate(&$uri, $config, $context) {
- parent::validate($uri, $config, $context);
+ public function doValidate(&$uri, $config, $context) {
$uri->userinfo = null;
$uri->query = null;
return true;
diff --git a/tests/HTMLPurifier/AttrDef/URITest.php b/tests/HTMLPurifier/AttrDef/URITest.php
index b149d1da..3044367a 100644
--- a/tests/HTMLPurifier/AttrDef/URITest.php
+++ b/tests/HTMLPurifier/AttrDef/URITest.php
@@ -74,6 +74,15 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness
$this->assertDef('mailto:this-looks-like-a-path@example.com');
}
+ function testResolveNullSchemeAmbiguity() {
+ $this->assertDef('///foo', '/foo');
+ }
+
+ function testResolveNullSchemeDoubleAmbiguity() {
+ $this->config->set('URI.Host', 'example.com');
+ $this->assertDef('////foo', '//example.com//foo');
+ }
+
function testURIDefinitionValidation() {
$parser = new HTMLPurifier_URIParser();
$uri = $parser->parse('http://example.com');
diff --git a/tests/HTMLPurifier/HTMLT/munge.htmlt b/tests/HTMLPurifier/HTMLT/munge.htmlt
new file mode 100644
index 00000000..827c216e
--- /dev/null
+++ b/tests/HTMLPurifier/HTMLT/munge.htmlt
@@ -0,0 +1,52 @@
+--INI--
+URI.Munge = "/r/%s"
+URI.AllowedSchemes = http,ftp,file
+--HTML--
+foo
+foo
+foo
+foo
+foo
+
+foo
+foo
+foo
+foo
+foo
+
+foo
+foo
+foo
+foo
+foo
+
+foo
+foo
+foo
+foo
+foo
+--EXPECT--
+foo
+foo
+foo
+foo
+foo
+
+foo
+foo
+foo
+foo
+foo
+
+foo
+foo
+foo
+foo
+foo
+
+foo
+foo
+foo
+foo
+foo
+--# vim: et sw=4 sts=4
diff --git a/tests/HTMLPurifier/URISchemeTest.php b/tests/HTMLPurifier/URISchemeTest.php
index 23b8b203..4b43310b 100644
--- a/tests/HTMLPurifier/URISchemeTest.php
+++ b/tests/HTMLPurifier/URISchemeTest.php
@@ -172,6 +172,17 @@ class HTMLPurifier_URISchemeTest extends HTMLPurifier_URIHarness
);
}
+ function test_file_local() {
+ $this->assertValidation(
+ 'file:///foo/bar?baz#frag',
+ 'file:///foo/bar#frag'
+ );
+ }
+
+ function test_ftp_empty_host() {
+ $this->assertValidation('ftp:///example.com', false);
+ }
+
}
// vim: et sw=4 sts=4
diff --git a/tests/HTMLPurifier/URITest.php b/tests/HTMLPurifier/URITest.php
index 57aefcff..02b95013 100644
--- a/tests/HTMLPurifier/URITest.php
+++ b/tests/HTMLPurifier/URITest.php
@@ -157,7 +157,7 @@ class HTMLPurifier_URITest extends HTMLPurifier_URIHarness
}
function test_validate_invalidHostThatLooksLikeIPv6() {
- $this->assertValidation('http://[2001:0db8:85z3:08d3:1319:8a2e:0370:7334]', 'http:');
+ $this->assertValidation('http://[2001:0db8:85z3:08d3:1319:8a2e:0370:7334]', '');
}
function test_validate_removeRedundantScheme() {