diff --git a/NEWS b/NEWS index 04a7fa9e..f43cde3a 100644 --- a/NEWS +++ b/NEWS @@ -39,6 +39,8 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier raw definition. Reported by ajh. - Switch to using require_once in the Bootstrap to work around bad interaction with Zend Debugger and APC. Reported by Antonio Parraga. +- Fix URI handling when hostname is missing but scheme is present. + Reported by Neike Taika-Tessaro. 4.2.0, released 2010-09-15 ! Added %Core.RemoveProcessingInstructions, which lets you remove diff --git a/library/HTMLPurifier/AttrDef/URI/Host.php b/library/HTMLPurifier/AttrDef/URI/Host.php index 2156c10c..feca469d 100644 --- a/library/HTMLPurifier/AttrDef/URI/Host.php +++ b/library/HTMLPurifier/AttrDef/URI/Host.php @@ -23,6 +23,12 @@ class HTMLPurifier_AttrDef_URI_Host extends HTMLPurifier_AttrDef public function validate($string, $config, $context) { $length = strlen($string); + // empty hostname is OK; it's usually semantically equivalent: + // the default host as defined by a URI scheme is used: + // + // If the URI scheme defines a default for host, then that + // default applies when the host subcomponent is undefined + // or when the registered name is empty (zero length). if ($string === '') return ''; if ($length > 1 && $string[0] === '[' && $string[$length-1] === ']') { //IPv6 diff --git a/library/HTMLPurifier/URI.php b/library/HTMLPurifier/URI.php index 8b50d0d1..92bff87a 100644 --- a/library/HTMLPurifier/URI.php +++ b/library/HTMLPurifier/URI.php @@ -67,14 +67,6 @@ class HTMLPurifier_URI $chars_gen_delims = ':/?#[]@'; $chars_pchar = $chars_sub_delims . ':@'; - // validate scheme (MUST BE FIRST!) - if (!is_null($this->scheme) && is_null($this->host)) { - $def = $config->getDefinition('URI'); - if ($def->defaultScheme === $this->scheme) { - $this->scheme = null; - } - } - // validate host if (!is_null($this->host)) { $host_def = new HTMLPurifier_AttrDef_URI_Host(); @@ -82,6 +74,21 @@ class HTMLPurifier_URI if ($this->host === false) $this->host = null; } + // validate scheme + // NOTE: It's not appropriate to check whether or not this + // scheme is in our registry, since a URIFilter may convert a + // URI that we don't allow into one we do. So instead, we just + // check if the scheme can be dropped because there is no host + // and it is our default scheme. + if (!is_null($this->scheme) && is_null($this->host) || $this->host === '') { + // support for relative paths is pretty abysmal when the + // scheme is present, so axe it when possible + $def = $config->getDefinition('URI'); + if ($def->defaultScheme === $this->scheme) { + $this->scheme = null; + } + } + // validate username if (!is_null($this->userinfo)) { $encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . ':'); @@ -96,32 +103,48 @@ class HTMLPurifier_URI // validate path $path_parts = array(); $segments_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/'); - if (!is_null($this->host)) { + if (!is_null($this->host)) { // this catches $this->host === '' // path-abempty (hier and relative) + // http://www.example.com/my/path + // //www.example.com/my/path (looks odd, but works, and + // recognized by most browsers) + // (this set is valid or invalid on a scheme by scheme + // basis, so we'll deal with it later) + // file:///my/path + // ///my/path $this->path = $segments_encoder->encode($this->path); - } elseif ($this->path !== '' && $this->path[0] === '/') { - // path-absolute (hier and relative) - if (strlen($this->path) >= 2 && $this->path[1] === '/') { - // This shouldn't ever happen! - $this->path = ''; - } else { + } elseif ($this->path !== '') { + if ($this->path[0] === '/') { + // path-absolute (hier and relative) + // http:/my/path + // /my/path + if (strlen($this->path) >= 2 && $this->path[1] === '/') { + // This could happen if both the host gets stripped + // out + // http://my/path + // //my/path + $this->path = ''; + } else { + $this->path = $segments_encoder->encode($this->path); + } + } elseif (!is_null($this->scheme)) { + // path-rootless (hier) + // http:my/path + // Short circuit evaluation means we don't need to check nz $this->path = $segments_encoder->encode($this->path); - } - } elseif (!is_null($this->scheme) && $this->path !== '') { - // path-rootless (hier) - // Short circuit evaluation means we don't need to check nz - $this->path = $segments_encoder->encode($this->path); - } elseif (is_null($this->scheme) && $this->path !== '') { - // path-noscheme (relative) - // (once again, not checking nz) - $segment_nc_encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . '@'); - $c = strpos($this->path, '/'); - if ($c !== false) { - $this->path = - $segment_nc_encoder->encode(substr($this->path, 0, $c)) . - $segments_encoder->encode(substr($this->path, $c)); } else { - $this->path = $segment_nc_encoder->encode($this->path); + // path-noscheme (relative) + // my/path + // (once again, not checking nz) + $segment_nc_encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . '@'); + $c = strpos($this->path, '/'); + if ($c !== false) { + $this->path = + $segment_nc_encoder->encode(substr($this->path, 0, $c)) . + $segments_encoder->encode(substr($this->path, $c)); + } else { + $this->path = $segment_nc_encoder->encode($this->path); + } } } else { // path-empty (hier and relative) @@ -150,6 +173,9 @@ class HTMLPurifier_URI public function toString() { // reconstruct authority $authority = null; + // there is a rendering difference between a null authority + // (http:foo-bar) and an empty string authority + // (http:///foo-bar). if (!is_null($this->host)) { $authority = ''; if(!is_null($this->userinfo)) $authority .= $this->userinfo . '@'; @@ -157,7 +183,12 @@ class HTMLPurifier_URI if(!is_null($this->port)) $authority .= ':' . $this->port; } - // reconstruct the result + // Reconstruct the result + // One might wonder about parsing quirks from browsers after + // this reconstruction. Unfortunately, parsing behaviro depends + // on what *scheme* was employed (file:///foo is handled *very* + // differently than http:///foo), so unfortunately we have to + // defer to the schemes to do the right thing. $result = ''; if (!is_null($this->scheme)) $result .= $this->scheme . ':'; if (!is_null($authority)) $result .= '//' . $authority; diff --git a/library/HTMLPurifier/URIScheme.php b/library/HTMLPurifier/URIScheme.php index 039710fd..25eb8410 100644 --- a/library/HTMLPurifier/URIScheme.php +++ b/library/HTMLPurifier/URIScheme.php @@ -3,11 +3,13 @@ /** * Validator for the components of a URI for a specific scheme */ -class HTMLPurifier_URIScheme +abstract class HTMLPurifier_URIScheme { /** - * Scheme's default port (integer) + * Scheme's default port (integer). If an explicit port number is + * specified that coincides with the default port, it will be + * elided. */ public $default_port = null; @@ -24,17 +26,62 @@ class HTMLPurifier_URIScheme public $hierarchical = false; /** - * Validates the components of a URI - * @note This implementation should be called by children if they define - * a default port, as it does port processing. - * @param $uri Instance of HTMLPurifier_URI + * Whether or not the URI may omit a hostname when the scheme is + * explicitly specified, ala file:///path/to/file. As of writing, + * 'file' is the only scheme that browsers support his properly. + */ + public $may_omit_host = false; + + /** + * Validates the components of a URI for a specific scheme. + * @param $uri Reference to a HTMLPurifier_URI object + * @param $config HTMLPurifier_Config object + * @param $context HTMLPurifier_Context object + * @return Bool success or failure + */ + public abstract function doValidate(&$uri, $config, $context); + + /** + * Public interface for validating components of a URI. Performs a + * bunch of default actions. Don't overload this method. + * @param $uri Reference to a HTMLPurifier_URI object * @param $config HTMLPurifier_Config object * @param $context HTMLPurifier_Context object * @return Bool success or failure */ public function validate(&$uri, $config, $context) { if ($this->default_port == $uri->port) $uri->port = null; - return true; + // kludge: browsers do funny things when the scheme but not the + // authority is set + if (!$this->may_omit_host && + // if the scheme is present, a missing host is always in error + (!is_null($uri->scheme) && ($uri->host === '' || is_null($uri->host))) || + // if the scheme is not present, a *blank* host is in error, + // since this translates into '///path' which most browsers + // interpret as being 'http://path'. + (is_null($uri->scheme) && $uri->host === '') + ) { + do { + if (is_null($uri->scheme)) { + if (substr($uri->path, 0, 2) != '//') { + $uri->host = null; + break; + } + // URI is '////path', so we cannot nullify the + // host to preserve semantics. Try expanding the + // hostname instead (fall through) + } + // first see if we can manually insert a hostname + $host = $config->get('URI.Host'); + if (!is_null($host)) { + $uri->host = $host; + } else { + // we can't do anything sensible, reject the URL. + return false; + } + } while (false); + } + return $this->doValidate($uri, $config, $context); } } diff --git a/library/HTMLPurifier/URIScheme/data.php b/library/HTMLPurifier/URIScheme/data.php index b7f1989c..a5c43989 100644 --- a/library/HTMLPurifier/URIScheme/data.php +++ b/library/HTMLPurifier/URIScheme/data.php @@ -13,8 +13,11 @@ class HTMLPurifier_URIScheme_data extends HTMLPurifier_URIScheme { 'image/gif' => true, 'image/png' => true, ); + // this is actually irrelevant since we only write out the path + // component + public $may_omit_host = true; - public function validate(&$uri, $config, $context) { + public function doValidate(&$uri, $config, $context) { $result = explode(',', $uri->path, 2); $is_base64 = false; $charset = null; diff --git a/library/HTMLPurifier/URIScheme/file.php b/library/HTMLPurifier/URIScheme/file.php index 407b6c17..d74a3f19 100644 --- a/library/HTMLPurifier/URIScheme/file.php +++ b/library/HTMLPurifier/URIScheme/file.php @@ -9,8 +9,14 @@ class HTMLPurifier_URIScheme_file extends HTMLPurifier_URIScheme { // machines, so placing them as an img src is incorrect. public $browsable = false; - public function validate(&$uri, $config, $context) { - parent::validate($uri, $config, $context); + // Basically the *only* URI scheme for which this is true, since + // accessing files on the local machine is very common. In fact, + // browsers on some operating systems don't understand the + // authority, though I hear it is used on Windows to refer to + // network shares. + public $may_omit_host = true; + + public function doValidate(&$uri, $config, $context) { // Authentication method is not supported $uri->userinfo = null; // file:// makes no provisions for accessing the resource diff --git a/library/HTMLPurifier/URIScheme/ftp.php b/library/HTMLPurifier/URIScheme/ftp.php index 5849bf7f..0fb2abf6 100644 --- a/library/HTMLPurifier/URIScheme/ftp.php +++ b/library/HTMLPurifier/URIScheme/ftp.php @@ -9,8 +9,7 @@ class HTMLPurifier_URIScheme_ftp extends HTMLPurifier_URIScheme { public $browsable = true; // usually public $hierarchical = true; - public function validate(&$uri, $config, $context) { - parent::validate($uri, $config, $context); + public function doValidate(&$uri, $config, $context) { $uri->query = null; // typecode check diff --git a/library/HTMLPurifier/URIScheme/http.php b/library/HTMLPurifier/URIScheme/http.php index b097a31d..959b8daf 100644 --- a/library/HTMLPurifier/URIScheme/http.php +++ b/library/HTMLPurifier/URIScheme/http.php @@ -9,8 +9,7 @@ class HTMLPurifier_URIScheme_http extends HTMLPurifier_URIScheme { public $browsable = true; public $hierarchical = true; - public function validate(&$uri, $config, $context) { - parent::validate($uri, $config, $context); + public function doValidate(&$uri, $config, $context) { $uri->userinfo = null; return true; } diff --git a/library/HTMLPurifier/URIScheme/mailto.php b/library/HTMLPurifier/URIScheme/mailto.php index c1e2cd5a..9db4cb23 100644 --- a/library/HTMLPurifier/URIScheme/mailto.php +++ b/library/HTMLPurifier/URIScheme/mailto.php @@ -12,9 +12,9 @@ class HTMLPurifier_URIScheme_mailto extends HTMLPurifier_URIScheme { public $browsable = false; + public $may_omit_host = true; - public function validate(&$uri, $config, $context) { - parent::validate($uri, $config, $context); + public function doValidate(&$uri, $config, $context) { $uri->userinfo = null; $uri->host = null; $uri->port = null; diff --git a/library/HTMLPurifier/URIScheme/news.php b/library/HTMLPurifier/URIScheme/news.php index f5f54f4f..84a6748d 100644 --- a/library/HTMLPurifier/URIScheme/news.php +++ b/library/HTMLPurifier/URIScheme/news.php @@ -6,9 +6,9 @@ class HTMLPurifier_URIScheme_news extends HTMLPurifier_URIScheme { public $browsable = false; + public $may_omit_host = true; - public function validate(&$uri, $config, $context) { - parent::validate($uri, $config, $context); + public function doValidate(&$uri, $config, $context) { $uri->userinfo = null; $uri->host = null; $uri->port = null; diff --git a/library/HTMLPurifier/URIScheme/nntp.php b/library/HTMLPurifier/URIScheme/nntp.php index 5bf93ea7..4ccea0df 100644 --- a/library/HTMLPurifier/URIScheme/nntp.php +++ b/library/HTMLPurifier/URIScheme/nntp.php @@ -8,8 +8,7 @@ class HTMLPurifier_URIScheme_nntp extends HTMLPurifier_URIScheme { public $default_port = 119; public $browsable = false; - public function validate(&$uri, $config, $context) { - parent::validate($uri, $config, $context); + public function doValidate(&$uri, $config, $context) { $uri->userinfo = null; $uri->query = null; return true; diff --git a/tests/HTMLPurifier/AttrDef/URITest.php b/tests/HTMLPurifier/AttrDef/URITest.php index b149d1da..3044367a 100644 --- a/tests/HTMLPurifier/AttrDef/URITest.php +++ b/tests/HTMLPurifier/AttrDef/URITest.php @@ -74,6 +74,15 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness $this->assertDef('mailto:this-looks-like-a-path@example.com'); } + function testResolveNullSchemeAmbiguity() { + $this->assertDef('///foo', '/foo'); + } + + function testResolveNullSchemeDoubleAmbiguity() { + $this->config->set('URI.Host', 'example.com'); + $this->assertDef('////foo', '//example.com//foo'); + } + function testURIDefinitionValidation() { $parser = new HTMLPurifier_URIParser(); $uri = $parser->parse('http://example.com'); diff --git a/tests/HTMLPurifier/HTMLT/munge.htmlt b/tests/HTMLPurifier/HTMLT/munge.htmlt new file mode 100644 index 00000000..827c216e --- /dev/null +++ b/tests/HTMLPurifier/HTMLT/munge.htmlt @@ -0,0 +1,52 @@ +--INI-- +URI.Munge = "/r/%s" +URI.AllowedSchemes = http,ftp,file +--HTML-- +foo +foo +foo +foo +foo + +foo +foo +foo +foo +foo + +foo +foo +foo +foo +foo + +foo +foo +foo +foo +foo +--EXPECT-- +foo +foo +foo +foo +foo + +foo +foo +foo +foo +foo + +foo +foo +foo +foo +foo + +foo +foo +foo +foo +foo +--# vim: et sw=4 sts=4 diff --git a/tests/HTMLPurifier/URISchemeTest.php b/tests/HTMLPurifier/URISchemeTest.php index 23b8b203..4b43310b 100644 --- a/tests/HTMLPurifier/URISchemeTest.php +++ b/tests/HTMLPurifier/URISchemeTest.php @@ -172,6 +172,17 @@ class HTMLPurifier_URISchemeTest extends HTMLPurifier_URIHarness ); } + function test_file_local() { + $this->assertValidation( + 'file:///foo/bar?baz#frag', + 'file:///foo/bar#frag' + ); + } + + function test_ftp_empty_host() { + $this->assertValidation('ftp:///example.com', false); + } + } // vim: et sw=4 sts=4 diff --git a/tests/HTMLPurifier/URITest.php b/tests/HTMLPurifier/URITest.php index 57aefcff..02b95013 100644 --- a/tests/HTMLPurifier/URITest.php +++ b/tests/HTMLPurifier/URITest.php @@ -157,7 +157,7 @@ class HTMLPurifier_URITest extends HTMLPurifier_URIHarness } function test_validate_invalidHostThatLooksLikeIPv6() { - $this->assertValidation('http://[2001:0db8:85z3:08d3:1319:8a2e:0370:7334]', 'http:'); + $this->assertValidation('http://[2001:0db8:85z3:08d3:1319:8a2e:0370:7334]', ''); } function test_validate_removeRedundantScheme() {