diff --git a/library/HTMLPurifier/AttrDef/URI.php b/library/HTMLPurifier/AttrDef/URI.php index ab8c9746..9eb39b08 100644 --- a/library/HTMLPurifier/AttrDef/URI.php +++ b/library/HTMLPurifier/AttrDef/URI.php @@ -3,56 +3,36 @@ require_once 'HTMLPurifier/URIScheme.php'; require_once 'HTMLPurifier/URISchemeRegistry.php'; +HTMLPurifier_ConfigDef::define( + 'URI', 'DefaultScheme', 'http', + 'Defines through what scheme the output will be served, in order to '. + 'select the proper object validator when no scheme information is present.' +); + class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef { + function validate($uri, $config = null) { // We'll write stack-based parsers later, for now, use regexps to // get things working as fast as possible (irony) + if (!$config) $config = HTMLPurifier_Config::createDefault(); + // parse as CDATA $uri = $this->parseCDATA($uri); // while it would be nice to use parse_url(), that's specifically // for HTTP and thus won't work for our generic URI parsing - // define regexps - - $HEXDIG = '[A-Fa-f0-9]'; - $unreserved = 'A-Za-z0-9-._~'; // make sure you wrap with [] - $sub_delims = '!$&\'()'; // needs [] - $pct_encoded = "%$HEXDIG$HEXDIG"; - $h16 = "{$HEXDIG}{1,4}"; - $dec_octet = '(?:25[0-5]|2[0-4]\d|1\d\d|1\d|[0-9])'; - $IPv4address = "$dec_octet.$dec_octet.$dec_octet.$dec_octet"; - $ls32 = "(?:$h16:$h16|$IPv4address)"; - $IPvFuture = "v$HEXDIG+\.[:$unreserved$sub_delims]+"; - $IPv6Address = "(?:". - "(?:$h16:){6}$ls32" . - "|::(?:$h16:){5}$ls32" . - "|(?:$h16)?::(?:$h16:){4}$ls32" . - "|(?:(?:$h16:){1}$h16)?::(?:$h16:){3}$ls32" . - "|(?:(?:$h16:){2}$h16)?::(?:$h16:){2}$ls32" . - "|(?:(?:$h16:){3}$h16)?::(?:$h16:){1}$ls32" . - "|(?:(?:$h16:){4}$h16)?::$ls32" . - "|(?:(?:$h16:){5}$h16)?::$h16" . - "|(?:(?:$h16:){6}$h16)?::" . - ")"; - $IP_literal = "\[(?:$IPvFuture|$IPv6Address)\]"; - - // the important regexps, the collide with other names, prefix with r_ - - $r_userinfo = "(?:[$unreserved$sub_delims:]|$pct_encoded)*"; - $r_authority = "/^(($r_userinfo)@)?(\[$IP_literal\]|[^:]*)(:(\d*))?/"; - // according to the RFC... (but this cuts corners, i.e. non-validating) $r_URI = '!^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?!'; // 12 3 4 5 6 7 8 9 - $matches = array(); + $matches = array(); $result = preg_match($r_URI, $uri, $matches); - if (!$result) return ''; // wow, that's very strange + if (!$result) return ''; // seperate out parts $scheme = !empty($matches[1]) ? $matches[2] : null; @@ -61,43 +41,104 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef $query = !empty($matches[6]) ? $matches[7] : null; $fragment = !empty($matches[8]) ? $matches[9] : null; - // okay, no need to validate the scheme since we do that when we - // retrieve the specific scheme object from the registry - $scheme = ctype_lower($scheme) ? $scheme : strtolower($scheme); - $registry = HTMLPurifier_URISchemeRegistry::instance(); - $scheme_obj = $registry->getScheme($scheme); - if (!$scheme_obj) return ''; // invalid scheme, clean it out + + $registry = HTMLPurifier_URISchemeRegistry::instance(); + if ($scheme !== null) { + // no need to validate the scheme's fmt since we do that when we + // retrieve the specific scheme object from the registry + $scheme = ctype_lower($scheme) ? $scheme : strtolower($scheme); + $scheme_obj = $registry->getScheme($scheme); + if (!$scheme_obj) return ''; // invalid scheme, clean it out + } else { + $scheme_obj = $registry->getScheme($config->get('URI', 'DefaultScheme')); + } + + if ($authority !== null) { - // validate authority - $matches = array(); + + // define regexps + // this stuff may need to be factored out so Email can get to it + + $HEXDIG = '[A-Fa-f0-9]'; + $unreserved = 'A-Za-z0-9-._~'; // make sure you wrap with [] + $sub_delims = '!$&\'()'; // needs [] + $pct_encoded = "%$HEXDIG$HEXDIG"; + $h16 = "{$HEXDIG}{1,4}"; + $dec_octet = '(?:25[0-5]|2[0-4]\d|1\d\d|1\d|[0-9])'; + $IPv4address = "$dec_octet.$dec_octet.$dec_octet.$dec_octet"; + $ls32 = "(?:$h16:$h16|$IPv4address)"; + $IPvFuture = "v$HEXDIG+\.[:$unreserved$sub_delims]+"; + $IPv6Address = "(?:". + "(?:$h16:){6}$ls32" . + "|::(?:$h16:){5}$ls32" . + "|(?:$h16)?::(?:$h16:){4}$ls32" . + "|(?:(?:$h16:){1}$h16)?::(?:$h16:){3}$ls32" . + "|(?:(?:$h16:){2}$h16)?::(?:$h16:){2}$ls32" . + "|(?:(?:$h16:){3}$h16)?::(?:$h16:){1}$ls32" . + "|(?:(?:$h16:){4}$h16)?::$ls32" . + "|(?:(?:$h16:){5}$h16)?::$h16" . + "|(?:(?:$h16:){6}$h16)?::" . + ")"; + $IP_literal = "\[(?:$IPvFuture|$IPv6Address)\]"; + $r_userinfo = "(?:[$unreserved$sub_delims:]|$pct_encoded)*"; // IPv6 is broken + $r_authority = "/^(($r_userinfo)@)?(\[$IP_literal\]|[^:]*)(:(\d*))?/"; + $matches = array(); preg_match($r_authority, $authority, $matches); + // overloads regexp! $userinfo = !empty($matches[1]) ? $matches[2] : null; $host = !empty($matches[3]) ? $matches[3] : null; $port = !empty($matches[4]) ? $matches[5] : null; // validate port if ($port !== null) { - if (!ctype_digit($port) || $port < 1 || $port > 65535) { - $port = null; - } + $port = (int) $port; + if ($port < 1 || $port > 65535) $port = null; } + // userinfo and host are validated within the regexp + + // regenerate authority $authority = ($userinfo === null ? '' : ($userinfo . '@')) . $host . ($port === null ? '' : (':' . $port)); } + + // query and fragment are quite simple in terms of definition: + // *( pchar / "/" / "?" ), so define their validation routines + // when we start fixing percent encoding + + + + // path gets to be validated against a hodge-podge of rules depending + // on the status of authority and scheme, but it's not that important, + // esp. since it won't be applicable to everyone + + + + // okay, now we defer execution to the subobject for more processing list($authority, $path, $query, $fragment) = $scheme_obj->validateComponents($authority, $path, $query, $fragment); + // reconstruct the result + $result = ''; + if ($scheme !== null) $result .= "$scheme:"; + if ($authority !== null) $result .= "//$authority"; + $result .= $path; + if ($query !== null) $result .= "?$query"; + if ($fragment !== null) $result .= "#$fragment"; + + return $result; + } + } ?> \ No newline at end of file diff --git a/library/HTMLPurifier/ConfigDef.php b/library/HTMLPurifier/ConfigDef.php index de548ea0..f001127a 100644 --- a/library/HTMLPurifier/ConfigDef.php +++ b/library/HTMLPurifier/ConfigDef.php @@ -7,6 +7,7 @@ class HTMLPurifier_ConfigDef { function initialize() { $this->defineNamespace('Core', 'Core features that are always available.'); $this->defineNamespace('Attr', 'Features regarding attribute validation.'); + $this->defineNamespace('URI', 'Features regarding Uniform Resource Identifiers.'); } function &instance($prototype = null) { diff --git a/tests/HTMLPurifier/AttrDef/URITest.php b/tests/HTMLPurifier/AttrDef/URITest.php index 32b397fc..c7533074 100644 --- a/tests/HTMLPurifier/AttrDef/URITest.php +++ b/tests/HTMLPurifier/AttrDef/URITest.php @@ -110,10 +110,12 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness // test invalid port $uri[12] = 'http://example.com:foobar'; $components[12] = array('example.com', '', null, null); + $expect_uri[12] = 'http://example.com'; // test overlarge port (max is 65535, although this isn't official) $uri[13] = 'http://example.com:65536'; $components[13] = array('example.com', '', null, null); + $uri[13] = 'http://example.com'; // some spec abnf tests @@ -124,21 +126,29 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness $components[14] = array(null, '/this/is/path', null, null); $expect_uri[14] = 'http:/this/is/path'; // do not munge scheme off + // scheme munging is not being tested yet, it's an extra feature + // "path-rootless" - this should not be used but is allowed $uri[15] = 'http:this/is/path'; $components[15] = array(null, 'this/is/path', null, null); - $expect_uri[15] = 'this/is/path'; // munge scheme off + //$expect_uri[15] = 'this/is/path'; // munge scheme off // "path-empty" - a rather interesting case, remove the scheme $uri[16] = 'http:'; $components[16] = array(null, '', null, null); - $expect_uri[16] = ''; // munge scheme off + //$expect_uri[16] = ''; // munge scheme off // test invalid scheme $uri[17] = 'javascript:alert("moo");'; $components[17] = false; $expect_uri[17] = ''; + // relative URIs + + // test basic case + $uri[18] = '/a/b'; + $components[18] = array(null, '/a/b', null, null); + foreach ($uri as $i => $value) { // $fake_registry isn't the real mock, because due to PHP 4 weirdness @@ -172,7 +182,7 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness } $result = $def->validate($value); $scheme->tally(); - //$this->assertIdentical($expect_uri[$i], $result); + $this->assertIdentical($expect_uri[$i], $result); }