From 797b89930520c58198e846fd5c8db36828ea17f6 Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Wed, 1 Aug 2007 18:34:46 +0000 Subject: [PATCH] [2.1.0] Create new URI object and migrate URI validation systems to use it. URIScheme interface changed. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1334 48356398-32a2-884e-a903-53898d9a118a --- NEWS | 3 + library/HTMLPurifier/AttrDef/URI.php | 163 +++----------- library/HTMLPurifier/URI.php | 153 +++++++++++++ library/HTMLPurifier/URIParser.php | 8 +- library/HTMLPurifier/URIScheme.php | 16 +- library/HTMLPurifier/URIScheme/ftp.php | 29 ++- library/HTMLPurifier/URIScheme/http.php | 11 +- library/HTMLPurifier/URIScheme/mailto.php | 13 +- library/HTMLPurifier/URIScheme/news.php | 14 +- library/HTMLPurifier/URIScheme/nntp.php | 12 +- tests/HTMLPurifier/AttrDef/URITest.php | 220 +----------------- tests/HTMLPurifier/Harness.php | 10 + tests/HTMLPurifier/URIParserTest.php | 4 +- tests/HTMLPurifier/URISchemeTest.php | 260 +++++++++++----------- tests/HTMLPurifier/URITest.php | 240 ++++++++++++++++++++ tests/test_files.php | 1 + 16 files changed, 623 insertions(+), 534 deletions(-) create mode 100644 library/HTMLPurifier/URI.php create mode 100644 tests/HTMLPurifier/URITest.php diff --git a/NEWS b/NEWS index 007a3fb7..27c3188e 100644 --- a/NEWS +++ b/NEWS @@ -51,6 +51,9 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier . URI scheme is munged off if there is no authority and the scheme is the default one . All unit tests inherit from HTMLPurifier_Harness, not UnitTestCase +. Interface for URIScheme changed +. Generic URI object to hold components of URI added, most systems involved + in URI validation have been migrated to use it 2.0.1, released 2007-06-27 ! Tag auto-closing now based on a ChildDef heuristic rather than a diff --git a/library/HTMLPurifier/AttrDef/URI.php b/library/HTMLPurifier/AttrDef/URI.php index 8c76fbf3..4f383c57 100644 --- a/library/HTMLPurifier/AttrDef/URI.php +++ b/library/HTMLPurifier/AttrDef/URI.php @@ -93,170 +93,59 @@ HTMLPurifier_ConfigSchema::defineAlias('Attr', 'DisableURI', 'URI', 'Disable'); class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef { - var $host, $parser; - var $embeds_resource; + var $parser, $percentEncoder; + var $embedsResource; /** * @param $embeds_resource_resource Does the URI here result in an extra HTTP request? */ function HTMLPurifier_AttrDef_URI($embeds_resource = false) { - $this->host = new HTMLPurifier_AttrDef_URI_Host(); $this->parser = new HTMLPurifier_URIParser(); - $this->embeds_resource = (bool) $embeds_resource; + $this->percentEncoder = new HTMLPurifier_PercentEncoder(); + $this->embedsResource = (bool) $embeds_resource; } function validate($uri, $config, &$context) { - static $PercentEncoder = null; - if ($PercentEncoder === null) $PercentEncoder = new HTMLPurifier_PercentEncoder(); - if ($config->get('URI', 'Disable')) return false; // initial operations $uri = $this->parseCDATA($uri); - $uri = $PercentEncoder->normalize($uri); + $uri = $this->percentEncoder->normalize($uri); // parse the URI - $parsed_uri = $this->parser->parse($uri); - if ($parsed_uri === false) return false; - list($scheme, $userinfo, $host, $port, $path, $query, $fragment) = $parsed_uri; + $uri = $this->parser->parse($uri); + if ($uri === false) return false; - // retrieve the scheme object - $registry =& HTMLPurifier_URISchemeRegistry::instance(); - $default_scheme = $config->get('URI', 'DefaultScheme'); - if ($scheme !== null) { - // no need to validate the scheme's fmt since we do that when we - // retrieve the specific scheme object from the registry - $scheme = ctype_lower($scheme) ? $scheme : strtolower($scheme); - $scheme_obj = $registry->getScheme($scheme, $config, $context); - if (!$scheme_obj) return false; // invalid scheme, clean it out - } else { - // no scheme: retrieve the default one - $scheme_obj = $registry->getScheme($default_scheme, $config, $context); - if (!$scheme_obj) { - // something funky happened to the default scheme object - trigger_error( - 'Default scheme object "' . $config->get('URI', 'DefaultScheme') . '" was not readable', - E_USER_WARNING - ); - return false; - } - } - if ($this->embeds_resource && !$scheme_obj->browsable) { - // the URI we're processing embeds_resource a resource in the - // page, but the URI it references cannot be physically retrieved - return false; - } + // generic validation + $context->register('EmbeddedURI', $this->embedsResource); // flag + $result = $uri->validate($config, $context); + $context->destroy('EmbeddedURI'); + if (!$result) return false; - // validate host - if ($host !== null) { - // remove URI if it's absolute and we disabled externals or - // if it's absolute and embedded and we disabled external resources - unset($our_host); - if ( - $config->get('URI', 'DisableExternal') || - ( - $config->get('URI', 'DisableExternalResources') && - $this->embeds_resource - ) - ) { - $our_host = $config->get('URI', 'Host'); - if ($our_host === null) return false; - } - $host = $this->host->validate($host, $config, $context); - if ($host === false) $host = null; - - // check host against blacklist - if ($this->checkBlacklist($host, $config, $context)) return false; - - // more lenient absolute checking - if (isset($our_host)) { - $host_parts = array_reverse(explode('.', $host)); - // could be cached - $our_host_parts = array_reverse(explode('.', $our_host)); - foreach ($our_host_parts as $i => $discard) { - if (!isset($host_parts[$i])) return false; - if ($host_parts[$i] != $our_host_parts[$i]) return false; - } - } - } + // scheme-specific validation + $scheme_obj = $uri->getSchemeObj($config, $context); + if (!$scheme_obj) return false; + if ($this->embedsResource && !$scheme_obj->browsable) return false; + $result = $scheme_obj->validate($uri, $config, $context); + if (!$result) return false; - // validate port - if ($port !== null) { - if ($port < 1 || $port > 65535) $port = null; - } - - - // query and fragment are quite simple in terms of definition: - // *( pchar / "/" / "?" ), so define their validation routines - // when we start fixing percent encoding - - - - // path gets to be validated against a hodge-podge of rules depending - // on the status of authority and scheme, but it's not that important, - // esp. since it won't be applicable to everyone - - - - // okay, now we defer execution to the subobject for more processing - // note that $fragment is omitted - list($userinfo, $host, $port, $path, $query) = - $scheme_obj->validateComponents( - $userinfo, $host, $port, $path, $query, $config, $context - ); - - - // reconstruct authority - $authority = null; - if (!is_null($userinfo) || !is_null($host) || !is_null($port)) { - $authority = ''; - if($userinfo !== null) $authority .= $userinfo . '@'; - $authority .= $host; - if($port !== null) $authority .= ':' . $port; - } else { - if ($default_scheme == $scheme) $scheme = null; // munge scheme off when unnecessary - } - - // reconstruct the result - $result = ''; - if ($scheme !== null) $result .= "$scheme:"; - if ($authority !== null) $result .= "//$authority"; - $result .= $path; - if ($query !== null) $result .= "?$query"; - if ($fragment !== null) $result .= "#$fragment"; + // back to string + $result = $uri->toString(); // munge if necessary - $munge = $config->get('URI', 'Munge'); - if (!empty($scheme_obj->browsable) && $munge !== null) { - if ($authority !== null) { - $result = str_replace('%s', rawurlencode($result), $munge); - } + if ( + !is_null($uri->host) && // indicator for authority + !empty($scheme_obj->browsable) && + !is_null($munge = $config->get('URI', 'Munge')) + ) { + $result = str_replace('%s', rawurlencode($result), $munge); } return $result; } - /** - * Checks a host against an array blacklist - * @param $host Host to check - * @param $config HTMLPurifier_Config instance - * @param $context HTMLPurifier_Context instance - * @return bool Is spam? - */ - function checkBlacklist($host, &$config, &$context) { - $blacklist = $config->get('URI', 'HostBlacklist'); - if (!empty($blacklist)) { - foreach($blacklist as $blacklisted_host_fragment) { - if (strpos($host, $blacklisted_host_fragment) !== false) { - return true; - } - } - } - return false; - } - } diff --git a/library/HTMLPurifier/URI.php b/library/HTMLPurifier/URI.php new file mode 100644 index 00000000..968471b2 --- /dev/null +++ b/library/HTMLPurifier/URI.php @@ -0,0 +1,153 @@ +scheme = is_null($scheme) || ctype_lower($scheme) ? $scheme : strtolower($scheme); + $this->userinfo = $userinfo; + $this->host = $host; + $this->port = is_null($port) ? $port : (int) $port; + $this->path = $path; + $this->query = $query; + $this->fragment = $fragment; + } + + function getSchemeObj($config, &$context) { + $registry =& HTMLPurifier_URISchemeRegistry::instance(); + if ($this->scheme !== null) { + $scheme_obj = $registry->getScheme($this->scheme, $config, $context); + if (!$scheme_obj) return false; // invalid scheme, clean it out + } else { + // no scheme: retrieve the default one + $scheme_obj = $registry->getScheme($config->get('URI', 'DefaultScheme'), $config, $context); + if (!$scheme_obj) { + // something funky happened to the default scheme object + trigger_error( + 'Default scheme object "' . $config->get('URI', 'DefaultScheme') . '" was not readable', + E_USER_WARNING + ); + return false; + } + } + return $scheme_obj; + } + + /** + * Generic validation method applicable for all schemes + */ + function validate($config, &$context) { + + // validate host + if (!is_null($this->host)) { + // remove URI if it's absolute and we disabled externals or + // if it's absolute and embedded and we disabled external resources + unset($our_host); // ensure this variable is not set + if ( + $config->get('URI', 'DisableExternal') || + ( + $config->get('URI', 'DisableExternalResources') && + $context->get('EmbeddedURI', true) // suppress errors + ) + ) { + $our_host = $config->get('URI', 'Host'); + if ($our_host === null) return false; + } + $host_def = new HTMLPurifier_AttrDef_URI_Host(); + $this->host = $host_def->validate($this->host, $config, $context); + if ($this->host === false) $this->host = null; + + // check host against blacklist + if ($this->checkBlacklist($this->host, $config, $context)) return false; + + // more lenient absolute checking + if (isset($our_host)) { + $host_parts = array_reverse(explode('.', $this->host)); + // could be cached + $our_host_parts = array_reverse(explode('.', $our_host)); + foreach ($our_host_parts as $i => $discard) { + if (!isset($host_parts[$i])) return false; + if ($host_parts[$i] != $our_host_parts[$i]) return false; + } + } + } + + // munge scheme off if necessary + if (!is_null($this->scheme) && is_null($this->host)) { + if ($config->get('URI', 'DefaultScheme') == $this->scheme) { + $this->scheme = null; + } + } + + // validate port + if (!is_null($this->port)) { + if ($this->port < 1 || $this->port > 65535) $this->port = null; + } + + // query and fragment are quite simple in terms of definition: + // *( pchar / "/" / "?" ), so define their validation routines + // when we start fixing percent encoding + + // path gets to be validated against a hodge-podge of rules depending + // on the status of authority and scheme, but it's not that important, + // esp. since it won't be applicable to everyone + + return true; + + } + + /** + * Checks a host against an array blacklist + * @param $host Host to check + * @param $config HTMLPurifier_Config instance + * @param $context HTMLPurifier_Context instance + * @return bool Is spam? + */ + function checkBlacklist($host, $config, &$context) { + $blacklist = $config->get('URI', 'HostBlacklist'); + if (!empty($blacklist)) { + foreach($blacklist as $blacklisted_host_fragment) { + if (strpos($host, $blacklisted_host_fragment) !== false) { + return true; + } + } + } + return false; + } + + /** + * Convert URI back to string + */ + function toString() { + // reconstruct authority + $authority = null; + if (!is_null($this->host)) { + $authority = ''; + if(!is_null($this->userinfo)) $authority .= $this->userinfo . '@'; + $authority .= $this->host; + if(!is_null($this->port)) $authority .= ':' . $this->port; + } + + // reconstruct the result + $result = ''; + if (!is_null($this->scheme)) $result .= $this->scheme . ':'; + if (!is_null($authority)) $result .= '//' . $authority; + $result .= $this->path; + if (!is_null($this->query)) $result .= '?' . $this->query; + if (!is_null($this->fragment)) $result .= '#' . $this->fragment; + + return $result; + } + +} + diff --git a/library/HTMLPurifier/URIParser.php b/library/HTMLPurifier/URIParser.php index 44a24440..dff7e28e 100644 --- a/library/HTMLPurifier/URIParser.php +++ b/library/HTMLPurifier/URIParser.php @@ -1,8 +1,11 @@ default_port == $port) $port = null; - return array($userinfo, $host, $port, $path, $query); + function validate(&$uri, $config, &$context) { + if ($this->default_port == $uri->port) $uri->port = null; + return true; } } diff --git a/library/HTMLPurifier/URIScheme/ftp.php b/library/HTMLPurifier/URIScheme/ftp.php index 3dbb1446..950fe032 100644 --- a/library/HTMLPurifier/URIScheme/ftp.php +++ b/library/HTMLPurifier/URIScheme/ftp.php @@ -10,34 +10,33 @@ class HTMLPurifier_URIScheme_ftp extends HTMLPurifier_URIScheme { var $default_port = 21; var $browsable = true; // usually - function validateComponents( - $userinfo, $host, $port, $path, $query, $config, &$context - ) { - list($userinfo, $host, $port, $path, $query) = - parent::validateComponents( - $userinfo, $host, $port, $path, $query, $config, $context ); - $semicolon_pos = strrpos($path, ';'); // reverse + function validate(&$uri, $config, &$context) { + parent::validate($uri, $config, $context); + $uri->query = null; + + // typecode check + $semicolon_pos = strrpos($uri->path, ';'); // reverse if ($semicolon_pos !== false) { - // typecode check - $type = substr($path, $semicolon_pos + 1); // no semicolon - $path = substr($path, 0, $semicolon_pos); + $type = substr($uri->path, $semicolon_pos + 1); // no semicolon + $uri->path = substr($uri->path, 0, $semicolon_pos); $type_ret = ''; if (strpos($type, '=') !== false) { // figure out whether or not the declaration is correct list($key, $typecode) = explode('=', $type, 2); if ($key !== 'type') { // invalid key, tack it back on encoded - $path .= '%3B' . $type; + $uri->path .= '%3B' . $type; } elseif ($typecode === 'a' || $typecode === 'i' || $typecode === 'd') { $type_ret = ";type=$typecode"; } } else { - $path .= '%3B' . $type; + $uri->path .= '%3B' . $type; } - $path = str_replace(';', '%3B', $path); - $path .= $type_ret; + $uri->path = str_replace(';', '%3B', $uri->path); + $uri->path .= $type_ret; } - return array($userinfo, $host, $port, $path, null); + + return true; } } diff --git a/library/HTMLPurifier/URIScheme/http.php b/library/HTMLPurifier/URIScheme/http.php index 18a1cf87..262e2bd9 100644 --- a/library/HTMLPurifier/URIScheme/http.php +++ b/library/HTMLPurifier/URIScheme/http.php @@ -10,13 +10,10 @@ class HTMLPurifier_URIScheme_http extends HTMLPurifier_URIScheme { var $default_port = 80; var $browsable = true; - function validateComponents( - $userinfo, $host, $port, $path, $query, $config, &$context - ) { - list($userinfo, $host, $port, $path, $query) = - parent::validateComponents( - $userinfo, $host, $port, $path, $query, $config, $context ); - return array(null, $host, $port, $path, $query); + function validate(&$uri, $config, &$context) { + parent::validate($uri, $config, $context); + $uri->userinfo = null; + return true; } } diff --git a/library/HTMLPurifier/URIScheme/mailto.php b/library/HTMLPurifier/URIScheme/mailto.php index 8e552f5c..f6acc6af 100644 --- a/library/HTMLPurifier/URIScheme/mailto.php +++ b/library/HTMLPurifier/URIScheme/mailto.php @@ -15,14 +15,13 @@ class HTMLPurifier_URIScheme_mailto extends HTMLPurifier_URIScheme { var $browsable = false; - function validateComponents( - $userinfo, $host, $port, $path, $query, $config, &$context - ) { - list($userinfo, $host, $port, $path, $query) = - parent::validateComponents( - $userinfo, $host, $port, $path, $query, $config, $context ); + function validate(&$uri, $config, &$context) { + parent::validate($uri, $config, $context); + $uri->userinfo = null; + $uri->host = null; + $uri->port = null; // we need to validate path against RFC 2368's addr-spec - return array(null, null, null, $path, $query); + return true; } } diff --git a/library/HTMLPurifier/URIScheme/news.php b/library/HTMLPurifier/URIScheme/news.php index 7b81834f..87bda63c 100644 --- a/library/HTMLPurifier/URIScheme/news.php +++ b/library/HTMLPurifier/URIScheme/news.php @@ -9,14 +9,14 @@ class HTMLPurifier_URIScheme_news extends HTMLPurifier_URIScheme { var $browsable = false; - function validateComponents( - $userinfo, $host, $port, $path, $query, $config, &$context - ) { - list($userinfo, $host, $port, $path, $query) = - parent::validateComponents( - $userinfo, $host, $port, $path, $query, $config, $context ); + function validate(&$uri, $config, &$context) { + parent::validate($uri, $config, $context); + $uri->userinfo = null; + $uri->host = null; + $uri->port = null; + $uri->query = null; // typecode check needed on path - return array(null, null, null, $path, null); + return true; } } diff --git a/library/HTMLPurifier/URIScheme/nntp.php b/library/HTMLPurifier/URIScheme/nntp.php index 8f513419..caa85b26 100644 --- a/library/HTMLPurifier/URIScheme/nntp.php +++ b/library/HTMLPurifier/URIScheme/nntp.php @@ -10,13 +10,11 @@ class HTMLPurifier_URIScheme_nntp extends HTMLPurifier_URIScheme { var $default_port = 119; var $browsable = false; - function validateComponents( - $userinfo, $host, $port, $path, $query, $config, &$context - ) { - list($userinfo, $host, $port, $path, $query) = - parent::validateComponents( - $userinfo, $host, $port, $path, $query, $config, $context ); - return array(null, $host, $port, $path, null); + function validate(&$uri, $config, &$context) { + parent::validate($uri, $config, $context); + $uri->userinfo = null; + $uri->query = null; + return true; } } diff --git a/tests/HTMLPurifier/AttrDef/URITest.php b/tests/HTMLPurifier/AttrDef/URITest.php index 33d058c5..d6f55ccd 100644 --- a/tests/HTMLPurifier/AttrDef/URITest.php +++ b/tests/HTMLPurifier/AttrDef/URITest.php @@ -3,162 +3,17 @@ require_once 'HTMLPurifier/AttrDefHarness.php'; require_once 'HTMLPurifier/AttrDef/URI.php'; -// WARNING: INCOMPLETE UNIT TESTS! -// we also need to test all the configuration directives defined by this class - -// http: is returned quite often when a URL is invalid. We have to change -// this behavior to just a plain old "FALSE"! - +/** + * @todo Aim for complete code coverage with mocks + */ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness { - var $scheme, $components, $return_components; - - var $oldRegistry; - function setUp() { - // setup ensures that any twiddling around with the registry is reverted - $this->oldRegistry = HTMLPurifier_URISchemeRegistry::instance(); - $this->def = new HTMLPurifier_AttrDef_URI(); // default + $this->def = new HTMLPurifier_AttrDef_URI(); parent::setUp(); } - function tearDown() { - HTMLPurifier_URISchemeRegistry::instance($this->oldRegistry); - } - - function &generateSchemeMock($scheme_names = array('http', 'mailto')) { - generate_mock_once('HTMLPurifier_URIScheme'); - generate_mock_once('HTMLPurifier_URISchemeRegistry'); - - // load a scheme registry mock to the singleton - $registry =& HTMLPurifier_URISchemeRegistry::instance( - new HTMLPurifier_URISchemeRegistryMock() - ); - - // add a pseudo-scheme to the registry for $scheme_names - $scheme = new HTMLPurifier_URISchemeMock(); - foreach ($scheme_names as $name) { - $registry->setReturnReference('getScheme', $scheme, array($name, '*', '*')); - } - // registry returns false if an invalid scheme is requested - $registry->setReturnValue('getScheme', false, array('*', '*', '*')); - - return $scheme; - } - - // PARSING RELATED TESTS - - function assertParsing($uri, $userinfo, $host, $port, $path, $query, $config = null, $context = null) { - - $this->prepareCommon($config, $context); - $scheme =& $this->generateSchemeMock(); - - // create components parameter list - // Config and Context are wildcards due to PHP4 reference funkiness - $components = array($userinfo, $host, $port, $path, $query, '*', '*'); - $scheme->expectOnce('validateComponents', $components); - - $def = new HTMLPurifier_AttrDef_URI(); - $def->validate($uri, $config, $context); - - $scheme->tally(); - - } - - function testParsingImproperPercentEncoding() { - // even though we don't resolve percent entities, we have to fix - // improper percent-encodes. Taken one at a time: - // %56 - V, which is an unreserved character - // %fc - u with an umlaut, normalize to uppercase - // %GJ - invalid characters in entity, encode % - // %5 - prematurely terminated, encode % - // %FC - u with umlaut, correct - // note that Apache doesn't do such fixing, rather, it just claims - // that the browser sent a "Bad Request". See PercentEncoder.php - // for more details - $this->assertParsing( - 'http://www.example.com/%56%fc%GJ%5%FC', - null, 'www.example.com', null, '/V%FC%25GJ%255%FC', null - ); - } - - function testParsingInvalidHostThatLooksLikeIPv6Address() { - $this->assertParsing( - 'http://[2001:0db8:85z3:08d3:1319:8a2e:0370:7334]', - null, null, null, '', null - ); - } - - function testParsingOverLargePort() { - $this->assertParsing( - 'http://example.com:65536', - null, 'example.com', null, '', null - ); - } - - // OUTPUT RELATED TESTS - // scheme is mocked to ensure only the URI is being tested - - function assertOutput($input_uri, $expect_uri, $userinfo, $host, $port, $path, $query, $config = null, $context = null) { - - // prepare mock machinery - $this->prepareCommon($config, $context); - $scheme =& $this->generateSchemeMock(); - $components = array($userinfo, $host, $port, $path, $query); - $scheme->setReturnValue('validateComponents', $components); - - $def = new HTMLPurifier_AttrDef_URI(); - $result_uri = $def->validate($input_uri, $config, $context); - if ($expect_uri === true) $expect_uri = $input_uri; - $this->assertEqual($result_uri, $expect_uri); - - } - - function testOutputRegular() { - $this->assertOutput( - 'http://user@authority.part:8080/now/the/path?query#frag', true, - 'user', 'authority.part', 8080, '/now/the/path', 'query' - ); - } - - function testOutputEmpty() { - $this->assertOutput( - '', true, - null, null, null, '', null - ); - } - - function testOutputNullPath() { - $this->assertOutput( - '', true, - null, null, null, null, null // usually shouldn't happen - ); - } - - function testOutputPathAbsolute() { - $this->assertOutput( - 'http:/this/is/path', '/this/is/path', - null, null, null, '/this/is/path', null - ); - } - - function testOutputPathRootless() { - $this->assertOutput( - 'http:this/is/path', 'this/is/path', - null, null, null, 'this/is/path', null - ); - } - - function testOutputPathEmpty() { - $this->assertOutput( - 'http:', '', - null, null, null, '', null - ); - } - - // INTEGRATION TESTS - function testIntegration() { $this->assertDef('http://www.google.com/'); $this->assertDef('http:', ''); @@ -170,84 +25,27 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness $this->assertDef('mailto:bob@example.com'); } - function testConfigDisableExternal() { - - $this->def = new HTMLPurifier_AttrDef_URI(); - - $this->config->set('URI', 'DisableExternal', true); - $this->config->set('URI', 'Host', 'sub.example.com'); - - $this->assertDef('/foobar.txt'); - $this->assertDef('http://google.com/', false); - $this->assertDef('http://sub.example.com/alas?foo=asd'); - $this->assertDef('http://example.com/teehee', false); - $this->assertDef('http://www.example.com/#man', false); - $this->assertDef('http://go.sub.example.com/perhaps?p=foo'); - + function testIntegrationWithPercentEncoder() { + $this->assertDef( + 'http://www.example.com/%56%fc%GJ%5%FC', + 'http://www.example.com/V%FC%25GJ%255%FC' + ); } function testEmbeds() { - - // embedded URI $this->def = new HTMLPurifier_AttrDef_URI(true); - $this->assertDef('http://sub.example.com/alas?foo=asd'); $this->assertDef('mailto:foo@example.com', false); - - } - - function testConfigDisableExternalResources() { - - $this->config->set('URI', 'DisableExternalResources', true); - - $this->def = new HTMLPurifier_AttrDef_URI(); - $this->assertDef('http://sub.example.com/alas?foo=asd'); - $this->assertDef('/img.png'); - - $this->def = new HTMLPurifier_AttrDef_URI(true); - $this->assertDef('http://sub.example.com/alas?foo=asd', false); - $this->assertDef('/img.png'); - } function testConfigMunge() { - $this->config->set('URI', 'Munge', 'http://www.google.com/url?q=%s'); - $this->assertDef( 'http://www.example.com/', 'http://www.google.com/url?q=http%3A%2F%2Fwww.example.com%2F' ); - $this->assertDef('index.html'); $this->assertDef('javascript:foobar();', false); - - } - - function testBlacklist() { - - $this->config->set('URI', 'HostBlacklist', array('example.com', 'moo')); - - $this->assertDef('foo.txt'); - $this->assertDef('http://www.google.com/example.com/moo'); - - $this->assertDef('http://example.com/#23', false); - $this->assertDef('https://sub.domain.example.com/foobar', false); - $this->assertDef('http://example.com.example.net/?whoo=foo', false); - $this->assertDef('ftp://moo-moo.net/foo/foo/', false); - - } - - function testWhitelist() { - /* unimplemented - $this->config->set('URI', 'HostPolicy', 'DenyAll'); - $this->config->set('URI', 'HostWhitelist', array(null, 'google.com')); - - $this->assertDef('http://example.com/fo/google.com', false); - $this->assertDef('server.txt'); - $this->assertDef('ftp://www.google.com/?t=a'); - $this->assertDef('http://google.com.tricky.spamsite.net', false); - */ } } diff --git a/tests/HTMLPurifier/Harness.php b/tests/HTMLPurifier/Harness.php index ce1bb11e..5cc8c379 100644 --- a/tests/HTMLPurifier/Harness.php +++ b/tests/HTMLPurifier/Harness.php @@ -10,10 +10,20 @@ class HTMLPurifier_Harness extends UnitTestCase parent::UnitTestCase(); } + var $config, $context; + + function setUp() { + list($this->config, $this->context) = $this->createCommon(); + } + function prepareCommon(&$config, &$context) { $config = HTMLPurifier_Config::create($config); if (!$context) $context = new HTMLPurifier_Context(); } + function createCommon() { + return array(HTMLPurifier_Config::createDefault(), new HTMLPurifier_Context); + } + } diff --git a/tests/HTMLPurifier/URIParserTest.php b/tests/HTMLPurifier/URIParserTest.php index 1bca977d..370e90ca 100644 --- a/tests/HTMLPurifier/URIParserTest.php +++ b/tests/HTMLPurifier/URIParserTest.php @@ -1,6 +1,7 @@ prepareCommon($config, $context); $parser = new HTMLPurifier_URIParser(); $result = $parser->parse($uri, $config, $context); - $this->assertEqual($result, array($scheme, $userinfo, $host, $port, $path, $query, $fragment)); + $expect = new HTMLPurifier_URI($scheme, $userinfo, $host, $port, $path, $query, $fragment); + $this->assertEqual($result, $expect); } function testRegular() { diff --git a/tests/HTMLPurifier/URISchemeTest.php b/tests/HTMLPurifier/URISchemeTest.php index a00ca0dc..0ace9a21 100644 --- a/tests/HTMLPurifier/URISchemeTest.php +++ b/tests/HTMLPurifier/URISchemeTest.php @@ -1,6 +1,10 @@ assertIdentical( - $scheme->validateComponents( - null, 'www.example.com', null, '/', 's=foobar', $config, $context), - array(null, 'www.example.com', null, '/', 's=foobar') - ); - - // absorb default port and userinfo - $this->assertIdentical( - $scheme->validateComponents( - 'user', 'www.example.com', 80, '/', 's=foobar', $config, $context), - array(null, 'www.example.com', null, '/', 's=foobar') - ); - - // do not absorb non-default port - $this->assertIdentical( - $scheme->validateComponents( - null, 'www.example.com', 8080, '/', 's=foobar', $config, $context), - array(null, 'www.example.com', 8080, '/', 's=foobar') - ); - - // https is basically the same - - $scheme = new HTMLPurifier_URIScheme_https(); - $this->assertIdentical( - $scheme->validateComponents( - 'user', 'www.example.com', 443, '/', 's=foobar', $config, $context), - array(null, 'www.example.com', null, '/', 's=foobar') - ); - + function assertValidation($uri, $expect_uri = true) { + $parser = new HTMLPurifier_URIParser(); + if ($expect_uri === true) $uri = $expect_uri; + $uri = $parser->parse($uri); + if ($expect_uri !== false) { + $expect_uri = $parser->parse($expect_uri); + } + // convenience hack: the scheme should be explicitly specified + $scheme = $uri->getSchemeObj($this->config, $this->context); + $result = $scheme->validate($uri, $this->config, $this->context); + if ($expect_uri !== false) { + $this->assertTrue($result); + $this->assertIdentical($uri, $expect_uri); + } else { + $this->assertFalse($result); + } } - function test_ftp() { - - $scheme = new HTMLPurifier_URIScheme_ftp(); - $config = HTMLPurifier_Config::createDefault(); - $context = new HTMLPurifier_Context(); - - $this->assertIdentical( - $scheme->validateComponents( - 'user', 'www.example.com', 21, '/', 's=foobar', $config, $context), - array('user', 'www.example.com', null, '/', null) - ); - - // valid typecode - $this->assertIdentical( - $scheme->validateComponents( - null, 'www.example.com', null, '/file.txt;type=a', null, $config, $context), - array(null, 'www.example.com', null, '/file.txt;type=a', null) - ); - - // remove invalid typecode - $this->assertIdentical( - $scheme->validateComponents( - null, 'www.example.com', null, '/file.txt;type=z', null, $config, $context), - array(null, 'www.example.com', null, '/file.txt', null) - ); - - // encode errant semicolons - $this->assertIdentical( - $scheme->validateComponents( - null, 'www.example.com', null, '/too;many;semicolons=1', null, $config, $context), - array(null, 'www.example.com', null, '/too%3Bmany%3Bsemicolons=1', null) - ); - - } - - function test_news() { - - $scheme = new HTMLPurifier_URIScheme_news(); - $config = HTMLPurifier_Config::createDefault(); - $context = new HTMLPurifier_Context(); - - $this->assertIdentical( - $scheme->validateComponents( - null, null, null, 'gmane.science.linguistics', null, $config, $context), - array(null, null, null, 'gmane.science.linguistics', null) - ); - - $this->assertIdentical( - $scheme->validateComponents( - null, null, null, '642@eagle.ATT.COM', null, $config, $context), - array(null, null, null, '642@eagle.ATT.COM', null) - ); - - // test invalid field removal - $this->assertIdentical( - $scheme->validateComponents( - 'user', 'www.google.com', 80, 'rec.music', 'path=foo', $config, $context), - array(null, null, null, 'rec.music', null) - ); - - } - - function test_nntp() { - - $scheme = new HTMLPurifier_URIScheme_nntp(); - $config = HTMLPurifier_Config::createDefault(); - $context = new HTMLPurifier_Context(); - - $this->assertIdentical( - $scheme->validateComponents( - null, 'news.example.com', null, '/alt.misc/12345', null, $config, $context), - array(null, 'news.example.com', null, '/alt.misc/12345', null) - ); - - - $this->assertIdentical( - $scheme->validateComponents( - 'user', 'news.example.com', 119, '/alt.misc/12345', 'foo=asdf', $config, $context), - array(null, 'news.example.com', null, '/alt.misc/12345', null) + function test_http_regular() { + $this->assertValidation( + 'http://example.com/?s=q#fragment' ); } - function test_mailto() { - - $scheme = new HTMLPurifier_URIScheme_mailto(); - $config = HTMLPurifier_Config::createDefault(); - $context = new HTMLPurifier_Context(); - - $this->assertIdentical( - $scheme->validateComponents( - null, null, null, 'bob@example.com', null, $config, $context), - array(null, null, null, 'bob@example.com', null) + function test_http_removeDefaultPort() { + $this->assertValidation( + 'http://example.com:80', + 'http://example.com' ); - - $this->assertIdentical( - $scheme->validateComponents( - 'user', 'example.com', 80, 'bob@example.com', 'subject=Foo!', $config, $context), - array(null, null, null, 'bob@example.com', 'subject=Foo!') + } + + function test_http_removeUserInfo() { + $this->assertValidation( + 'http://bob@example.com', + 'http://example.com' + ); + } + + function test_http_preserveNonDefaultPort() { + $this->assertValidation( + 'http://example.com:8080' + ); + } + + function test_https_regular() { + $this->assertValidation( + 'https://user@example.com:443/?s=q#frag', + 'https://example.com/?s=q#frag' + ); + } + + function test_ftp_regular() { + $this->assertValidation( + 'ftp://user@example.com/path' + ); + } + + function test_ftp_removeDefaultPort() { + $this->assertValidation( + 'ftp://example.com:21', + 'ftp://example.com' + ); + } + + function test_ftp_removeQueryString() { + $this->assertValidation( + 'ftp://example.com?s=q', + 'ftp://example.com' + ); + } + + function test_ftp_preserveValidTypecode() { + $this->assertValidation( + 'ftp://example.com/file.txt;type=a' + ); + } + + function test_ftp_removeInvalidTypecode() { + $this->assertValidation( + 'ftp://example.com/file.txt;type=z', + 'ftp://example.com/file.txt' + ); + } + + function test_ftp_encodeExtraSemicolons() { + $this->assertValidation( + 'ftp://example.com/too;many;semicolons=1', + 'ftp://example.com/too%3Bmany%3Bsemicolons=1' + ); + } + + function test_news_regular() { + $this->assertValidation( + 'news:gmane.science.linguistics' + ); + } + + function test_news_explicit() { + $this->assertValidation( + 'news:642@eagle.ATT.COM' + ); + } + + function test_news_removeNonPathComponents() { + $this->assertValidation( + 'news://user@example.com:80/rec.music?path=foo#frag', + 'news:/rec.music#frag' + ); + } + + function test_nntp_regular() { + $this->assertValidation( + 'nntp://news.example.com/alt.misc/42#frag' + ); + } + + function test_nntp_removalOfRedundantOrUselessComponents() { + $this->assertValidation( + 'nntp://user@news.example.com:119/alt.misc/42?s=q#frag', + 'nntp://news.example.com/alt.misc/42#frag' + ); + } + + function test_mailto_regular() { + $this->assertValidation( + 'mailto:bob@example.com' + ); + } + + function test_mailto_removalOfRedundantOrUselessComponents() { + $this->assertValidation( + 'mailto://user@example.com:80/bob@example.com?subject=Foo#frag', + 'mailto:/bob@example.com?subject=Foo#frag' ); - } } diff --git a/tests/HTMLPurifier/URITest.php b/tests/HTMLPurifier/URITest.php new file mode 100644 index 00000000..cebc79cf --- /dev/null +++ b/tests/HTMLPurifier/URITest.php @@ -0,0 +1,240 @@ +parse($uri); + } + + function test_construct() { + $uri1 = new HTMLPurifier_URI('HTTP', 'bob', 'example.com', '23', '/foo', 'bar=2', 'slash'); + $uri2 = new HTMLPurifier_URI('http', 'bob', 'example.com', 23, '/foo', 'bar=2', 'slash'); + $this->assertIdentical($uri1, $uri2); + } + + var $oldRegistry; + + function &setUpSchemeRegistryMock() { + $this->oldRegistry = HTMLPurifier_URISchemeRegistry::instance(); + generate_mock_once('HTMLPurifier_URIScheme'); + generate_mock_once('HTMLPurifier_URISchemeRegistry'); + $registry =& HTMLPurifier_URISchemeRegistry::instance( + new HTMLPurifier_URISchemeRegistryMock() + ); + return $registry; + } + + function &setUpSchemeMock($name) { + $registry =& $this->setUpSchemeRegistryMock(); + $scheme_mock = new HTMLPurifier_URISchemeMock(); + $registry->setReturnValue('getScheme', $scheme_mock, array($name, '*', '*')); + return $scheme_mock; + } + + function setUpNoValidSchemes() { + $registry =& $this->setUpSchemeRegistryMock(); + $registry->setReturnValue('getScheme', false, array('*', '*', '*')); + } + + function tearDownSchemeRegistryMock() { + HTMLPurifier_URISchemeRegistry::instance($this->oldRegistry); + } + + function test_getSchemeObj() { + $scheme_mock =& $this->setUpSchemeMock('http'); + + $uri = $this->createURI('http:'); + $scheme_obj = $uri->getSchemeObj($this->config, $this->context); + $this->assertIdentical($scheme_obj, $scheme_mock); + + $this->tearDownSchemeRegistryMock(); + } + + function test_getSchemeObj_invalidScheme() { + $this->setUpNoValidSchemes(); + + $uri = $this->createURI('http:'); + $result = $uri->getSchemeObj($this->config, $this->context); + $this->assertIdentical($result, false); + + $this->tearDownSchemeRegistryMock(); + } + + function test_getSchemaObj_defaultScheme() { + $scheme = 'foobar'; + + $scheme_mock =& $this->setUpSchemeMock($scheme); + $this->config->set('URI', 'DefaultScheme', $scheme); + + $uri = $this->createURI('hmm'); + $scheme_obj = $uri->getSchemeObj($this->config, $this->context); + $this->assertIdentical($scheme_obj, $scheme_mock); + + $this->tearDownSchemeRegistryMock(); + } + + function test_getSchemaObj_invalidDefaultScheme() { + $this->setUpNoValidSchemes(); + $this->config->set('URI', 'DefaultScheme', 'foobar'); + + $uri = $this->createURI('hmm'); + + $this->expectError('Default scheme object "foobar" was not readable'); + $result = $uri->getSchemeObj($this->config, $this->context); + $this->assertIdentical($result, false); + + $this->tearDownSchemeRegistryMock(); + } + + function assertToString($expect_uri, $scheme, $userinfo, $host, $port, $path, $query, $fragment) { + $uri = new HTMLPurifier_URI($scheme, $userinfo, $host, $port, $path, $query, $fragment); + $string = $uri->toString(); + $this->assertIdentical($string, $expect_uri); + } + + function test_toString_full() { + $this->assertToString( + 'http://bob@example.com:300/foo?bar=baz#fragment', + 'http', 'bob', 'example.com', 300, '/foo', 'bar=baz', 'fragment' + ); + } + + function test_toString_scheme() { + $this->assertToString( + 'http:', + 'http', null, null, null, '', null, null + ); + } + + function test_toString_authority() { + $this->assertToString( + '//bob@example.com:8080', + null, 'bob', 'example.com', 8080, '', null, null + ); + } + + function test_toString_path() { + $this->assertToString( + '/path/to', + null, null, null, null, '/path/to', null, null + ); + } + + function test_toString_query() { + $this->assertToString( + '?q=string', + null, null, null, null, '', 'q=string', null + ); + } + + function test_toString_fragment() { + $this->assertToString( + '#fragment', + null, null, null, null, '', null, 'fragment' + ); + } + + function assertValidation($uri, $expect_uri = true) { + if ($expect_uri === true) $expect_uri = $uri; + $uri = $this->createURI($uri); + $result = $uri->validate($this->config, $this->context); + if ($expect_uri === false) { + $this->assertFalse($result); + } else { + $this->assertTrue($result); + $this->assertIdentical($uri->toString(), $expect_uri); + } + } + + function test_validate_defaultSchemeRemovedInBlank() { + $this->assertValidation('http:', ''); + } + + function test_validate_defaultSchemeRemovedInRelativeURI() { + $this->assertValidation('http:/foo/bar', '/foo/bar'); + } + + function test_validate_defaultSchemeNotRemovedInAbsoluteURI() { + $this->assertValidation('http://example.com/foo/bar'); + } + + function test_validate_altSchemeNotRemoved() { + $this->assertValidation('mailto:this-looks-like-a-path@example.com'); + } + + function test_validate_overlongPort() { + $this->assertValidation('http://example.com:65536', 'http://example.com'); + } + + function test_validate_zeroPort() { + $this->assertValidation('http://example.com:00', 'http://example.com'); + } + + function test_validate_invalidHostThatLooksLikeIPv6() { + $this->assertValidation('http://[2001:0db8:85z3:08d3:1319:8a2e:0370:7334]', ''); + } + + function test_validate_configDisableExternal() { + + $this->def = new HTMLPurifier_AttrDef_URI(); + + $this->config->set('URI', 'DisableExternal', true); + $this->config->set('URI', 'Host', 'sub.example.com'); + + $this->assertValidation('/foobar.txt'); + $this->assertValidation('http://google.com/', false); + $this->assertValidation('http://sub.example.com/alas?foo=asd'); + $this->assertValidation('http://example.com/teehee', false); + $this->assertValidation('http://www.example.com/#man', false); + $this->assertValidation('http://go.sub.example.com/perhaps?p=foo'); + + } + + function test_validate_configDisableExternalResources() { + + $this->config->set('URI', 'DisableExternalResources', true); + + $this->assertValidation('http://sub.example.com/alas?foo=asd'); + $this->assertValidation('/img.png'); + + $embeds = true; // passed by reference + $this->context->register('EmbeddedURI', $embeds); + $this->assertValidation('http://sub.example.com/alas?foo=asd', false); + $this->assertValidation('/img.png'); + + } + + function test_validate_configBlacklist() { + + $this->config->set('URI', 'HostBlacklist', array('example.com', 'moo')); + + $this->assertValidation('foo.txt'); + $this->assertValidation('http://www.google.com/example.com/moo'); + + $this->assertValidation('http://example.com/#23', false); + $this->assertValidation('https://sub.domain.example.com/foobar', false); + $this->assertValidation('http://example.com.example.net/?whoo=foo', false); + $this->assertValidation('ftp://moo-moo.net/foo/foo/', false); + + } + + /* + function test_validate_configWhitelist() { + + $this->config->set('URI', 'HostPolicy', 'DenyAll'); + $this->config->set('URI', 'HostWhitelist', array(null, 'google.com')); + + $this->assertValidation('http://example.com/fo/google.com', false); + $this->assertValidation('server.txt'); + $this->assertValidation('ftp://www.google.com/?t=a'); + $this->assertValidation('http://google.com.tricky.spamsite.net', false); + + } + */ + +} diff --git a/tests/test_files.php b/tests/test_files.php index 845807c7..c3a78120 100644 --- a/tests/test_files.php +++ b/tests/test_files.php @@ -105,6 +105,7 @@ $test_files[] = 'HTMLPurifier/TokenTest.php'; $test_files[] = 'HTMLPurifier/URIParserTest.php'; $test_files[] = 'HTMLPurifier/URISchemeRegistryTest.php'; $test_files[] = 'HTMLPurifier/URISchemeTest.php'; +$test_files[] = 'HTMLPurifier/URITest.php'; $test_files[] = 'HTMLPurifierTest.php'; if (version_compare(PHP_VERSION, '5', '>=')) {