From d28bad648a1c831c821b786aa39fa5f947610e9f Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Sat, 12 Aug 2006 18:58:54 +0000 Subject: [PATCH] Implement URIScheme and subclasses except for mailto. Remove fragment from components, as it is scheme independent. git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@218 48356398-32a2-884e-a903-53898d9a118a --- library/HTMLPurifier/AttrDef/URI.php | 26 +++-- library/HTMLPurifier/URIScheme.php | 9 +- library/HTMLPurifier/URIScheme/ftp.php | 21 ++++ library/HTMLPurifier/URIScheme/http.php | 11 +- library/HTMLPurifier/URIScheme/https.php | 11 ++ library/HTMLPurifier/URIScheme/news.php | 19 ++++ library/HTMLPurifier/URIScheme/nntp.php | 20 ++++ tests/HTMLPurifier/AttrDef/URITest.php | 40 ++++---- tests/HTMLPurifier/URISchemeTest.php | 123 +++++++++++++++++++++++ tests/index.php | 1 + 10 files changed, 250 insertions(+), 31 deletions(-) create mode 100644 library/HTMLPurifier/URIScheme/ftp.php create mode 100644 library/HTMLPurifier/URIScheme/https.php create mode 100644 library/HTMLPurifier/URIScheme/news.php create mode 100644 library/HTMLPurifier/URIScheme/nntp.php create mode 100644 tests/HTMLPurifier/URISchemeTest.php diff --git a/library/HTMLPurifier/AttrDef/URI.php b/library/HTMLPurifier/AttrDef/URI.php index 850e953b..a44efeb6 100644 --- a/library/HTMLPurifier/AttrDef/URI.php +++ b/library/HTMLPurifier/AttrDef/URI.php @@ -49,7 +49,9 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef $scheme_obj =& $registry->getScheme($scheme, $config); if (!$scheme_obj) return ''; // invalid scheme, clean it out } else { - $scheme_obj =& $registry->getScheme($config->get('URI', 'DefaultScheme'), $config); + $scheme_obj =& $registry->getScheme( + $config->get('URI', 'DefaultScheme'), $config + ); } @@ -99,11 +101,8 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef // userinfo and host are validated within the regexp - // regenerate authority - $authority = - ($userinfo === null ? '' : ($userinfo . '@')) . - $host . - ($port === null ? '' : (':' . $port)); + } else { + $port = $host = $userinfo = null; } @@ -120,10 +119,21 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef // okay, now we defer execution to the subobject for more processing - list($authority, $path, $query, $fragment) = - $scheme_obj->validateComponents($authority, $path, $query, $fragment); + // note that $fragment is omitted + list($userinfo, $host, $port, $path, $query) = + $scheme_obj->validateComponents( + $userinfo, $host, $port, $path, $query, $config + ); + // reconstruct authority + $authority = null; + if (!is_null($userinfo) || !is_null($host) || !is_null($port)) { + $authority = ''; + if($userinfo !== null) $authority .= $userinfo . '@'; + $authority .= $host; + if($port !== null) $authority .= ':' . $port; + } // reconstruct the result $result = ''; diff --git a/library/HTMLPurifier/URIScheme.php b/library/HTMLPurifier/URIScheme.php index c6c12f53..820182f5 100644 --- a/library/HTMLPurifier/URIScheme.php +++ b/library/HTMLPurifier/URIScheme.php @@ -3,8 +3,13 @@ class HTMLPurifier_URIScheme { - function validateComponents($authority, $path, $query, $fragment) { - return array($authority, $path, $query, $fragment); + var $default_port = null; + + function validateComponents( + $userinfo, $host, $port, $path, $query, $config + ) { + if ($this->default_port == $port) $port = null; + return array($userinfo, $host, $port, $path, $query); } } diff --git a/library/HTMLPurifier/URIScheme/ftp.php b/library/HTMLPurifier/URIScheme/ftp.php new file mode 100644 index 00000000..df7837ee --- /dev/null +++ b/library/HTMLPurifier/URIScheme/ftp.php @@ -0,0 +1,21 @@ + \ No newline at end of file diff --git a/library/HTMLPurifier/URIScheme/http.php b/library/HTMLPurifier/URIScheme/http.php index 946e29ba..4ea0c706 100644 --- a/library/HTMLPurifier/URIScheme/http.php +++ b/library/HTMLPurifier/URIScheme/http.php @@ -4,8 +4,15 @@ require_once 'HTMLPurifier/URIScheme.php'; class HTMLPurifier_URIScheme_http extends HTMLPurifier_URIScheme { - function validateComponents($authority, $path, $query, $fragment) { - + var $default_port = 80; + + function validateComponents( + $userinfo, $host, $port, $path, $query, $config + ) { + list($userinfo, $host, $port, $path, $query) = + parent::validateComponents( + $userinfo, $host, $port, $path, $query, $config ); + return array(null, $host, $port, $path, $query); } } diff --git a/library/HTMLPurifier/URIScheme/https.php b/library/HTMLPurifier/URIScheme/https.php new file mode 100644 index 00000000..6798eeab --- /dev/null +++ b/library/HTMLPurifier/URIScheme/https.php @@ -0,0 +1,11 @@ + \ No newline at end of file diff --git a/library/HTMLPurifier/URIScheme/news.php b/library/HTMLPurifier/URIScheme/news.php new file mode 100644 index 00000000..f78f2e35 --- /dev/null +++ b/library/HTMLPurifier/URIScheme/news.php @@ -0,0 +1,19 @@ + \ No newline at end of file diff --git a/library/HTMLPurifier/URIScheme/nntp.php b/library/HTMLPurifier/URIScheme/nntp.php new file mode 100644 index 00000000..d5ce7d90 --- /dev/null +++ b/library/HTMLPurifier/URIScheme/nntp.php @@ -0,0 +1,20 @@ + \ No newline at end of file diff --git a/tests/HTMLPurifier/AttrDef/URITest.php b/tests/HTMLPurifier/AttrDef/URITest.php index 13c3153d..bfc2b48c 100644 --- a/tests/HTMLPurifier/AttrDef/URITest.php +++ b/tests/HTMLPurifier/AttrDef/URITest.php @@ -33,21 +33,22 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness // test a regular instance, return identical URI $uri[0] = 'http://www.example.com/webhp?q=foo#result2'; $components[0] = array( - 'www.example.com', // authority + null, // userinfo + 'www.example.com', // host + null, // port '/webhp', // path - 'q=foo', // query - 'result2' // fragment + 'q=foo' // query ); // test an amended URI (the actual logic is irrelevant) // test that user and port get parsed correctly (3.2.1 and 3.2.3) $uri[1] = 'http://user@authority.part:80/now/the/path?query#fragment'; $components[1] = array( - 'user@authority.part:80', // yes, user+port are part of authority - '/now/the/path', 'query', 'fragment' + 'user', 'authority.part', 80, + '/now/the/path', 'query' ); $return_components[1] = array( // removed port (it's standard) - 'user@authority.part', '/now/the/path', 'query', 'fragment' + 'user', 'authority.part', null, '/now/the/path', 'query' ); $expect_uri[1] = 'http://user@authority.part/now/the/path?query#fragment'; @@ -56,20 +57,20 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness // also test what happens when query/fragment are missing $uri[2] = 'http://en.wikipedia.org/wiki/Clich%C3%A9'; $components[2] = array( - 'en.wikipedia.org', '/wiki/Clich%C3%A9', null, null + null, 'en.wikipedia.org', null, '/wiki/Clich%C3%A9', null ); // test distinction between empty query and undefined query (above) $uri[3] = 'http://www.example.com/?#'; - $components[3] = array( 'www.example.com', '/', '', '' ); + $components[3] = array(null, 'www.example.com', null, '/', ''); // path is always defined, even if empty $uri[4] = 'http://www.example.com'; - $components[4] = array( 'www.example.com', '', null, null ); + $components[4] = array(null, 'www.example.com', null, '', null); // test parsing of an opaque URI $uri[5] = 'mailto:bob@example.com'; - $components[5] = array(null, 'bob@example.com', null, null); + $components[5] = array(null, null, null, 'bob@example.com', null); // even though we don't resolve percent entities, we have to fix // improper percent-encodes. Taken one at a time: @@ -86,12 +87,12 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness // test IPv4 address (behavior may vary with configuration) $uri[7] = 'http://192.0.34.166/'; - $components[7] = array('192.0.34.166', '/', null, null); + $components[7] = array(null, '192.0.34.166', null, '/', null); // while it may look like an IPv4 address, it's really a reg-name. // don't destroy it $uri[8] = 'http://333.123.32.123/'; - $components[8] = array('333.123.32.123', '/', null, null); + $components[8] = array(null, '333.123.32.123', null, '/', null); // test IPv6 address, using amended form of RFC's example //$uri[9] = 'http://[2001:db8::7]/c=GB?objectClass?one'; @@ -105,7 +106,7 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness // break the RFC a little and allow international characters // WARNING: UTF-8 encoded! $uri[10] = 'http://tūdaliņ.lv'; - $components[10] = array('tūdaliņ.lv', '', null, null); + $components[10] = array(null, 'tūdaliņ.lv', null, '', null); // test invalid IPv6 address and invalid reg-name //$uri[11] = 'http://[2001:0db8:85z3:08d3:1319:8a2e:0370:7334]'; @@ -113,12 +114,12 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness // test invalid port $uri[12] = 'http://example.com:foobar'; - $components[12] = array('example.com', '', null, null); + $components[12] = array(null, 'example.com', null, '', null); $expect_uri[12] = 'http://example.com'; // test overlarge port (max is 65535, although this isn't official) $uri[13] = 'http://example.com:65536'; - $components[13] = array('example.com', '', null, null); + $components[13] = array(null, 'example.com', null, '', null); $expect_uri[13] = 'http://example.com'; // some spec abnf tests @@ -127,19 +128,19 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness // "path-absolute", note this is different from path-rootless $uri[14] = 'http:/this/is/path'; - $components[14] = array(null, '/this/is/path', null, null); + $components[14] = array(null, null, null, '/this/is/path', null); $expect_uri[14] = 'http:/this/is/path'; // do not munge scheme off // scheme munging is not being tested yet, it's an extra feature // "path-rootless" - this should not be used but is allowed $uri[15] = 'http:this/is/path'; - $components[15] = array(null, 'this/is/path', null, null); + $components[15] = array(null, null, null, 'this/is/path', null); //$expect_uri[15] = 'this/is/path'; // munge scheme off // "path-empty" - a rather interesting case, remove the scheme $uri[16] = 'http:'; - $components[16] = array(null, '', null, null); + $components[16] = array(null, null, null, '', null); //$expect_uri[16] = ''; // munge scheme off // test invalid scheme, components shouldn't be passed @@ -150,7 +151,7 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness // test basic case $uri[18] = '/a/b'; - $components[18] = array(null, '/a/b', null, null); + $components[18] = array(null, null, null, '/a/b', null); foreach ($uri as $i => $value) { @@ -204,6 +205,7 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness if ($this->components === false) { $this->scheme->expectNever('validateComponents'); } else { + $this->components[] = $this->config; // append the configuration $this->scheme->setReturnValue( 'validateComponents', $this->return_components, $this->components); $this->scheme->expectOnce('validateComponents', $this->components); diff --git a/tests/HTMLPurifier/URISchemeTest.php b/tests/HTMLPurifier/URISchemeTest.php new file mode 100644 index 00000000..157ee956 --- /dev/null +++ b/tests/HTMLPurifier/URISchemeTest.php @@ -0,0 +1,123 @@ +assertIdentical( + $scheme->validateComponents( + null, 'www.example.com', null, '/', 's=foobar', $config), + array(null, 'www.example.com', null, '/', 's=foobar') + ); + + // absorb default port and userinfo + $this->assertIdentical( + $scheme->validateComponents( + 'user', 'www.example.com', 80, '/', 's=foobar', $config), + array(null, 'www.example.com', null, '/', 's=foobar') + ); + + // do not absorb non-default port + $this->assertIdentical( + $scheme->validateComponents( + null, 'www.example.com', 8080, '/', 's=foobar', $config), + array(null, 'www.example.com', 8080, '/', 's=foobar') + ); + + // https is basically the same + + $scheme = new HTMLPurifier_URIScheme_https(); + $this->assertIdentical( + $scheme->validateComponents( + 'user', 'www.example.com', 443, '/', 's=foobar', $config), + array(null, 'www.example.com', null, '/', 's=foobar') + ); + + } + + function test_ftp() { + + $scheme = new HTMLPurifier_URIScheme_ftp(); + $config = HTMLPurifier_Config::createDefault(); + $this->assertIdentical( + $scheme->validateComponents( + 'user', 'www.example.com', 21, '/', 's=foobar', $config), + array('user', 'www.example.com', null, '/', null) + ); + + } + + function test_news() { + + $scheme = new HTMLPurifier_URIScheme_news(); + $config = HTMLPurifier_Config::createDefault(); + + $this->assertIdentical( + $scheme->validateComponents( + null, null, null, 'gmane.science.linguistics', null, $config), + array(null, null, null, 'gmane.science.linguistics', null) + ); + + $this->assertIdentical( + $scheme->validateComponents( + null, null, null, '642@eagle.ATT.COM', null, $config), + array(null, null, null, '642@eagle.ATT.COM', null) + ); + + // test invalid field removal + $this->assertIdentical( + $scheme->validateComponents( + 'user', 'www.google.com', 80, 'rec.music', 'path=foo', $config), + array(null, null, null, 'rec.music', null) + ); + + } + + function test_nntp() { + + $scheme = new HTMLPurifier_URIScheme_nntp(); + $config = HTMLPurifier_Config::createDefault(); + + $this->assertIdentical( + $scheme->validateComponents( + null, 'news.example.com', null, '/alt.misc/12345', null, $config), + array(null, 'news.example.com', null, '/alt.misc/12345', null) + ); + + + $this->assertIdentical( + $scheme->validateComponents( + 'user', 'news.example.com', 119, '/alt.misc/12345', 'foo=asdf', $config), + array(null, 'news.example.com', null, '/alt.misc/12345', null) + ); + } + + // mailto currently isn't implemented yet + function non_test_mailto() { + + $scheme = new HTMLPurifier_URIScheme_mailto(); + $config = HTMLPurifier_Config::createDefault(); + + $this->assertIdentical( + $scheme->validateComponents( + null, null, null, 'bob@example.com', null, $config), + array(null, null, null, 'bob@example.com', null) + ); + + } + +} + +?> \ No newline at end of file diff --git a/tests/index.php b/tests/index.php index 0d2ee800..103b08b1 100644 --- a/tests/index.php +++ b/tests/index.php @@ -68,6 +68,7 @@ $test_files[] = 'TagTransformTest.php'; $test_files[] = 'AttrTransform/LangTest.php'; $test_files[] = 'AttrTransform/TextAlignTest.php'; $test_files[] = 'URISchemeRegistryTest.php'; +$test_files[] = 'URISchemeTest.php'; $test_file_lookup = array_flip($test_files);