diff --git a/NEWS b/NEWS index 7255569c..f01b4f67 100644 --- a/NEWS +++ b/NEWS @@ -10,6 +10,7 @@ NEWS ( CHANGELOG and HISTORY ) HTMLPurifier 1.2.0, unknown projected release date ! Added MODx plugin +! Added percent encoding normalization - Documentation updated + TODO added request Phalanger + TODO added request Native compression diff --git a/library/HTMLPurifier/AttrDef/URI.php b/library/HTMLPurifier/AttrDef/URI.php index c5181104..770b1282 100644 --- a/library/HTMLPurifier/AttrDef/URI.php +++ b/library/HTMLPurifier/AttrDef/URI.php @@ -4,6 +4,7 @@ require_once 'HTMLPurifier/AttrDef.php'; require_once 'HTMLPurifier/URIScheme.php'; require_once 'HTMLPurifier/URISchemeRegistry.php'; require_once 'HTMLPurifier/AttrDef/Host.php'; +require_once 'HTMLPurifier/PercentEncoder.php'; HTMLPurifier_ConfigSchema::define( 'URI', 'DefaultScheme', 'http', 'string', @@ -19,9 +20,11 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef { var $host; + var $PercentEncoder; function HTMLPurifier_AttrDef_URI() { $this->host = new HTMLPurifier_AttrDef_Host(); + $this->PercentEncoder = new HTMLPurifier_PercentEncoder(); } function validate($uri, $config, &$context) { @@ -32,6 +35,9 @@ class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef // parse as CDATA $uri = $this->parseCDATA($uri); + // fix up percent-encoding + $uri = $this->PercentEncoder->normalize($uri); + // while it would be nice to use parse_url(), that's specifically // for HTTP and thus won't work for our generic URI parsing diff --git a/library/HTMLPurifier/PercentEncoder.php b/library/HTMLPurifier/PercentEncoder.php new file mode 100644 index 00000000..f5f26527 --- /dev/null +++ b/library/HTMLPurifier/PercentEncoder.php @@ -0,0 +1,44 @@ += 48 && $int <= 57) || // digits + ($int >= 65 && $int <= 90) || // uppercase letters + ($int >= 97 && $int <= 122) || // lowercase letters + $int == 126 || $int == 45 || $int == 46 || $int == 95 // ~-._ + ) { + $ret .= chr($int) . $text; + continue; + } + $encoding = strtoupper($encoding); + $ret .= '%' . $encoding . $text; + } + return $ret; + } + +} + +?> \ No newline at end of file diff --git a/tests/HTMLPurifier/AttrDef/URITest.php b/tests/HTMLPurifier/AttrDef/URITest.php index cec094ec..8b178ebc 100644 --- a/tests/HTMLPurifier/AttrDef/URITest.php +++ b/tests/HTMLPurifier/AttrDef/URITest.php @@ -4,7 +4,6 @@ require_once 'HTMLPurifier/AttrDefHarness.php'; require_once 'HTMLPurifier/AttrDef/URI.php'; // WARNING: INCOMPLETE UNIT TESTS! -// we are currently abstaining percent-encode fixing unit tests // we also need to test all the configuration directives defined by this class // http: is returned quite often when a URL is invalid. We have to change @@ -83,10 +82,11 @@ class HTMLPurifier_AttrDef_URITest extends HTMLPurifier_AttrDefHarness // %5 - prematurely terminated, encode % // %FC - u with umlaut, correct // note that Apache doesn't do such fixing, rather, it just claims - // that the browser sent a "Bad Request". - //$uri[6] = 'http://www.example.com/%56%fc%GJ%5%FC'; - //$components[6] = array('www.example.com', '/V%FC%25GJ%255%FC', null, null); - //$expect_uri[6] = 'http://www.example.com/V%FC%25GJ%255%FC'; + // that the browser sent a "Bad Request". See PercentEncoder.php + // for more details + $uri[6] = 'http://www.example.com/%56%fc%GJ%5%FC'; + $components[6] = array(null, 'www.example.com', null, '/V%FC%25GJ%255%FC', null); + $expect_uri[6] = 'http://www.example.com/V%FC%25GJ%255%FC'; // test IPv4 address (behavior may vary with configuration) $uri[7] = 'http://192.0.34.166/'; diff --git a/tests/HTMLPurifier/PercentEncoderTest.php b/tests/HTMLPurifier/PercentEncoderTest.php new file mode 100644 index 00000000..ef5ac869 --- /dev/null +++ b/tests/HTMLPurifier/PercentEncoderTest.php @@ -0,0 +1,42 @@ +PercentEncoder = new HTMLPurifier_PercentEncoder(); + $this->func = ''; + } + + function assertDecode($string, $expect = true) { + if ($expect === true) $expect = $string; + $this->assertEqual($this->PercentEncoder->{$this->func}($string), $expect); + } + + function test_normalize() { + $this->func = 'normalize'; + + $this->assertDecode('Aw.../-$^8'); // no change + $this->assertDecode('%41%77%7E%2D%2E%5F', 'Aw~-._'); // decode unreserved chars + $this->assertDecode('%3A%2F%3F%23%5B%5D%40%21%24%26%27%28%29%2A%2B%2C%3B%3D'); // preserve reserved chars + $this->assertDecode('%2b', '%2B'); // normalize to uppercase + $this->assertDecode('%2B2B%3A3A'); // extra text + $this->assertDecode('%2b2B%4141', '%2B2BA41'); // extra text, with normalization + $this->assertDecode('%', '%25'); // normalize stray percent sign + $this->assertDecode('%5%25', '%255%25'); // permaturely terminated encoding + $this->assertDecode('%GJ', '%25GJ'); // invalid hexadecimal chars + + // contested behavior, if this changes, we'll also have to have + // outbound encoding + $this->assertDecode('%FC'); // not reserved or unreserved, preserve + + } + +} + +?> \ No newline at end of file diff --git a/tests/index.php b/tests/index.php index 0c898af1..4ceb8d96 100644 --- a/tests/index.php +++ b/tests/index.php @@ -90,6 +90,7 @@ $test_files[] = 'EncoderTest.php'; $test_files[] = 'EntityParserTest.php'; $test_files[] = 'Test.php'; $test_files[] = 'ContextTest.php'; +$test_files[] = 'PercentEncoderTest.php'; if (version_compare(PHP_VERSION, '5', '>=')) { $test_files[] = 'TokenFactoryTest.php';