mirror of
https://github.com/ezyang/htmlpurifier.git
synced 2025-08-03 20:58:11 +02:00
[2.1.0] Create new URI object and migrate URI validation systems to use it. URIScheme interface changed.
git-svn-id: http://htmlpurifier.org/svnroot/htmlpurifier/trunk@1334 48356398-32a2-884e-a903-53898d9a118a
This commit is contained in:
@@ -93,170 +93,59 @@ HTMLPurifier_ConfigSchema::defineAlias('Attr', 'DisableURI', 'URI', 'Disable');
|
||||
class HTMLPurifier_AttrDef_URI extends HTMLPurifier_AttrDef
|
||||
{
|
||||
|
||||
var $host, $parser;
|
||||
var $embeds_resource;
|
||||
var $parser, $percentEncoder;
|
||||
var $embedsResource;
|
||||
|
||||
/**
|
||||
* @param $embeds_resource_resource Does the URI here result in an extra HTTP request?
|
||||
*/
|
||||
function HTMLPurifier_AttrDef_URI($embeds_resource = false) {
|
||||
$this->host = new HTMLPurifier_AttrDef_URI_Host();
|
||||
$this->parser = new HTMLPurifier_URIParser();
|
||||
$this->embeds_resource = (bool) $embeds_resource;
|
||||
$this->percentEncoder = new HTMLPurifier_PercentEncoder();
|
||||
$this->embedsResource = (bool) $embeds_resource;
|
||||
}
|
||||
|
||||
function validate($uri, $config, &$context) {
|
||||
|
||||
static $PercentEncoder = null;
|
||||
if ($PercentEncoder === null) $PercentEncoder = new HTMLPurifier_PercentEncoder();
|
||||
|
||||
if ($config->get('URI', 'Disable')) return false;
|
||||
|
||||
// initial operations
|
||||
$uri = $this->parseCDATA($uri);
|
||||
$uri = $PercentEncoder->normalize($uri);
|
||||
$uri = $this->percentEncoder->normalize($uri);
|
||||
|
||||
// parse the URI
|
||||
$parsed_uri = $this->parser->parse($uri);
|
||||
if ($parsed_uri === false) return false;
|
||||
list($scheme, $userinfo, $host, $port, $path, $query, $fragment) = $parsed_uri;
|
||||
$uri = $this->parser->parse($uri);
|
||||
if ($uri === false) return false;
|
||||
|
||||
// retrieve the scheme object
|
||||
$registry =& HTMLPurifier_URISchemeRegistry::instance();
|
||||
$default_scheme = $config->get('URI', 'DefaultScheme');
|
||||
if ($scheme !== null) {
|
||||
// no need to validate the scheme's fmt since we do that when we
|
||||
// retrieve the specific scheme object from the registry
|
||||
$scheme = ctype_lower($scheme) ? $scheme : strtolower($scheme);
|
||||
$scheme_obj = $registry->getScheme($scheme, $config, $context);
|
||||
if (!$scheme_obj) return false; // invalid scheme, clean it out
|
||||
} else {
|
||||
// no scheme: retrieve the default one
|
||||
$scheme_obj = $registry->getScheme($default_scheme, $config, $context);
|
||||
if (!$scheme_obj) {
|
||||
// something funky happened to the default scheme object
|
||||
trigger_error(
|
||||
'Default scheme object "' . $config->get('URI', 'DefaultScheme') . '" was not readable',
|
||||
E_USER_WARNING
|
||||
);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if ($this->embeds_resource && !$scheme_obj->browsable) {
|
||||
// the URI we're processing embeds_resource a resource in the
|
||||
// page, but the URI it references cannot be physically retrieved
|
||||
return false;
|
||||
}
|
||||
// generic validation
|
||||
$context->register('EmbeddedURI', $this->embedsResource); // flag
|
||||
$result = $uri->validate($config, $context);
|
||||
$context->destroy('EmbeddedURI');
|
||||
if (!$result) return false;
|
||||
|
||||
// validate host
|
||||
if ($host !== null) {
|
||||
// remove URI if it's absolute and we disabled externals or
|
||||
// if it's absolute and embedded and we disabled external resources
|
||||
unset($our_host);
|
||||
if (
|
||||
$config->get('URI', 'DisableExternal') ||
|
||||
(
|
||||
$config->get('URI', 'DisableExternalResources') &&
|
||||
$this->embeds_resource
|
||||
)
|
||||
) {
|
||||
$our_host = $config->get('URI', 'Host');
|
||||
if ($our_host === null) return false;
|
||||
}
|
||||
$host = $this->host->validate($host, $config, $context);
|
||||
if ($host === false) $host = null;
|
||||
|
||||
// check host against blacklist
|
||||
if ($this->checkBlacklist($host, $config, $context)) return false;
|
||||
|
||||
// more lenient absolute checking
|
||||
if (isset($our_host)) {
|
||||
$host_parts = array_reverse(explode('.', $host));
|
||||
// could be cached
|
||||
$our_host_parts = array_reverse(explode('.', $our_host));
|
||||
foreach ($our_host_parts as $i => $discard) {
|
||||
if (!isset($host_parts[$i])) return false;
|
||||
if ($host_parts[$i] != $our_host_parts[$i]) return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
// scheme-specific validation
|
||||
$scheme_obj = $uri->getSchemeObj($config, $context);
|
||||
if (!$scheme_obj) return false;
|
||||
if ($this->embedsResource && !$scheme_obj->browsable) return false;
|
||||
$result = $scheme_obj->validate($uri, $config, $context);
|
||||
if (!$result) return false;
|
||||
|
||||
// validate port
|
||||
if ($port !== null) {
|
||||
if ($port < 1 || $port > 65535) $port = null;
|
||||
}
|
||||
|
||||
|
||||
// query and fragment are quite simple in terms of definition:
|
||||
// *( pchar / "/" / "?" ), so define their validation routines
|
||||
// when we start fixing percent encoding
|
||||
|
||||
|
||||
|
||||
// path gets to be validated against a hodge-podge of rules depending
|
||||
// on the status of authority and scheme, but it's not that important,
|
||||
// esp. since it won't be applicable to everyone
|
||||
|
||||
|
||||
|
||||
// okay, now we defer execution to the subobject for more processing
|
||||
// note that $fragment is omitted
|
||||
list($userinfo, $host, $port, $path, $query) =
|
||||
$scheme_obj->validateComponents(
|
||||
$userinfo, $host, $port, $path, $query, $config, $context
|
||||
);
|
||||
|
||||
|
||||
// reconstruct authority
|
||||
$authority = null;
|
||||
if (!is_null($userinfo) || !is_null($host) || !is_null($port)) {
|
||||
$authority = '';
|
||||
if($userinfo !== null) $authority .= $userinfo . '@';
|
||||
$authority .= $host;
|
||||
if($port !== null) $authority .= ':' . $port;
|
||||
} else {
|
||||
if ($default_scheme == $scheme) $scheme = null; // munge scheme off when unnecessary
|
||||
}
|
||||
|
||||
// reconstruct the result
|
||||
$result = '';
|
||||
if ($scheme !== null) $result .= "$scheme:";
|
||||
if ($authority !== null) $result .= "//$authority";
|
||||
$result .= $path;
|
||||
if ($query !== null) $result .= "?$query";
|
||||
if ($fragment !== null) $result .= "#$fragment";
|
||||
// back to string
|
||||
$result = $uri->toString();
|
||||
|
||||
// munge if necessary
|
||||
$munge = $config->get('URI', 'Munge');
|
||||
if (!empty($scheme_obj->browsable) && $munge !== null) {
|
||||
if ($authority !== null) {
|
||||
$result = str_replace('%s', rawurlencode($result), $munge);
|
||||
}
|
||||
if (
|
||||
!is_null($uri->host) && // indicator for authority
|
||||
!empty($scheme_obj->browsable) &&
|
||||
!is_null($munge = $config->get('URI', 'Munge'))
|
||||
) {
|
||||
$result = str_replace('%s', rawurlencode($result), $munge);
|
||||
}
|
||||
|
||||
return $result;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks a host against an array blacklist
|
||||
* @param $host Host to check
|
||||
* @param $config HTMLPurifier_Config instance
|
||||
* @param $context HTMLPurifier_Context instance
|
||||
* @return bool Is spam?
|
||||
*/
|
||||
function checkBlacklist($host, &$config, &$context) {
|
||||
$blacklist = $config->get('URI', 'HostBlacklist');
|
||||
if (!empty($blacklist)) {
|
||||
foreach($blacklist as $blacklisted_host_fragment) {
|
||||
if (strpos($host, $blacklisted_host_fragment) !== false) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
153
library/HTMLPurifier/URI.php
Normal file
153
library/HTMLPurifier/URI.php
Normal file
@@ -0,0 +1,153 @@
|
||||
<?php
|
||||
|
||||
require_once 'HTMLPurifier/URIParser.php';
|
||||
|
||||
/**
|
||||
* HTML Purifier's internal representation of a URI
|
||||
*/
|
||||
class HTMLPurifier_URI
|
||||
{
|
||||
|
||||
var $scheme, $userinfo, $host, $port, $path, $query, $fragment;
|
||||
|
||||
/**
|
||||
* @note Automatically normalizes scheme and port
|
||||
*/
|
||||
function HTMLPurifier_URI($scheme, $userinfo, $host, $port, $path, $query, $fragment) {
|
||||
$this->scheme = is_null($scheme) || ctype_lower($scheme) ? $scheme : strtolower($scheme);
|
||||
$this->userinfo = $userinfo;
|
||||
$this->host = $host;
|
||||
$this->port = is_null($port) ? $port : (int) $port;
|
||||
$this->path = $path;
|
||||
$this->query = $query;
|
||||
$this->fragment = $fragment;
|
||||
}
|
||||
|
||||
function getSchemeObj($config, &$context) {
|
||||
$registry =& HTMLPurifier_URISchemeRegistry::instance();
|
||||
if ($this->scheme !== null) {
|
||||
$scheme_obj = $registry->getScheme($this->scheme, $config, $context);
|
||||
if (!$scheme_obj) return false; // invalid scheme, clean it out
|
||||
} else {
|
||||
// no scheme: retrieve the default one
|
||||
$scheme_obj = $registry->getScheme($config->get('URI', 'DefaultScheme'), $config, $context);
|
||||
if (!$scheme_obj) {
|
||||
// something funky happened to the default scheme object
|
||||
trigger_error(
|
||||
'Default scheme object "' . $config->get('URI', 'DefaultScheme') . '" was not readable',
|
||||
E_USER_WARNING
|
||||
);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return $scheme_obj;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generic validation method applicable for all schemes
|
||||
*/
|
||||
function validate($config, &$context) {
|
||||
|
||||
// validate host
|
||||
if (!is_null($this->host)) {
|
||||
// remove URI if it's absolute and we disabled externals or
|
||||
// if it's absolute and embedded and we disabled external resources
|
||||
unset($our_host); // ensure this variable is not set
|
||||
if (
|
||||
$config->get('URI', 'DisableExternal') ||
|
||||
(
|
||||
$config->get('URI', 'DisableExternalResources') &&
|
||||
$context->get('EmbeddedURI', true) // suppress errors
|
||||
)
|
||||
) {
|
||||
$our_host = $config->get('URI', 'Host');
|
||||
if ($our_host === null) return false;
|
||||
}
|
||||
$host_def = new HTMLPurifier_AttrDef_URI_Host();
|
||||
$this->host = $host_def->validate($this->host, $config, $context);
|
||||
if ($this->host === false) $this->host = null;
|
||||
|
||||
// check host against blacklist
|
||||
if ($this->checkBlacklist($this->host, $config, $context)) return false;
|
||||
|
||||
// more lenient absolute checking
|
||||
if (isset($our_host)) {
|
||||
$host_parts = array_reverse(explode('.', $this->host));
|
||||
// could be cached
|
||||
$our_host_parts = array_reverse(explode('.', $our_host));
|
||||
foreach ($our_host_parts as $i => $discard) {
|
||||
if (!isset($host_parts[$i])) return false;
|
||||
if ($host_parts[$i] != $our_host_parts[$i]) return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// munge scheme off if necessary
|
||||
if (!is_null($this->scheme) && is_null($this->host)) {
|
||||
if ($config->get('URI', 'DefaultScheme') == $this->scheme) {
|
||||
$this->scheme = null;
|
||||
}
|
||||
}
|
||||
|
||||
// validate port
|
||||
if (!is_null($this->port)) {
|
||||
if ($this->port < 1 || $this->port > 65535) $this->port = null;
|
||||
}
|
||||
|
||||
// query and fragment are quite simple in terms of definition:
|
||||
// *( pchar / "/" / "?" ), so define their validation routines
|
||||
// when we start fixing percent encoding
|
||||
|
||||
// path gets to be validated against a hodge-podge of rules depending
|
||||
// on the status of authority and scheme, but it's not that important,
|
||||
// esp. since it won't be applicable to everyone
|
||||
|
||||
return true;
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks a host against an array blacklist
|
||||
* @param $host Host to check
|
||||
* @param $config HTMLPurifier_Config instance
|
||||
* @param $context HTMLPurifier_Context instance
|
||||
* @return bool Is spam?
|
||||
*/
|
||||
function checkBlacklist($host, $config, &$context) {
|
||||
$blacklist = $config->get('URI', 'HostBlacklist');
|
||||
if (!empty($blacklist)) {
|
||||
foreach($blacklist as $blacklisted_host_fragment) {
|
||||
if (strpos($host, $blacklisted_host_fragment) !== false) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert URI back to string
|
||||
*/
|
||||
function toString() {
|
||||
// reconstruct authority
|
||||
$authority = null;
|
||||
if (!is_null($this->host)) {
|
||||
$authority = '';
|
||||
if(!is_null($this->userinfo)) $authority .= $this->userinfo . '@';
|
||||
$authority .= $this->host;
|
||||
if(!is_null($this->port)) $authority .= ':' . $this->port;
|
||||
}
|
||||
|
||||
// reconstruct the result
|
||||
$result = '';
|
||||
if (!is_null($this->scheme)) $result .= $this->scheme . ':';
|
||||
if (!is_null($authority)) $result .= '//' . $authority;
|
||||
$result .= $this->path;
|
||||
if (!is_null($this->query)) $result .= '?' . $this->query;
|
||||
if (!is_null($this->fragment)) $result .= '#' . $this->fragment;
|
||||
|
||||
return $result;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -1,8 +1,11 @@
|
||||
<?php
|
||||
|
||||
require_once 'HTMLPurifier/URI.php';
|
||||
|
||||
/**
|
||||
* Parses a URI into the components and fragment identifier as specified
|
||||
* by RFC 2396.
|
||||
* @todo Replace regexps with a native PHP parser
|
||||
*/
|
||||
class HTMLPurifier_URIParser
|
||||
{
|
||||
@@ -10,7 +13,7 @@ class HTMLPurifier_URIParser
|
||||
/**
|
||||
* Parses a URI
|
||||
* @param $uri string URI to parse
|
||||
* @return array(userinfo, host, int port, path, query, fragment) components
|
||||
* @return HTMLPurifier_URI representation of URI
|
||||
*/
|
||||
function parse($uri) {
|
||||
$r_URI = '!'.
|
||||
@@ -51,7 +54,8 @@ class HTMLPurifier_URIParser
|
||||
$port = $host = $userinfo = null;
|
||||
}
|
||||
|
||||
return array($scheme, $userinfo, $host, $port, $path, $query, $fragment);
|
||||
return new HTMLPurifier_URI(
|
||||
$scheme, $userinfo, $host, $port, $path, $query, $fragment);
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -23,20 +23,14 @@ class HTMLPurifier_URIScheme
|
||||
* Validates the components of a URI
|
||||
* @note This implementation should be called by children if they define
|
||||
* a default port, as it does port processing.
|
||||
* @note Fragment is omitted as that is scheme independent
|
||||
* @param $userinfo User info found before at sign in authority
|
||||
* @param $host Hostname in authority
|
||||
* @param $port Port found after colon in authority
|
||||
* @param $path Path of URI
|
||||
* @param $query Query of URI, found after question mark
|
||||
* @param $uri Instance of HTMLPurifier_URI
|
||||
* @param $config HTMLPurifier_Config object
|
||||
* @param $context HTMLPurifier_Context object
|
||||
* @return Bool success or failure
|
||||
*/
|
||||
function validateComponents(
|
||||
$userinfo, $host, $port, $path, $query, $config, &$context
|
||||
) {
|
||||
if ($this->default_port == $port) $port = null;
|
||||
return array($userinfo, $host, $port, $path, $query);
|
||||
function validate(&$uri, $config, &$context) {
|
||||
if ($this->default_port == $uri->port) $uri->port = null;
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -10,34 +10,33 @@ class HTMLPurifier_URIScheme_ftp extends HTMLPurifier_URIScheme {
|
||||
var $default_port = 21;
|
||||
var $browsable = true; // usually
|
||||
|
||||
function validateComponents(
|
||||
$userinfo, $host, $port, $path, $query, $config, &$context
|
||||
) {
|
||||
list($userinfo, $host, $port, $path, $query) =
|
||||
parent::validateComponents(
|
||||
$userinfo, $host, $port, $path, $query, $config, $context );
|
||||
$semicolon_pos = strrpos($path, ';'); // reverse
|
||||
function validate(&$uri, $config, &$context) {
|
||||
parent::validate($uri, $config, $context);
|
||||
$uri->query = null;
|
||||
|
||||
// typecode check
|
||||
$semicolon_pos = strrpos($uri->path, ';'); // reverse
|
||||
if ($semicolon_pos !== false) {
|
||||
// typecode check
|
||||
$type = substr($path, $semicolon_pos + 1); // no semicolon
|
||||
$path = substr($path, 0, $semicolon_pos);
|
||||
$type = substr($uri->path, $semicolon_pos + 1); // no semicolon
|
||||
$uri->path = substr($uri->path, 0, $semicolon_pos);
|
||||
$type_ret = '';
|
||||
if (strpos($type, '=') !== false) {
|
||||
// figure out whether or not the declaration is correct
|
||||
list($key, $typecode) = explode('=', $type, 2);
|
||||
if ($key !== 'type') {
|
||||
// invalid key, tack it back on encoded
|
||||
$path .= '%3B' . $type;
|
||||
$uri->path .= '%3B' . $type;
|
||||
} elseif ($typecode === 'a' || $typecode === 'i' || $typecode === 'd') {
|
||||
$type_ret = ";type=$typecode";
|
||||
}
|
||||
} else {
|
||||
$path .= '%3B' . $type;
|
||||
$uri->path .= '%3B' . $type;
|
||||
}
|
||||
$path = str_replace(';', '%3B', $path);
|
||||
$path .= $type_ret;
|
||||
$uri->path = str_replace(';', '%3B', $uri->path);
|
||||
$uri->path .= $type_ret;
|
||||
}
|
||||
return array($userinfo, $host, $port, $path, null);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -10,13 +10,10 @@ class HTMLPurifier_URIScheme_http extends HTMLPurifier_URIScheme {
|
||||
var $default_port = 80;
|
||||
var $browsable = true;
|
||||
|
||||
function validateComponents(
|
||||
$userinfo, $host, $port, $path, $query, $config, &$context
|
||||
) {
|
||||
list($userinfo, $host, $port, $path, $query) =
|
||||
parent::validateComponents(
|
||||
$userinfo, $host, $port, $path, $query, $config, $context );
|
||||
return array(null, $host, $port, $path, $query);
|
||||
function validate(&$uri, $config, &$context) {
|
||||
parent::validate($uri, $config, $context);
|
||||
$uri->userinfo = null;
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -15,14 +15,13 @@ class HTMLPurifier_URIScheme_mailto extends HTMLPurifier_URIScheme {
|
||||
|
||||
var $browsable = false;
|
||||
|
||||
function validateComponents(
|
||||
$userinfo, $host, $port, $path, $query, $config, &$context
|
||||
) {
|
||||
list($userinfo, $host, $port, $path, $query) =
|
||||
parent::validateComponents(
|
||||
$userinfo, $host, $port, $path, $query, $config, $context );
|
||||
function validate(&$uri, $config, &$context) {
|
||||
parent::validate($uri, $config, $context);
|
||||
$uri->userinfo = null;
|
||||
$uri->host = null;
|
||||
$uri->port = null;
|
||||
// we need to validate path against RFC 2368's addr-spec
|
||||
return array(null, null, null, $path, $query);
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -9,14 +9,14 @@ class HTMLPurifier_URIScheme_news extends HTMLPurifier_URIScheme {
|
||||
|
||||
var $browsable = false;
|
||||
|
||||
function validateComponents(
|
||||
$userinfo, $host, $port, $path, $query, $config, &$context
|
||||
) {
|
||||
list($userinfo, $host, $port, $path, $query) =
|
||||
parent::validateComponents(
|
||||
$userinfo, $host, $port, $path, $query, $config, $context );
|
||||
function validate(&$uri, $config, &$context) {
|
||||
parent::validate($uri, $config, $context);
|
||||
$uri->userinfo = null;
|
||||
$uri->host = null;
|
||||
$uri->port = null;
|
||||
$uri->query = null;
|
||||
// typecode check needed on path
|
||||
return array(null, null, null, $path, null);
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
@@ -10,13 +10,11 @@ class HTMLPurifier_URIScheme_nntp extends HTMLPurifier_URIScheme {
|
||||
var $default_port = 119;
|
||||
var $browsable = false;
|
||||
|
||||
function validateComponents(
|
||||
$userinfo, $host, $port, $path, $query, $config, &$context
|
||||
) {
|
||||
list($userinfo, $host, $port, $path, $query) =
|
||||
parent::validateComponents(
|
||||
$userinfo, $host, $port, $path, $query, $config, $context );
|
||||
return array(null, $host, $port, $path, null);
|
||||
function validate(&$uri, $config, &$context) {
|
||||
parent::validate($uri, $config, $context);
|
||||
$uri->userinfo = null;
|
||||
$uri->query = null;
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
Reference in New Issue
Block a user