1
0
mirror of https://github.com/processwire/processwire.git synced 2025-08-11 09:14:58 +02:00

Fix issue processwire/processwire-issues#623 where WireHttp::download() method was not working with URLs having encoded spaces when downloading with the "fopen" option (the "curl" option was not affected). Added a couple new $options to $sanitizer->url() method for dictating how encoded characters should be handled. Added WireHttp::setValidateURLOptions() method for cases where you want to manually specify different options for validating the URL in WireHttp.

This commit is contained in:
Ryan Cramer
2018-06-27 08:30:31 -04:00
parent 637f81579e
commit 1e912c4a4d
2 changed files with 70 additions and 28 deletions

View File

@@ -1315,6 +1315,8 @@ class Sanitizer extends Wire {
* - `allowSchemes` (array): Array of allowed schemes, lowercase (default=[] any). * - `allowSchemes` (array): Array of allowed schemes, lowercase (default=[] any).
* - `disallowSchemes` (array): Array of disallowed schemes, lowercase (default=['file']). * - `disallowSchemes` (array): Array of disallowed schemes, lowercase (default=['file']).
* - `requireScheme` (bool): Specify true to require a scheme in the URL, if one not present, it will be added to non-relative URLs (default=true). * - `requireScheme` (bool): Specify true to require a scheme in the URL, if one not present, it will be added to non-relative URLs (default=true).
* - `convertEncoded` (boolean): Convert most encoded hex characters characters (i.e. “%2F”) to non-encoded? (default=true)
* - `encodeSpace` (boolean): Encoded space to “%20” or allow “%20“ in URL? Only useful if convertEncoded is true. (default=false)
* - `stripTags` (bool): Specify false to prevent tags from being stripped (default=true). * - `stripTags` (bool): Specify false to prevent tags from being stripped (default=true).
* - `stripQuotes` (bool): Specify false to prevent quotes from being stripped (default=true). * - `stripQuotes` (bool): Specify false to prevent quotes from being stripped (default=true).
* - `maxLength` (int): Maximum length in bytes allowed for URLs (default=4096). * - `maxLength` (int): Maximum length in bytes allowed for URLs (default=4096).
@@ -1333,6 +1335,8 @@ class Sanitizer extends Wire {
'allowSchemes' => array(), 'allowSchemes' => array(),
'disallowSchemes' => array('file', 'javascript'), 'disallowSchemes' => array('file', 'javascript'),
'requireScheme' => true, 'requireScheme' => true,
'convertEncoded' => true,
'encodeSpace' => false,
'stripTags' => true, 'stripTags' => true,
'stripQuotes' => true, 'stripQuotes' => true,
'maxLength' => 4096, 'maxLength' => 4096,
@@ -1380,15 +1384,17 @@ class Sanitizer extends Wire {
$queryString = ''; $queryString = '';
} }
$pathIsEncoded = strpos($domainPath, '%') !== false; $pathIsEncoded = $options['convertEncoded'] && strpos($domainPath, '%') !== false;
if($pathIsEncoded || filter_var($domainPath, FILTER_SANITIZE_URL) !== $domainPath) { $pathModifiedByFilter = filter_var($domainPath, FILTER_SANITIZE_URL) !== $domainPath;
if($pathIsEncoded || $pathModifiedByFilter) {
// the domain and/or path contains extended characters not supported by FILTER_SANITIZE_URL // the domain and/or path contains extended characters not supported by FILTER_SANITIZE_URL
// Example: https://de.wikipedia.org/wiki/Linkshänder // Example: https://de.wikipedia.org/wiki/Linkshänder
// OR it is already rawurlencode()'d // OR it is already rawurlencode()'d
// Example: https://de.wikipedia.org/wiki/Linksh%C3%A4nder // Example: https://de.wikipedia.org/wiki/Linksh%C3%A4nder
// we convert the URL to be FILTER_SANITIZE_URL compatible // we convert the URL to be FILTER_SANITIZE_URL compatible
// if already encoded, first remove encoding: // if already encoded, first remove encoding:
if(strpos($domainPath, '%') !== false) $domainPath = rawurldecode($domainPath); if($pathIsEncoded) $domainPath = rawurldecode($domainPath);
// Next, encode it, for example: https%3A%2F%2Fde.wikipedia.org%2Fwiki%2FLinksh%C3%A4nder // Next, encode it, for example: https%3A%2F%2Fde.wikipedia.org%2Fwiki%2FLinksh%C3%A4nder
$domainPath = rawurlencode($domainPath); $domainPath = rawurlencode($domainPath);
// restore characters allowed in domain/path // restore characters allowed in domain/path
@@ -1417,7 +1423,8 @@ class Sanitizer extends Wire {
if($slashPos === false) $slashPos = $dotPos+1; if($slashPos === false) $slashPos = $dotPos+1;
// if the first slash comes after the first dot, the dot is likely part of a domain.com/path/ // if the first slash comes after the first dot, the dot is likely part of a domain.com/path/
// if the first slash comes before the first dot, then it's likely a /path/product.html // if the first slash comes before the first dot, then it's likely a /path/product.html
if($dotPos && $slashPos > $dotPos && preg_match('{^([^\s_.]+\.)?[^-_\s.][^\s_.]+\.([a-z]{2,6})([./:#]|$)}i', $value, $matches)) { $regex = '{^([^\s_.]+\.)?[^-_\s.][^\s_.]+\.([a-z]{2,6})([./:#]|$)}i';
if($dotPos && $slashPos > $dotPos && preg_match($regex, $value, $matches)) {
// most likely a domain name // most likely a domain name
// $tld = $matches[3]; // TODO add TLD validation to confirm it's a domain name // $tld = $matches[3]; // TODO add TLD validation to confirm it's a domain name
$value = $this->filterValidateURL("http://$value", $options); // add scheme for validation $value = $this->filterValidateURL("http://$value", $options); // add scheme for validation
@@ -1461,7 +1468,7 @@ class Sanitizer extends Wire {
if($pathIsEncoded && strlen($value)) { if($pathIsEncoded && strlen($value)) {
// restore to non-encoded, UTF-8 version // restore to non-encoded, UTF-8 version
if(strpos('?', $value) !== false) { if(strpos($value, '?') !== false) {
list($domainPath, $queryString) = explode('?', $value); list($domainPath, $queryString) = explode('?', $value);
} else { } else {
$domainPath = $value; $domainPath = $value;
@@ -1469,6 +1476,7 @@ class Sanitizer extends Wire {
} }
$domainPath = rawurldecode($domainPath); $domainPath = rawurldecode($domainPath);
if(strpos($domainPath, '%') !== false) { if(strpos($domainPath, '%') !== false) {
// if any apparently encoded characters remain afer rawurldecode, remove them
$domainPath = preg_replace('/%[0-9ABCDEF]{1,2}/i', '', $domainPath); $domainPath = preg_replace('/%[0-9ABCDEF]{1,2}/i', '', $domainPath);
$domainPath = str_replace('%', '', $domainPath); $domainPath = str_replace('%', '', $domainPath);
} }
@@ -1476,23 +1484,27 @@ class Sanitizer extends Wire {
$value = $domainPath . (strlen($queryString) ? "?$queryString" : ""); $value = $domainPath . (strlen($queryString) ? "?$queryString" : "");
} }
if(strlen($value)) { if(!strlen($value)) return '';
if($options['stripTags']) {
if(stripos($value, '%3') !== false) { if($options['stripTags']) {
$value = str_ireplace(array('%3C', '%3E'), array('!~!<', '>!~!'), $value); if(stripos($value, '%3') !== false) {
$value = strip_tags($value); $value = str_ireplace(array('%3C', '%3E'), array('!~!<', '>!~!'), $value); // convert encoded to placeholders to strip
$value = str_ireplace(array('!~!<', '>!~!', '!~!'), array('%3C', '%3E', ''), $value); // restore, in case valid/non-tag $value = strip_tags($value);
} else { $value = str_ireplace(array('!~!<', '>!~!', '!~!'), array('%3C', '%3E', ''), $value); // restore, in case valid/non-tag
$value = strip_tags($value); } else {
} $value = strip_tags($value);
} }
if($options['stripQuotes']) {
$value = str_replace(array('"', "'", "%22", "%27"), '', $value);
}
return $value;
} }
return ''; if($options['stripQuotes']) {
$value = str_replace(array('"', "'", "%22", "%27"), '', $value);
}
if($options['encodeSpace'] && strpos($value, ' ')) {
$value = str_replace(' ', '%20', $value);
}
return $value;
} }
/** /**

View File

@@ -22,7 +22,7 @@
* *
* Thanks to @horst for his assistance with several methods in this class. * Thanks to @horst for his assistance with several methods in this class.
* *
* ProcessWire 3.x, Copyright 2016 by Ryan Cramer * ProcessWire 3.x, Copyright 2018 by Ryan Cramer
* https://processwire.com * https://processwire.com
* *
* @method bool|string send($url, $data = array(), $method = 'POST') * @method bool|string send($url, $data = array(), $method = 'POST')
@@ -221,6 +221,22 @@ class WireHttp extends Wire {
*/ */
protected $hasFopen = false; protected $hasFopen = false;
/**
* Options to pass to $sanitizer->url('url', $options) in WireHttp::validateURL() method
*
* Can be modified with the setValidateURLOptions() method.
*
* @var array
*
*/
protected $validateURLOptions = array(
'allowRelative' => false,
'requireScheme' => true,
'stripQuotes' => false,
'encodeSpace' => true,
'throw' => true,
);
/** /**
* Construct/initialize * Construct/initialize
* *
@@ -1019,13 +1035,8 @@ class WireHttp extends Wire {
* *
*/ */
public function validateURL($url, $throw = false) { public function validateURL($url, $throw = false) {
$options = array( $options = $this->validateURLOptions;
'allowRelative' => false, $options['allowSchemes'] = $this->allowSchemes;
'allowSchemes' => $this->allowSchemes,
'requireScheme' => true,
'stripQuotes' => false,
'throw' => true,
);
try { try {
$url = $this->wire('sanitizer')->url($url, $options); $url = $this->wire('sanitizer')->url($url, $options);
} catch(WireException $e) { } catch(WireException $e) {
@@ -1129,6 +1140,24 @@ class WireHttp extends Wire {
return $this; return $this;
} }
/**
* Set options array given to $sanitizer->url()
*
* It should not be necessary to call this unless you are dealing with an unusual URL that is causing
* errors with the default options in WireHttp. Note that the “allowSchemes” option is set separately
* with the setAllowSchemes() method in this class.
*
* To return current validate URL options, omit the $options argument.
*
* @param array $options Options to set, see the $sanitizer->url() method for details on options.
* @return array Always returns current options
*
*/
public function setValidateURLOptions(array $options = array()) {
if(!empty($options)) $this->validateURLOptions = array_merge($this->validateURLOptions, $options);
return $this->validateURLOptions;
}
/** /**
* Return array of allowed schemes * Return array of allowed schemes
* *
@@ -1174,6 +1203,7 @@ class WireHttp extends Wire {
* *
*/ */
public function _errorHandler($errno, $errstr, $errfile, $errline, $errcontext) { public function _errorHandler($errno, $errstr, $errfile, $errline, $errcontext) {
if($errfile || $errline || $errcontext) {} // ignore
$this->error[] = "$errno: $errstr"; $this->error[] = "$errno: $errstr";
} }