From 1e912c4a4df0e01d140443fc84398783fe017ac6 Mon Sep 17 00:00:00 2001 From: Ryan Cramer Date: Wed, 27 Jun 2018 08:30:31 -0400 Subject: [PATCH] Fix issue processwire/processwire-issues#623 where WireHttp::download() method was not working with URLs having encoded spaces when downloading with the "fopen" option (the "curl" option was not affected). Added a couple new $options to $sanitizer->url() method for dictating how encoded characters should be handled. Added WireHttp::setValidateURLOptions() method for cases where you want to manually specify different options for validating the URL in WireHttp. --- wire/core/Sanitizer.php | 52 +++++++++++++++++++++++++---------------- wire/core/WireHttp.php | 46 +++++++++++++++++++++++++++++------- 2 files changed, 70 insertions(+), 28 deletions(-) diff --git a/wire/core/Sanitizer.php b/wire/core/Sanitizer.php index 5d3ed842..1ad18b51 100644 --- a/wire/core/Sanitizer.php +++ b/wire/core/Sanitizer.php @@ -1315,6 +1315,8 @@ class Sanitizer extends Wire { * - `allowSchemes` (array): Array of allowed schemes, lowercase (default=[] any). * - `disallowSchemes` (array): Array of disallowed schemes, lowercase (default=['file']). * - `requireScheme` (bool): Specify true to require a scheme in the URL, if one not present, it will be added to non-relative URLs (default=true). + * - `convertEncoded` (boolean): Convert most encoded hex characters characters (i.e. “%2F”) to non-encoded? (default=true) + * - `encodeSpace` (boolean): Encoded space to “%20” or allow “%20“ in URL? Only useful if convertEncoded is true. (default=false) * - `stripTags` (bool): Specify false to prevent tags from being stripped (default=true). * - `stripQuotes` (bool): Specify false to prevent quotes from being stripped (default=true). * - `maxLength` (int): Maximum length in bytes allowed for URLs (default=4096). @@ -1333,6 +1335,8 @@ class Sanitizer extends Wire { 'allowSchemes' => array(), 'disallowSchemes' => array('file', 'javascript'), 'requireScheme' => true, + 'convertEncoded' => true, + 'encodeSpace' => false, 'stripTags' => true, 'stripQuotes' => true, 'maxLength' => 4096, @@ -1380,15 +1384,17 @@ class Sanitizer extends Wire { $queryString = ''; } - $pathIsEncoded = strpos($domainPath, '%') !== false; - if($pathIsEncoded || filter_var($domainPath, FILTER_SANITIZE_URL) !== $domainPath) { + $pathIsEncoded = $options['convertEncoded'] && strpos($domainPath, '%') !== false; + $pathModifiedByFilter = filter_var($domainPath, FILTER_SANITIZE_URL) !== $domainPath; + + if($pathIsEncoded || $pathModifiedByFilter) { // the domain and/or path contains extended characters not supported by FILTER_SANITIZE_URL // Example: https://de.wikipedia.org/wiki/Linkshänder // OR it is already rawurlencode()'d // Example: https://de.wikipedia.org/wiki/Linksh%C3%A4nder // we convert the URL to be FILTER_SANITIZE_URL compatible // if already encoded, first remove encoding: - if(strpos($domainPath, '%') !== false) $domainPath = rawurldecode($domainPath); + if($pathIsEncoded) $domainPath = rawurldecode($domainPath); // Next, encode it, for example: https%3A%2F%2Fde.wikipedia.org%2Fwiki%2FLinksh%C3%A4nder $domainPath = rawurlencode($domainPath); // restore characters allowed in domain/path @@ -1417,7 +1423,8 @@ class Sanitizer extends Wire { if($slashPos === false) $slashPos = $dotPos+1; // if the first slash comes after the first dot, the dot is likely part of a domain.com/path/ // if the first slash comes before the first dot, then it's likely a /path/product.html - if($dotPos && $slashPos > $dotPos && preg_match('{^([^\s_.]+\.)?[^-_\s.][^\s_.]+\.([a-z]{2,6})([./:#]|$)}i', $value, $matches)) { + $regex = '{^([^\s_.]+\.)?[^-_\s.][^\s_.]+\.([a-z]{2,6})([./:#]|$)}i'; + if($dotPos && $slashPos > $dotPos && preg_match($regex, $value, $matches)) { // most likely a domain name // $tld = $matches[3]; // TODO add TLD validation to confirm it's a domain name $value = $this->filterValidateURL("http://$value", $options); // add scheme for validation @@ -1461,7 +1468,7 @@ class Sanitizer extends Wire { if($pathIsEncoded && strlen($value)) { // restore to non-encoded, UTF-8 version - if(strpos('?', $value) !== false) { + if(strpos($value, '?') !== false) { list($domainPath, $queryString) = explode('?', $value); } else { $domainPath = $value; @@ -1469,6 +1476,7 @@ class Sanitizer extends Wire { } $domainPath = rawurldecode($domainPath); if(strpos($domainPath, '%') !== false) { + // if any apparently encoded characters remain afer rawurldecode, remove them $domainPath = preg_replace('/%[0-9ABCDEF]{1,2}/i', '', $domainPath); $domainPath = str_replace('%', '', $domainPath); } @@ -1476,23 +1484,27 @@ class Sanitizer extends Wire { $value = $domainPath . (strlen($queryString) ? "?$queryString" : ""); } - if(strlen($value)) { - if($options['stripTags']) { - if(stripos($value, '%3') !== false) { - $value = str_ireplace(array('%3C', '%3E'), array('!~!<', '>!~!'), $value); - $value = strip_tags($value); - $value = str_ireplace(array('!~!<', '>!~!', '!~!'), array('%3C', '%3E', ''), $value); // restore, in case valid/non-tag - } else { - $value = strip_tags($value); - } + if(!strlen($value)) return ''; + + if($options['stripTags']) { + if(stripos($value, '%3') !== false) { + $value = str_ireplace(array('%3C', '%3E'), array('!~!<', '>!~!'), $value); // convert encoded to placeholders to strip + $value = strip_tags($value); + $value = str_ireplace(array('!~!<', '>!~!', '!~!'), array('%3C', '%3E', ''), $value); // restore, in case valid/non-tag + } else { + $value = strip_tags($value); } - if($options['stripQuotes']) { - $value = str_replace(array('"', "'", "%22", "%27"), '', $value); - } - return $value; } - - return ''; + + if($options['stripQuotes']) { + $value = str_replace(array('"', "'", "%22", "%27"), '', $value); + } + + if($options['encodeSpace'] && strpos($value, ' ')) { + $value = str_replace(' ', '%20', $value); + } + + return $value; } /** diff --git a/wire/core/WireHttp.php b/wire/core/WireHttp.php index 2cdad7c0..34ee7c2c 100644 --- a/wire/core/WireHttp.php +++ b/wire/core/WireHttp.php @@ -22,7 +22,7 @@ * * Thanks to @horst for his assistance with several methods in this class. * - * ProcessWire 3.x, Copyright 2016 by Ryan Cramer + * ProcessWire 3.x, Copyright 2018 by Ryan Cramer * https://processwire.com * * @method bool|string send($url, $data = array(), $method = 'POST') @@ -221,6 +221,22 @@ class WireHttp extends Wire { */ protected $hasFopen = false; + /** + * Options to pass to $sanitizer->url('url', $options) in WireHttp::validateURL() method + * + * Can be modified with the setValidateURLOptions() method. + * + * @var array + * + */ + protected $validateURLOptions = array( + 'allowRelative' => false, + 'requireScheme' => true, + 'stripQuotes' => false, + 'encodeSpace' => true, + 'throw' => true, + ); + /** * Construct/initialize * @@ -1019,13 +1035,8 @@ class WireHttp extends Wire { * */ public function validateURL($url, $throw = false) { - $options = array( - 'allowRelative' => false, - 'allowSchemes' => $this->allowSchemes, - 'requireScheme' => true, - 'stripQuotes' => false, - 'throw' => true, - ); + $options = $this->validateURLOptions; + $options['allowSchemes'] = $this->allowSchemes; try { $url = $this->wire('sanitizer')->url($url, $options); } catch(WireException $e) { @@ -1129,6 +1140,24 @@ class WireHttp extends Wire { return $this; } + /** + * Set options array given to $sanitizer->url() + * + * It should not be necessary to call this unless you are dealing with an unusual URL that is causing + * errors with the default options in WireHttp. Note that the “allowSchemes” option is set separately + * with the setAllowSchemes() method in this class. + * + * To return current validate URL options, omit the $options argument. + * + * @param array $options Options to set, see the $sanitizer->url() method for details on options. + * @return array Always returns current options + * + */ + public function setValidateURLOptions(array $options = array()) { + if(!empty($options)) $this->validateURLOptions = array_merge($this->validateURLOptions, $options); + return $this->validateURLOptions; + } + /** * Return array of allowed schemes * @@ -1174,6 +1203,7 @@ class WireHttp extends Wire { * */ public function _errorHandler($errno, $errstr, $errfile, $errline, $errcontext) { + if($errfile || $errline || $errcontext) {} // ignore $this->error[] = "$errno: $errstr"; }