From 7bd2e3fc7f8780e4f0cb82c243d9e8de933a7183 Mon Sep 17 00:00:00 2001 From: Ryan Cramer Date: Mon, 1 Apr 2019 11:32:04 -0400 Subject: [PATCH] Add versions of many PHP string functions to WireTextTools to abstract away the mb_string test and update Sanitizer to use them where appropriate --- wire/core/Sanitizer.php | 61 +++++---- wire/core/WireTextTools.php | 264 +++++++++++++++++++++++++++++++++--- 2 files changed, 278 insertions(+), 47 deletions(-) diff --git a/wire/core/Sanitizer.php b/wire/core/Sanitizer.php index 8fb0f916..22d6365c 100644 --- a/wire/core/Sanitizer.php +++ b/wire/core/Sanitizer.php @@ -645,6 +645,8 @@ class Sanitizer extends Wire { // if UTF8 module is not enabled then delegate this call to regular pageName sanitizer if($this->wire('config')->pageNameCharset != 'UTF8') return $this->pageName($value, false, $maxLength); + + $tt = $this->getTextTools(); // we don't allow UTF8 page names to be prefixed with "xn-" if(strpos($value, 'xn-') === 0) $value = substr($value, 3); @@ -670,10 +672,10 @@ class Sanitizer extends Wire { // validate that all characters are in our whitelist $replacements = array(); - for($n = 0; $n < mb_strlen($value); $n++) { - $c = mb_substr($value, $n, 1); - $inBlacklist = mb_strpos($blacklist, $c) !== false || strpos($blacklist, $c) !== false; - $inWhitelist = !$inBlacklist && $whitelist !== false && mb_strpos($whitelist, $c) !== false; + for($n = 0; $n < $tt->strlen($value); $n++) { + $c = $tt->substr($value, $n, 1); + $inBlacklist = $tt->strpos($blacklist, $c) !== false || strpos($blacklist, $c) !== false; + $inWhitelist = !$inBlacklist && $whitelist !== false && $tt->strpos($whitelist, $c) !== false; if($inWhitelist && !$inBlacklist) { // in whitelist } else if($inBlacklist || !strlen(trim($c)) || ctype_cntrl($c)) { @@ -681,14 +683,14 @@ class Sanitizer extends Wire { $replacements[] = $c; } else { // character that is not in whitelist, double check case variants - $cLower = mb_strtolower($c); - $cUpper = mb_strtoupper($c); - if($cLower !== $c && mb_strpos($whitelist, $cLower) !== false) { + $cLower = $tt->strtolower($c); + $cUpper = $tt->strtoupper($c); + if($cLower !== $c && $tt->strpos($whitelist, $cLower) !== false) { // allow character and convert to lowercase variant - $value = mb_substr($value, 0, $n) . $cLower . mb_substr($value, $n+1); - } else if($cUpper !== $c && mb_strpos($whitelist, $cUpper) !== false) { + $value = $tt->substr($value, 0, $n) . $cLower . $tt->substr($value, $n+1); + } else if($cUpper !== $c && $tt->strpos($whitelist, $cUpper) !== false) { // allow character and convert to uppercase varient - $value = mb_substr($value, 0, $n) . $cUpper . mb_substr($value, $n+1); + $value = $tt->substr($value, 0, $n) . $cUpper . $tt->substr($value, $n+1); } else { // queue character to be replaced $replacements[] = $c; @@ -709,7 +711,7 @@ class Sanitizer extends Wire { // trim off any remaining separators/extras $value = trim($value, '-_.'); - if(mb_strlen($value) > $maxLength) $value = mb_substr($value, 0, $maxLength); + if($tt->strlen($value) > $maxLength) $value = $tt->substr($value, 0, $maxLength); return $value; } @@ -762,6 +764,7 @@ class Sanitizer extends Wire { // exclude values that don't need to be converted if(strpos($value, 'xn-') === 0) return $value; if(ctype_alnum(str_replace(array('.', '-', '_'), '', $value))) return $value; + $tt = $this->getTextTools(); while(strpos($value, '__') !== false) { $value = str_replace('__', '_', $value); @@ -771,8 +774,8 @@ class Sanitizer extends Wire { $_value = $value; $parts = array(); while(strlen($_value)) { - $part = mb_substr($_value, 0, 12); - $_value = mb_substr($_value, 12); + $part = $tt->substr($_value, 0, 12); + $_value = $tt->substr($_value, 12); $parts[] = $this->punyEncodeName($part); } $value = implode('__', $parts); @@ -2099,8 +2102,8 @@ class Sanitizer extends Wire { // first, replace common entities that can possibly remain $entities = array(''' => "'"); $str = str_ireplace(array_keys($entities), array_values($entities), $str); - if(strpos($str, '&#') !== false) { - // manually convert decimal and hex entities + if(strpos($str, '&#') !== false && $this->multibyteSupport) { + // manually convert decimal and hex entities (when possible) $str = preg_replace_callback('/(&#[0-9A-F]+;)/i', function($matches) use($encoding) { return mb_convert_encoding($matches[1], $encoding, "HTML-ENTITIES"); }, $str); @@ -2337,8 +2340,8 @@ class Sanitizer extends Wire { */ public function trim($str, $chars = '') { - $mb = $this->multibyteSupport; - $len = $mb ? mb_strlen($str) : strlen($str); + $tt = $this->getTextTools(); + $len = $tt->strlen($str); if(!$len) return $str; if(is_array($chars) && !count($chars)) $chars = ''; $trims = array(); @@ -2355,9 +2358,9 @@ class Sanitizer extends Wire { if(is_array($chars)) { $trims = $chars; } else { - for($n = 0; $n < mb_strlen($str); $n++) { - $trim = $mb ? mb_substr($chars, $n, 1) : substr($chars, $n, 1); - $trimLen = $mb ? mb_strlen($trim) : strlen($trim); + for($n = 0; $n < $tt->strlen($str); $n++) { + $trim = $tt->substr($chars, $n, 1); + $trimLen = $tt->strlen($trim); if($trimLen) $trims[] = $trim; } } @@ -2369,7 +2372,7 @@ class Sanitizer extends Wire { $numRemovedEnd = 0; // num removed from end foreach($trims as $trimKey => $trim) { - $trimPos = $mb ? mb_strpos($str, $trim) : strpos($str, $trim); + $trimPos = $tt->strpos($str, $trim); // if trim not present anywhere in string it can be removed from our trims list if($trimPos === false) { @@ -2378,23 +2381,23 @@ class Sanitizer extends Wire { } // at this point we know the trim character is present somewhere in the string - $trimLen = $mb ? mb_strlen($trim) : strlen($trim); + $trimLen = $tt->strlen($trim); // while this trim character matches at beginning of string, remove it while($trimPos === 0) { - $str = $mb ? mb_substr($str, $trimLen) : substr($str, $trimLen); - $trimPos = $mb ? mb_strpos($str, $trim) : strpos($str, $trim); + $str = $tt->substr($str, $trimLen); + $trimPos = $tt->strpos($str, $trim); $numRemovedStart++; } // trim from end if($trimPos > 0) do { $x = 0; // qty removed only in this do/while iteration - $trimPos = $mb ? mb_strrpos($str, $trim) : strrpos($str, $trim); + $trimPos = $tt->strrpos($str, $trim); if($trimPos === false) break; - $strLen = $mb ? mb_strlen($str) : strlen($str); + $strLen = $tt->strlen($str); if($trimPos + $trimLen >= $strLen) { - $str = $mb ? mb_substr($str, 0, $trimPos) : substr($str, 0, $trimPos); + $str = $tt->substr($str, 0, $trimPos); $numRemovedEnd++; $x++; } @@ -2405,7 +2408,7 @@ class Sanitizer extends Wire { } // foreach - $strLen = $mb ? mb_strlen($str) : strlen($str); + $strLen = $tt->strlen($str); } while($numRemovedStart + $numRemovedEnd > 0 && $strLen > 0); @@ -2743,7 +2746,7 @@ class Sanitizer extends Wire { * */ public function chars($value, $allow = '', $replacement = '', $collapse = true, $mb = null) { - + $value = $this->string($value); $alpha = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'; $digit = '0123456789'; diff --git a/wire/core/WireTextTools.php b/wire/core/WireTextTools.php index 7da52fff..58315c3d 100644 --- a/wire/core/WireTextTools.php +++ b/wire/core/WireTextTools.php @@ -14,6 +14,23 @@ */ class WireTextTools extends Wire { + + /** + * mbstring support? + * + * @var bool + * + */ + protected $mb; + + /** + * Construct + * + */ + public function __construct() { + $this->mb = function_exists("mb_internal_encoding"); + parent::__construct(); + } /** * Convert HTML markup to readable text @@ -387,7 +404,7 @@ class WireTextTools extends Wire { $maxLength = $options['maxLength']; } else if(is_string($maxLength) && ctype_alpha($maxLength)) { $options['type'] = $maxLength; - $maxLength = isset($options['maxLength']) ? $options['maxLength'] : mb_strlen($str); + $maxLength = isset($options['maxLength']) ? $options['maxLength'] : $this->strlen($str); } if(!$maxLength) $maxLength = 255; @@ -406,14 +423,14 @@ class WireTextTools extends Wire { } if($type === 'block') { - if(mb_strpos($str, $blockEndChar) !== false) $str = str_replace($blockEndChar, ' ', $str); + if($this->strpos($str, $blockEndChar) !== false) $str = str_replace($blockEndChar, ' ', $str); $options['endBlocksWith'] = $blockEndChar; } // collapse whitespace and strip tags $str = $this->collapse($str, $options); - if(trim($options['collapseLinesWith']) && mb_strpos($str, $options['collapseLinesWith'])) { + if(trim($options['collapseLinesWith']) && $this->strpos($str, $options['collapseLinesWith'])) { // if lines are collapsed with something other than whitespace, avoid using that string // when the line already ends with sentence punctuation foreach($endSentenceChars as $c) { @@ -422,26 +439,26 @@ class WireTextTools extends Wire { } // if anything above reduced the length of the string enough, return it now - if(mb_strlen($str) <= $maxLength) return $str; + if($this->strlen($str) <= $maxLength) return $str; // get string at maximum possible length if($options['visible']) { // adjust for only visible length $_str = $str; - $str = mb_substr($str, 0, $maxLength); + $str = $this->substr($str, 0, $maxLength); $len = $this->getVisibleLength($str); if($len < $maxLength) { $maxLength += ($maxLength - $len); - $str = mb_substr($_str, 0, $maxLength); + $str = $this->substr($_str, 0, $maxLength); } unset($_str); } else { - $str = mb_substr($str, 0, $maxLength); + $str = $this->substr($str, 0, $maxLength); } // match to closest blocks, like paragraph(s) if($type === 'block') { - $pos = $options['maximize'] ? mb_strrpos($str, $blockEndChar) : mb_strpos($str, $blockEndChar); + $pos = $options['maximize'] ? $this->strrpos($str, $blockEndChar) : $this->strpos($str, $blockEndChar); if($pos === false) { $type = 'sentence'; } else { @@ -459,7 +476,7 @@ class WireTextTools extends Wire { // find punctuation closes to end of string if($type === 'punctuation') { foreach($punctuationChars as $find) { - $pos = $options['maximize'] ? mb_strrpos($str, $find) : mb_strpos($str, $find); + $pos = $options['maximize'] ? $this->strrpos($str, $find) : $this->strpos($str, $find); if($pos) $tests[] = $pos; } if(!count($tests)) $type = 'word'; @@ -467,7 +484,7 @@ class WireTextTools extends Wire { // find whitespace and last word closest to end of string if($type === 'word' || !count($tests)) { - $pos = $options['maximize'] ? mb_strrpos($str, ' ') : mb_strpos($str, ' '); + $pos = $options['maximize'] ? $this->strrpos($str, ' ') : $this->strpos($str, ' '); if($pos) $tests[] = $pos; } @@ -486,8 +503,8 @@ class WireTextTools extends Wire { // process our tests do { $pos = array_pop($tests); - $result = trim(mb_substr($str, 0, $pos + 1)); - $lastChar = mb_substr($result, -1); + $result = trim($this->substr($str, 0, $pos + 1)); + $lastChar = $this->substr($result, -1); $result = rtrim($result, $options['trim']); if($type === 'sentence' || $type === 'block') { @@ -497,7 +514,7 @@ class WireTextTools extends Wire { } else if(in_array($lastChar, $punctuationChars)) { $trims = ' '; foreach($punctuationChars as $c) { - if(mb_strpos($options['noTrim'], $c) !== false) continue; + if($this->strpos($options['noTrim'], $c) !== false) continue; if(in_array($c, $endSentenceChars)) continue; $trims .= $c; } @@ -548,18 +565,18 @@ class WireTextTools extends Wire { foreach($chars as $find) { - $pos = $options['maximize'] ? mb_strrpos($thisStr, "$find ") : mb_strpos($thisStr, "$find ", $offset); + $pos = $options['maximize'] ? $this->strrpos($thisStr, "$find ") : $this->strpos($thisStr, "$find ", $offset); if(!$pos) continue; if($find === '.') { - $testStr = mb_substr($thisStr, 0, $pos + 1); + $testStr = $this->substr($thisStr, 0, $pos + 1); if(preg_match($noEndRegex, $testStr, $matches)) { // ends with a disallowed word, next time try to match with a shorter string if($options['maximize']) { - $nextStr = mb_substr($testStr, 0, mb_strlen($testStr) - mb_strlen($matches[1]) - 1); + $nextStr = $this->substr($testStr, 0, $this->strlen($testStr) - $this->strlen($matches[1]) - 1); } else { - $nextOffset = mb_strlen($testStr); + $nextOffset = $this->strlen($testStr); } continue; } @@ -585,7 +602,7 @@ class WireTextTools extends Wire { if(strpos($str, '&') !== false && strpos($str, ';')) { $str = html_entity_decode($str, ENT_QUOTES, 'UTF-8'); } - return mb_strlen($str); + return $this->strlen($str); } /** @@ -755,5 +772,216 @@ class WireTextTools extends Wire { return $str; } + + /*********************************************************************************************************** + * MULTIBYTE PHP STRING FUNCTIONS THAT FALLBACK WHEN MBSTRING NOT AVAILABLE + * + * These duplicate the equivalent PHP string methods and use exactly the same arguments + * and exhibit exactly the same behavior. The only difference is that these methods using + * the multibyte string versions when they are available, and fallback to the regular PHP + * string methods when not. Use these functions only when that behavior is okay. + * + */ + + /** + * Get part of a string + * + * #pw-group-PHP-function-alternates + * + * @param string $str + * @param int $start + * @param int|null $length Max chars to use from str. If omitted or NULL, extract all characters to the end of the string. + * @return string + * @see https://www.php.net/manual/en/function.substr.php + * + */ + public function substr($str, $start, $length = null) { + return $this->mb ? mb_substr($str, $start, $length) : substr($start, $start, $length); + } + + /** + * Find position of first occurrence of string in a string + * + * #pw-group-PHP-function-alternates + * + * @param string $haystack + * @param string $needle + * @param int $offset + * @return bool|false|int + * @see https://www.php.net/manual/en/function.strpos.php + * + */ + public function strpos($haystack, $needle, $offset = 0) { + return $this->mb ? mb_strpos($haystack, $needle, $offset) : strpos($haystack, $needle, $offset); + } + + /** + * Find the position of the first occurrence of a case-insensitive substring in a string + * + * #pw-group-PHP-function-alternates + * + * @param string $haystack + * @param string $needle + * @param int $offset + * @return bool|false|int + * @see https://www.php.net/manual/en/function.stripos.php + * + */ + public function stripos($haystack, $needle, $offset = 0) { + return $this->mb ? mb_stripos($haystack, $needle, $offset) : stripos($haystack, $needle, $offset); + } + + /** + * Find the position of the last occurrence of a substring in a string + * + * #pw-group-PHP-function-alternates + * + * @param string $haystack + * @param string $needle + * @param int $offset + * @return bool|false|int + * @see https://www.php.net/manual/en/function.strrpos.php + * + */ + public function strrpos($haystack, $needle, $offset = 0) { + return $this->mb ? mb_strrpos($haystack, $needle, $offset) : strrpos($haystack, $needle, $offset); + } + + /** + * Find the position of the last occurrence of a case-insensitive substring in a string + * + * #pw-group-PHP-function-alternates + * + * @param string $haystack + * @param string $needle + * @param int $offset + * @return bool|false|int + * @see https://www.php.net/manual/en/function.strripos.php + * + */ + public function strripos($haystack, $needle, $offset = 0) { + return $this->mb ? mb_strripos($haystack, $needle, $offset) : strripos($haystack, $needle, $offset); + } + + /** + * Get string length + * + * #pw-group-PHP-function-alternates + * + * @param string $str + * @return int + * @see https://www.php.net/manual/en/function.strlen.php + * + */ + public function strlen($str) { + return $this->mb ? mb_strlen($str) : strlen($str); + } + + /** + * Make a string lowercase + * + * #pw-group-PHP-function-alternates + * + * @param string $str + * @return string + * @see https://www.php.net/manual/en/function.strtolower.php + * + */ + public function strtolower($str) { + return $this->mb ? mb_strtolower($str) : strtolower($str); + } + + /** + * Make a string uppercase + * + * #pw-group-PHP-function-alternates + * + * @param string $str + * @return string + * @see https://www.php.net/manual/en/function.strtoupper.php + * + */ + public function strtoupper($str) { + return $this->mb ? mb_strtoupper($str) : strtoupper($str); + } + + /** + * Count the number of substring occurrences + * + * #pw-group-PHP-function-alternates + * + * @param string $haystack + * @param string $needle + * @return int + * @see https://www.php.net/manual/en/function.substr-count.php + * + */ + public function substrCount($haystack, $needle) { + return $this->mb ? mb_substr_count($haystack, $needle) : substr_count($haystack, $needle); + } + + /** + * Find the first occurrence of a string + * + * #pw-group-PHP-function-alternates + * + * @param string $haystack + * @param string $needle + * @param bool $beforeNeedle Return part of haystack before first occurrence of the needle? (default=false) + * @return false|string + * @see https://www.php.net/manual/en/function.strstr.php + * + */ + public function strstr($haystack, $needle, $beforeNeedle = false) { + return $this->mb ? mb_strstr($haystack, $needle, $beforeNeedle) : strstr($haystack, $needle, $beforeNeedle); + } + + /** + * Find the first occurrence of a string (case insensitive) + * + * #pw-group-PHP-function-alternates + * + * @param string $haystack + * @param string $needle + * @param bool $beforeNeedle Return part of haystack before first occurrence of the needle? (default=false) + * @return false|string + * @see https://www.php.net/manual/en/function.stristr.php + * + */ + public function stristr($haystack, $needle, $beforeNeedle = false) { + return $this->mb ? mb_stristr($haystack, $needle, $beforeNeedle) : stristr($haystack, $needle, $beforeNeedle); + } + + + /** + * Find the last occurrence of a character in a string + * + * #pw-group-PHP-function-alternates + * + * @param string $haystack + * @param string $needle Only first given character used + * @return false|string + * @see https://www.php.net/manual/en/function.strrchr.php + * + */ + public function strrchr($haystack, $needle) { + return $this->mb ? mb_strrchr($haystack, $needle) : strrchr($haystack, $needle); + } + + /** + * Strip whitespace (or other characters) from the beginning and end of a string + * + * #pw-group-PHP-function-alternates + * + * @param string $str + * @param string $chars Omit for default + * @return string + * + */ + public function trim($str, $chars = '') { + if(!$this->mb) return $chars === '' ? trim($str) : trim($str, $chars); + return $this->wire('sanitizer')->trim($str, $chars); + } + }