diff --git a/wire/core/Sanitizer.php b/wire/core/Sanitizer.php index 0ab1c3a9..427491f0 100644 --- a/wire/core/Sanitizer.php +++ b/wire/core/Sanitizer.php @@ -69,6 +69,64 @@ class Sanitizer extends Wire { */ protected $textTools = null; + /** + * UTF-8 whitespace hex codes + * + * @var array + * + */ + protected $whitespaceUTF8 = array( + '0009', // character tab + '000A', // line feed + '000B', // line tab + '000C', // form feed + '000D', // carriage return + '0020', // space + '0085', // next line + '00A0', // non-breaking space + '1680', // ogham space mark + '180E', // mongolian vowel separator + '2000', // en quad + '2001', // em quad + '2002', // en space + '2003', // em space + '2004', // three per em space + '2005', // four per em space + '2006', // six per em space + '2007', // figure space + '2008', // punctuation space + '2009', // thin space + '200A', // hair space + '200B', // zero width space + '200C', // zero width non-join + '200D', // zero width join + '2028', // line seperator + '2029', // paragraph seperator + '202F', // narrow non-breaking space + '205F', // medium mathematical space + '2060', // word join + '3000', // ideographic space + 'FEFF', // zero width non-breaking space + ); + + /** + * HTML entities representing whitespace + * + * Note that this array is populated with all decimal/hex entities after a call to + * getWhitespaceArray() method with the $html option as true. + * + * @var array + * + */ + protected $whitespaceHTML = array( + ' ', // non-breaking space + ' ', // en space + ' ', // em space + ' ', // thin space + '‌', // zero width non-join + '‍', // zero width join + ); + /** * Construct the sanitizer * @@ -929,10 +987,15 @@ class Sanitizer extends Wire { * - `maxLength` (int): maximum characters allowed, or 0=no max (default=255). * - `maxBytes` (int): maximum bytes allowed (default=0, which implies maxLength*4). * - `stripTags` (bool): strip markup tags? (default=true). - * - `stripMB4` (bool): strip emoji and other 4-byte UTF-8? (default=false). + * - `stripMB4` (bool): strip emoji and other 4-byte UTF-8? (default=false). + * - `stripSpace` (bool|string): strip whitespace? Specify true or character to replace whitespace with (default=false). + * - `reduceSpace` (bool|string): reduce consecutive whitespace to single? Specify true or character to reduce to (default=false). + * Note that the reduceSpace option is an alternative to the stripSpace option, they should not be used together. * - `allowableTags` (string): markup tags that are allowed, if stripTags is true (use same format as for PHP's `strip_tags()` function. * - `multiLine` (bool): allow multiple lines? if false, then $newlineReplacement below is applicable (default=false). - * - `newlineReplacement` (string): character to replace newlines with, OR specify boolean TRUE to remove extra lines (default=" "). + * - `convertEntities` (bool): convert HTML entities to equivalent character(s)? (default=false). + * - `newlineReplacement` (string): character to replace newlines with, OR specify boolean true to remove extra lines (default=" "). + * - `truncateTail` (bool): if truncate necessary for maxLength, truncate from end/tail? Use false to truncate head (default=true). * - `inCharset` (string): input character set (default="UTF-8"). * - `outCharset` (string): output character set (default="UTF-8"). * @return string @@ -946,17 +1009,30 @@ class Sanitizer extends Wire { 'maxBytes' => 0, // maximum bytes allowed (0 = default, which is maxLength*4) 'stripTags' => true, // strip markup tags 'stripMB4' => false, // strip Emoji and 4-byte characters? + 'stripSpace' => false, // remove/replace whitespace? If yes, specify character to replace with, or true for blank + 'reduceSpace' => false, // reduce whitespace to single? If yes, specify character to replace with or true for ' '. 'allowableTags' => '', // tags that are allowed, if stripTags is true (use same format as for PHP's strip_tags function) 'multiLine' => false, // allow multiple lines? if false, then $newlineReplacement below is applicable + 'convertEntities' => false, // convert HTML entities to equivalent characters? 'newlineReplacement' => ' ', // character to replace newlines with, OR specify boolean TRUE to remove extra lines 'inCharset' => 'UTF-8', // input charset 'outCharset' => 'UTF-8', // output charset + 'trunateTail' => true, // if truncate necessary for maxLength, remove chars from tail? False to truncate from head. + 'trim' => true, // trim whitespace from beginning/end, or specify character(s) to trim, or false to disable ); - $options = array_merge($defaultOptions, $options); + $truncated = false; + $options = array_merge($defaultOptions, $options); + if(isset($options['multiline'])) $options['multiLine'] = $options['multiline']; // common case error + if(isset($options['maxlength'])) $options['maxLength'] = $options['maxlength']; // common case error if($options['maxLength'] < 0) $options['maxLength'] = 0; if($options['maxBytes'] < 0) $options['maxBytes'] = 0; + if($options['reduceSpace'] !== false && $options['stripSpace'] === false) { + // if reduceSpace option is used then provide necessary value for stripSpace option + $options['stripSpace'] = is_string($options['reduceSpace']) ? $options['reduceSpace'] : ' '; + } + if(!is_string($value)) $value = $this->string($value); if(!$options['multiLine']) { @@ -975,21 +1051,51 @@ class Sanitizer extends Wire { } } - if($options['stripTags']) $value = strip_tags($value, $options['allowableTags']); + if($options['stripTags']) { + $value = strip_tags($value, $options['allowableTags']); + } - if($options['inCharset'] != $options['outCharset']) $value = iconv($options['inCharset'], $options['outCharset'], $value); + if($options['inCharset'] != $options['outCharset']) { + $value = iconv($options['inCharset'], $options['outCharset'], $value); + } - if($options['stripMB4']) $value = $this->removeMB4($value); + if($options['convertEntities']) { + $value = $this->unentities($value, true, $options['outCharset']); + } + if($options['stripSpace'] !== false) { + $c = is_string($options['stripSpace']) ? $options['stripSpace'] : ''; + $allow = $options['multiLine'] ? array("\n") : array(); + $value = $this->removeWhitespace($value, array('replace' => $c, 'allow' => $allow)); + } + + if($options['stripMB4']) { + $value = $this->removeMB4($value); + } + + if($options['trim']) { + $value = is_string($options['trim']) ? trim($value, $options['trim']) : trim($value); + } + if($options['maxLength']) { if(empty($options['maxBytes'])) $options['maxBytes'] = $options['maxLength'] * 4; if($this->multibyteSupport) { if(mb_strlen($value, $options['outCharset']) > $options['maxLength']) { - $value = mb_substr($value, 0, $options['maxLength'], $options['outCharset']); + $truncated = true; + if($options['truncateTail']) { + $value = mb_substr($value, 0, $options['maxLength'], $options['outCharset']); + } else { + $value = mb_substr($value, -1 * $options['maxLength'], null, $options['outCharset']); + } } } else { if(strlen($value) > $options['maxLength']) { - $value = substr($value, 0, $options['maxLength']); + $truncated = true; + if($options['truncateTail']) { + $value = substr($value, 0, $options['maxLength']); + } else { + $value = substr($value, -1 * $options['maxLength']); + } } } } @@ -997,16 +1103,30 @@ class Sanitizer extends Wire { if($options['maxBytes']) { $n = $options['maxBytes']; while(strlen($value) > $options['maxBytes']) { + $truncated = true; $n--; if($this->multibyteSupport) { - $value = mb_substr($value, 0, $n, $options['outCharset']); + if($options['truncateTail']) { + $value = mb_substr($value, 0, $n, $options['outCharset']); + } else { + $value = mb_substr($value, $n, null, $options['outCharset']); + } } else { - $value = substr($value, 0, $n); + if($options['truncateTail']) { + $value = substr($value, 0, $n); + } else { + $value = substr($value, $n); + } } } } + + if($truncated && $options['trim']) { + // secondary trim after truncation + $value = is_string($options['trim']) ? trim($value, $options['trim']) : trim($value); + } - return trim($value); + return $value; } /** @@ -1015,7 +1135,7 @@ class Sanitizer extends Wire { * - This sanitizer is useful for user-submitted text from a plain-text `