From 1f293cc4f4092883b4eaf915950ba3c7c5430a7a Mon Sep 17 00:00:00 2001 From: Ryan Cramer Date: Fri, 19 Jun 2020 12:42:14 -0400 Subject: [PATCH] Add new $sanitizer->flatArray() and $sanitizer->wordsArray() methods, plus some improvements to existing array() method --- wire/core/DatabaseQuerySelectFulltext.php | 397 +++++++++++++++++++--- wire/core/Sanitizer.php | 315 ++++++++++++++++- 2 files changed, 645 insertions(+), 67 deletions(-) diff --git a/wire/core/DatabaseQuerySelectFulltext.php b/wire/core/DatabaseQuerySelectFulltext.php index 0910f08b..b9738a54 100644 --- a/wire/core/DatabaseQuerySelectFulltext.php +++ b/wire/core/DatabaseQuerySelectFulltext.php @@ -74,11 +74,14 @@ class DatabaseQuerySelectFulltext extends Wire { */ protected $methodOperators = array( 'matchEquals' => array('=', '!=', '>', '<', '>=', '<='), - 'matchContains' => array('*='), - 'matchWords' => array('~=', '!~='), + 'matchContains' => array('*=', '*+=', '**=', '**+=', '^=', '$='), + 'matchWords' => array('~=', '!~=', '~+='), + 'matchContainsWords' => array('~*=', '~~=', '~|=', '~|*='), + 'matchWordsLIKE' => array('~%=', '~|%='), 'matchLIKE' => array('%='), - 'matchStart' => array('^=', '%^='), - 'matchEnd' => array('$=', '%$='), + 'matchStartLIKE' => array('%^='), + 'matchEndLIKE' => array('%$='), + 'matchCommands' => array('#='), ); /** @@ -153,7 +156,9 @@ class DatabaseQuerySelectFulltext extends Wire { * */ protected function escapeAGAINST($str) { - return str_replace(array('@', '+', '-', '*', '~', '<', '>', '(', ')', ':', '"', '&', '|', '=', '.'), ' ', $str); + $str = str_replace(array('@', '+', '-', '*', '~', '<', '>', '(', ')', ':', '"', '&', '|', '=', '.'), ' ', $str); + while(strpos($str, ' ')) $str = str_replace(' ', ' ', $str); + return $str; } /** @@ -218,8 +223,8 @@ class DatabaseQuerySelectFulltext extends Wire { */ protected function matchArrayValue(array $value) { - if($this->operator === '~=') { - throw new WireException("Operator ~= is not supported for $this->fieldName with OR value condition"); + if(strpos($this->operator, '~') !== false) { + throw new WireException("Operator $this->operator is not supported for $this->fieldName with OR value condition"); } // convert *= operator to %= to make the query possible (avoiding matchContains method) @@ -266,7 +271,7 @@ class DatabaseQuerySelectFulltext extends Wire { * @param string $value * */ - protected function matchStart($value) { + protected function matchStartLIKE($value) { $this->query->where("$this->tableField LIKE ?", $this->escapeLIKE($value) . '%'); } @@ -276,75 +281,146 @@ class DatabaseQuerySelectFulltext extends Wire { * @param string $value * */ - protected function matchEnd($value) { + protected function matchEndLIKE($value) { $this->query->where("$this->tableField LIKE ?", '%' . $this->escapeLIKE($value)); } /** - * Match words + * Match full words * * @param string $value * */ protected function matchWords($value) { - $words = preg_split('/[-\s,@]/', $value, -1, PREG_SPLIT_NO_EMPTY); + $partial = $this->operator === '~*='; + // note: ft_min_word_len is automatically changed to InnoDB’s equivalent when applicable + $minWordLength = (int) $this->database->getVariable('ft_min_word_len'); + $words = $this->words($value); + $mb = function_exists('mb_strlen'); foreach($words as $word) { - $len = function_exists('mb_strlen') ? mb_strlen($word) : strlen($word); - if(DatabaseStopwords::has($word) || $len < (int) $this->database->getVariable('ft_min_word_len')) { - // word is stop-word or has too short to use fulltext index - $this->matchWordLIKE($word); + $word = trim($word, '-.,'); + $len = $mb ? mb_strlen($word) : strlen($word); + if($len < (int) $minWordLength || $this->database->isStopword($word)) { + // word is stop-word or is too short to use fulltext index + $this->matchWordLIKE($word, $partial); } else { - $this->matchContains($word); + $this->matchContains($word, $partial); } } // force it not to match if no words if(!count($words)) $this->query->where("1>2"); } + /** + * Match words (plural) LIKE + * + * @param string $value + * @since 3.0.160 + * + */ + protected function matchWordsLIKE($value) { + $type = strpos($this->operator, '!') === 0 ? 'NOT LIKE' : 'LIKE'; + $any = strpos($this->operator, '|') !== false; + //$texts = preg_split('/[-\s,@]/', $value, -1, PREG_SPLIT_NO_EMPTY); + $words = $this->words($value); + $binds = array(); // used only in $any mode + $wheres = array(); // used only in $any mode + foreach($words as $word) { + $word = $this->escapeLIKE($word); + if(!strlen($word)) continue; + if($any) { + $bindKey = $this->query->getUniqueBindKey(); + $wheres[] = "($this->tableField $type $bindKey)"; + $binds[$bindKey] = "%$word%"; + } else { + $this->query->where("($this->tableField $type ?)", "%$word%"); + } + } + // force it not to match if no words + if(!count($words)) { + $this->query->where("1>2"); + } else if($any) { + $this->query->where(implode(' OR ', $wheres)); + $this->query->bindValues($binds); + } + } + + /** + * Match contains partial words + * + * @param string $value + * @since 3.0.160 + * + */ + protected function matchContainsWords($value) { + $tableField = $this->tableField(); + $operator = $this->operator; + $required = strpos($operator, '|') === false; + $partial = $operator != '~|='; + $booleanValue = $this->getBooleanQueryValueWords($value, $required, $partial); + $not = strpos($operator, '!') === 0; + $match = $not ? 'NOT MATCH' : 'MATCH'; + $bindKey = $this->query->bindValueGetKey($booleanValue); + $where = "$match($tableField) AGAINST($bindKey IN BOOLEAN MODE)"; + $this->query->where($where); + } /** * Match contains string * * @param string $value + * @param bool|null $partial * */ - protected function matchContains($value) { + protected function matchContains($value, $partial = null) { $tableField = $this->tableField(); - $tableName = $this->tableName; - $fieldName = $this->fieldName; $operator = $this->operator; - $partial = strpos($operator, '~') === false; $not = strpos($operator, '!') === 0; $match = $not ? 'NOT MATCH' : 'MATCH'; $wheres = array(); - $against = $this->escapeAGAINST($value); - $booleanValue = $this->getBooleanQueryValue($value, true, $partial); $operator = ltrim($operator, '!'); + $scoreField = $this->getScoreFieldName(); + $expandAgainst = (strpos($operator, '+') !== false ? ' WITH QUERY EXPANSION' : ''); + $booleanValue = ''; + $required = true; $likeType = ''; $like = ''; - $n = 0; + + if($partial === null) { + $partial = strpos($operator, '~') === false || $operator === '~*=' || $operator === '~~='; + } - do { - $scoreField = "_score_{$tableName}_{$fieldName}" . (++$n); - // $locateField = "_locate_{$tableName}_{$fieldName}$n"; - } while(in_array($scoreField, self::$scoreFields)); + if(strpos($operator, '**') !== false || strpos($operator, '+') !== false) { + // match or expand + $value = implode(' ', $this->words($value, array('pluralize' => true))); + } else if($operator === '^=' || $operator === '$=') { + // starts with or ends with + } else { + // boolean value query + $booleanValue = $this->getBooleanQueryValueWords($value, $required, $partial); + } - self::$scoreFields[] = $scoreField; - - $bindKey = $this->query->bindValueGetKey($against); - $this->query->select("$match($tableField) AGAINST($bindKey) AS $scoreField"); + $against = $this->escapeAGAINST($value); + $bindKey = $this->query->bindValueGetKey($against); + $matchAgainst = "$match($tableField) AGAINST($bindKey$expandAgainst)"; + $select = "$matchAgainst AS $scoreField "; + $this->query->select($select); $this->query->orderby("$scoreField DESC"); //$query->select("LOCATE('$against', $tableField) AS $locateField"); //$query->orderby("$locateField=1 DESC"); - if($booleanValue) { + if($booleanValue == '') { + $wheres[] = $matchAgainst; + } else { $bindKey = $this->query->bindValueGetKey($booleanValue); $wheres[] = "$match($tableField) AGAINST($bindKey IN BOOLEAN MODE)"; } - if($operator == '^=' || $operator == '$=') { + // determine if we need to add LIKE conditions as a secondary qualifier to narrow + // search after rows have already been identified by the MATCH/AGAINST + if($operator === '^=' || $operator === '$=') { // starts or ends with $likeType = $not ? 'NOT RLIKE' : 'RLIKE'; $likeText = preg_quote($value); @@ -357,20 +433,25 @@ class DatabaseQuerySelectFulltext extends Wire { $like = $likeText . '[[:space:]]*[[:punct:]]*[[:space:]]*(<[^>]+>)*[[:space:]]*$'; } - } else if($operator === '*=' && (!count($wheres) || preg_match('/[-\s]/', $against))) { - // contains *= with word separators, or no existing where (boolean) conditions - $likeType = $not ? 'NOT LIKE' : 'LIKE'; - $likeText = $this->escapeLIKE($value); - $like = "%$likeText%"; + } else if($operator === '*=') { + // cointains phrase + if(!count($wheres) || preg_match('/[-\s]/', $against)) { + // contains *= with word separators, or no existing where (boolean) conditions + $likeType = $not ? 'NOT LIKE' : 'LIKE'; + $likeText = $this->escapeLIKE($value); + $like = "%$likeText%"; + } } if($like) { - // LIKE is used as a secondary qualifier, so it's not a bottleneck + // LIKE is used as a secondary qualifier, so it’s not a bottleneck $bindKey = $this->query->bindValueGetKey($like); $wheres[] = "($tableField $likeType $bindKey)"; } - if(count($wheres)) $this->query->where(implode(' AND ', $wheres)); + if(count($wheres)) { + $this->query->where(implode(' AND ', $wheres)); + } } @@ -381,15 +462,48 @@ class DatabaseQuerySelectFulltext extends Wire { * or for words that are stop words. It uses a slower REGEXP rather than fulltext index. * * @param string $word + * @param bool $partial * */ - protected function matchWordLIKE($word) { + protected function matchWordLIKE($word, $partial = false) { $word = preg_quote($word); - //$regex = "([[:blank:][:punct:]]|^)$v([[:blank:][:punct:]]|$)"; - $regex = "([[:blank:]]|[[:punct:]]|[[space]]|^)$word([[:blank:]]|[[:punct:]]|[[space]]|$)"; + $regex = "([[:blank:]]|[[:punct:]]|[[space]]|^)$word"; + if(!$partial) $regex .= "([[:blank:]]|[[:punct:]]|[[space]]|$)"; // match full word at boundary $type = strpos($this->operator, '!') === 0 ? 'NOT REGEXP' : 'REGEXP'; $this->query->where("($this->tableField $type ?)", $regex); } + + /** + * Match text using LIKE + * + * @param string $text + * @since 3.0.160 + * + */ + protected function matchTextLIKE($text) { + $text = $this->escapeLIKE($text); + $type = strpos($this->operator, '!') === 0 ? 'NOT LIKE' : 'LIKE'; + $this->query->where("($this->tableField $type ?)", $text); + } + + /** + * Match text using boolean mode commands + * + * @param string $text + * @since 3.0.160 + * + */ + protected function matchCommands($text) { + $tableField = $this->tableField(); + $scoreField = $this->getScoreFieldName(); + $against = $this->getBooleanQueryValueCommands($text); + $bindKey = $this->query->bindValueGetKey($against); + $matchAgainst = "MATCH($tableField) AGAINST($bindKey IN BOOLEAN MODE) "; + $select = "$matchAgainst AS $scoreField "; + $this->query->select($select); + $this->query->orderby("$scoreField DESC"); + $this->query->where($matchAgainst); + } /** * Generate a boolean query value for use in an SQL MATCH/AGAINST statement. @@ -400,19 +514,194 @@ class DatabaseQuerySelectFulltext extends Wire { * @return string Value provided to the function with boolean operators added. * */ - protected function getBooleanQueryValue($value, $required = true, $partial = true) { - - $newValue = ''; + protected function getBooleanQueryValueWords($value, $required = true, $partial = true) { + + $operator = $this->operator; + $booleanValue = ''; $value = $this->escapeAGAINST($value); - $words = preg_split('/[\s,!?;]+/', $value); + $lastWord = ''; + $searchStopwords = false; - foreach($words as $k => $v) { - $v = trim($v); - if(!strlen($v) || DatabaseStopwords::has($v)) continue; - $newValue .= $required ? "+$v" : "$v"; - if($partial) $newValue .= "*"; - $newValue .= " "; + if($operator === '~~=' || $operator === '^=') { + // contains full words and partial last word (live search or starts with) + $words = $this->words($value, array()); + $lastWord = trim(array_pop($words)); + $partial = false; + $searchStopwords = true; + } else if($partial && $operator !== '*=') { + // contains partial words + $searchStopwords = true; + $words = $this->words($value, array( + 'pluralize' => strpos($operator, '+') !== false, + // 'singularize' => true + )); + } else { + $words = $this->words($value); } - return trim($newValue); + + foreach($words as $key => $word) { + //$word = trim($word, '-~+*,.<>@"\' '); + if(!strlen($word)) continue; + if(!$searchStopwords && $this->database->isStopword($word)) continue; + $booleanValue .= $required ? "+$word" : "$word"; + if($partial) $booleanValue .= "*"; + $booleanValue .= " "; + } + + if($lastWord !== '') { + if($required) $booleanValue .= '+'; + $booleanValue .= $lastWord . '*'; + } + + return trim($booleanValue); + } + + /** + * Generate boolean query value for matching exact phrase in order (no partials) + * + * @param string $value + * @param string $action Phrase action of blank, '+', '-' or '~' (default='') + * @return string + * + */ + protected function getBooleanQueryValueExactPhrase($value, $action = '') { + $value = $this->escapeAGAINST($value); + $words = $this->words($value); + $phrase = implode(' ', $words); + $booleanValue = '"' . $phrase . '"'; + if($action === '+' || $action === '-' || $action === '~') { + $booleanValue = $action . $booleanValue; + } + return $booleanValue; + } + + /** + * Get boolean query value where "+" and "-" and "*" and '"' are allowed in query to affect results + * + * @param string $value + * @return string + * + */ + protected function getBooleanQueryValueCommands($value) { + + $booleanValues = array(); + $value = str_replace(array('“', '”'), '"', $value); + + if(strpos($value, '"') !== false && preg_match_all('![-~+]?"([^"]+)"!', $value, $matches)) { + // find all quoted phrases + foreach($matches[0] as $key => $fullMatch) { + $action = strpos($fullMatch, '"') === 0 ? '' : substr($fullMatch, 0, 1); + $phrase = trim($matches[1][$key]); + if(empty($phrase)) continue; + $phrase = $this->getBooleanQueryValueExactPhrase($phrase, $action); + if(strlen($phrase)) $booleanValues[] = $phrase; + $value = str_replace($fullMatch, ' ', $value); + } + } + + $value = str_replace('"', '', $value); + $words = $this->words($value); + $value = " $value "; + + foreach($words as $word) { + $w = $this->escapeAGAINST($word); + $pregWord = preg_quote($w); + if(stripos($value, "+$word*")) { + $booleanValues[] = "+$w*"; + } else if(stripos($value, "+$word") && preg_match('!\+' . $pregWord . '\b!i', $value)) { + $booleanValues[] = "+$w"; + } else if(stripos($value, "-$word*")) { + $booleanValues[] = "-$w*"; + } else if(stripos($value, "-$word") && preg_match('!-' . $pregWord . '\b!i', $value)) { + $booleanValues[] = "-$w"; + } else { + $booleanValues[] = $w; // optional + } + } + + return implode(' ', $booleanValues); + } + + /** + * Get array of words from given value + * + * @param string $value + * @param array $options + * @return array + * + */ + protected function words($value, array $options = array()) { + + $defaults = array( + 'keepNumberFormat' => false, + 'singularize' => false, + 'pluralize' => false, + 'boolean' => false, // not currently used + ); + + $options = count($options) ? array_merge($defaults, $options) : $defaults; + $words = $this->wire()->sanitizer->wordsArray($value, $options); + $plural = strtolower($this->_('s')); // Suffix(es) that when appended to a word makes it plural // Separate multiple with a pipe "|" or to disable specify uppercase "X" + $plurals = strpos($plural, '|') ? explode('|', $plural) : array($plural); + + if($options['pluralize']) { + // add additional pluralized or singularized words + $addWords = array(); + foreach($words as $key => $word) { + $word = strtolower($word); + $wordLen = strlen($word); + foreach($plurals as $suffix) { + $suffixLen = strlen($suffix); + $w = ''; + if($wordLen > $suffixLen && substr($word, -1 * $suffixLen) === $suffix) { + if($options['singularize']) $w = substr($word, 0, $wordLen - $suffixLen); + } else { + // pluralize + $w = $word . $suffix; + } + if($w) { + if($options['boolean']) $w = "<$w"; + $addWords[$w] = $w; + } + } + } + if(count($addWords)) $words = array_merge($words, $addWords); + + } else if($options['singularize']) { + // singularize only by replacement + foreach($words as $key => $word) { + $word = strtolower($word); + $wordLen = strlen($word); + foreach($plurals as $suffix) { + if(stripos($word, $suffix) === false) continue; + $suffixLen = strlen($suffix); + if($wordLen <= $suffixLen) continue; + if(substr($word, -1 * $suffixLen) === $suffix) { + $word = substr($word, 0, $wordLen - $suffixLen); + if($options['boolean']) $word = "<$word"; + $words[$key] = $word; + } + } + } + } + + return $words; + } + + /** + * Get unique score field name + * + * @return string + * @since 3.0.160 + * + */ + protected function getScoreFieldName() { + $n = 0; + do { + $scoreField = "_score_{$this->tableName}_{$this->fieldName}" . (++$n); + // $locateField = "_locate_{$tableName}_{$fieldName}$n"; + } while(isset(self::$scoreFields[$scoreField])); + self::$scoreFields[$scoreField] = 1; + return $scoreField; } } diff --git a/wire/core/Sanitizer.php b/wire/core/Sanitizer.php index 146ee089..d308b172 100644 --- a/wire/core/Sanitizer.php +++ b/wire/core/Sanitizer.php @@ -3624,18 +3624,19 @@ class Sanitizer extends Wire { */ /** - * Sanitize array or CSV string to array of strings + * Sanitize array or CSV string to array of values, optionally sanitized by given method * - * If string specified, string delimiter may be pipe ("|"), or comma (","), unless overridden with the 'delimiter' - * or 'delimiters' option. + * If given a string, delimiter may be pipe ("|"), or comma (","), unless overridden with the `delimiter` + * or `delimiters` options. * * #pw-group-arrays * * @param array|string|mixed $value Accepts an array or CSV string. If given something else, it becomes first item in array. * @param string $sanitizer Optional Sanitizer method to apply to items in the array (default=null, aka none). * @param array $options Optional modifications to default behavior: - * `maxItems` (int): Maximum items allowed in array (default=0, which means no limit) - * The following options are only used if the provided $value is a string: + * - `maxItems` (int): Maximum items allowed in each array (default=0, which means no limit) + * - `maxDepth` (int): Max nested array depth (default=0, which means no nesting allowed) Since 3.0.160 + * - The following options are only used if the provided $value is a string: * - `delimiter` (string): Single delimiter to use to identify CSV strings. Overrides the 'delimiters' option when specified (default=null) * - `delimiters` (array): Delimiters to identify CSV strings. First found delimiter will be used, default=array("|", ",") * - `enclosure` (string): Enclosure to use for CSV strings (default=double quote, i.e. ") @@ -3644,15 +3645,24 @@ class Sanitizer extends Wire { * */ public function ___array($value, $sanitizer = null, array $options = array()) { + + static $depth = 0; + $defaults = array( + 'maxItems' => 0, + 'maxDepth' => 0, 'delimiter' => null, 'delimiters' => array('|', ','), 'enclosure' => '"', - 'maxItems' => 0, ); + $options = array_merge($defaults, $options); - if(!is_array($value)) { - if(is_null($value)) return array(); + $clean = array(); + + if($value === null) { + return array(); + + } else if(!is_array($value)) { if(is_object($value)) { // value is object: convert to string or array if(method_exists($value, '__toString')) { @@ -3679,20 +3689,39 @@ class Sanitizer extends Wire { } if(!is_array($value)) $value = array($value); } - if($options['maxItems']) { - if(count($value) > $options['maxItems']) $value = array_slice($value, 0, abs($options['maxItems'])); + + $depth++; + foreach($value as $k => $v) { + if(!is_array($v)) continue; + if($depth <= $options['maxDepth']) { + // sanitize nested array recursively + $value[$k] = $this->array($v, $sanitizer, $options); + } else { + // remove nested array + unset($value[$k]); + } } - $clean = array(); - if(!is_null($sanitizer)) { + $depth--; + + if($options['maxItems'] && count($value) > $options['maxItems']) { + $value = array_slice($value, 0, abs($options['maxItems'])); + } + + if($sanitizer) { if(!method_exists($this, $sanitizer) && !method_exists($this, "___$sanitizer")) { throw new WireException("Unknown sanitizer method: $sanitizer"); } foreach($value as $k => $v) { - $clean[$k] = $this->$sanitizer($v); + if($options['maxDepth'] > 0 && is_array($v)) { + $clean[$k] = $v; // array already sanitized by recursive call + } else { + $clean[$k] = $this->$sanitizer($v); + } } } else { $clean = $value; } + return array_values($clean); } @@ -3818,6 +3847,266 @@ class Sanitizer extends Wire { return $data; } + /** + * Given a potentially multi-dimensional array, return a flat 1-dimensional array + * + * #pw-group-arrays + * + * @param array $value + * @param array $options + * - `preserveKeys` (bool): Preserve associative array keys where possible? (default=false) + * - `maxDepth` (int): Max depth of nested arrays to flatten into value, after which they are discarded (default=0). + * The default value of 0 removes any nested arrays, so specify 1 or higher to include them. + * @return array + * @since 3.0.160 + * + */ + public function flatArray($value, $options = array()) { + + static $depth = 0; + + $defaults = array( + 'preserveKeys' => is_bool($options) ? $options : false, + 'maxDepth' => 0, + ); + + if(!is_array($value)) return array($value); + + $flat = array(); + $isFlat = true; + $options = is_array($options) ? array_merge($defaults, $options) : $defaults; + $preserveKeys = $options['preserveKeys']; + + foreach($value as $key => $val) { + if(is_array($val)) $isFlat = false; + if(!$isFlat) break; + } + + if($isFlat) return $preserveKeys ? $value : array_values($value); + + $depth++; + + foreach($value as $key => $val) { + + $hasStringKey = $preserveKeys && is_string($key); + + if(!is_array($val)) { + // not an array value + if($hasStringKey) { + // associative key + list($n, $kk) = array(0, $key); + // this while loop likely is not needed + while(isset($flat[$kk])) $kk = "$key-" . (++$n); + $flat[$kk] = $val; + } else { + // integer key + $flat[] = $val; + } + continue; + } + + /** @var array $val At this point val is known to be an array */ + + if($depth > $options['maxDepth']) { + // skip over arrays when when we are at the max recursion depth + continue; + } + + if(!$preserveKeys) { + // if keys are not preserved then we can take a shortcut + $flat = array_merge($flat, $this->flatArray($val, $options)); + continue; + } + + // array value with preserved keys + foreach($this->flatArray($val, $options) as $k => $v) { + if(is_int($k) || ctype_digit("$k")) { + // integer keys in nested array + $k = (int) $k; + if($hasStringKey) { + // parent array is associative and preserveKeys is true + do { + $kk = "$key.$k"; // parent key + incrementing child key + $k++; + } while(isset($flat[$kk]) || isset($value[$kk])); + $flat[$kk] = $v; + } else { + // parent array is non-associative + $flat[] = $v; + } + } else if(isset($value[$k]) || isset($flat[$k])) { + // associative key already exists + // create new key that marries parent and child keys + $n = -1; + do { + $kk = $key . '.' . $k; + // no match on first-round, start incrementing + if($n > -1) $kk .= '-' . $n; + $n++; + } while(isset($value[$kk]) || isset($flat[$kk])); + $flat[$kk] = $v; + } else { + // associative key that is not already taken + $flat[$k] = $v; + } + } + } + + $depth--; + + return $flat; + } + + /** + * Return array of all words in given value (excluding punctuation and other non-word characters) + * + * #pw-group-arrays + * + * @param string|array $value String containing words + * @param array $options + * - `keepNumberFormat` (bool): Keep minus/comma/period in numbers rather than splitting into words? (default=false) + * - `keepUnderscore` (bool): Keep hyphenated words? (default=false) + * - `keepHyphen` (bool): Keep hyphenated words? (default=false) + * - `minWordLength` (int): Minimum word length (default=1) + * - `maxWordLength` (int): Maximum word length (default=80) + * - `maxWords` (int): Maximum number of words allowed (default=0, no limit) + * - `stripTags` (bool): Strip markup tags so they don’t contribute to returned word list? (default=true) + * @return array + * @since 3.0.160 + * + */ + public function wordsArray($value, array $options = array()) { + + $defaults = array( + 'minWordLength' => 1, + 'maxWordLength' => 80, + 'maxWords' => 0, + 'keepHyphen' => false, + 'keepUnderscore' => false, + 'keepNumberFormat' => true, + 'stripTags' => true, + 'getString' => false, + ); + + $options = array_merge($defaults, $options); + $minLength = (int) $options['minWordLength']; + $maxLength = (int) $options['maxWordLength']; + $replacements = array(); + $replacementPrefix = 'REP'; + $hasReplacements = false; + + if(is_array($value)) { + $value = $this->flatArray($value); + $value = implode(' ', $value); + } else if(!is_string($value)) { + $value = $this->string($value); + } + + // prevents non-bracketed tag names from also becoming words + if($options['stripTags']) $value = strip_tags($value); + + if(!strlen($value)) return array(); + + if($options['keepNumberFormat']) { + $replacements = $this->wordsArrayNumberReplacements($value, $replacementPrefix); + $hasReplacements = count($replacements); + } + + // https://www.php.net/manual/en/regexp.reference.unicode.php + // pZ=Separator (line, paragraph or space) + // pS=Symbol (all) + // pC=Other (control, format, surrogate) + // p{Pd}=Dash punctuation + // pP=Punctuation (all) + $splitWith = '.,;/\\\\*:+<>\s\pZ\pS\pC'; + if($options['keepHyphen']) { + // allow hyphen but not en-dash or em-dash + $splitWith .= '–—'; + } else { + // split on all types of dash and hyphen + $splitWith .= '\p{Pd}'; + } + if(!$options['keepUnderscore']) $splitWith .= '_'; + $regex = '!\pP*[' . $splitWith . ']\pP*!u'; + $words = preg_split($regex, "$value ", -1, PREG_SPLIT_NO_EMPTY); + + if($words === false) { + $words = array(); + } else if($options['maxWords'] && count($words) > $options['maxWords']) { + $words = array_slice($words, 0, $options['maxWords']); + } + + foreach($words as $key => $word) { + if($hasReplacements && strpos($word, $replacementPrefix) !== false) { + $words[$key] = str_replace(array_keys($replacements), array_values($replacements), $word); + } + $length = $this->multibyteSupport ? mb_strlen($word) : strlen($word); + if($length < $minLength || $length > $maxLength) { + unset($words[$key]); + } + } + + return $words; + } + + /** + * Identify decimals, minus signs and commas in numbers, replace them, and return the replacements array + * + * @param string $value + * @param string $prefix + * @return array + * + */ + protected function wordsArrayNumberReplacements(&$value, $prefix = 'REP') { + + // keep floating point, negative, or thousands-separator numbers together + $replacements = array(); + $hasPeriod = strpos($value, '.') !== false; + $hasComma = strpos($value, ',') !== false; + $hasHyphen = strpos($value, '-') !== false; + $hasMinus = $hasHyphen || strpos($value, '−') !== false; + $hasNumber = ($hasPeriod || $hasComma || $hasHyphen) && preg_match('![-.,]\d!', $value); + + if(!$hasNumber) return array(); + + if($hasPeriod && preg_match_all('!(\b|\d*)\.(\d+)\b!', $value, $matches)) { + // keep floating point numbers together + list($n, $decimal) = array(0, "0{$prefix}DEC0X"); + while(strpos($value, $decimal) !== false && ++$n) $decimal = "{$n}{$prefix}DEC{$n}X"; + foreach($matches[1] as $key => $n1) { + $n2 = $matches[2][$key]; + $value = str_replace("$n1.$n2", "{$n1}$decimal{$n2}", $value); + } + $replacements[$decimal] = '.'; + } + + if($hasMinus && preg_match_all('!([-−])(\d+)!', $value, $matches)) { + // prevent negative numbers from losing their minus sign + list($n, $minus) = array(0, "0{$prefix}MIN0"); + while(strpos($value, $minus) !== false && ++$n) $minus = "{$n}{$prefix}MIN{$n}"; + foreach($matches[2] as $key => $digits) { + $sign = $matches[1][$key]; + $minusKey = $sign === '-' ? "{$minus}D" : "{$minus}M"; + $value = str_replace("$sign$digits", " $minusKey$digits", $value); + $replacements[$minusKey] = $sign; + } + } + + if($hasComma && preg_match_all('!(\d*,)(\d+)!', $value, $matches)) { + // keep commas that appear around digits + list($n, $comma) = array(0, "0{$prefix}COM0"); + while(strpos($value, $comma) !== false && ++$n) $comma = "{$n}{$prefix}COM{$n}"; + foreach($matches[1] as $key => $digits1) { + $digits1 = rtrim($digits1, ','); + $digits2 = $matches[2][$key]; + $value = str_replace("$digits1,$digits2", "$digits1{$comma}$digits2", $value); + $replacements[$comma] = ','; + } + } + + return $replacements; + } + /** * Return $value if it exists in $allowedValues, or null if it doesn't *