1
0
mirror of https://github.com/processwire/processwire.git synced 2025-08-13 18:24:57 +02:00

Add new $sanitizer->flatArray() and $sanitizer->wordsArray() methods, plus some improvements to existing array() method

This commit is contained in:
Ryan Cramer
2020-06-19 12:42:14 -04:00
parent d79d9286b1
commit 1f293cc4f4
2 changed files with 645 additions and 67 deletions

View File

@@ -74,11 +74,14 @@ class DatabaseQuerySelectFulltext extends Wire {
*/
protected $methodOperators = array(
'matchEquals' => array('=', '!=', '>', '<', '>=', '<='),
'matchContains' => array('*='),
'matchWords' => array('~=', '!~='),
'matchContains' => array('*=', '*+=', '**=', '**+=', '^=', '$='),
'matchWords' => array('~=', '!~=', '~+='),
'matchContainsWords' => array('~*=', '~~=', '~|=', '~|*='),
'matchWordsLIKE' => array('~%=', '~|%='),
'matchLIKE' => array('%='),
'matchStart' => array('^=', '%^='),
'matchEnd' => array('$=', '%$='),
'matchStartLIKE' => array('%^='),
'matchEndLIKE' => array('%$='),
'matchCommands' => array('#='),
);
/**
@@ -153,7 +156,9 @@ class DatabaseQuerySelectFulltext extends Wire {
*
*/
protected function escapeAGAINST($str) {
return str_replace(array('@', '+', '-', '*', '~', '<', '>', '(', ')', ':', '"', '&', '|', '=', '.'), ' ', $str);
$str = str_replace(array('@', '+', '-', '*', '~', '<', '>', '(', ')', ':', '"', '&', '|', '=', '.'), ' ', $str);
while(strpos($str, ' ')) $str = str_replace(' ', ' ', $str);
return $str;
}
/**
@@ -218,8 +223,8 @@ class DatabaseQuerySelectFulltext extends Wire {
*/
protected function matchArrayValue(array $value) {
if($this->operator === '~=') {
throw new WireException("Operator ~= is not supported for $this->fieldName with OR value condition");
if(strpos($this->operator, '~') !== false) {
throw new WireException("Operator $this->operator is not supported for $this->fieldName with OR value condition");
}
// convert *= operator to %= to make the query possible (avoiding matchContains method)
@@ -266,7 +271,7 @@ class DatabaseQuerySelectFulltext extends Wire {
* @param string $value
*
*/
protected function matchStart($value) {
protected function matchStartLIKE($value) {
$this->query->where("$this->tableField LIKE ?", $this->escapeLIKE($value) . '%');
}
@@ -276,75 +281,146 @@ class DatabaseQuerySelectFulltext extends Wire {
* @param string $value
*
*/
protected function matchEnd($value) {
protected function matchEndLIKE($value) {
$this->query->where("$this->tableField LIKE ?", '%' . $this->escapeLIKE($value));
}
/**
* Match words
* Match full words
*
* @param string $value
*
*/
protected function matchWords($value) {
$words = preg_split('/[-\s,@]/', $value, -1, PREG_SPLIT_NO_EMPTY);
$partial = $this->operator === '~*=';
// note: ft_min_word_len is automatically changed to InnoDBs equivalent when applicable
$minWordLength = (int) $this->database->getVariable('ft_min_word_len');
$words = $this->words($value);
$mb = function_exists('mb_strlen');
foreach($words as $word) {
$len = function_exists('mb_strlen') ? mb_strlen($word) : strlen($word);
if(DatabaseStopwords::has($word) || $len < (int) $this->database->getVariable('ft_min_word_len')) {
// word is stop-word or has too short to use fulltext index
$this->matchWordLIKE($word);
$word = trim($word, '-.,');
$len = $mb ? mb_strlen($word) : strlen($word);
if($len < (int) $minWordLength || $this->database->isStopword($word)) {
// word is stop-word or is too short to use fulltext index
$this->matchWordLIKE($word, $partial);
} else {
$this->matchContains($word);
$this->matchContains($word, $partial);
}
}
// force it not to match if no words
if(!count($words)) $this->query->where("1>2");
}
/**
* Match words (plural) LIKE
*
* @param string $value
* @since 3.0.160
*
*/
protected function matchWordsLIKE($value) {
$type = strpos($this->operator, '!') === 0 ? 'NOT LIKE' : 'LIKE';
$any = strpos($this->operator, '|') !== false;
//$texts = preg_split('/[-\s,@]/', $value, -1, PREG_SPLIT_NO_EMPTY);
$words = $this->words($value);
$binds = array(); // used only in $any mode
$wheres = array(); // used only in $any mode
foreach($words as $word) {
$word = $this->escapeLIKE($word);
if(!strlen($word)) continue;
if($any) {
$bindKey = $this->query->getUniqueBindKey();
$wheres[] = "($this->tableField $type $bindKey)";
$binds[$bindKey] = "%$word%";
} else {
$this->query->where("($this->tableField $type ?)", "%$word%");
}
}
// force it not to match if no words
if(!count($words)) {
$this->query->where("1>2");
} else if($any) {
$this->query->where(implode(' OR ', $wheres));
$this->query->bindValues($binds);
}
}
/**
* Match contains partial words
*
* @param string $value
* @since 3.0.160
*
*/
protected function matchContainsWords($value) {
$tableField = $this->tableField();
$operator = $this->operator;
$required = strpos($operator, '|') === false;
$partial = $operator != '~|=';
$booleanValue = $this->getBooleanQueryValueWords($value, $required, $partial);
$not = strpos($operator, '!') === 0;
$match = $not ? 'NOT MATCH' : 'MATCH';
$bindKey = $this->query->bindValueGetKey($booleanValue);
$where = "$match($tableField) AGAINST($bindKey IN BOOLEAN MODE)";
$this->query->where($where);
}
/**
* Match contains string
*
* @param string $value
* @param bool|null $partial
*
*/
protected function matchContains($value) {
protected function matchContains($value, $partial = null) {
$tableField = $this->tableField();
$tableName = $this->tableName;
$fieldName = $this->fieldName;
$operator = $this->operator;
$partial = strpos($operator, '~') === false;
$not = strpos($operator, '!') === 0;
$match = $not ? 'NOT MATCH' : 'MATCH';
$wheres = array();
$against = $this->escapeAGAINST($value);
$booleanValue = $this->getBooleanQueryValue($value, true, $partial);
$operator = ltrim($operator, '!');
$scoreField = $this->getScoreFieldName();
$expandAgainst = (strpos($operator, '+') !== false ? ' WITH QUERY EXPANSION' : '');
$booleanValue = '';
$required = true;
$likeType = '';
$like = '';
$n = 0;
if($partial === null) {
$partial = strpos($operator, '~') === false || $operator === '~*=' || $operator === '~~=';
}
do {
$scoreField = "_score_{$tableName}_{$fieldName}" . (++$n);
// $locateField = "_locate_{$tableName}_{$fieldName}$n";
} while(in_array($scoreField, self::$scoreFields));
if(strpos($operator, '**') !== false || strpos($operator, '+') !== false) {
// match or expand
$value = implode(' ', $this->words($value, array('pluralize' => true)));
} else if($operator === '^=' || $operator === '$=') {
// starts with or ends with
} else {
// boolean value query
$booleanValue = $this->getBooleanQueryValueWords($value, $required, $partial);
}
self::$scoreFields[] = $scoreField;
$bindKey = $this->query->bindValueGetKey($against);
$this->query->select("$match($tableField) AGAINST($bindKey) AS $scoreField");
$against = $this->escapeAGAINST($value);
$bindKey = $this->query->bindValueGetKey($against);
$matchAgainst = "$match($tableField) AGAINST($bindKey$expandAgainst)";
$select = "$matchAgainst AS $scoreField ";
$this->query->select($select);
$this->query->orderby("$scoreField DESC");
//$query->select("LOCATE('$against', $tableField) AS $locateField");
//$query->orderby("$locateField=1 DESC");
if($booleanValue) {
if($booleanValue == '') {
$wheres[] = $matchAgainst;
} else {
$bindKey = $this->query->bindValueGetKey($booleanValue);
$wheres[] = "$match($tableField) AGAINST($bindKey IN BOOLEAN MODE)";
}
if($operator == '^=' || $operator == '$=') {
// determine if we need to add LIKE conditions as a secondary qualifier to narrow
// search after rows have already been identified by the MATCH/AGAINST
if($operator === '^=' || $operator === '$=') {
// starts or ends with
$likeType = $not ? 'NOT RLIKE' : 'RLIKE';
$likeText = preg_quote($value);
@@ -357,20 +433,25 @@ class DatabaseQuerySelectFulltext extends Wire {
$like = $likeText . '[[:space:]]*[[:punct:]]*[[:space:]]*(<[^>]+>)*[[:space:]]*$';
}
} else if($operator === '*=' && (!count($wheres) || preg_match('/[-\s]/', $against))) {
// contains *= with word separators, or no existing where (boolean) conditions
$likeType = $not ? 'NOT LIKE' : 'LIKE';
$likeText = $this->escapeLIKE($value);
$like = "%$likeText%";
} else if($operator === '*=') {
// cointains phrase
if(!count($wheres) || preg_match('/[-\s]/', $against)) {
// contains *= with word separators, or no existing where (boolean) conditions
$likeType = $not ? 'NOT LIKE' : 'LIKE';
$likeText = $this->escapeLIKE($value);
$like = "%$likeText%";
}
}
if($like) {
// LIKE is used as a secondary qualifier, so it's not a bottleneck
// LIKE is used as a secondary qualifier, so its not a bottleneck
$bindKey = $this->query->bindValueGetKey($like);
$wheres[] = "($tableField $likeType $bindKey)";
}
if(count($wheres)) $this->query->where(implode(' AND ', $wheres));
if(count($wheres)) {
$this->query->where(implode(' AND ', $wheres));
}
}
@@ -381,15 +462,48 @@ class DatabaseQuerySelectFulltext extends Wire {
* or for words that are stop words. It uses a slower REGEXP rather than fulltext index.
*
* @param string $word
* @param bool $partial
*
*/
protected function matchWordLIKE($word) {
protected function matchWordLIKE($word, $partial = false) {
$word = preg_quote($word);
//$regex = "([[:blank:][:punct:]]|^)$v([[:blank:][:punct:]]|$)";
$regex = "([[:blank:]]|[[:punct:]]|[[space]]|^)$word([[:blank:]]|[[:punct:]]|[[space]]|$)";
$regex = "([[:blank:]]|[[:punct:]]|[[space]]|^)$word";
if(!$partial) $regex .= "([[:blank:]]|[[:punct:]]|[[space]]|$)"; // match full word at boundary
$type = strpos($this->operator, '!') === 0 ? 'NOT REGEXP' : 'REGEXP';
$this->query->where("($this->tableField $type ?)", $regex);
}
/**
* Match text using LIKE
*
* @param string $text
* @since 3.0.160
*
*/
protected function matchTextLIKE($text) {
$text = $this->escapeLIKE($text);
$type = strpos($this->operator, '!') === 0 ? 'NOT LIKE' : 'LIKE';
$this->query->where("($this->tableField $type ?)", $text);
}
/**
* Match text using boolean mode commands
*
* @param string $text
* @since 3.0.160
*
*/
protected function matchCommands($text) {
$tableField = $this->tableField();
$scoreField = $this->getScoreFieldName();
$against = $this->getBooleanQueryValueCommands($text);
$bindKey = $this->query->bindValueGetKey($against);
$matchAgainst = "MATCH($tableField) AGAINST($bindKey IN BOOLEAN MODE) ";
$select = "$matchAgainst AS $scoreField ";
$this->query->select($select);
$this->query->orderby("$scoreField DESC");
$this->query->where($matchAgainst);
}
/**
* Generate a boolean query value for use in an SQL MATCH/AGAINST statement.
@@ -400,19 +514,194 @@ class DatabaseQuerySelectFulltext extends Wire {
* @return string Value provided to the function with boolean operators added.
*
*/
protected function getBooleanQueryValue($value, $required = true, $partial = true) {
$newValue = '';
protected function getBooleanQueryValueWords($value, $required = true, $partial = true) {
$operator = $this->operator;
$booleanValue = '';
$value = $this->escapeAGAINST($value);
$words = preg_split('/[\s,!?;]+/', $value);
$lastWord = '';
$searchStopwords = false;
foreach($words as $k => $v) {
$v = trim($v);
if(!strlen($v) || DatabaseStopwords::has($v)) continue;
$newValue .= $required ? "+$v" : "$v";
if($partial) $newValue .= "*";
$newValue .= " ";
if($operator === '~~=' || $operator === '^=') {
// contains full words and partial last word (live search or starts with)
$words = $this->words($value, array());
$lastWord = trim(array_pop($words));
$partial = false;
$searchStopwords = true;
} else if($partial && $operator !== '*=') {
// contains partial words
$searchStopwords = true;
$words = $this->words($value, array(
'pluralize' => strpos($operator, '+') !== false,
// 'singularize' => true
));
} else {
$words = $this->words($value);
}
return trim($newValue);
foreach($words as $key => $word) {
//$word = trim($word, '-~+*,.<>@"\' ');
if(!strlen($word)) continue;
if(!$searchStopwords && $this->database->isStopword($word)) continue;
$booleanValue .= $required ? "+$word" : "$word";
if($partial) $booleanValue .= "*";
$booleanValue .= " ";
}
if($lastWord !== '') {
if($required) $booleanValue .= '+';
$booleanValue .= $lastWord . '*';
}
return trim($booleanValue);
}
/**
* Generate boolean query value for matching exact phrase in order (no partials)
*
* @param string $value
* @param string $action Phrase action of blank, '+', '-' or '~' (default='')
* @return string
*
*/
protected function getBooleanQueryValueExactPhrase($value, $action = '') {
$value = $this->escapeAGAINST($value);
$words = $this->words($value);
$phrase = implode(' ', $words);
$booleanValue = '"' . $phrase . '"';
if($action === '+' || $action === '-' || $action === '~') {
$booleanValue = $action . $booleanValue;
}
return $booleanValue;
}
/**
* Get boolean query value where "+" and "-" and "*" and '"' are allowed in query to affect results
*
* @param string $value
* @return string
*
*/
protected function getBooleanQueryValueCommands($value) {
$booleanValues = array();
$value = str_replace(array('“', '”'), '"', $value);
if(strpos($value, '"') !== false && preg_match_all('![-~+]?"([^"]+)"!', $value, $matches)) {
// find all quoted phrases
foreach($matches[0] as $key => $fullMatch) {
$action = strpos($fullMatch, '"') === 0 ? '' : substr($fullMatch, 0, 1);
$phrase = trim($matches[1][$key]);
if(empty($phrase)) continue;
$phrase = $this->getBooleanQueryValueExactPhrase($phrase, $action);
if(strlen($phrase)) $booleanValues[] = $phrase;
$value = str_replace($fullMatch, ' ', $value);
}
}
$value = str_replace('"', '', $value);
$words = $this->words($value);
$value = " $value ";
foreach($words as $word) {
$w = $this->escapeAGAINST($word);
$pregWord = preg_quote($w);
if(stripos($value, "+$word*")) {
$booleanValues[] = "+$w*";
} else if(stripos($value, "+$word") && preg_match('!\+' . $pregWord . '\b!i', $value)) {
$booleanValues[] = "+$w";
} else if(stripos($value, "-$word*")) {
$booleanValues[] = "-$w*";
} else if(stripos($value, "-$word") && preg_match('!-' . $pregWord . '\b!i', $value)) {
$booleanValues[] = "-$w";
} else {
$booleanValues[] = $w; // optional
}
}
return implode(' ', $booleanValues);
}
/**
* Get array of words from given value
*
* @param string $value
* @param array $options
* @return array
*
*/
protected function words($value, array $options = array()) {
$defaults = array(
'keepNumberFormat' => false,
'singularize' => false,
'pluralize' => false,
'boolean' => false, // not currently used
);
$options = count($options) ? array_merge($defaults, $options) : $defaults;
$words = $this->wire()->sanitizer->wordsArray($value, $options);
$plural = strtolower($this->_('s')); // Suffix(es) that when appended to a word makes it plural // Separate multiple with a pipe "|" or to disable specify uppercase "X"
$plurals = strpos($plural, '|') ? explode('|', $plural) : array($plural);
if($options['pluralize']) {
// add additional pluralized or singularized words
$addWords = array();
foreach($words as $key => $word) {
$word = strtolower($word);
$wordLen = strlen($word);
foreach($plurals as $suffix) {
$suffixLen = strlen($suffix);
$w = '';
if($wordLen > $suffixLen && substr($word, -1 * $suffixLen) === $suffix) {
if($options['singularize']) $w = substr($word, 0, $wordLen - $suffixLen);
} else {
// pluralize
$w = $word . $suffix;
}
if($w) {
if($options['boolean']) $w = "<$w";
$addWords[$w] = $w;
}
}
}
if(count($addWords)) $words = array_merge($words, $addWords);
} else if($options['singularize']) {
// singularize only by replacement
foreach($words as $key => $word) {
$word = strtolower($word);
$wordLen = strlen($word);
foreach($plurals as $suffix) {
if(stripos($word, $suffix) === false) continue;
$suffixLen = strlen($suffix);
if($wordLen <= $suffixLen) continue;
if(substr($word, -1 * $suffixLen) === $suffix) {
$word = substr($word, 0, $wordLen - $suffixLen);
if($options['boolean']) $word = "<$word";
$words[$key] = $word;
}
}
}
}
return $words;
}
/**
* Get unique score field name
*
* @return string
* @since 3.0.160
*
*/
protected function getScoreFieldName() {
$n = 0;
do {
$scoreField = "_score_{$this->tableName}_{$this->fieldName}" . (++$n);
// $locateField = "_locate_{$tableName}_{$fieldName}$n";
} while(isset(self::$scoreFields[$scoreField]));
self::$scoreFields[$scoreField] = 1;
return $scoreField;
}
}

View File

@@ -3624,18 +3624,19 @@ class Sanitizer extends Wire {
*/
/**
* Sanitize array or CSV string to array of strings
* Sanitize array or CSV string to array of values, optionally sanitized by given method
*
* If string specified, string delimiter may be pipe ("|"), or comma (","), unless overridden with the 'delimiter'
* or 'delimiters' option.
* If given a string, delimiter may be pipe ("|"), or comma (","), unless overridden with the `delimiter`
* or `delimiters` options.
*
* #pw-group-arrays
*
* @param array|string|mixed $value Accepts an array or CSV string. If given something else, it becomes first item in array.
* @param string $sanitizer Optional Sanitizer method to apply to items in the array (default=null, aka none).
* @param array $options Optional modifications to default behavior:
* `maxItems` (int): Maximum items allowed in array (default=0, which means no limit)
* The following options are only used if the provided $value is a string:
* - `maxItems` (int): Maximum items allowed in each array (default=0, which means no limit)
* - `maxDepth` (int): Max nested array depth (default=0, which means no nesting allowed) Since 3.0.160
* - The following options are only used if the provided $value is a string:
* - `delimiter` (string): Single delimiter to use to identify CSV strings. Overrides the 'delimiters' option when specified (default=null)
* - `delimiters` (array): Delimiters to identify CSV strings. First found delimiter will be used, default=array("|", ",")
* - `enclosure` (string): Enclosure to use for CSV strings (default=double quote, i.e. ")
@@ -3644,15 +3645,24 @@ class Sanitizer extends Wire {
*
*/
public function ___array($value, $sanitizer = null, array $options = array()) {
static $depth = 0;
$defaults = array(
'maxItems' => 0,
'maxDepth' => 0,
'delimiter' => null,
'delimiters' => array('|', ','),
'enclosure' => '"',
'maxItems' => 0,
);
$options = array_merge($defaults, $options);
if(!is_array($value)) {
if(is_null($value)) return array();
$clean = array();
if($value === null) {
return array();
} else if(!is_array($value)) {
if(is_object($value)) {
// value is object: convert to string or array
if(method_exists($value, '__toString')) {
@@ -3679,20 +3689,39 @@ class Sanitizer extends Wire {
}
if(!is_array($value)) $value = array($value);
}
if($options['maxItems']) {
if(count($value) > $options['maxItems']) $value = array_slice($value, 0, abs($options['maxItems']));
$depth++;
foreach($value as $k => $v) {
if(!is_array($v)) continue;
if($depth <= $options['maxDepth']) {
// sanitize nested array recursively
$value[$k] = $this->array($v, $sanitizer, $options);
} else {
// remove nested array
unset($value[$k]);
}
}
$clean = array();
if(!is_null($sanitizer)) {
$depth--;
if($options['maxItems'] && count($value) > $options['maxItems']) {
$value = array_slice($value, 0, abs($options['maxItems']));
}
if($sanitizer) {
if(!method_exists($this, $sanitizer) && !method_exists($this, "___$sanitizer")) {
throw new WireException("Unknown sanitizer method: $sanitizer");
}
foreach($value as $k => $v) {
$clean[$k] = $this->$sanitizer($v);
if($options['maxDepth'] > 0 && is_array($v)) {
$clean[$k] = $v; // array already sanitized by recursive call
} else {
$clean[$k] = $this->$sanitizer($v);
}
}
} else {
$clean = $value;
}
return array_values($clean);
}
@@ -3818,6 +3847,266 @@ class Sanitizer extends Wire {
return $data;
}
/**
* Given a potentially multi-dimensional array, return a flat 1-dimensional array
*
* #pw-group-arrays
*
* @param array $value
* @param array $options
* - `preserveKeys` (bool): Preserve associative array keys where possible? (default=false)
* - `maxDepth` (int): Max depth of nested arrays to flatten into value, after which they are discarded (default=0).
* The default value of 0 removes any nested arrays, so specify 1 or higher to include them.
* @return array
* @since 3.0.160
*
*/
public function flatArray($value, $options = array()) {
static $depth = 0;
$defaults = array(
'preserveKeys' => is_bool($options) ? $options : false,
'maxDepth' => 0,
);
if(!is_array($value)) return array($value);
$flat = array();
$isFlat = true;
$options = is_array($options) ? array_merge($defaults, $options) : $defaults;
$preserveKeys = $options['preserveKeys'];
foreach($value as $key => $val) {
if(is_array($val)) $isFlat = false;
if(!$isFlat) break;
}
if($isFlat) return $preserveKeys ? $value : array_values($value);
$depth++;
foreach($value as $key => $val) {
$hasStringKey = $preserveKeys && is_string($key);
if(!is_array($val)) {
// not an array value
if($hasStringKey) {
// associative key
list($n, $kk) = array(0, $key);
// this while loop likely is not needed
while(isset($flat[$kk])) $kk = "$key-" . (++$n);
$flat[$kk] = $val;
} else {
// integer key
$flat[] = $val;
}
continue;
}
/** @var array $val At this point val is known to be an array */
if($depth > $options['maxDepth']) {
// skip over arrays when when we are at the max recursion depth
continue;
}
if(!$preserveKeys) {
// if keys are not preserved then we can take a shortcut
$flat = array_merge($flat, $this->flatArray($val, $options));
continue;
}
// array value with preserved keys
foreach($this->flatArray($val, $options) as $k => $v) {
if(is_int($k) || ctype_digit("$k")) {
// integer keys in nested array
$k = (int) $k;
if($hasStringKey) {
// parent array is associative and preserveKeys is true
do {
$kk = "$key.$k"; // parent key + incrementing child key
$k++;
} while(isset($flat[$kk]) || isset($value[$kk]));
$flat[$kk] = $v;
} else {
// parent array is non-associative
$flat[] = $v;
}
} else if(isset($value[$k]) || isset($flat[$k])) {
// associative key already exists
// create new key that marries parent and child keys
$n = -1;
do {
$kk = $key . '.' . $k;
// no match on first-round, start incrementing
if($n > -1) $kk .= '-' . $n;
$n++;
} while(isset($value[$kk]) || isset($flat[$kk]));
$flat[$kk] = $v;
} else {
// associative key that is not already taken
$flat[$k] = $v;
}
}
}
$depth--;
return $flat;
}
/**
* Return array of all words in given value (excluding punctuation and other non-word characters)
*
* #pw-group-arrays
*
* @param string|array $value String containing words
* @param array $options
* - `keepNumberFormat` (bool): Keep minus/comma/period in numbers rather than splitting into words? (default=false)
* - `keepUnderscore` (bool): Keep hyphenated words? (default=false)
* - `keepHyphen` (bool): Keep hyphenated words? (default=false)
* - `minWordLength` (int): Minimum word length (default=1)
* - `maxWordLength` (int): Maximum word length (default=80)
* - `maxWords` (int): Maximum number of words allowed (default=0, no limit)
* - `stripTags` (bool): Strip markup tags so they dont contribute to returned word list? (default=true)
* @return array
* @since 3.0.160
*
*/
public function wordsArray($value, array $options = array()) {
$defaults = array(
'minWordLength' => 1,
'maxWordLength' => 80,
'maxWords' => 0,
'keepHyphen' => false,
'keepUnderscore' => false,
'keepNumberFormat' => true,
'stripTags' => true,
'getString' => false,
);
$options = array_merge($defaults, $options);
$minLength = (int) $options['minWordLength'];
$maxLength = (int) $options['maxWordLength'];
$replacements = array();
$replacementPrefix = 'REP';
$hasReplacements = false;
if(is_array($value)) {
$value = $this->flatArray($value);
$value = implode(' ', $value);
} else if(!is_string($value)) {
$value = $this->string($value);
}
// prevents non-bracketed tag names from also becoming words
if($options['stripTags']) $value = strip_tags($value);
if(!strlen($value)) return array();
if($options['keepNumberFormat']) {
$replacements = $this->wordsArrayNumberReplacements($value, $replacementPrefix);
$hasReplacements = count($replacements);
}
// https://www.php.net/manual/en/regexp.reference.unicode.php
// pZ=Separator (line, paragraph or space)
// pS=Symbol (all)
// pC=Other (control, format, surrogate)
// p{Pd}=Dash punctuation
// pP=Punctuation (all)
$splitWith = '.,;/\\\\*:+<>\s\pZ\pS\pC';
if($options['keepHyphen']) {
// allow hyphen but not en-dash or em-dash
$splitWith .= '–—';
} else {
// split on all types of dash and hyphen
$splitWith .= '\p{Pd}';
}
if(!$options['keepUnderscore']) $splitWith .= '_';
$regex = '!\pP*[' . $splitWith . ']\pP*!u';
$words = preg_split($regex, "$value ", -1, PREG_SPLIT_NO_EMPTY);
if($words === false) {
$words = array();
} else if($options['maxWords'] && count($words) > $options['maxWords']) {
$words = array_slice($words, 0, $options['maxWords']);
}
foreach($words as $key => $word) {
if($hasReplacements && strpos($word, $replacementPrefix) !== false) {
$words[$key] = str_replace(array_keys($replacements), array_values($replacements), $word);
}
$length = $this->multibyteSupport ? mb_strlen($word) : strlen($word);
if($length < $minLength || $length > $maxLength) {
unset($words[$key]);
}
}
return $words;
}
/**
* Identify decimals, minus signs and commas in numbers, replace them, and return the replacements array
*
* @param string $value
* @param string $prefix
* @return array
*
*/
protected function wordsArrayNumberReplacements(&$value, $prefix = 'REP') {
// keep floating point, negative, or thousands-separator numbers together
$replacements = array();
$hasPeriod = strpos($value, '.') !== false;
$hasComma = strpos($value, ',') !== false;
$hasHyphen = strpos($value, '-') !== false;
$hasMinus = $hasHyphen || strpos($value, '') !== false;
$hasNumber = ($hasPeriod || $hasComma || $hasHyphen) && preg_match('![-.,]\d!', $value);
if(!$hasNumber) return array();
if($hasPeriod && preg_match_all('!(\b|\d*)\.(\d+)\b!', $value, $matches)) {
// keep floating point numbers together
list($n, $decimal) = array(0, "0{$prefix}DEC0X");
while(strpos($value, $decimal) !== false && ++$n) $decimal = "{$n}{$prefix}DEC{$n}X";
foreach($matches[1] as $key => $n1) {
$n2 = $matches[2][$key];
$value = str_replace("$n1.$n2", "{$n1}$decimal{$n2}", $value);
}
$replacements[$decimal] = '.';
}
if($hasMinus && preg_match_all('!([-])(\d+)!', $value, $matches)) {
// prevent negative numbers from losing their minus sign
list($n, $minus) = array(0, "0{$prefix}MIN0");
while(strpos($value, $minus) !== false && ++$n) $minus = "{$n}{$prefix}MIN{$n}";
foreach($matches[2] as $key => $digits) {
$sign = $matches[1][$key];
$minusKey = $sign === '-' ? "{$minus}D" : "{$minus}M";
$value = str_replace("$sign$digits", " $minusKey$digits", $value);
$replacements[$minusKey] = $sign;
}
}
if($hasComma && preg_match_all('!(\d*,)(\d+)!', $value, $matches)) {
// keep commas that appear around digits
list($n, $comma) = array(0, "0{$prefix}COM0");
while(strpos($value, $comma) !== false && ++$n) $comma = "{$n}{$prefix}COM{$n}";
foreach($matches[1] as $key => $digits1) {
$digits1 = rtrim($digits1, ',');
$digits2 = $matches[2][$key];
$value = str_replace("$digits1,$digits2", "$digits1{$comma}$digits2", $value);
$replacements[$comma] = ',';
}
}
return $replacements;
}
/**
* Return $value if it exists in $allowedValues, or null if it doesn't
*