diff --git a/wire/core/Sanitizer.php b/wire/core/Sanitizer.php index 6dfa1f1c..0ab1c3a9 100644 --- a/wire/core/Sanitizer.php +++ b/wire/core/Sanitizer.php @@ -63,6 +63,12 @@ class Sanitizer extends Wire { */ protected $allowedASCII = array(); + /** + * @var null|WireTextTools + * + */ + protected $textTools = null; + /** * Construct the sanitizer * @@ -1809,9 +1815,70 @@ class Sanitizer extends Wire { * @return string String without newlines * */ - function removeNewlines($str, $replacement = ' ') { + public function removeNewlines($str, $replacement = ' ') { return str_replace(array("\r\n", "\r", "\n"), $replacement, $str); } + + /** + * Truncate string to given maximum length without breaking words + * + * This method can truncate between words, sentences, punctuation or blocks (like paragraphs). + * See the `type` option for details on how it should truncate. By default it truncates between + * words. Description of types: + * + * - word: truncate to closest word. + * - punctuation: truncate to closest punctuation within sentence. + * - sentence: truncate to closest sentence. + * - block: truncate to closest block of text (like a paragraph or headline). + * + * Note that if your specified `type` is something other than “word”, and it cannot be matched + * within the maxLength, then it will attempt a different type. For instance, if you specify + * “sentence” as the type, and it cannot match a sentence, it will try to match to “punctuation” + * instead. If it cannot match that, then it will attempt “word”. + * + * HTML will be stripped from returned string. If you want to keep some tags use the `keepTags` or `keepFormatTags` + * options to specify what tags are allowed to remain. The `keepFormatTags` option that, when true, will make it + * retain all HTML inline text formatting tags. + * + * ~~~~~~~ + * // Truncate string to closest word within 150 characters + * $s = $sanitizer->truncate($str, 150); + * + * // Truncate string to closest sentence within 300 characters + * $s = $sanitizer->truncate($str, 300, 'sentence'); + * + * // Truncate with options + * $s = $sanitizer->truncate($str, [ + * 'type' => 'punctuation', + * 'maxLength' => 300, + * 'visible' => true, + * 'more' => '…' + * ]); + * ~~~~~~~ + * + * @param string $str String to truncate + * @param int|array $maxLength Maximum length of returned string, or specify $options array here. + * @param array|string $options Options array, or specify `type` option (string). + * - `type` (string): Preferred truncation type of word, punctuation, sentence, or block. (default='word') + * This is a “preferred type”, not an absolute one, because it will adjust to match what it can within your maxLength. + * - `maxLength` (int): Max characters for truncation, used only if $options array substituted for $maxLength argument. + * - `maximize` (bool): Include as much as possible within specified type and max-length? (default=true) + * If you specify false for the maximize option, it will truncate to first word, puncutation, sentence or block. + * - `visible` (bool): When true, invisible text (markup, entities, etc.) does not count towards string length. (default=false) + * - `trim` (string): Characters to trim from returned string. (default=',;/ ') + * - `noTrim` (string): Never trim these from end of returned string. (default=')]>}”»') + * - `more` (string): Append this to truncated strings that do not end with sentence punctuation. (default='…') + * - `keepTags` (array): HTML tags that should be kept in returned string. (default=[]) + * - `keepFormatTags` (bool): Keep HTML text-formatting tags? Simpler alternative to keepTags option. (default=false) + * - `collapseLinesWith` (string): String to collapse lines with where the first is not punctuated. (default=' … ') + * - `convertEntities` (bool): Convert HTML entities to non-entity characters? (default=false) + * - `noEndSentence` (string): Strings that sentence may not end with, space-separated values (default='Mr. Mrs. …') + * @return string + * + */ + function truncate($str, $maxLength = 300, $options = array()) { + return $this->getTextTools()->truncate($str, $maxLength, $options); + } /** * Removes 4-byte UTF-8 characters (like emoji) that produce error with with MySQL regular “UTF8” encoding @@ -2447,6 +2514,20 @@ class Sanitizer extends Wire { return $results; } + /** + * Get instance of WireTextTools + * + * @return WireTextTools + * + */ + public function getTextTools() { + if(!$this->textTools) { + $this->textTools = new WireTextTools(); + $this->wire($this->textTools); + } + return $this->textTools; + } + /********************************************************************************************************************** * FILE VALIDATORS * diff --git a/wire/core/WireTextTools.php b/wire/core/WireTextTools.php new file mode 100644 index 00000000..330c137e --- /dev/null +++ b/wire/core/WireTextTools.php @@ -0,0 +1,568 @@ +` elements. (default='• ') + * - `replacements` (array): Associative array of strings to manually replace. (default=[' ' => ' ']) + * @return string + * + */ + public function markupToText($str, array $options = array()) { + + $defaults = array( + 'keepTags' => array(), + 'splitBlocks' => "\n\n", + 'convertEntities' => true, + 'listItemPrefix' => '• ', + 'replacements' => array( + ' ' => ' ' + ), + ); + + $options = array_merge($defaults, $options); + + if(strpos($str, '>') !== false) { + + // ensure tags are separated by whitespace + $str = str_replace('><', '> <', $str); + + // normalize newlines + if(strpos($str, "\r") !== false) { + $str = str_replace(array("\r\n", "\r"), "\n", $str); + } + + // normalize tabs to spaces + if(strpos($str, "\t") !== false) { + $str = str_replace("\t", " ", $str); + } + + // ensure paragraphs and headers are followed by two newlines + if(stripos($str, '

') || stripos($str, ')!i', '$1' . $options['splitBlocks'], $str); + } + + // ensure list items are on their own line and prefixed with a bullet + if(stripos($str, ']*>!i', "\n
  • $prefix", $str); + } + + // convert
    tags to be just a single newline + if(stripos($str, '', '
    ', '
    '), "
    \n", $str); + while(stripos($str, "\n
    ") !== false) $str = str_replace("\n
    ", "
    ", $str); + while(stripos($str, "
    \n\n") !== false) $str = str_replace("
    \n\n", "
    \n", $str); + } + } + + // normalize newlines and whitespace around newlines + while(strpos($str, " \n") !== false) $str = str_replace(" \n", "\n", $str); + while(strpos($str, "\n ") !== false) $str = str_replace("\n ", "\n", $str); + while(strpos($str, "\n\n\n") !== false) $str = str_replace("\n\n\n", "\n\n", $str); + + // strip tags + if(count($options['keepTags'])) { + // some tags will be allowed to remain + $keepTags = ''; + foreach($options['keepTags'] as $tag) { + $keepTags .= "<" . trim($tag, "<>") . ">"; + } + $str = strip_tags($str, $keepTags); + + } else { + // not allowing any tags + $str = strip_tags($str); + // if any possible tag characters remain, drop them now + $str = str_replace(array('<', '>'), ' ', $str); + } + + // apply any other replacements + foreach($options['replacements'] as $find => $replace) { + $str = str_ireplace($find, $replace, $str); + } + + // convert entities to plain text equivalents + if($options['convertEntities'] && strpos($str, '&') !== false) { + $str = $this->unentities($str); + } + + return trim($str); + } + + /** + * Remove (or close) unclosed HTML tags from given string + * + * Remove unclosed tags: + * --------------------- + * At present, if it finds an unclosed tag, it removes all tags of the same kind. + * This is in order to keep the function fast, by delegating what it can to strip_tags(). + * This is sufficient for our internal use here, but may not be ideal for all situations. + * + * Fix/close unclosed tags: + * ------------------------ + * When the remove option is false, it will attempt to close unclosed tags rather than + * remove them. It doesn't know exactly where they should be closed, so it appends the + * close tags to the end of the string. + * + * @param string $str + * @param bool $remove Remove unclosed tags? If false, it will attempt to close them instead. (default=true) + * @param array $options + * - `ignoreTags` (array): Tags that can be ignored because they close themselves. (default=per HTML spec) + * @return string + * + */ + public function fixUnclosedTags($str, $remove = true, $options = array()) { + + $defaults = array( + 'ignoreTags' => array( + 'area','base','br','col','command','embed','hr','img','input', + 'keygen','link','menuitem','meta','param','source','track','wbr', + ), + ); + + if(isset($options['ignoreTags'])) { + // merge user specified ignoreTags with our defaults so that both are used + $options['ignoreTags'] = array_merge($defaults['ignoreTags'], $options['ignoreTags']); + } + + $options = array_merge($defaults, $options); + $tags = array(); + $unclosed = array(); + + $n1 = substr_count($str, '>'); + $n2 = substr_count($str, '" is equal to double the quantity of "'); + if($n1 > $n2) { + // string might end with a partial tag, i.e. "|\s*/>|\s[^>]+>)!i', $str, $matches)) return $str; + + foreach($matches[1] as $key => $tag) { + if(strpos($matches[2][$key], '/>') !== false) continue; // ignore self closing tags + if(in_array(strtolower($tag), $options['ignoreTags'])) continue; + $tags[$tag] = $tag; + } + + // count appearances of found tags + foreach($tags as $tag) { + // count number of open tags of this type + $openQty = substr_count($str, "<$tag>") + substr_count($str, "<$tag "); + // count number of closing tags of this type + $closeQty = substr_count($str, ""); + // if quantities do not match, mark tag for deletion + if($openQty !== $closeQty) { + unset($tags[$tag]); + $unclosed[] = $tag; + } + } + + + if(count($unclosed)) { + if($remove) { + // strip all tags except those where open/close quantity matched + $keepTags = count($tags) ? '<' . implode('><', $tags) . '>' : ''; + $str = strip_tags($str, $keepTags); + } else { + foreach($unclosed as $tag) { + $str .= ""; + } + } + } + + return $str; + } + + /** + * Collapse string to plain text that all exists on a single long line without destroying words/punctuation. + * + * @param string $str String to collapse + * @param array $options + * - `stripTags` (bool): Strip markup tags? (default=true) + * - `keepTags` (array): Array of tag names to keep, if stripTags==true. (default=[]) + * - `collapseLinesWith` (string): String to collapse newlines with. (default=' ') + * - `endBlocksWith` (string): Character or string to insert to identify paragraph/header separation (default='') + * - `convertEntities` (bool): Convert entity-encoded characters to text? (default=true) + * @return mixed|string + * + */ + public function collapse($str, array $options = array()) { + + $defaults = array( + 'stripTags' => true, + 'keepTags' => array(), + 'collapseLinesWith' => ' ', + 'endBlocksWith' => '', + 'convertEntities' => true, + ); + + $options = array_merge($defaults, $options); + + if($options['stripTags']) { + $str = $this->markupToText($str, array( + 'convertEntities' => $options['convertEntities'], + 'keepTags' => $options['keepTags'], + )); + if(!strlen($str)) return $str; + } + + // character that we collapse lines with + $r = $options['collapseLinesWith']; + + // convert any tabs to space + if(strpos($str, "\t") !== false) { + $str = str_replace("\t", " ", $str); + } + + // convert CRs to LFs + if(strpos($str, "\r") !== false) { + $str = str_replace(array("\r\n", "\r"), "\n", $str); + } + + // collapse whitespace that appears before or after newlines + while(strpos($str, " \n") !== false) $str = str_replace(" \n", "\n", $str); + while(strpos($str, "\n ") !== false) $str = str_replace("\n ", "\n", $str); + + // convert redundant LFs to no more than double LFs + while(strpos($str, "\n\n\n") !== false) { + $str = str_replace("\n\n\n", "\n\n", $str); + } + + // add character to indicate blocks, when asked for + if(!empty($options['endBlocksWith'])) { + $str = str_replace("\n\n", "$options[endBlocksWith]\n\n", $str); + } + + // replace all types of newlines + $str = str_replace(array("\r\n", "\r", "\n\n", "\n"), $r, $str); + + // while there are consecutives of our collapse string, reduce them to one + while(strpos($str, "$r$r") !== false) { + $str = str_replace("$r$r", $r, $str); + } + + if($r !== $defaults['collapseLinesWith']) { + // replacement of whitespace with something other than another single whitespace + // so collapse consecutive spaces to one space, since this would not be already done + while(strpos($str, " ") !== false) { + $str = str_replace(" ", " ", $str); + } + // use space rather than replacement char when left side already ends with punctuation + foreach($this->getPunctuationChars() as $c) { + if(strpos($str, "$c$r")) $str = str_replace("$c$r", "$c ", $str); + } + } + + return trim($str); + } + + /** + * Truncate string to given maximum length without breaking words + * + * This method can truncate between words, sentences, punctuation or blocks (like paragraphs). + * See the `type` option for details on how it should truncate. By default it truncates between + * words. Description of types: + * + * - word: truncate to closest word. + * - punctuation: truncate to closest punctuation within sentence. + * - sentence: truncate to closest sentence. + * - block: truncate to closest block of text (like a paragraph or headline). + * + * Note that if your specified `type` is something other than “word”, and it cannot be matched + * within the maxLength, then it will attempt a different type. For instance, if you specify + * “sentence” as the type, and it cannot match a sentence, it will try to match to “punctuation” + * instead. If it cannot match that, then it will attempt “word”. + * + * HTML will be stripped from returned string. If you want to keep some tags use the `keepTags` or `keepFormatTags` + * options to specify what tags are allowed to remain. The `keepFormatTags` option that, when true, will make it + * retain all HTML inline text formatting tags. + * + * ~~~~~~~ + * // Truncate string to closest word within 150 characters + * $s = $sanitizer->truncate($str, 150); + * + * // Truncate string to closest sentence within 300 characters + * $s = $sanitizer->truncate($str, 300, 'sentence'); + * + * // Truncate with options + * $s = $sanitizer->truncate($str, [ + * 'type' => 'punctuation', + * 'maxLength' => 300, + * 'visible' => true, + * 'more' => '…' + * ]); + * ~~~~~~~ + * + * @param string $str String to truncate + * @param int|array $maxLength Maximum length of returned string, or specify $options array here. + * @param array|string $options Options array, or specify `type` option (string). + * - `type` (string): Preferred truncation type of word, punctuation, sentence, or block. (default='word') + * This is a “preferred type”, not an absolute one, because it will adjust to match what it can within your maxLength. + * - `maxLength` (int): Max characters for truncation, used only if $options array substituted for $maxLength argument. + * - `maximize` (bool): Include as much as possible within specified type and max-length? (default=true) + * If you specify false for the maximize option, it will truncate to first word, puncutation, sentence or block. + * - `visible` (bool): When true, invisible text (markup, entities, etc.) does not count towards string length. (default=false) + * - `trim` (string): Characters to trim from returned string. (default=',;/ ') + * - `noTrim` (string): Never trim these from end of returned string. (default=')]>}”»') + * - `more` (string): Append this to truncated strings that do not end with sentence punctuation. (default='…') + * - `keepTags` (array): HTML tags that should be kept in returned string. (default=[]) + * - `keepFormatTags` (bool): Keep HTML text-formatting tags? Simpler alternative to keepTags option. (default=false) + * - `collapseLinesWith` (string): String to collapse lines with where the first is not punctuated. (default=' … ') + * - `convertEntities` (bool): Convert HTML entities to non-entity characters? (default=false) + * - `noEndSentence` (string): Strings that sentence may not end with, space-separated values (default='Mr. Mrs. …') + * @return string + * + */ + function truncate($str, $maxLength, $options = array()) { + + $defaults = array( + 'type' => 'word', // word, punctuation, sentence, or block + 'maximize' => true, // include as much as possible within the type and maxLength (false=include as little as possible) + 'visible' => false, // when true, invisible text (markup, entities, etc.) does not count towards string length. (default=false) + 'trim' => $this->_(',;/') . ' ', // Trim these characters from the end of the returned string + 'noTrim' => $this->_(')]>}”»'), // Never trim these characters from end of returned string + 'more' => '…', // Append to truncated strings that do not end with sentence punctuation + 'stripTags' => true, // strip HTML tags? (currently required, see keepTags to keep some) + 'keepTags' => array(), // if strip HTML tags is true, optional array of tag names you want to keep + 'keepFormatTags' => false, // alternative to keepTags: keep just inline text format tags like strong, em, etc. + 'collapseWhitespace' => true, // collapsed whitespace (currently required) + 'collapseLinesWith' => ' ' . $this->_('…') . ' ', // String placed between joined lines (like from paragraphs) + 'convertEntities' => false, // convert entity encoded characters to non-entity equivalents? (default=false) + 'noEndSentence' => $this->_('Mr. Mrs. Ms. Dr. Hon. PhD. i.e. e.g.'), // When in sentence type, words that do not end the sentence (space-separated) + ); + + if(!strlen($str)) return ''; + + if(is_string($options) && ctype_alpha($options)) { + $defaults['type'] = $options; + $options = array(); + } + + if(is_array($maxLength)) { + $options = $maxLength; + if(!isset($options['maxLength'])) $options['maxLength'] = 0; + $maxLength = $options['maxLength']; + } else if(is_string($maxLength) && ctype_alpha($maxLength)) { + $options['type'] = $maxLength; + $maxLength = isset($options['maxLength']) ? $options['maxLength'] : mb_strlen($str); + } + + if(!$maxLength) $maxLength = 255; + $options = array_merge($defaults, $options); + $type = $options['type']; + $str = trim($str); + $blockEndChar = '¶'; + $tests = array(); + $punctuationChars = $this->getPunctuationChars(); + $endSentenceChars = $this->getPunctuationChars(true); + $noEndSentenceWords = explode(' ', $options['noEndSentence']); + + if($options['keepFormatTags']) { + $options['keepTags'] = array_merge($options['keepTags'], array( + 'abbr','acronym','b','big','cite','code','em','i','kbd', 'q','samp','small','span','strong','sub','sup','time','var', + )); + } + + if($type === 'block') { + if(mb_strpos($str, $blockEndChar) !== false) $str = str_replace($blockEndChar, ' ', $str); + $options['endBlocksWith'] = $blockEndChar; + } + + // collapse whitespace and strip tags + $str = $this->collapse($str, $options); + + if(trim($options['collapseLinesWith']) && mb_strpos($str, $options['collapseLinesWith'])) { + // if lines are collapsed with something other than whitespace, avoid using that string + // when the line already ends with sentence punctuation + foreach($endSentenceChars as $c) { + $str = str_replace("$c$options[collapseLinesWith]", "$c ", $str); + } + } + + // if anything above reduced the length of the string enough, return it now + if(mb_strlen($str) <= $maxLength) return $str; + + // get string at maximum possible length + if($options['visible']) { + // adjust for only visible length + $_str = $str; + $str = mb_substr($str, 0, $maxLength); + $len = $this->getVisibleLength($str); + if($len < $maxLength) { + $maxLength += ($maxLength - $len); + $str = mb_substr($_str, 0, $maxLength); + } + unset($_str); + } else { + $str = mb_substr($str, 0, $maxLength); + } + + // match to closest blocks, like paragraph(s) + if($type === 'block') { + $pos = $options['maximize'] ? mb_strrpos($str, $blockEndChar) : mb_strpos($str, $blockEndChar); + if($pos === false) { + $type = 'word'; + } else { + $tests[] = $pos; + $options['trim'] .= $blockEndChar; + } + } + + // find sentences closest to end + if($type === 'sentence') { + foreach($endSentenceChars as $find) { + $pos = $options['maximize'] ? mb_strrpos($str, "$find ") : mb_strpos($str, "$find "); + if($pos) $tests[] = $pos; + } + if(!count($tests)) $type = 'punctuation'; + } + + // find punctuation closes to end of string + if($type === 'punctuation') { + foreach($punctuationChars as $find) { + $pos = $options['maximize'] ? mb_strrpos($str, $find) : mb_strpos($str, $find); + if($pos) $tests[] = $pos; + } + if(!count($tests)) $type = 'word'; + } + + // find whitespace and last word closest to end of string + if($type === 'word' || !count($tests)) { + $pos = $options['maximize'] ? mb_strrpos($str, ' ') : mb_strpos($str, ' '); + if($pos) $tests[] = $pos; + } + + // if we didn't find any place to truncate, just return exact truncated string + if(!count($tests)) { + return trim($str, $options['trim']) . $options['more']; + } + + // we found somewhere to truncate, so truncate at the longest one possible + if($options['maximize']) { + sort($tests); + } else { + rsort($tests); + } + + // process our tests + do { + $pos = array_pop($tests); + $result = trim(mb_substr($str, 0, $pos + 1)); + $lastChar = mb_substr($result, -1); + $result = rtrim($result, $options['trim']); + + if($type === 'sentence') { + $pos = strrpos($result, ' '); + if(!$pos) break; + // if sentence type, make sure it doesn't end with a disallowed word + $lastWord = mb_substr($result, $pos + 1); + while(!ctype_alnum(mb_substr($lastWord, 0, 1)) && strlen($lastWord)) { + $lastWord = mb_substr($lastWord, 1); + } + foreach($noEndSentenceWords as $word) { + if($word !== $lastWord) continue; + $tests[] = $pos; + $type = 'word'; + $result = ''; + break; + } + } else if($type === 'block') { + // good to go with result as is + } else { + if(in_array($lastChar, $endSentenceChars)) { + // great, end with sentence ending punctuation + } else if(in_array($lastChar, $punctuationChars)) { + $trims = ' '; + foreach($punctuationChars as $c) { + if(mb_strpos($options['noTrim'], $c) !== false) continue; + if(in_array($c, $endSentenceChars)) continue; + $trims .= $c; + } + $result = rtrim($result, $trims) . $options['more']; + } else { + $result .= $options['more']; + } + } + + } while(!strlen($result) && count($tests)); + + // make sure we didn't break any HTML tags as a result of truncation + if(strlen($result) && count($options['keepTags']) && strpos($result, '<') !== false) { + $result = $this->fixUnclosedTags($result); + } + + return $result; + } + + /** + * Return visible length of string, which is length not counting markup or entities + * + * @param string $str + * @return int + * + */ + public function getVisibleLength($str) { + if(strpos($str, '>')) { + $str = strip_tags($str); + } + if(strpos($str, '&') !== false && strpos($str, ';')) { + $str = html_entity_decode($str, ENT_QUOTES, 'UTF-8'); + } + return mb_strlen($str); + } + + /** + * Get array of punctuation characters + * + * @param bool $sentence Get only sentence-ending punctuation + * @return array|string + * + */ + public function getPunctuationChars($sentence = false) { + if($sentence) { + $s = $this->_('. ? !'); // Sentence ending punctuation characters (must be space-separated) + } else { + $s = $this->_(', : . ? ! “ ” „ " – -- ( ) [ ] { } « »'); // All punctuation characters (must be space-separated) + } + return explode(' ', $s); + } + +}