1
0
mirror of https://github.com/processwire/processwire.git synced 2025-08-13 18:24:57 +02:00

Add $sanitizer->truncate() function per processwire/processwire-requests#163

This commit is contained in:
Ryan Cramer
2018-05-04 15:32:54 -04:00
parent 589d745caa
commit 8d38ad7c44
2 changed files with 650 additions and 1 deletions

View File

@@ -63,6 +63,12 @@ class Sanitizer extends Wire {
*/
protected $allowedASCII = array();
/**
* @var null|WireTextTools
*
*/
protected $textTools = null;
/**
* Construct the sanitizer
*
@@ -1809,9 +1815,70 @@ class Sanitizer extends Wire {
* @return string String without newlines
*
*/
function removeNewlines($str, $replacement = ' ') {
public function removeNewlines($str, $replacement = ' ') {
return str_replace(array("\r\n", "\r", "\n"), $replacement, $str);
}
/**
* Truncate string to given maximum length without breaking words
*
* This method can truncate between words, sentences, punctuation or blocks (like paragraphs).
* See the `type` option for details on how it should truncate. By default it truncates between
* words. Description of types:
*
* - word: truncate to closest word.
* - punctuation: truncate to closest punctuation within sentence.
* - sentence: truncate to closest sentence.
* - block: truncate to closest block of text (like a paragraph or headline).
*
* Note that if your specified `type` is something other than “word”, and it cannot be matched
* within the maxLength, then it will attempt a different type. For instance, if you specify
* “sentence” as the type, and it cannot match a sentence, it will try to match to “punctuation”
* instead. If it cannot match that, then it will attempt “word”.
*
* HTML will be stripped from returned string. If you want to keep some tags use the `keepTags` or `keepFormatTags`
* options to specify what tags are allowed to remain. The `keepFormatTags` option that, when true, will make it
* retain all HTML inline text formatting tags.
*
* ~~~~~~~
* // Truncate string to closest word within 150 characters
* $s = $sanitizer->truncate($str, 150);
*
* // Truncate string to closest sentence within 300 characters
* $s = $sanitizer->truncate($str, 300, 'sentence');
*
* // Truncate with options
* $s = $sanitizer->truncate($str, [
* 'type' => 'punctuation',
* 'maxLength' => 300,
* 'visible' => true,
* 'more' => '…'
* ]);
* ~~~~~~~
*
* @param string $str String to truncate
* @param int|array $maxLength Maximum length of returned string, or specify $options array here.
* @param array|string $options Options array, or specify `type` option (string).
* - `type` (string): Preferred truncation type of word, punctuation, sentence, or block. (default='word')
* This is a “preferred type”, not an absolute one, because it will adjust to match what it can within your maxLength.
* - `maxLength` (int): Max characters for truncation, used only if $options array substituted for $maxLength argument.
* - `maximize` (bool): Include as much as possible within specified type and max-length? (default=true)
* If you specify false for the maximize option, it will truncate to first word, puncutation, sentence or block.
* - `visible` (bool): When true, invisible text (markup, entities, etc.) does not count towards string length. (default=false)
* - `trim` (string): Characters to trim from returned string. (default=',;/ ')
* - `noTrim` (string): Never trim these from end of returned string. (default=')]>}”»')
* - `more` (string): Append this to truncated strings that do not end with sentence punctuation. (default='…')
* - `keepTags` (array): HTML tags that should be kept in returned string. (default=[])
* - `keepFormatTags` (bool): Keep HTML text-formatting tags? Simpler alternative to keepTags option. (default=false)
* - `collapseLinesWith` (string): String to collapse lines with where the first is not punctuated. (default=' … ')
* - `convertEntities` (bool): Convert HTML entities to non-entity characters? (default=false)
* - `noEndSentence` (string): Strings that sentence may not end with, space-separated values (default='Mr. Mrs. …')
* @return string
*
*/
function truncate($str, $maxLength = 300, $options = array()) {
return $this->getTextTools()->truncate($str, $maxLength, $options);
}
/**
* Removes 4-byte UTF-8 characters (like emoji) that produce error with with MySQL regular “UTF8” encoding
@@ -2447,6 +2514,20 @@ class Sanitizer extends Wire {
return $results;
}
/**
* Get instance of WireTextTools
*
* @return WireTextTools
*
*/
public function getTextTools() {
if(!$this->textTools) {
$this->textTools = new WireTextTools();
$this->wire($this->textTools);
}
return $this->textTools;
}
/**********************************************************************************************************************
* FILE VALIDATORS
*

568
wire/core/WireTextTools.php Normal file
View File

@@ -0,0 +1,568 @@
<?php namespace ProcessWire;
/**
* ProcessWire Text Tools
*
* #pw-summary Specific text and markup tools for ProcessWire $sanitizer and elsewhere.
*
* ProcessWire 3.x, Copyright 2018 by Ryan Cramer
* https://processwire.com
*
*
*/
class WireTextTools extends Wire {
/**
* Convert HTML markup to readable text
*
* Like PHPs strip_tags but with some small improvements in HTML-to-text conversion that
* improves the readability of the text.
*
* #pw-internal
*
* @param string $str String to convert to text
* @param array $options
* - `keepTags` (array): Tag names to keep in returned value, i.e. [ "em", "strong" ]. (default=none)
* - `splitBlocks` (string): String to split paragraph and header elements. (default="\n\n")
* - `convertEntities` (bool): Convert HTML entities to plain text equivalents? (default=true)
* - `listItemPrefix` (string): Prefix for converted list item `<li>` elements. (default='• ')
* - `replacements` (array): Associative array of strings to manually replace. (default=['&nbsp;' => ' '])
* @return string
*
*/
public function markupToText($str, array $options = array()) {
$defaults = array(
'keepTags' => array(),
'splitBlocks' => "\n\n",
'convertEntities' => true,
'listItemPrefix' => '• ',
'replacements' => array(
'&nbsp;' => ' '
),
);
$options = array_merge($defaults, $options);
if(strpos($str, '>') !== false) {
// ensure tags are separated by whitespace
$str = str_replace('><', '> <', $str);
// normalize newlines
if(strpos($str, "\r") !== false) {
$str = str_replace(array("\r\n", "\r"), "\n", $str);
}
// normalize tabs to spaces
if(strpos($str, "\t") !== false) {
$str = str_replace("\t", " ", $str);
}
// ensure paragraphs and headers are followed by two newlines
if(stripos($str, '</p>') || stripos($str, '</h')) {
$str = preg_replace('!(</(?:p|h\d)>)!i', '$1' . $options['splitBlocks'], $str);
}
// ensure list items are on their own line and prefixed with a bullet
if(stripos($str, '<li') !== false) {
$prefix = in_array('li', $options['keepTags']) ? '' : $options['listItemPrefix'];
$str = preg_replace('![\s\r\n]+<li[^>]*>!i', "\n<li>$prefix", $str);
}
// convert <br> tags to be just a single newline
if(stripos($str, '<br') !== false) {
$str = str_replace(array('<br>', '<br/>', '<br />'), "<br>\n", $str);
while(stripos($str, "\n<br>") !== false) $str = str_replace("\n<br>", "<br>", $str);
while(stripos($str, "<br>\n\n") !== false) $str = str_replace("<br>\n\n", "<br>\n", $str);
}
}
// normalize newlines and whitespace around newlines
while(strpos($str, " \n") !== false) $str = str_replace(" \n", "\n", $str);
while(strpos($str, "\n ") !== false) $str = str_replace("\n ", "\n", $str);
while(strpos($str, "\n\n\n") !== false) $str = str_replace("\n\n\n", "\n\n", $str);
// strip tags
if(count($options['keepTags'])) {
// some tags will be allowed to remain
$keepTags = '';
foreach($options['keepTags'] as $tag) {
$keepTags .= "<" . trim($tag, "<>") . ">";
}
$str = strip_tags($str, $keepTags);
} else {
// not allowing any tags
$str = strip_tags($str);
// if any possible tag characters remain, drop them now
$str = str_replace(array('<', '>'), ' ', $str);
}
// apply any other replacements
foreach($options['replacements'] as $find => $replace) {
$str = str_ireplace($find, $replace, $str);
}
// convert entities to plain text equivalents
if($options['convertEntities'] && strpos($str, '&') !== false) {
$str = $this->unentities($str);
}
return trim($str);
}
/**
* Remove (or close) unclosed HTML tags from given string
*
* Remove unclosed tags:
* ---------------------
* At present, if it finds an unclosed tag, it removes all tags of the same kind.
* This is in order to keep the function fast, by delegating what it can to strip_tags().
* This is sufficient for our internal use here, but may not be ideal for all situations.
*
* Fix/close unclosed tags:
* ------------------------
* When the remove option is false, it will attempt to close unclosed tags rather than
* remove them. It doesn't know exactly where they should be closed, so it appends the
* close tags to the end of the string.
*
* @param string $str
* @param bool $remove Remove unclosed tags? If false, it will attempt to close them instead. (default=true)
* @param array $options
* - `ignoreTags` (array): Tags that can be ignored because they close themselves. (default=per HTML spec)
* @return string
*
*/
public function fixUnclosedTags($str, $remove = true, $options = array()) {
$defaults = array(
'ignoreTags' => array(
'area','base','br','col','command','embed','hr','img','input',
'keygen','link','menuitem','meta','param','source','track','wbr',
),
);
if(isset($options['ignoreTags'])) {
// merge user specified ignoreTags with our defaults so that both are used
$options['ignoreTags'] = array_merge($defaults['ignoreTags'], $options['ignoreTags']);
}
$options = array_merge($defaults, $options);
$tags = array();
$unclosed = array();
$n1 = substr_count($str, '>');
$n2 = substr_count($str, '</');
if($n1) $n1 = $n1 / 2;
// if the quantity of ">" is equal to double the quantity of "</" then early exit
if($n1 === $n2) return $str;
// now check for string possibly ending with a partial tag, and remove if present
$n1 = strrpos($str, '<');
$n2 = strrpos($str, '>');
if($n1 > $n2) {
// string might end with a partial tag, i.e. "<span"
$test = substr($str, $n1 + 1, 1); // i.e. "s" from "<span", or "<" is last char in the string
if(ctype_alpha($test) || $test === false || $test === '') {
// going to assume this is a tag, so trucate
$str = substr($str, 0, $n1 - 1);
}
}
// find all open tags
if(!preg_match_all('!<([a-z]+[a-z0-9]*)(>|\s*/>|\s[^>]+>)!i', $str, $matches)) return $str;
foreach($matches[1] as $key => $tag) {
if(strpos($matches[2][$key], '/>') !== false) continue; // ignore self closing tags
if(in_array(strtolower($tag), $options['ignoreTags'])) continue;
$tags[$tag] = $tag;
}
// count appearances of found tags
foreach($tags as $tag) {
// count number of open tags of this type
$openQty = substr_count($str, "<$tag>") + substr_count($str, "<$tag ");
// count number of closing tags of this type
$closeQty = substr_count($str, "</$tag>");
// if quantities do not match, mark tag for deletion
if($openQty !== $closeQty) {
unset($tags[$tag]);
$unclosed[] = $tag;
}
}
if(count($unclosed)) {
if($remove) {
// strip all tags except those where open/close quantity matched
$keepTags = count($tags) ? '<' . implode('><', $tags) . '>' : '';
$str = strip_tags($str, $keepTags);
} else {
foreach($unclosed as $tag) {
$str .= "</$tag>";
}
}
}
return $str;
}
/**
* Collapse string to plain text that all exists on a single long line without destroying words/punctuation.
*
* @param string $str String to collapse
* @param array $options
* - `stripTags` (bool): Strip markup tags? (default=true)
* - `keepTags` (array): Array of tag names to keep, if stripTags==true. (default=[])
* - `collapseLinesWith` (string): String to collapse newlines with. (default=' ')
* - `endBlocksWith` (string): Character or string to insert to identify paragraph/header separation (default='')
* - `convertEntities` (bool): Convert entity-encoded characters to text? (default=true)
* @return mixed|string
*
*/
public function collapse($str, array $options = array()) {
$defaults = array(
'stripTags' => true,
'keepTags' => array(),
'collapseLinesWith' => ' ',
'endBlocksWith' => '',
'convertEntities' => true,
);
$options = array_merge($defaults, $options);
if($options['stripTags']) {
$str = $this->markupToText($str, array(
'convertEntities' => $options['convertEntities'],
'keepTags' => $options['keepTags'],
));
if(!strlen($str)) return $str;
}
// character that we collapse lines with
$r = $options['collapseLinesWith'];
// convert any tabs to space
if(strpos($str, "\t") !== false) {
$str = str_replace("\t", " ", $str);
}
// convert CRs to LFs
if(strpos($str, "\r") !== false) {
$str = str_replace(array("\r\n", "\r"), "\n", $str);
}
// collapse whitespace that appears before or after newlines
while(strpos($str, " \n") !== false) $str = str_replace(" \n", "\n", $str);
while(strpos($str, "\n ") !== false) $str = str_replace("\n ", "\n", $str);
// convert redundant LFs to no more than double LFs
while(strpos($str, "\n\n\n") !== false) {
$str = str_replace("\n\n\n", "\n\n", $str);
}
// add character to indicate blocks, when asked for
if(!empty($options['endBlocksWith'])) {
$str = str_replace("\n\n", "$options[endBlocksWith]\n\n", $str);
}
// replace all types of newlines
$str = str_replace(array("\r\n", "\r", "\n\n", "\n"), $r, $str);
// while there are consecutives of our collapse string, reduce them to one
while(strpos($str, "$r$r") !== false) {
$str = str_replace("$r$r", $r, $str);
}
if($r !== $defaults['collapseLinesWith']) {
// replacement of whitespace with something other than another single whitespace
// so collapse consecutive spaces to one space, since this would not be already done
while(strpos($str, " ") !== false) {
$str = str_replace(" ", " ", $str);
}
// use space rather than replacement char when left side already ends with punctuation
foreach($this->getPunctuationChars() as $c) {
if(strpos($str, "$c$r")) $str = str_replace("$c$r", "$c ", $str);
}
}
return trim($str);
}
/**
* Truncate string to given maximum length without breaking words
*
* This method can truncate between words, sentences, punctuation or blocks (like paragraphs).
* See the `type` option for details on how it should truncate. By default it truncates between
* words. Description of types:
*
* - word: truncate to closest word.
* - punctuation: truncate to closest punctuation within sentence.
* - sentence: truncate to closest sentence.
* - block: truncate to closest block of text (like a paragraph or headline).
*
* Note that if your specified `type` is something other than “word”, and it cannot be matched
* within the maxLength, then it will attempt a different type. For instance, if you specify
* “sentence” as the type, and it cannot match a sentence, it will try to match to “punctuation”
* instead. If it cannot match that, then it will attempt “word”.
*
* HTML will be stripped from returned string. If you want to keep some tags use the `keepTags` or `keepFormatTags`
* options to specify what tags are allowed to remain. The `keepFormatTags` option that, when true, will make it
* retain all HTML inline text formatting tags.
*
* ~~~~~~~
* // Truncate string to closest word within 150 characters
* $s = $sanitizer->truncate($str, 150);
*
* // Truncate string to closest sentence within 300 characters
* $s = $sanitizer->truncate($str, 300, 'sentence');
*
* // Truncate with options
* $s = $sanitizer->truncate($str, [
* 'type' => 'punctuation',
* 'maxLength' => 300,
* 'visible' => true,
* 'more' => '…'
* ]);
* ~~~~~~~
*
* @param string $str String to truncate
* @param int|array $maxLength Maximum length of returned string, or specify $options array here.
* @param array|string $options Options array, or specify `type` option (string).
* - `type` (string): Preferred truncation type of word, punctuation, sentence, or block. (default='word')
* This is a “preferred type”, not an absolute one, because it will adjust to match what it can within your maxLength.
* - `maxLength` (int): Max characters for truncation, used only if $options array substituted for $maxLength argument.
* - `maximize` (bool): Include as much as possible within specified type and max-length? (default=true)
* If you specify false for the maximize option, it will truncate to first word, puncutation, sentence or block.
* - `visible` (bool): When true, invisible text (markup, entities, etc.) does not count towards string length. (default=false)
* - `trim` (string): Characters to trim from returned string. (default=',;/ ')
* - `noTrim` (string): Never trim these from end of returned string. (default=')]>}”»')
* - `more` (string): Append this to truncated strings that do not end with sentence punctuation. (default='…')
* - `keepTags` (array): HTML tags that should be kept in returned string. (default=[])
* - `keepFormatTags` (bool): Keep HTML text-formatting tags? Simpler alternative to keepTags option. (default=false)
* - `collapseLinesWith` (string): String to collapse lines with where the first is not punctuated. (default=' … ')
* - `convertEntities` (bool): Convert HTML entities to non-entity characters? (default=false)
* - `noEndSentence` (string): Strings that sentence may not end with, space-separated values (default='Mr. Mrs. …')
* @return string
*
*/
function truncate($str, $maxLength, $options = array()) {
$defaults = array(
'type' => 'word', // word, punctuation, sentence, or block
'maximize' => true, // include as much as possible within the type and maxLength (false=include as little as possible)
'visible' => false, // when true, invisible text (markup, entities, etc.) does not count towards string length. (default=false)
'trim' => $this->_(',;/') . ' ', // Trim these characters from the end of the returned string
'noTrim' => $this->_(')]>}”»'), // Never trim these characters from end of returned string
'more' => '…', // Append to truncated strings that do not end with sentence punctuation
'stripTags' => true, // strip HTML tags? (currently required, see keepTags to keep some)
'keepTags' => array(), // if strip HTML tags is true, optional array of tag names you want to keep
'keepFormatTags' => false, // alternative to keepTags: keep just inline text format tags like strong, em, etc.
'collapseWhitespace' => true, // collapsed whitespace (currently required)
'collapseLinesWith' => ' ' . $this->_('…') . ' ', // String placed between joined lines (like from paragraphs)
'convertEntities' => false, // convert entity encoded characters to non-entity equivalents? (default=false)
'noEndSentence' => $this->_('Mr. Mrs. Ms. Dr. Hon. PhD. i.e. e.g.'), // When in sentence type, words that do not end the sentence (space-separated)
);
if(!strlen($str)) return '';
if(is_string($options) && ctype_alpha($options)) {
$defaults['type'] = $options;
$options = array();
}
if(is_array($maxLength)) {
$options = $maxLength;
if(!isset($options['maxLength'])) $options['maxLength'] = 0;
$maxLength = $options['maxLength'];
} else if(is_string($maxLength) && ctype_alpha($maxLength)) {
$options['type'] = $maxLength;
$maxLength = isset($options['maxLength']) ? $options['maxLength'] : mb_strlen($str);
}
if(!$maxLength) $maxLength = 255;
$options = array_merge($defaults, $options);
$type = $options['type'];
$str = trim($str);
$blockEndChar = '¶';
$tests = array();
$punctuationChars = $this->getPunctuationChars();
$endSentenceChars = $this->getPunctuationChars(true);
$noEndSentenceWords = explode(' ', $options['noEndSentence']);
if($options['keepFormatTags']) {
$options['keepTags'] = array_merge($options['keepTags'], array(
'abbr','acronym','b','big','cite','code','em','i','kbd', 'q','samp','small','span','strong','sub','sup','time','var',
));
}
if($type === 'block') {
if(mb_strpos($str, $blockEndChar) !== false) $str = str_replace($blockEndChar, ' ', $str);
$options['endBlocksWith'] = $blockEndChar;
}
// collapse whitespace and strip tags
$str = $this->collapse($str, $options);
if(trim($options['collapseLinesWith']) && mb_strpos($str, $options['collapseLinesWith'])) {
// if lines are collapsed with something other than whitespace, avoid using that string
// when the line already ends with sentence punctuation
foreach($endSentenceChars as $c) {
$str = str_replace("$c$options[collapseLinesWith]", "$c ", $str);
}
}
// if anything above reduced the length of the string enough, return it now
if(mb_strlen($str) <= $maxLength) return $str;
// get string at maximum possible length
if($options['visible']) {
// adjust for only visible length
$_str = $str;
$str = mb_substr($str, 0, $maxLength);
$len = $this->getVisibleLength($str);
if($len < $maxLength) {
$maxLength += ($maxLength - $len);
$str = mb_substr($_str, 0, $maxLength);
}
unset($_str);
} else {
$str = mb_substr($str, 0, $maxLength);
}
// match to closest blocks, like paragraph(s)
if($type === 'block') {
$pos = $options['maximize'] ? mb_strrpos($str, $blockEndChar) : mb_strpos($str, $blockEndChar);
if($pos === false) {
$type = 'word';
} else {
$tests[] = $pos;
$options['trim'] .= $blockEndChar;
}
}
// find sentences closest to end
if($type === 'sentence') {
foreach($endSentenceChars as $find) {
$pos = $options['maximize'] ? mb_strrpos($str, "$find ") : mb_strpos($str, "$find ");
if($pos) $tests[] = $pos;
}
if(!count($tests)) $type = 'punctuation';
}
// find punctuation closes to end of string
if($type === 'punctuation') {
foreach($punctuationChars as $find) {
$pos = $options['maximize'] ? mb_strrpos($str, $find) : mb_strpos($str, $find);
if($pos) $tests[] = $pos;
}
if(!count($tests)) $type = 'word';
}
// find whitespace and last word closest to end of string
if($type === 'word' || !count($tests)) {
$pos = $options['maximize'] ? mb_strrpos($str, ' ') : mb_strpos($str, ' ');
if($pos) $tests[] = $pos;
}
// if we didn't find any place to truncate, just return exact truncated string
if(!count($tests)) {
return trim($str, $options['trim']) . $options['more'];
}
// we found somewhere to truncate, so truncate at the longest one possible
if($options['maximize']) {
sort($tests);
} else {
rsort($tests);
}
// process our tests
do {
$pos = array_pop($tests);
$result = trim(mb_substr($str, 0, $pos + 1));
$lastChar = mb_substr($result, -1);
$result = rtrim($result, $options['trim']);
if($type === 'sentence') {
$pos = strrpos($result, ' ');
if(!$pos) break;
// if sentence type, make sure it doesn't end with a disallowed word
$lastWord = mb_substr($result, $pos + 1);
while(!ctype_alnum(mb_substr($lastWord, 0, 1)) && strlen($lastWord)) {
$lastWord = mb_substr($lastWord, 1);
}
foreach($noEndSentenceWords as $word) {
if($word !== $lastWord) continue;
$tests[] = $pos;
$type = 'word';
$result = '';
break;
}
} else if($type === 'block') {
// good to go with result as is
} else {
if(in_array($lastChar, $endSentenceChars)) {
// great, end with sentence ending punctuation
} else if(in_array($lastChar, $punctuationChars)) {
$trims = ' ';
foreach($punctuationChars as $c) {
if(mb_strpos($options['noTrim'], $c) !== false) continue;
if(in_array($c, $endSentenceChars)) continue;
$trims .= $c;
}
$result = rtrim($result, $trims) . $options['more'];
} else {
$result .= $options['more'];
}
}
} while(!strlen($result) && count($tests));
// make sure we didn't break any HTML tags as a result of truncation
if(strlen($result) && count($options['keepTags']) && strpos($result, '<') !== false) {
$result = $this->fixUnclosedTags($result);
}
return $result;
}
/**
* Return visible length of string, which is length not counting markup or entities
*
* @param string $str
* @return int
*
*/
public function getVisibleLength($str) {
if(strpos($str, '>')) {
$str = strip_tags($str);
}
if(strpos($str, '&') !== false && strpos($str, ';')) {
$str = html_entity_decode($str, ENT_QUOTES, 'UTF-8');
}
return mb_strlen($str);
}
/**
* Get array of punctuation characters
*
* @param bool $sentence Get only sentence-ending punctuation
* @return array|string
*
*/
public function getPunctuationChars($sentence = false) {
if($sentence) {
$s = $this->_('. ? !'); // Sentence ending punctuation characters (must be space-separated)
} else {
$s = $this->_(', : . ? ! “ ” „ " -- ( ) [ ] { } « »'); // All punctuation characters (must be space-separated)
}
return explode(' ', $s);
}
}