1
0
mirror of https://github.com/processwire/processwire.git synced 2025-08-16 11:44:42 +02:00

Several major improvements to WireTextTools::markupToText() method, plus related updates in Sanitizer and WireMail classes

This commit is contained in:
Ryan Cramer
2019-05-22 13:50:12 -04:00
parent f1d5f12835
commit 2b7f80d575
3 changed files with 161 additions and 32 deletions

View File

@@ -1095,16 +1095,21 @@ class Sanitizer extends Wire {
/** /**
* Returns a value that may be used in an email header * Returns a value that may be used in an email header
* *
* This method is designed to prevent one email header from injecting into another.
*
* #pw-group-strings * #pw-group-strings
* *
* @param string $value * @param string $value
* @param bool $headerName Sanitize a header name rather than header value? (default=false) Since 3.0.132
* @return string * @return string
* *
*/ */
public function emailHeader($value) { public function emailHeader($value, $headerName = false) {
if(!is_string($value)) return ''; if(!is_string($value)) return '';
$a = array("\n", "\r", "<CR>", "<LF>", "0x0A", "0x0D", "%0A", "%0D", 'content-type:', 'bcc:', 'cc:', 'to:', 'reply-to:'); $a = array("\n", "\r", "<CR>", "<LF>", "0x0A", "0x0D", "%0A", "%0D"); // newlines
return trim(str_ireplace($a, ' ', $value)); $value = trim(str_ireplace($a, ' ', stripslashes($value)));
if($headerName) $value = trim(preg_replace('/[^-_a-zA-Z0-9]/', '-', trim($value, ':')), '-');
return $value;
} }
/** /**
@@ -1354,6 +1359,16 @@ class Sanitizer extends Wire {
/** /**
* Convert a string containing markup or entities to be plain text * Convert a string containing markup or entities to be plain text
* *
* This is one implementation but there is also a better one that you may prefer with the
* `WireTextTools::markupToText()` method:
*
* ~~~~~
* $markup = '<html>a bunch of HTML here</html>';
* // try both to see what you prefer:
* $text1 = $sanitizer->markupToText($html);
* $text2 = $sanitizer->getTextTools()->markupToText();
* ~~~~~
*
* #pw-group-strings * #pw-group-strings
* *
* @param string $value String you want to convert * @param string $value String you want to convert
@@ -1363,6 +1378,7 @@ class Sanitizer extends Wire {
* - `entities` (bool): Entity encode returned value? (default=false). * - `entities` (bool): Entity encode returned value? (default=false).
* - `trim` (string): Character(s) to trim from beginning and end of value (default=" -,:;|\n\t"). * - `trim` (string): Character(s) to trim from beginning and end of value (default=" -,:;|\n\t").
* @return string Converted string of text * @return string Converted string of text
* @see WireTextTools::markupToText() for different though likely better (for most cases) implementation.
* *
*/ */
public function markupToText($value, array $options = array()) { public function markupToText($value, array $options = array()) {

View File

@@ -52,6 +52,8 @@
* *
* @method int send() Send email. * @method int send() Send email.
* @method string htmlToText($html) Convert HTML email body to TEXT email body. * @method string htmlToText($html) Convert HTML email body to TEXT email body.
* @method string sanitizeHeaderName($name) #pw-internal
* @method string sanitizeHeaderValue($value) #pw-internal
* *
* @property array $to To email address. * @property array $to To email address.
* @property array $toName Optional persons name to accompany “to” email address * @property array $toName Optional persons name to accompany “to” email address
@@ -68,6 +70,7 @@
* @property array $attachments Array of file attachments (if populated and where supported) #pw-advanced * @property array $attachments Array of file attachments (if populated and where supported) #pw-advanced
* @property string $newline Newline character, populated only if different from CRLF. #pw-advanced * @property string $newline Newline character, populated only if different from CRLF. #pw-advanced
* *
*
*/ */
class WireMail extends WireData implements WireMailInterface { class WireMail extends WireData implements WireMailInterface {
@@ -160,14 +163,46 @@ class WireMail extends WireData implements WireMailInterface {
} }
/** /**
* Sanitize string for use in a email header * Sanitize and normalize a header name
*
* @param string $name
* @return string
* @since 3.0.132
*
*/
protected function ___sanitizeHeaderName($name) {
/** @var Sanitizer $sanitizer */
$sanitizer = $this->wire('sanitizer');
$name = $sanitizer->emailHeader($name, true);
// ensure consistent capitalization for header names
$name = ucwords(str_replace('-', ' ', $name));
$name = str_replace(' ', '-', $name);
return $name;
}
/**
* Sanitize an email header header value
*
* @param string $value
* @return string
* @since 3.0.132
*
*/
protected function ___sanitizeHeaderValue($value) {
return $this->wire('sanitizer')->emailHeader($value);
}
/**
* Alias of sanitizeHeaderValue() method for backwards compatibility
*
* #pw-internal
* *
* @param string $header * @param string $header
* @return string * @return string
* *
*/ */
protected function sanitizeHeader($header) { protected function sanitizeHeader($header) {
return $this->wire('sanitizer')->emailHeader($header); return $this->sanitizeHeaderValue($header);
} }
/** /**
@@ -182,7 +217,7 @@ class WireMail extends WireData implements WireMailInterface {
if(strpos($email, '<') !== false && strpos($email, '>') !== false) { if(strpos($email, '<') !== false && strpos($email, '>') !== false) {
// email has separate from name and email // email has separate from name and email
if(preg_match('/^(.*?)<([^>]+)>.*$/', $email, $matches)) { if(preg_match('/^(.*?)<([^>]+)>.*$/', $email, $matches)) {
$name = $this->sanitizeHeader($matches[1]); $name = $this->sanitizeHeaderValue($matches[1]);
$email = $matches[2]; $email = $matches[2];
} }
} }
@@ -203,7 +238,7 @@ class WireMail extends WireData implements WireMailInterface {
protected function bundleEmailAndName($email, $name) { protected function bundleEmailAndName($email, $name) {
$email = $this->sanitizeEmail($email); $email = $this->sanitizeEmail($email);
if(!strlen($name)) return $email; if(!strlen($name)) return $email;
$name = $this->sanitizeHeader($name); $name = $this->sanitizeHeaderValue($name);
$delim = ''; $delim = '';
if(strpos($name, ',') !== false) { if(strpos($name, ',') !== false) {
// name contains a comma, so quote the value // name contains a comma, so quote the value
@@ -265,7 +300,7 @@ class WireMail extends WireData implements WireMailInterface {
$toEmail = $this->sanitizeEmail($toEmail); $toEmail = $this->sanitizeEmail($toEmail);
if(strlen($toEmail)) { if(strlen($toEmail)) {
$this->mail['to'][$toEmail] = $toEmail; $this->mail['to'][$toEmail] = $toEmail;
$this->mail['toName'][$toEmail] = $this->sanitizeHeader($toName); $this->mail['toName'][$toEmail] = $this->sanitizeHeaderValue($toName);
} }
} }
@@ -289,7 +324,7 @@ class WireMail extends WireData implements WireMailInterface {
$emails = $this->mail['to']; $emails = $this->mail['to'];
if(!count($emails)) throw new WireException("Please set a 'to' address before setting a name."); if(!count($emails)) throw new WireException("Please set a 'to' address before setting a name.");
$email = end($emails); $email = end($emails);
$this->mail['toName'][$email] = $this->sanitizeHeader($name); $this->mail['toName'][$email] = $this->sanitizeHeaderValue($name);
return $this; return $this;
} }
@@ -324,7 +359,7 @@ class WireMail extends WireData implements WireMailInterface {
* *
*/ */
public function fromName($name) { public function fromName($name) {
$this->mail['fromName'] = $this->sanitizeHeader($name); $this->mail['fromName'] = $this->sanitizeHeaderValue($name);
return $this; return $this;
} }
@@ -343,7 +378,7 @@ class WireMail extends WireData implements WireMailInterface {
} else { } else {
$email = $this->sanitizeEmail($email); $email = $this->sanitizeEmail($email);
} }
if($name) $this->mail['replyToName'] = $this->sanitizeHeader($name); if($name) $this->mail['replyToName'] = $this->sanitizeHeaderValue($name);
$this->mail['replyTo'] = $email; $this->mail['replyTo'] = $email;
if(empty($name) && !empty($this->mail['replyToName'])) $name = $this->mail['replyToName']; if(empty($name) && !empty($this->mail['replyToName'])) $name = $this->mail['replyToName'];
if(strlen($name)) $email = $this->bundleEmailAndName($email, $name); if(strlen($name)) $email = $this->bundleEmailAndName($email, $name);
@@ -360,7 +395,7 @@ class WireMail extends WireData implements WireMailInterface {
*/ */
public function replyToName($name) { public function replyToName($name) {
if(strlen($this->mail['replyTo'])) return $this->replyTo($this->mail['replyTo'], $name); if(strlen($this->mail['replyTo'])) return $this->replyTo($this->mail['replyTo'], $name);
$this->mail['replyToName'] = $this->sanitizeHeader($name); $this->mail['replyToName'] = $this->sanitizeHeaderValue($name);
return $this; return $this;
} }
@@ -372,7 +407,7 @@ class WireMail extends WireData implements WireMailInterface {
* *
*/ */
public function subject($subject) { public function subject($subject) {
$this->mail['subject'] = $this->sanitizeHeader($subject); $this->mail['subject'] = $this->sanitizeHeaderValue($subject);
return $this; return $this;
} }
@@ -430,15 +465,13 @@ class WireMail extends WireData implements WireMailInterface {
if(is_array($key)) { if(is_array($key)) {
$this->headers($key); $this->headers($key);
} else { } else {
$key = $this->sanitizeHeaderName($key);
unset($this->mail['header'][$key]); unset($this->mail['header'][$key]);
} }
} else { } else {
$k = $this->wire('sanitizer')->name($this->sanitizeHeader($key)); $key = $this->sanitizeHeaderName($key);
// ensure consistent capitalization for all header keys $value = $this->sanitizeHeaderValue($value);
$k = ucwords(str_replace('-', ' ', $k)); if(strlen($key)) $this->mail['header'][$key] = $value;
$k = str_replace(' ', '-', $k);
$v = $this->sanitizeHeader($value);
$this->mail['header'][$k] = $v;
} }
return $this; return $this;
} }
@@ -761,9 +794,7 @@ class WireMail extends WireData implements WireMailInterface {
* *
*/ */
protected function ___htmlToText($html) { protected function ___htmlToText($html) {
$textTools = new WireTextTools(); $text = $this->wire('sanitizer')->getTextTools()->markupToText($html);
$this->wire($textTools);
$text = $textTools->markupToText($html);
$text = str_replace("\n", "\r\n", $text); $text = str_replace("\n", "\r\n", $text);
$text = $this->strReplace($text, $this->multipartBoundary()); $text = $this->strReplace($text, $this->multipartBoundary());
return $text; return $text;

View File

@@ -46,6 +46,10 @@ class WireTextTools extends Wire {
* - `splitBlocks` (string): String to split paragraph and header elements. (default="\n\n") * - `splitBlocks` (string): String to split paragraph and header elements. (default="\n\n")
* - `convertEntities` (bool): Convert HTML entities to plain text equivalents? (default=true) * - `convertEntities` (bool): Convert HTML entities to plain text equivalents? (default=true)
* - `listItemPrefix` (string): Prefix for converted list item `<li>` elements. (default='• ') * - `listItemPrefix` (string): Prefix for converted list item `<li>` elements. (default='• ')
* - `linksToUrls` (bool): Convert links to "(url)" rather than removing entirely? (default=true) Since 3.0.132
* - `uppercaseHeadlines` (bool): Convert headline tags to uppercase? (default=false) Since 3.0.132
* - `underlineHeadlines` (bool): Underline headlines with "=" or "-"? (default=true) Since 3.0.132
* - `collapseSpaces` (bool): Collapse extra/redundant extra spaces to single space? (default=true) Since 3.0.132
* - `replacements` (array): Associative array of strings to manually replace. (default=['&nbsp;' => ' ']) * - `replacements` (array): Associative array of strings to manually replace. (default=['&nbsp;' => ' '])
* @return string * @return string
* *
@@ -54,14 +58,26 @@ class WireTextTools extends Wire {
$defaults = array( $defaults = array(
'keepTags' => array(), 'keepTags' => array(),
'linksToUrls' => true, // convert links to just URL rather than removing entirely
'splitBlocks' => "\n\n", 'splitBlocks' => "\n\n",
'uppercaseHeadlines' => false,
'underlineHeadlines' => true,
'convertEntities' => true, 'convertEntities' => true,
'listItemPrefix' => '• ', 'listItemPrefix' => '• ',
'preIndent' => '', // indent for text within a <pre>
'collapseSpaces' => true,
'replacements' => array( 'replacements' => array(
'&nbsp;' => ' ' '&nbsp;' => ' '
), ),
'finishReplacements' => array(), // replacements applied at very end (internal)
); );
// merge options using arrays
foreach(array('replacements') as $key) {
if(!isset($options[$key])) continue;
$options[$key] = array_merge($defaults[$key], $options[$key]);
}
$options = array_merge($defaults, $options); $options = array_merge($defaults, $options);
if(strpos($str, '>') !== false) { if(strpos($str, '>') !== false) {
@@ -83,22 +99,79 @@ class WireTextTools extends Wire {
} }
// ensure paragraphs and headers are followed by two newlines // ensure paragraphs and headers are followed by two newlines
if(stripos($str, '</p>') || stripos($str, '</h')) { if(stripos($str, '</p') || stripos($str, '</h') || stripos($str, '</li') || stripos($str, '</bl') || stripos($str, '</div')) {
$str = preg_replace('!(</(?:p|h\d)>)!i', '$1' . $options['splitBlocks'], $str); $str = preg_replace('!(</(?:p|h\d|ul|ol|pre|blockquote|div)>)!i', '$1' . $options['splitBlocks'], $str);
} }
// ensure list items are on their own line and prefixed with a bullet // ensure list items are on their own line and prefixed with a bullet
if(stripos($str, '<li') !== false) { if(stripos($str, '<li') !== false) {
$prefix = in_array('li', $options['keepTags']) ? '' : $options['listItemPrefix']; $prefix = in_array('li', $options['keepTags']) ? '' : $options['listItemPrefix'];
$str = preg_replace('![\s\r\n]+<li[^>]*>!i', "\n<li>$prefix", $str); $str = preg_replace('![\s\r\n]+<li[^>]*>[\s\r\n]*!i', "\n<li>$prefix", $str);
if($prefix) $options['replacements']["\n$prefix "] = "\n$prefix"; // prevent extra space
} }
// convert <br> tags to be just a single newline // convert <br> tags to be just a single newline
if(stripos($str, '<br') !== false) { if(stripos($str, '<br') !== false) {
$str = str_replace(array('<br>', '<br/>', '<br />'), "<br>\n", $str); $str = str_replace(array('<br>', '<br/>', '<br />', '</li>'), "<br>\n", $str);
while(stripos($str, "\n<br>") !== false) $str = str_replace("\n<br>", "<br>", $str); while(stripos($str, "\n<br>") !== false) $str = str_replace("\n<br>", "<br>", $str);
while(stripos($str, "<br>\n\n") !== false) $str = str_replace("<br>\n\n", "<br>\n", $str); while(stripos($str, "<br>\n\n") !== false) $str = str_replace("<br>\n\n", "<br>\n", $str);
} }
// make headlines more prominent with underlines or uppercase
if(($options['uppercaseHeadlines'] || $options['underlineHeadlines']) && stripos($str, '<h') !== false) {
$topHtag = '';
if($options['underlineHeadlines']) {
// determine which is the top level headline tag
for($n = 1; $n <= 6; $n++) {
if(stripos($str, "<h$n") === false) continue;
$topHtag = "h$n";
break;
}
}
if(preg_match_all('!<(h[123456])[^>]*>(.+?)</\1>!is', $str, $matches)) {
foreach($matches[2] as $key => $headline) {
$fullMatch = $matches[0][$key];
$tagName = strtolower($matches[1][$key]);
$underline = '';
if($options['underlineHeadlines']) {
$char = $tagName === $topHtag ? '=' : '-';
$underline = "\n" . str_repeat($char, $this->strlen($headline));
}
if($options['uppercaseHeadlines']) $headline = strtoupper($headline);
$str = str_replace($fullMatch, "<$tagName>$headline</$tagName>$underline", $str);
}
}
}
// convert "<a href='url'>text</a>" tags to "text (url)"
if($options['linksToUrls'] && stripos($str, '<a ') !== false) {
if(preg_match_all('!<a\s[^<>]*href=([^\s>]+)[^<>]*>(.+?)</a>!is', $str, $matches)) {
$links = array();
foreach($matches[0] as $key => $fullMatch) {
$href = trim($matches[1][$key], '"\'');
if(strpos($href, '#') === 0) continue; // do not convert jumplinks
$anchorText = $matches[2][$key];
$links[$fullMatch] = "$anchorText ($href)";
}
if(count($links)) {
$str = str_replace(array_keys($links), array_values($links), $str);
}
}
}
// indent within <pre>...</pre> sections
if(strlen($options['preIndent']) && strpos($str, '<pre') !== false) {
if(preg_match_all('!<pre(?:>|\s[^>]*>)(.+?)</pre>!is', $str, $matches)) {
foreach($matches[0] as $key => $fullMatch) {
$lines = explode("\n", $matches[1][$key]);
foreach($lines as $k => $line) {
$lines[$k] = ':preIndent:' . rtrim($line);
}
$str = str_replace($fullMatch, implode("\n", $lines), $str);
$options['finishReplacements'][':preIndent:'] = $options['preIndent'];
}
}
}
} }
// strip tags // strip tags
@@ -127,11 +200,20 @@ class WireTextTools extends Wire {
$str = $this->wire('sanitizer')->unentities($str); $str = $this->wire('sanitizer')->unentities($str);
} }
// collapse any redundant/extra whitespace
if($options['collapseSpaces']) {
while(strpos($str, ' ') !== false) $str = str_replace(' ', ' ', $str);
}
// normalize newlines and whitespace around newlines // normalize newlines and whitespace around newlines
while(strpos($str, " \n") !== false) $str = str_replace(" \n", "\n", $str); while(strpos($str, " \n") !== false) $str = str_replace(" \n", "\n", $str);
while(strpos($str, "\n ") !== false) $str = str_replace("\n ", "\n", $str); while(strpos($str, "\n ") !== false) $str = str_replace("\n ", "\n", $str);
while(strpos($str, "\n\n\n") !== false) $str = str_replace("\n\n\n", "\n\n", $str); while(strpos($str, "\n\n\n") !== false) $str = str_replace("\n\n\n", "\n\n", $str);
if(count($options['finishReplacements'])) {
$str = str_replace(array_keys($options['finishReplacements']), array_values($options['finishReplacements']), $str);
}
return trim($str); return trim($str);
} }