1
0
mirror of https://github.com/processwire/processwire.git synced 2025-08-15 11:14:12 +02:00

Several major improvements to WireTextTools::markupToText() method, plus related updates in Sanitizer and WireMail classes

This commit is contained in:
Ryan Cramer
2019-05-22 13:50:12 -04:00
parent f1d5f12835
commit 2b7f80d575
3 changed files with 161 additions and 32 deletions

View File

@@ -1093,18 +1093,23 @@ class Sanitizer extends Wire {
}
/**
* Returns a value that may be used in an email header
* Returns a value that may be used in an email header
*
* This method is designed to prevent one email header from injecting into another.
*
* #pw-group-strings
*
* @param string $value
* @param bool $headerName Sanitize a header name rather than header value? (default=false) Since 3.0.132
* @return string
*
*/
public function emailHeader($value) {
public function emailHeader($value, $headerName = false) {
if(!is_string($value)) return '';
$a = array("\n", "\r", "<CR>", "<LF>", "0x0A", "0x0D", "%0A", "%0D", 'content-type:', 'bcc:', 'cc:', 'to:', 'reply-to:');
return trim(str_ireplace($a, ' ', $value));
$a = array("\n", "\r", "<CR>", "<LF>", "0x0A", "0x0D", "%0A", "%0D"); // newlines
$value = trim(str_ireplace($a, ' ', stripslashes($value)));
if($headerName) $value = trim(preg_replace('/[^-_a-zA-Z0-9]/', '-', trim($value, ':')), '-');
return $value;
}
/**
@@ -1354,6 +1359,16 @@ class Sanitizer extends Wire {
/**
* Convert a string containing markup or entities to be plain text
*
* This is one implementation but there is also a better one that you may prefer with the
* `WireTextTools::markupToText()` method:
*
* ~~~~~
* $markup = '<html>a bunch of HTML here</html>';
* // try both to see what you prefer:
* $text1 = $sanitizer->markupToText($html);
* $text2 = $sanitizer->getTextTools()->markupToText();
* ~~~~~
*
* #pw-group-strings
*
* @param string $value String you want to convert
@@ -1363,6 +1378,7 @@ class Sanitizer extends Wire {
* - `entities` (bool): Entity encode returned value? (default=false).
* - `trim` (string): Character(s) to trim from beginning and end of value (default=" -,:;|\n\t").
* @return string Converted string of text
* @see WireTextTools::markupToText() for different though likely better (for most cases) implementation.
*
*/
public function markupToText($value, array $options = array()) {

View File

@@ -52,6 +52,8 @@
*
* @method int send() Send email.
* @method string htmlToText($html) Convert HTML email body to TEXT email body.
* @method string sanitizeHeaderName($name) #pw-internal
* @method string sanitizeHeaderValue($value) #pw-internal
*
* @property array $to To email address.
* @property array $toName Optional persons name to accompany “to” email address
@@ -67,6 +69,7 @@
* @property array $param Associative array of aditional params (likely not applicable to most WireMail modules).
* @property array $attachments Array of file attachments (if populated and where supported) #pw-advanced
* @property string $newline Newline character, populated only if different from CRLF. #pw-advanced
*
*
*/
@@ -160,14 +163,46 @@ class WireMail extends WireData implements WireMailInterface {
}
/**
* Sanitize string for use in a email header
* Sanitize and normalize a header name
*
* @param string $name
* @return string
* @since 3.0.132
*
*/
protected function ___sanitizeHeaderName($name) {
/** @var Sanitizer $sanitizer */
$sanitizer = $this->wire('sanitizer');
$name = $sanitizer->emailHeader($name, true);
// ensure consistent capitalization for header names
$name = ucwords(str_replace('-', ' ', $name));
$name = str_replace(' ', '-', $name);
return $name;
}
/**
* Sanitize an email header header value
*
* @param string $value
* @return string
* @since 3.0.132
*
*/
protected function ___sanitizeHeaderValue($value) {
return $this->wire('sanitizer')->emailHeader($value);
}
/**
* Alias of sanitizeHeaderValue() method for backwards compatibility
*
* #pw-internal
*
* @param string $header
* @return string
*
*
*/
protected function sanitizeHeader($header) {
return $this->wire('sanitizer')->emailHeader($header);
return $this->sanitizeHeaderValue($header);
}
/**
@@ -182,7 +217,7 @@ class WireMail extends WireData implements WireMailInterface {
if(strpos($email, '<') !== false && strpos($email, '>') !== false) {
// email has separate from name and email
if(preg_match('/^(.*?)<([^>]+)>.*$/', $email, $matches)) {
$name = $this->sanitizeHeader($matches[1]);
$name = $this->sanitizeHeaderValue($matches[1]);
$email = $matches[2];
}
}
@@ -203,7 +238,7 @@ class WireMail extends WireData implements WireMailInterface {
protected function bundleEmailAndName($email, $name) {
$email = $this->sanitizeEmail($email);
if(!strlen($name)) return $email;
$name = $this->sanitizeHeader($name);
$name = $this->sanitizeHeaderValue($name);
$delim = '';
if(strpos($name, ',') !== false) {
// name contains a comma, so quote the value
@@ -265,7 +300,7 @@ class WireMail extends WireData implements WireMailInterface {
$toEmail = $this->sanitizeEmail($toEmail);
if(strlen($toEmail)) {
$this->mail['to'][$toEmail] = $toEmail;
$this->mail['toName'][$toEmail] = $this->sanitizeHeader($toName);
$this->mail['toName'][$toEmail] = $this->sanitizeHeaderValue($toName);
}
}
@@ -289,7 +324,7 @@ class WireMail extends WireData implements WireMailInterface {
$emails = $this->mail['to'];
if(!count($emails)) throw new WireException("Please set a 'to' address before setting a name.");
$email = end($emails);
$this->mail['toName'][$email] = $this->sanitizeHeader($name);
$this->mail['toName'][$email] = $this->sanitizeHeaderValue($name);
return $this;
}
@@ -324,7 +359,7 @@ class WireMail extends WireData implements WireMailInterface {
*
*/
public function fromName($name) {
$this->mail['fromName'] = $this->sanitizeHeader($name);
$this->mail['fromName'] = $this->sanitizeHeaderValue($name);
return $this;
}
@@ -343,7 +378,7 @@ class WireMail extends WireData implements WireMailInterface {
} else {
$email = $this->sanitizeEmail($email);
}
if($name) $this->mail['replyToName'] = $this->sanitizeHeader($name);
if($name) $this->mail['replyToName'] = $this->sanitizeHeaderValue($name);
$this->mail['replyTo'] = $email;
if(empty($name) && !empty($this->mail['replyToName'])) $name = $this->mail['replyToName'];
if(strlen($name)) $email = $this->bundleEmailAndName($email, $name);
@@ -360,7 +395,7 @@ class WireMail extends WireData implements WireMailInterface {
*/
public function replyToName($name) {
if(strlen($this->mail['replyTo'])) return $this->replyTo($this->mail['replyTo'], $name);
$this->mail['replyToName'] = $this->sanitizeHeader($name);
$this->mail['replyToName'] = $this->sanitizeHeaderValue($name);
return $this;
}
@@ -372,7 +407,7 @@ class WireMail extends WireData implements WireMailInterface {
*
*/
public function subject($subject) {
$this->mail['subject'] = $this->sanitizeHeader($subject);
$this->mail['subject'] = $this->sanitizeHeaderValue($subject);
return $this;
}
@@ -430,15 +465,13 @@ class WireMail extends WireData implements WireMailInterface {
if(is_array($key)) {
$this->headers($key);
} else {
$key = $this->sanitizeHeaderName($key);
unset($this->mail['header'][$key]);
}
} else {
$k = $this->wire('sanitizer')->name($this->sanitizeHeader($key));
// ensure consistent capitalization for all header keys
$k = ucwords(str_replace('-', ' ', $k));
$k = str_replace(' ', '-', $k);
$v = $this->sanitizeHeader($value);
$this->mail['header'][$k] = $v;
} else {
$key = $this->sanitizeHeaderName($key);
$value = $this->sanitizeHeaderValue($value);
if(strlen($key)) $this->mail['header'][$key] = $value;
}
return $this;
}
@@ -761,9 +794,7 @@ class WireMail extends WireData implements WireMailInterface {
*
*/
protected function ___htmlToText($html) {
$textTools = new WireTextTools();
$this->wire($textTools);
$text = $textTools->markupToText($html);
$text = $this->wire('sanitizer')->getTextTools()->markupToText($html);
$text = str_replace("\n", "\r\n", $text);
$text = $this->strReplace($text, $this->multipartBoundary());
return $text;

View File

@@ -46,28 +46,44 @@ class WireTextTools extends Wire {
* - `splitBlocks` (string): String to split paragraph and header elements. (default="\n\n")
* - `convertEntities` (bool): Convert HTML entities to plain text equivalents? (default=true)
* - `listItemPrefix` (string): Prefix for converted list item `<li>` elements. (default='• ')
* - `linksToUrls` (bool): Convert links to "(url)" rather than removing entirely? (default=true) Since 3.0.132
* - `uppercaseHeadlines` (bool): Convert headline tags to uppercase? (default=false) Since 3.0.132
* - `underlineHeadlines` (bool): Underline headlines with "=" or "-"? (default=true) Since 3.0.132
* - `collapseSpaces` (bool): Collapse extra/redundant extra spaces to single space? (default=true) Since 3.0.132
* - `replacements` (array): Associative array of strings to manually replace. (default=['&nbsp;' => ' '])
* @return string
*
*/
public function markupToText($str, array $options = array()) {
$defaults = array(
'keepTags' => array(),
'linksToUrls' => true, // convert links to just URL rather than removing entirely
'splitBlocks' => "\n\n",
'uppercaseHeadlines' => false,
'underlineHeadlines' => true,
'convertEntities' => true,
'listItemPrefix' => '• ',
'preIndent' => '', // indent for text within a <pre>
'collapseSpaces' => true,
'replacements' => array(
'&nbsp;' => ' '
),
'finishReplacements' => array(), // replacements applied at very end (internal)
);
// merge options using arrays
foreach(array('replacements') as $key) {
if(!isset($options[$key])) continue;
$options[$key] = array_merge($defaults[$key], $options[$key]);
}
$options = array_merge($defaults, $options);
if(strpos($str, '>') !== false) {
// strip out everything up to and including </head>, if present
if(strpos($str, '</head>') !== false) list(, $str) = explode('</head>', $str);
if(strpos($str, '</head>') !== false) list(, $str) = explode('</head>', $str);
// ensure tags are separated by whitespace
$str = str_replace('><', '> <', $str);
@@ -83,22 +99,79 @@ class WireTextTools extends Wire {
}
// ensure paragraphs and headers are followed by two newlines
if(stripos($str, '</p>') || stripos($str, '</h')) {
$str = preg_replace('!(</(?:p|h\d)>)!i', '$1' . $options['splitBlocks'], $str);
if(stripos($str, '</p') || stripos($str, '</h') || stripos($str, '</li') || stripos($str, '</bl') || stripos($str, '</div')) {
$str = preg_replace('!(</(?:p|h\d|ul|ol|pre|blockquote|div)>)!i', '$1' . $options['splitBlocks'], $str);
}
// ensure list items are on their own line and prefixed with a bullet
if(stripos($str, '<li') !== false) {
$prefix = in_array('li', $options['keepTags']) ? '' : $options['listItemPrefix'];
$str = preg_replace('![\s\r\n]+<li[^>]*>!i', "\n<li>$prefix", $str);
$prefix = in_array('li', $options['keepTags']) ? '' : $options['listItemPrefix'];
$str = preg_replace('![\s\r\n]+<li[^>]*>[\s\r\n]*!i', "\n<li>$prefix", $str);
if($prefix) $options['replacements']["\n$prefix "] = "\n$prefix"; // prevent extra space
}
// convert <br> tags to be just a single newline
if(stripos($str, '<br') !== false) {
$str = str_replace(array('<br>', '<br/>', '<br />'), "<br>\n", $str);
$str = str_replace(array('<br>', '<br/>', '<br />', '</li>'), "<br>\n", $str);
while(stripos($str, "\n<br>") !== false) $str = str_replace("\n<br>", "<br>", $str);
while(stripos($str, "<br>\n\n") !== false) $str = str_replace("<br>\n\n", "<br>\n", $str);
}
// make headlines more prominent with underlines or uppercase
if(($options['uppercaseHeadlines'] || $options['underlineHeadlines']) && stripos($str, '<h') !== false) {
$topHtag = '';
if($options['underlineHeadlines']) {
// determine which is the top level headline tag
for($n = 1; $n <= 6; $n++) {
if(stripos($str, "<h$n") === false) continue;
$topHtag = "h$n";
break;
}
}
if(preg_match_all('!<(h[123456])[^>]*>(.+?)</\1>!is', $str, $matches)) {
foreach($matches[2] as $key => $headline) {
$fullMatch = $matches[0][$key];
$tagName = strtolower($matches[1][$key]);
$underline = '';
if($options['underlineHeadlines']) {
$char = $tagName === $topHtag ? '=' : '-';
$underline = "\n" . str_repeat($char, $this->strlen($headline));
}
if($options['uppercaseHeadlines']) $headline = strtoupper($headline);
$str = str_replace($fullMatch, "<$tagName>$headline</$tagName>$underline", $str);
}
}
}
// convert "<a href='url'>text</a>" tags to "text (url)"
if($options['linksToUrls'] && stripos($str, '<a ') !== false) {
if(preg_match_all('!<a\s[^<>]*href=([^\s>]+)[^<>]*>(.+?)</a>!is', $str, $matches)) {
$links = array();
foreach($matches[0] as $key => $fullMatch) {
$href = trim($matches[1][$key], '"\'');
if(strpos($href, '#') === 0) continue; // do not convert jumplinks
$anchorText = $matches[2][$key];
$links[$fullMatch] = "$anchorText ($href)";
}
if(count($links)) {
$str = str_replace(array_keys($links), array_values($links), $str);
}
}
}
// indent within <pre>...</pre> sections
if(strlen($options['preIndent']) && strpos($str, '<pre') !== false) {
if(preg_match_all('!<pre(?:>|\s[^>]*>)(.+?)</pre>!is', $str, $matches)) {
foreach($matches[0] as $key => $fullMatch) {
$lines = explode("\n", $matches[1][$key]);
foreach($lines as $k => $line) {
$lines[$k] = ':preIndent:' . rtrim($line);
}
$str = str_replace($fullMatch, implode("\n", $lines), $str);
$options['finishReplacements'][':preIndent:'] = $options['preIndent'];
}
}
}
}
// strip tags
@@ -126,11 +199,20 @@ class WireTextTools extends Wire {
if($options['convertEntities'] && strpos($str, '&') !== false) {
$str = $this->wire('sanitizer')->unentities($str);
}
// collapse any redundant/extra whitespace
if($options['collapseSpaces']) {
while(strpos($str, ' ') !== false) $str = str_replace(' ', ' ', $str);
}
// normalize newlines and whitespace around newlines
while(strpos($str, " \n") !== false) $str = str_replace(" \n", "\n", $str);
while(strpos($str, "\n ") !== false) $str = str_replace("\n ", "\n", $str);
while(strpos($str, "\n\n\n") !== false) $str = str_replace("\n\n\n", "\n\n", $str);
if(count($options['finishReplacements'])) {
$str = str_replace(array_keys($options['finishReplacements']), array_values($options['finishReplacements']), $str);
}
return trim($str);
}