diff --git a/wire/core/WireTextTools.php b/wire/core/WireTextTools.php index 74456ee0..1961505a 100644 --- a/wire/core/WireTextTools.php +++ b/wire/core/WireTextTools.php @@ -40,15 +40,20 @@ class WireTextTools extends Wire { * Like PHP’s strip_tags but with some small improvements in HTML-to-text conversion that * improves the readability of the text. * + * In 3.0.197+ inner content of script, style and object tags is now removed, rather than just the tags. + * To revert this behavior or to remove content of additional tags, see the `clearTags` option. + * * #pw-internal * * @param string $str String to convert to text * @param array $options * - `keepTags` (array): Tag names to keep in returned value, i.e. [ "em", "strong" ]. (default=none) + * - `clearTags` (array): Tags that should also have their content cleared. (default=[ "script", "style", "object" ]) Since 3.0.197 * - `splitBlocks` (string): String to split paragraph and header elements. (default="\n\n") * - `convertEntities` (bool): Convert HTML entities to plain text equivalents? (default=true) * - `listItemPrefix` (string): Prefix for converted list item `
  • ` elements. (default='• ') - * - `linksToUrls` (bool): Convert links to "(url)" rather than removing entirely? (default=true) Since 3.0.132 + * - `linksToUrls` (bool): Convert links to `(url)` rather than removing? (default=true) Since 3.0.132 + * - `linksToMarkdown` (bool): Convert links to `[text](url)` rather than removing? (default=false) Since 3.0.197 * - `uppercaseHeadlines` (bool): Convert headline tags to uppercase? (default=false) Since 3.0.132 * - `underlineHeadlines` (bool): Underline headlines with "=" or "-"? (default=true) Since 3.0.132 * - `collapseSpaces` (bool): Collapse extra/redundant extra spaces to single space? (default=true) Since 3.0.132 @@ -58,9 +63,13 @@ class WireTextTools extends Wire { */ public function markupToText($str, array $options = array()) { + $sanitizer = $this->wire()->sanitizer; + $defaults = array( - 'keepTags' => array(), + 'keepTags' => array(), + 'clearTags' => array('script', 'style', 'object'), 'linksToUrls' => true, // convert links to just URL rather than removing entirely + 'linksToMarkdown' => false, // convert links to Markdown style links 'splitBlocks' => "\n\n", 'uppercaseHeadlines' => false, 'underlineHeadlines' => true, @@ -102,14 +111,20 @@ class WireTextTools extends Wire { // ensure paragraphs and headers are followed by two newlines if(stripos($str, ')!i', '$1' . $options['splitBlocks'], $str); + $str = preg_replace('!()!i', '$1' . $options['splitBlocks'], $str); } // ensure list items are on their own line and prefixed with a bullet if(stripos($str, ']*>[\s\r\n]*!i', "\n
  • $prefix", $str); - if($prefix) $options['replacements']["\n$prefix "] = "\n$prefix"; // prevent extra space + if($prefix) { + $options['replacements']["\n$prefix "] = "\n$prefix"; // prevent extra space + $prefix = trim($prefix); + $options['finishReplacements']["\n$prefix\n$prefix"] = ""; // prevent blank items + $options['finishReplacements']["\n$prefix\n"] = ""; + + } } // convert
    tags to be just a single newline @@ -135,29 +150,31 @@ class WireTextTools extends Wire { $fullMatch = $matches[0][$key]; $tagName = strtolower($matches[1][$key]); $underline = ''; + //$headline = trim($headline); if($options['underlineHeadlines']) { $char = $tagName === $topHtag ? '=' : '-'; - $underline = "\n" . str_repeat($char, $this->strlen($headline)); + $underline = "\n" . str_repeat($char, $this->strlen(trim(strip_tags($headline)))); } if($options['uppercaseHeadlines']) $headline = strtoupper($headline); - $str = str_replace($fullMatch, "<$tagName>$headline$underline", $str); + $str = str_replace($fullMatch, "\n\n<$tagName>$headline$underline", $str); } } } // convert "text" tags to "text (url)" - if($options['linksToUrls'] && stripos($str, ']*href=([^\s>]+)[^<>]*>(.+?)!is', $str, $matches)) { $links = array(); foreach($matches[0] as $key => $fullMatch) { $href = trim($matches[1][$key], '"\''); if(strpos($href, '#') === 0) continue; // do not convert jumplinks - $anchorText = $matches[2][$key]; - $links[$fullMatch] = "$anchorText ($href)"; + $anchorText = trim($matches[2][$key]); + $links[$fullMatch] = "[$anchorText]($href)"; } if(count($links)) { $str = str_replace(array_keys($links), array_values($links), $str); } + unset($links); } } @@ -171,11 +188,30 @@ class WireTextTools extends Wire { } $str = str_replace($fullMatch, implode("\n", $lines), $str); $options['finishReplacements'][':preIndent:'] = $options['preIndent']; + unset($lines); } } } - } + // strip tags AND their contents for specified tags + foreach($options['clearTags'] as $s) { + $s = strtolower($s); + if(stripos($str, "<$s") === false) continue; + $str = str_ireplace(array("<$s", " $part) { + if(strpos($part, "") === false) { + if($key > 0) unset($parts[$key]); // remove nested inner content + } else { + $endparts = explode("", $part); + $parts[$key] = array_pop($endparts); // convert to content after last + } + } + $str = implode("", $parts); + unset($parts, $endparts, $s); + } + } + // strip tags if(count($options['keepTags'])) { // some tags will be allowed to remain @@ -199,7 +235,7 @@ class WireTextTools extends Wire { // convert entities to plain text equivalents if($options['convertEntities'] && strpos($str, '&') !== false) { - $str = $this->wire('sanitizer')->unentities($str); + $str = $sanitizer->unentities($str); } // collapse any redundant/extra whitespace @@ -211,7 +247,19 @@ class WireTextTools extends Wire { while(strpos($str, " \n") !== false) $str = str_replace(" \n", "\n", $str); while(strpos($str, "\n ") !== false) $str = str_replace("\n ", "\n", $str); while(strpos($str, "\n\n\n") !== false) $str = str_replace("\n\n\n", "\n\n", $str); - + + if(strpos($str, '](')) { + // contains links + if(strpos($str, '[](') !== false || strpos($str, '[ ](') !== false) { + // remove links that lack anchor text + $str = preg_replace('!\[\s*\]\([^)]*\)!', '', $str); + } + if($options['linksToUrls']) { + // convert markdown style "[text](url)" to "text (url)" + if(!$options['linksToMarkdown']) $str = preg_replace('!\[\s*(.+?)\]\(!', '$1 (', $str); + } + } + if(count($options['finishReplacements'])) { $str = str_replace(array_keys($options['finishReplacements']), array_values($options['finishReplacements']), $str); }