From da060996ec667373ac08ea69033699fb24ed2b2f Mon Sep 17 00:00:00 2001 From: Ryan Cramer Date: Thu, 24 Mar 2022 09:44:04 -0400 Subject: [PATCH] Improvements to WireTextTools::markupToText() method with new options: The 'clearTags' removes inner content of specified tags, in addition to the tags themselves, default value removes inner content of script, styel and object tags (this is to accommodate feature requested in processwire/processwire-issues#1546). The 'linksToMarkdown' option converts HTML links to Markdown style links like "[text](url)" rather than the existing "text (url)" style links that it uses by default. This version also has additional logic to prevent empty list items and links. --- wire/core/WireTextTools.php | 72 ++++++++++++++++++++++++++++++------- 1 file changed, 60 insertions(+), 12 deletions(-) diff --git a/wire/core/WireTextTools.php b/wire/core/WireTextTools.php index 74456ee0..1961505a 100644 --- a/wire/core/WireTextTools.php +++ b/wire/core/WireTextTools.php @@ -40,15 +40,20 @@ class WireTextTools extends Wire { * Like PHP’s strip_tags but with some small improvements in HTML-to-text conversion that * improves the readability of the text. * + * In 3.0.197+ inner content of script, style and object tags is now removed, rather than just the tags. + * To revert this behavior or to remove content of additional tags, see the `clearTags` option. + * * #pw-internal * * @param string $str String to convert to text * @param array $options * - `keepTags` (array): Tag names to keep in returned value, i.e. [ "em", "strong" ]. (default=none) + * - `clearTags` (array): Tags that should also have their content cleared. (default=[ "script", "style", "object" ]) Since 3.0.197 * - `splitBlocks` (string): String to split paragraph and header elements. (default="\n\n") * - `convertEntities` (bool): Convert HTML entities to plain text equivalents? (default=true) * - `listItemPrefix` (string): Prefix for converted list item `
  • ` elements. (default='• ') - * - `linksToUrls` (bool): Convert links to "(url)" rather than removing entirely? (default=true) Since 3.0.132 + * - `linksToUrls` (bool): Convert links to `(url)` rather than removing? (default=true) Since 3.0.132 + * - `linksToMarkdown` (bool): Convert links to `[text](url)` rather than removing? (default=false) Since 3.0.197 * - `uppercaseHeadlines` (bool): Convert headline tags to uppercase? (default=false) Since 3.0.132 * - `underlineHeadlines` (bool): Underline headlines with "=" or "-"? (default=true) Since 3.0.132 * - `collapseSpaces` (bool): Collapse extra/redundant extra spaces to single space? (default=true) Since 3.0.132 @@ -58,9 +63,13 @@ class WireTextTools extends Wire { */ public function markupToText($str, array $options = array()) { + $sanitizer = $this->wire()->sanitizer; + $defaults = array( - 'keepTags' => array(), + 'keepTags' => array(), + 'clearTags' => array('script', 'style', 'object'), 'linksToUrls' => true, // convert links to just URL rather than removing entirely + 'linksToMarkdown' => false, // convert links to Markdown style links 'splitBlocks' => "\n\n", 'uppercaseHeadlines' => false, 'underlineHeadlines' => true, @@ -102,14 +111,20 @@ class WireTextTools extends Wire { // ensure paragraphs and headers are followed by two newlines if(stripos($str, ')!i', '$1' . $options['splitBlocks'], $str); + $str = preg_replace('!()!i', '$1' . $options['splitBlocks'], $str); } // ensure list items are on their own line and prefixed with a bullet if(stripos($str, ']*>[\s\r\n]*!i', "\n
  • $prefix", $str); - if($prefix) $options['replacements']["\n$prefix "] = "\n$prefix"; // prevent extra space + if($prefix) { + $options['replacements']["\n$prefix "] = "\n$prefix"; // prevent extra space + $prefix = trim($prefix); + $options['finishReplacements']["\n$prefix\n$prefix"] = ""; // prevent blank items + $options['finishReplacements']["\n$prefix\n"] = ""; + + } } // convert
    tags to be just a single newline @@ -135,29 +150,31 @@ class WireTextTools extends Wire { $fullMatch = $matches[0][$key]; $tagName = strtolower($matches[1][$key]); $underline = ''; + //$headline = trim($headline); if($options['underlineHeadlines']) { $char = $tagName === $topHtag ? '=' : '-'; - $underline = "\n" . str_repeat($char, $this->strlen($headline)); + $underline = "\n" . str_repeat($char, $this->strlen(trim(strip_tags($headline)))); } if($options['uppercaseHeadlines']) $headline = strtoupper($headline); - $str = str_replace($fullMatch, "<$tagName>$headline$underline", $str); + $str = str_replace($fullMatch, "\n\n<$tagName>$headline$underline", $str); } } } // convert "text" tags to "text (url)" - if($options['linksToUrls'] && stripos($str, ']*href=([^\s>]+)[^<>]*>(.+?)!is', $str, $matches)) { $links = array(); foreach($matches[0] as $key => $fullMatch) { $href = trim($matches[1][$key], '"\''); if(strpos($href, '#') === 0) continue; // do not convert jumplinks - $anchorText = $matches[2][$key]; - $links[$fullMatch] = "$anchorText ($href)"; + $anchorText = trim($matches[2][$key]); + $links[$fullMatch] = "[$anchorText]($href)"; } if(count($links)) { $str = str_replace(array_keys($links), array_values($links), $str); } + unset($links); } } @@ -171,11 +188,30 @@ class WireTextTools extends Wire { } $str = str_replace($fullMatch, implode("\n", $lines), $str); $options['finishReplacements'][':preIndent:'] = $options['preIndent']; + unset($lines); } } } - } + // strip tags AND their contents for specified tags + foreach($options['clearTags'] as $s) { + $s = strtolower($s); + if(stripos($str, "<$s") === false) continue; + $str = str_ireplace(array("<$s", " $part) { + if(strpos($part, "") === false) { + if($key > 0) unset($parts[$key]); // remove nested inner content + } else { + $endparts = explode("", $part); + $parts[$key] = array_pop($endparts); // convert to content after last + } + } + $str = implode("", $parts); + unset($parts, $endparts, $s); + } + } + // strip tags if(count($options['keepTags'])) { // some tags will be allowed to remain @@ -199,7 +235,7 @@ class WireTextTools extends Wire { // convert entities to plain text equivalents if($options['convertEntities'] && strpos($str, '&') !== false) { - $str = $this->wire('sanitizer')->unentities($str); + $str = $sanitizer->unentities($str); } // collapse any redundant/extra whitespace @@ -211,7 +247,19 @@ class WireTextTools extends Wire { while(strpos($str, " \n") !== false) $str = str_replace(" \n", "\n", $str); while(strpos($str, "\n ") !== false) $str = str_replace("\n ", "\n", $str); while(strpos($str, "\n\n\n") !== false) $str = str_replace("\n\n\n", "\n\n", $str); - + + if(strpos($str, '](')) { + // contains links + if(strpos($str, '[](') !== false || strpos($str, '[ ](') !== false) { + // remove links that lack anchor text + $str = preg_replace('!\[\s*\]\([^)]*\)!', '', $str); + } + if($options['linksToUrls']) { + // convert markdown style "[text](url)" to "text (url)" + if(!$options['linksToMarkdown']) $str = preg_replace('!\[\s*(.+?)\]\(!', '$1 (', $str); + } + } + if(count($options['finishReplacements'])) { $str = str_replace(array_keys($options['finishReplacements']), array_values($options['finishReplacements']), $str); }