From da060996ec667373ac08ea69033699fb24ed2b2f Mon Sep 17 00:00:00 2001
From: Ryan Cramer
Date: Thu, 24 Mar 2022 09:44:04 -0400
Subject: [PATCH] Improvements to WireTextTools::markupToText() method with new
options: The 'clearTags' removes inner content of specified tags, in addition
to the tags themselves, default value removes inner content of script, styel
and object tags (this is to accommodate feature requested in
processwire/processwire-issues#1546). The 'linksToMarkdown' option converts
HTML links to Markdown style links like "[text](url)" rather than the
existing "text (url)" style links that it uses by default. This version also
has additional logic to prevent empty list items and links.
---
wire/core/WireTextTools.php | 72 ++++++++++++++++++++++++++++++-------
1 file changed, 60 insertions(+), 12 deletions(-)
diff --git a/wire/core/WireTextTools.php b/wire/core/WireTextTools.php
index 74456ee0..1961505a 100644
--- a/wire/core/WireTextTools.php
+++ b/wire/core/WireTextTools.php
@@ -40,15 +40,20 @@ class WireTextTools extends Wire {
* Like PHP’s strip_tags but with some small improvements in HTML-to-text conversion that
* improves the readability of the text.
*
+ * In 3.0.197+ inner content of script, style and object tags is now removed, rather than just the tags.
+ * To revert this behavior or to remove content of additional tags, see the `clearTags` option.
+ *
* #pw-internal
*
* @param string $str String to convert to text
* @param array $options
* - `keepTags` (array): Tag names to keep in returned value, i.e. [ "em", "strong" ]. (default=none)
+ * - `clearTags` (array): Tags that should also have their content cleared. (default=[ "script", "style", "object" ]) Since 3.0.197
* - `splitBlocks` (string): String to split paragraph and header elements. (default="\n\n")
* - `convertEntities` (bool): Convert HTML entities to plain text equivalents? (default=true)
* - `listItemPrefix` (string): Prefix for converted list item `` elements. (default='• ')
- * - `linksToUrls` (bool): Convert links to "(url)" rather than removing entirely? (default=true) Since 3.0.132
+ * - `linksToUrls` (bool): Convert links to `(url)` rather than removing? (default=true) Since 3.0.132
+ * - `linksToMarkdown` (bool): Convert links to `[text](url)` rather than removing? (default=false) Since 3.0.197
* - `uppercaseHeadlines` (bool): Convert headline tags to uppercase? (default=false) Since 3.0.132
* - `underlineHeadlines` (bool): Underline headlines with "=" or "-"? (default=true) Since 3.0.132
* - `collapseSpaces` (bool): Collapse extra/redundant extra spaces to single space? (default=true) Since 3.0.132
@@ -58,9 +63,13 @@ class WireTextTools extends Wire {
*/
public function markupToText($str, array $options = array()) {
+ $sanitizer = $this->wire()->sanitizer;
+
$defaults = array(
- 'keepTags' => array(),
+ 'keepTags' => array(),
+ 'clearTags' => array('script', 'style', 'object'),
'linksToUrls' => true, // convert links to just URL rather than removing entirely
+ 'linksToMarkdown' => false, // convert links to Markdown style links
'splitBlocks' => "\n\n",
'uppercaseHeadlines' => false,
'underlineHeadlines' => true,
@@ -102,14 +111,20 @@ class WireTextTools extends Wire {
// ensure paragraphs and headers are followed by two newlines
if(stripos($str, '
)!i', '$1' . $options['splitBlocks'], $str);
+ $str = preg_replace('!(?(?:p|h\d|ul|ol|pre|blockquote|div)>)!i', '$1' . $options['splitBlocks'], $str);
}
// ensure list items are on their own line and prefixed with a bullet
if(stripos($str, ']*>[\s\r\n]*!i', "\n$prefix", $str);
- if($prefix) $options['replacements']["\n$prefix "] = "\n$prefix"; // prevent extra space
+ if($prefix) {
+ $options['replacements']["\n$prefix "] = "\n$prefix"; // prevent extra space
+ $prefix = trim($prefix);
+ $options['finishReplacements']["\n$prefix\n$prefix"] = ""; // prevent blank items
+ $options['finishReplacements']["\n$prefix\n"] = "";
+
+ }
}
// convert
tags to be just a single newline
@@ -135,29 +150,31 @@ class WireTextTools extends Wire {
$fullMatch = $matches[0][$key];
$tagName = strtolower($matches[1][$key]);
$underline = '';
+ //$headline = trim($headline);
if($options['underlineHeadlines']) {
$char = $tagName === $topHtag ? '=' : '-';
- $underline = "\n" . str_repeat($char, $this->strlen($headline));
+ $underline = "\n" . str_repeat($char, $this->strlen(trim(strip_tags($headline))));
}
if($options['uppercaseHeadlines']) $headline = strtoupper($headline);
- $str = str_replace($fullMatch, "<$tagName>$headline$tagName>$underline", $str);
+ $str = str_replace($fullMatch, "\n\n<$tagName>$headline$tagName>$underline", $str);
}
}
}
// convert "text" tags to "text (url)"
- if($options['linksToUrls'] && stripos($str, ']*href=([^\s>]+)[^<>]*>(.+?)!is', $str, $matches)) {
$links = array();
foreach($matches[0] as $key => $fullMatch) {
$href = trim($matches[1][$key], '"\'');
if(strpos($href, '#') === 0) continue; // do not convert jumplinks
- $anchorText = $matches[2][$key];
- $links[$fullMatch] = "$anchorText ($href)";
+ $anchorText = trim($matches[2][$key]);
+ $links[$fullMatch] = "[$anchorText]($href)";
}
if(count($links)) {
$str = str_replace(array_keys($links), array_values($links), $str);
}
+ unset($links);
}
}
@@ -171,11 +188,30 @@ class WireTextTools extends Wire {
}
$str = str_replace($fullMatch, implode("\n", $lines), $str);
$options['finishReplacements'][':preIndent:'] = $options['preIndent'];
+ unset($lines);
}
}
}
- }
+ // strip tags AND their contents for specified tags
+ foreach($options['clearTags'] as $s) {
+ $s = strtolower($s);
+ if(stripos($str, "<$s") === false) continue;
+ $str = str_ireplace(array("<$s", "$s"), array("<$s", "$s"), $str); // adjust case
+ $parts = explode("<$s", $str);
+ foreach($parts as $key => $part) {
+ if(strpos($part, "$s>") === false) {
+ if($key > 0) unset($parts[$key]); // remove nested inner content
+ } else {
+ $endparts = explode("$s>", $part);
+ $parts[$key] = array_pop($endparts); // convert to content after last
+ }
+ }
+ $str = implode("", $parts);
+ unset($parts, $endparts, $s);
+ }
+ }
+
// strip tags
if(count($options['keepTags'])) {
// some tags will be allowed to remain
@@ -199,7 +235,7 @@ class WireTextTools extends Wire {
// convert entities to plain text equivalents
if($options['convertEntities'] && strpos($str, '&') !== false) {
- $str = $this->wire('sanitizer')->unentities($str);
+ $str = $sanitizer->unentities($str);
}
// collapse any redundant/extra whitespace
@@ -211,7 +247,19 @@ class WireTextTools extends Wire {
while(strpos($str, " \n") !== false) $str = str_replace(" \n", "\n", $str);
while(strpos($str, "\n ") !== false) $str = str_replace("\n ", "\n", $str);
while(strpos($str, "\n\n\n") !== false) $str = str_replace("\n\n\n", "\n\n", $str);
-
+
+ if(strpos($str, '](')) {
+ // contains links
+ if(strpos($str, '[](') !== false || strpos($str, '[ ](') !== false) {
+ // remove links that lack anchor text
+ $str = preg_replace('!\[\s*\]\([^)]*\)!', '', $str);
+ }
+ if($options['linksToUrls']) {
+ // convert markdown style "[text](url)" to "text (url)"
+ if(!$options['linksToMarkdown']) $str = preg_replace('!\[\s*(.+?)\]\(!', '$1 (', $str);
+ }
+ }
+
if(count($options['finishReplacements'])) {
$str = str_replace(array_keys($options['finishReplacements']), array_values($options['finishReplacements']), $str);
}