Fix Plainify edge cases

This commit replaces the main part of `helpers.StripHTML` with Go's implementation in its html/template package.

It's a little slower, but correctness is more important:

```bash
BenchmarkStripHTMLOld-10    	  680316	      1764 ns/op	     728 B/op	       4 allocs/op
BenchmarkStripHTMLNew-10    	  384520	      3099 ns/op	    2089 B/op	      10 allocs/op
```

Fixes #9199
Fixes #9909
Closes #9410
This commit is contained in:
Bjørn Erik Pedersen
2022-05-25 10:56:14 +02:00
parent cd0112a05a
commit 3854a6fa6c
10 changed files with 103 additions and 85 deletions

View File

@@ -18,9 +18,14 @@ import (
"io"
"reflect"
"regexp"
"strings"
"unicode"
bp "github.com/gohugoio/hugo/bufferpool"
"github.com/gohugoio/hugo/output"
htmltemplate "github.com/gohugoio/hugo/tpl/internal/go_templates/htmltemplate"
texttemplate "github.com/gohugoio/hugo/tpl/internal/go_templates/texttemplate"
)
@@ -163,3 +168,44 @@ func GetHasLockFromContext(ctx context.Context) bool {
func SetHasLockInContext(ctx context.Context, hasLock bool) context.Context {
return context.WithValue(ctx, texttemplate.HasLockContextKey, hasLock)
}
const hugoNewLinePlaceholder = "___hugonl_"
var (
stripHTMLReplacerPre = strings.NewReplacer("\n", " ", "</p>", hugoNewLinePlaceholder, "<br>", hugoNewLinePlaceholder, "<br />", hugoNewLinePlaceholder)
whitespaceRe = regexp.MustCompile(`\s+`)
)
// StripHTML strips out all HTML tags in s.
func StripHTML(s string) string {
// Shortcut strings with no tags in them
if !strings.ContainsAny(s, "<>") {
return s
}
pre := stripHTMLReplacerPre.Replace(s)
preReplaced := pre != s
s = htmltemplate.StripTags(pre)
if preReplaced {
s = strings.ReplaceAll(s, hugoNewLinePlaceholder, "\n")
}
var wasSpace bool
b := bp.GetBuffer()
defer bp.PutBuffer(b)
for _, r := range s {
isSpace := unicode.IsSpace(r)
if !(isSpace && wasSpace) {
b.WriteRune(r)
}
wasSpace = isSpace
}
if b.Len() > 0 {
s = b.String()
}
return s
}