Fix Plainify edge cases

This commit replaces the main part of `helpers.StripHTML` with Go's implementation in its html/template package.

It's a little slower, but correctness is more important:

```bash
BenchmarkStripHTMLOld-10    	  680316	      1764 ns/op	     728 B/op	       4 allocs/op
BenchmarkStripHTMLNew-10    	  384520	      3099 ns/op	    2089 B/op	      10 allocs/op
```

Fixes #9199
Fixes #9909
Closes #9410
This commit is contained in:
Bjørn Erik Pedersen
2022-05-25 10:56:14 +02:00
parent cd0112a05a
commit 3854a6fa6c
10 changed files with 103 additions and 85 deletions

View File

@@ -34,7 +34,6 @@ import (
"github.com/gohugoio/hugo/markup"
bp "github.com/gohugoio/hugo/bufferpool"
"github.com/gohugoio/hugo/config"
)
@@ -104,45 +103,6 @@ func NewContentSpec(cfg config.Provider, logger loggers.Logger, contentFs afero.
return spec, nil
}
var stripHTMLReplacer = strings.NewReplacer("\n", " ", "</p>", "\n", "<br>", "\n", "<br />", "\n")
// StripHTML accepts a string, strips out all HTML tags and returns it.
func StripHTML(s string) string {
// Shortcut strings with no tags in them
if !strings.ContainsAny(s, "<>") {
return s
}
s = stripHTMLReplacer.Replace(s)
// Walk through the string removing all tags
b := bp.GetBuffer()
defer bp.PutBuffer(b)
var inTag, isSpace, wasSpace bool
for _, r := range s {
if !inTag {
isSpace = false
}
switch {
case r == '<':
inTag = true
case r == '>':
inTag = false
case unicode.IsSpace(r):
isSpace = true
fallthrough
default:
if !inTag && (!isSpace || (isSpace && !wasSpace)) {
b.WriteRune(r)
}
}
wasSpace = isSpace
}
return b.String()
}
// stripEmptyNav strips out empty <nav> tags from content.
func stripEmptyNav(in []byte) []byte {
return bytes.Replace(in, []byte("<nav>\n</nav>\n\n"), []byte(``), -1)

View File

@@ -52,44 +52,6 @@ func TestTrimShortHTML(t *testing.T) {
}
}
func TestStripHTML(t *testing.T) {
type test struct {
input, expected string
}
data := []test{
{"<h1>strip h1 tag <h1>", "strip h1 tag "},
{"<p> strip p tag </p>", " strip p tag "},
{"</br> strip br<br>", " strip br\n"},
{"</br> strip br2<br />", " strip br2\n"},
{"This <strong>is</strong> a\nnewline", "This is a newline"},
{"No Tags", "No Tags"},
{`<p>Summary Next Line.
<figure >
<img src="/not/real" />
</figure>
.
More text here.</p>
<p>Some more text</p>`, "Summary Next Line. . More text here.\nSome more text\n"},
}
for i, d := range data {
output := StripHTML(d.input)
if d.expected != output {
t.Errorf("Test %d failed. Expected %q got %q", i, d.expected, output)
}
}
}
func BenchmarkStripHTML(b *testing.B) {
b.ResetTimer()
for i := 0; i < b.N; i++ {
StripHTML(tstHTMLContent)
}
}
func TestStripEmptyNav(t *testing.T) {
c := qt.New(t)
cleaned := stripEmptyNav([]byte("do<nav>\n</nav>\n\nbedobedo"))