mirror of
https://github.com/gohugoio/hugo.git
synced 2025-08-29 22:29:56 +02:00
Don't count HTML markup in auto summaries
This commit also fixes a bug where a `</picture>` end tag was wrongly used to detect a end paragraph. This should be very rare, though. Closes #12837
This commit is contained in:
@@ -161,6 +161,16 @@ func (s *HtmlSummary) resolveParagraphTagAndSetWrapper(mt media.Type) tagReStart
|
||||
return ptag
|
||||
}
|
||||
|
||||
// Avoid counting words that are most likely HTML tokens.
|
||||
var (
|
||||
isProbablyHTMLTag = regexp.MustCompile(`^<\/?[A-Za-z]+>?$`)
|
||||
isProablyHTMLAttribute = regexp.MustCompile(`^[A-Za-z]+=["']`)
|
||||
)
|
||||
|
||||
func isProbablyHTMLToken(s string) bool {
|
||||
return s == ">" || isProbablyHTMLTag.MatchString(s) || isProablyHTMLAttribute.MatchString(s)
|
||||
}
|
||||
|
||||
// ExtractSummaryFromHTML extracts a summary from the given HTML content.
|
||||
func ExtractSummaryFromHTML(mt media.Type, input string, numWords int, isCJK bool) (result HtmlSummary) {
|
||||
result.source = input
|
||||
@@ -173,6 +183,14 @@ func ExtractSummaryFromHTML(mt media.Type, input string, numWords int, isCJK boo
|
||||
var count int
|
||||
|
||||
countWord := func(word string) int {
|
||||
word = strings.TrimSpace(word)
|
||||
if len(word) == 0 {
|
||||
return 0
|
||||
}
|
||||
if isProbablyHTMLToken(word) {
|
||||
return 0
|
||||
}
|
||||
|
||||
if isCJK {
|
||||
word = tpl.StripHTML(word)
|
||||
runeCount := utf8.RuneCountInString(word)
|
||||
@@ -193,7 +211,7 @@ func ExtractSummaryFromHTML(mt media.Type, input string, numWords int, isCJK boo
|
||||
|
||||
for j := result.WrapperStart.High; j < high; {
|
||||
s := input[j:]
|
||||
closingIndex := strings.Index(s, "</"+ptag.tagName)
|
||||
closingIndex := strings.Index(s, "</"+ptag.tagName+">")
|
||||
|
||||
if closingIndex == -1 {
|
||||
break
|
||||
|
@@ -49,6 +49,46 @@ func TestExtractSummaryFromHTML(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// See https://discourse.gohugo.io/t/automatic-summarys-summarylength-seems-broken-in-the-case-of-plainify/51466/4
|
||||
// Also issue 12837
|
||||
func TestExtractSummaryFromHTMLLotsOfHTMLInSummary(t *testing.T) {
|
||||
c := qt.New(t)
|
||||
|
||||
input := `
|
||||
<p>
|
||||
<div>
|
||||
<picture>
|
||||
<img src="imgs/1.jpg" alt="1"/>
|
||||
</picture>
|
||||
<picture>
|
||||
<img src="imgs/2.jpg" alt="2"/>
|
||||
</picture>
|
||||
<picture>
|
||||
<img src="imgs/3.jpg" alt="3"/>
|
||||
</picture>
|
||||
<picture>
|
||||
<img src="imgs/4.jpg" alt="4"/>
|
||||
</picture>
|
||||
<picture>
|
||||
<img src="imgs/5.jpg" alt="5"/>
|
||||
</picture>
|
||||
</div>
|
||||
</p>
|
||||
<p>
|
||||
This is a story about a cat.
|
||||
</p>
|
||||
<p>
|
||||
The cat was white and fluffy.
|
||||
</p>
|
||||
<p>
|
||||
And it liked milk.
|
||||
</p>
|
||||
`
|
||||
|
||||
summary := ExtractSummaryFromHTML(media.Builtin.MarkdownType, input, 10, false)
|
||||
c.Assert(strings.HasSuffix(summary.Summary(), "<p>\nThis is a story about a cat.\n</p>\n<p>\nThe cat was white and fluffy.\n</p>"), qt.IsTrue)
|
||||
}
|
||||
|
||||
func TestExtractSummaryFromHTMLWithDivider(t *testing.T) {
|
||||
c := qt.New(t)
|
||||
|
||||
@@ -114,6 +154,23 @@ func TestExpandDivider(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestIsProbablyHTMLToken(t *testing.T) {
|
||||
c := qt.New(t)
|
||||
|
||||
for i, test := range []struct {
|
||||
input string
|
||||
expect bool
|
||||
}{
|
||||
{"<p>", true},
|
||||
{"<p", true},
|
||||
{"width=\"32\"", true},
|
||||
{"width='32'", true},
|
||||
{"<p>Æøå", false},
|
||||
} {
|
||||
c.Assert(isProbablyHTMLToken(test.input), qt.Equals, test.expect, qt.Commentf("[%d] Test.expect %q", i, test.input))
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkSummaryFromHTML(b *testing.B) {
|
||||
b.StopTimer()
|
||||
input := "<p>First paragraph</p><p>Second paragraph</p>"
|
||||
|
Reference in New Issue
Block a user