Revert "publisher: Make the HTML element collector more robust"

This reverts commit ef0f1a7269.
2025-08-20 21:31:32 +02:00 · 2021-05-19 03:45:36 +02:00
parent 3f515f0e33
commit dc6b7a75ff
3 changed files with 169 additions and 302 deletions
--- a/publisher/htmlElementsCollector.go
+++ b/publisher/htmlElementsCollector.go
@@ -19,51 +19,12 @@ import (
 	"sort"
 	"strings"
 	"sync"
-	"unicode"
-	"unicode/utf8"

 	"golang.org/x/net/html"

 	"github.com/gohugoio/hugo/helpers"
 )

-const eof = -1
-
-var (
-	htmlJsonFixer = strings.NewReplacer(", ", "\n")
-	jsonAttrRe    = regexp.MustCompile(`'?(.*?)'?:.*`)
-	classAttrRe   = regexp.MustCompile(`(?i)^class$|transition`)
-
-	skipInnerElementRe = regexp.MustCompile(`(?i)^(pre|textarea|script|style)`)
-	skipAllElementRe   = regexp.MustCompile(`(?i)^!DOCTYPE`)
-	endTagRe           = regexp.MustCompile(`(?i)<\/\s*([a-zA-Z]+)\s*>$`)
-
-	exceptionList = map[string]bool{
-		"thead": true,
-		"tbody": true,
-		"tfoot": true,
-		"td":    true,
-		"tr":    true,
-	}
-)
-
-func newHTMLElementsCollector() *htmlElementsCollector {
-	return &htmlElementsCollector{
-		elementSet: make(map[string]bool),
-	}
-}
-
-func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter {
-	w := &htmlElementsCollectorWriter{
-		collector: collector,
-		state:     htmlLexStart,
-	}
-
-	w.defaultLexElementInside = w.lexElementInside(htmlLexStart)
-
-	return w
-}
-
 // HTMLElements holds lists of tags and attribute values for classes and id.
 type HTMLElements struct {
 	Tags    []string `json:"tags"`
@@ -87,12 +48,6 @@ func (h *HTMLElements) Sort() {
 	sort.Strings(h.IDs)
 }

-type htmlElement struct {
-	Tag     string
-	Classes []string
-	IDs     []string
-}
-
 type htmlElementsCollector struct {
 	// Contains the raw HTML string. We will get the same element
 	// several times, and want to avoid costly reparsing when this
@@ -104,6 +59,12 @@ type htmlElementsCollector struct {
 	mu sync.RWMutex
 }

+func newHTMLElementsCollector() *htmlElementsCollector {
+	return &htmlElementsCollector{
+		elementSet: make(map[string]bool),
+	}
+}
+
 func (c *htmlElementsCollector) getHTMLElements() HTMLElements {
 	var (
 		classes []string
@@ -132,118 +93,114 @@ func (c *htmlElementsCollector) getHTMLElements() HTMLElements {

 type htmlElementsCollectorWriter struct {
 	collector *htmlElementsCollector
+	buff      bytes.Buffer

-	r     rune   // Current rune
-	width int    // The width in bytes of r
-	input []byte // The current slice written to Write
-	pos   int    // The current position in input
+	isCollecting bool
+	inPreTag     string

-	err error
-
-	inQuote rune
-
-	buff bytes.Buffer
-
-	// Current state
-	state htmlCollectorStateFunc
-
-	// Precompiled state funcs
-	defaultLexElementInside htmlCollectorStateFunc
+	inQuote    bool
+	quoteValue byte
 }

-// Write collects HTML elements from p.
+func newHTMLElementsCollectorWriter(collector *htmlElementsCollector) *htmlElementsCollectorWriter {
+	return &htmlElementsCollectorWriter{
+		collector: collector,
+	}
+}
+
+// Write splits the incoming stream into single html element.
 func (w *htmlElementsCollectorWriter) Write(p []byte) (n int, err error) {
 	n = len(p)
-	w.input = p
-	w.pos = 0
+	i := 0

-	for {
-		w.r = w.next()
-		if w.r == eof {
-			return
-		}
-		w.state = w.state(w)
-	}
-}
-
-func (l *htmlElementsCollectorWriter) backup() {
-	l.pos -= l.width
-	l.r, _ = utf8.DecodeRune(l.input[l.pos:])
-}
-
-func (w *htmlElementsCollectorWriter) consumeBuffUntil(condition func() bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
-	var s htmlCollectorStateFunc
-	s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc {
-		w.buff.WriteRune(w.r)
-		if condition() {
-			w.buff.Reset()
-			return resolve
-		}
-		return s
-	}
-	return s
-}
-
-func (w *htmlElementsCollectorWriter) consumeRuneUntil(condition func(r rune) bool, resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
-	var s htmlCollectorStateFunc
-	s = func(*htmlElementsCollectorWriter) htmlCollectorStateFunc {
-		if condition(w.r) {
-			return resolve
-		}
-		return s
-	}
-	return s
-}
-
-// Starts with e.g. "<body " or "<div"
-func (w *htmlElementsCollectorWriter) lexElementInside(resolve htmlCollectorStateFunc) htmlCollectorStateFunc {
-	var s htmlCollectorStateFunc
-	s = func(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
-		w.buff.WriteRune(w.r)
-
-		// Skip any text inside a quote.
-		if w.r == '\'' || w.r == '"' {
-			if w.inQuote == w.r {
-				w.inQuote = 0
-			} else if w.inQuote == 0 {
-				w.inQuote = w.r
+	for i < len(p) {
+		// If we are not collecting, cycle through byte stream until start bracket "<" is found.
+		if !w.isCollecting {
+			for ; i < len(p); i++ {
+				b := p[i]
+				if b == '<' {
+					w.startCollecting()
+					break
+				}
 			}
 		}

-		if w.inQuote != 0 {
-			return s
+		if w.isCollecting {
+			// If we are collecting, cycle through byte stream until end bracket ">" is found,
+			// disregard any ">" if within a quote,
+			// write bytes until found to buffer.
+			for ; i < len(p); i++ {
+				b := p[i]
+				w.toggleIfQuote(b)
+				w.buff.WriteByte(b)
+
+				if !w.inQuote && b == '>' {
+					w.endCollecting()
+					break
+				}
+			}
 		}

-		if w.r == '>' {
+		// If no end bracket ">" is found while collecting, but the stream ended
+		// this could mean we received chunks of a stream from e.g. the minify functionality
+		// next if loop will be skipped.
+
+		// At this point we have collected an element line between angle brackets "<" and ">".
+		if !w.isCollecting {
+			if w.buff.Len() == 0 {
+				continue
+			}
+
+			if w.inPreTag != "" { // within preformatted code block
+				s := w.buff.String()
+				w.buff.Reset()
+				if tagName, isEnd := parseEndTag(s); isEnd && w.inPreTag == tagName {
+					w.inPreTag = ""
+				}
+				continue
+			}
+
+			// First check if we have processed this element before.
+			w.collector.mu.RLock()

 			// Work with the bytes slice as long as it's practical,
 			// to save memory allocations.
 			b := w.buff.Bytes()

-			defer func() {
-				w.buff.Reset()
-			}()
-
-			// First check if we have processed this element before.
-			w.collector.mu.RLock()
-
+			// See https://github.com/dominikh/go-tools/issues/723
+			//lint:ignore S1030 This construct avoids memory allocation for the string.
 			seen := w.collector.elementSet[string(b)]
 			w.collector.mu.RUnlock()
 			if seen {
-				return resolve
+				w.buff.Reset()
+				continue
+			}
+
+			// Filter out unwanted tags
+			// if within preformatted code blocks <pre>, <textarea>, <script>, <style>
+			// comments and doctype tags
+			// end tags.
+			switch {
+			case bytes.HasPrefix(b, []byte("<!")): // comment or doctype tag
+				w.buff.Reset()
+				continue
+			case bytes.HasPrefix(b, []byte("</")): // end tag
+				w.buff.Reset()
+				continue
 			}

 			s := w.buff.String()
+			w.buff.Reset()

-			if s == "" {
-				return resolve
+			// Check if a preformatted code block started.
+			if tagName, isStart := parseStartTag(s); isStart && isPreFormatted(tagName) {
+				w.inPreTag = tagName
 			}

 			// Parse each collected element.
 			el, err := parseHTMLElement(s)
 			if err != nil {
-				w.err = err
-				return resolve
+				return n, err
 			}

 			// Write this tag to the element set.
@@ -251,137 +208,109 @@ func (w *htmlElementsCollectorWriter) lexElementInside(resolve htmlCollectorStat
 			w.collector.elementSet[s] = true
 			w.collector.elements = append(w.collector.elements, el)
 			w.collector.mu.Unlock()
-
-			return resolve
-
-		}
-
-		return s
-	}
-
-	return s
-}
-
-func (l *htmlElementsCollectorWriter) next() rune {
-	if l.pos >= len(l.input) {
-		l.width = 0
-		return eof
-	}
-
-	runeValue, runeWidth := utf8.DecodeRune(l.input[l.pos:])
-	l.width = runeWidth
-	l.pos += l.width
-	return runeValue
-}
-
-// returns the next state in HTML element scanner.
-type htmlCollectorStateFunc func(*htmlElementsCollectorWriter) htmlCollectorStateFunc
-
-// At "<", buffer empty.
-// Potentially starting a HTML element.
-func htmlLexElementStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
-	if w.r == '>' || unicode.IsSpace(w.r) {
-		if w.buff.Len() < 2 || bytes.HasPrefix(w.buff.Bytes(), []byte("</")) {
-			w.buff.Reset()
-			return htmlLexStart
-		}
-
-		tagName := w.buff.Bytes()[1:]
-
-		switch {
-		case skipInnerElementRe.Match(tagName):
-			// pre, script etc. We collect classes etc. on the surrounding
-			// element, but skip the inner content.
-			w.backup()
-
-			// tagName will be overwritten, so make a copy.
-			tagNameCopy := make([]byte, len(tagName))
-			copy(tagNameCopy, tagName)
-
-			return w.lexElementInside(
-				w.consumeBuffUntil(
-					func() bool {
-						if w.r != '>' {
-							return false
-						}
-						m := endTagRe.FindSubmatch(w.buff.Bytes())
-						if m == nil {
-							return false
-						}
-						return bytes.EqualFold(m[1], tagNameCopy)
-					},
-					htmlLexStart,
-				))
-		case skipAllElementRe.Match(tagName):
-			// E.g. "<!DOCTYPE ..."
-			w.buff.Reset()
-			return w.consumeRuneUntil(func(r rune) bool {
-				return r == '>'
-			}, htmlLexStart)
-		default:
-			w.backup()
-			return w.defaultLexElementInside
 		}
 	}

-	w.buff.WriteRune(w.r)
-
-	// If it's a comment, skip to its end.
-	if w.r == '-' && bytes.Equal(w.buff.Bytes(), []byte("<!--")) {
-		w.buff.Reset()
-		return htmlLexToEndOfComment
-	}
-
-	return htmlLexElementStart
+	return
 }

-// Entry state func.
-// Looks for a opening bracket, '<'.
-func htmlLexStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
-	if w.r == '<' {
-		w.backup()
-		w.buff.Reset()
-		return htmlLexElementStart
-	}
-
-	return htmlLexStart
+func (c *htmlElementsCollectorWriter) startCollecting() {
+	c.isCollecting = true
 }

-// After "<!--", buff empty.
-func htmlLexToEndOfComment(w *htmlElementsCollectorWriter) htmlCollectorStateFunc {
-	w.buff.WriteRune(w.r)
+func (c *htmlElementsCollectorWriter) endCollecting() {
+	c.isCollecting = false
+	c.inQuote = false
+}

-	if w.r == '>' && bytes.HasSuffix(w.buff.Bytes(), []byte("-->")) {
-		// Done, start looking for HTML elements again.
-		return htmlLexStart
+func (c *htmlElementsCollectorWriter) toggleIfQuote(b byte) {
+	if isQuote(b) {
+		if c.inQuote && b == c.quoteValue {
+			c.inQuote = false
+		} else if !c.inQuote {
+			c.inQuote = true
+			c.quoteValue = b
+		}
+	}
+}
+
+func isQuote(b byte) bool {
+	return b == '"' || b == '\''
+}
+
+func parseStartTag(s string) (string, bool) {
+	s = strings.TrimPrefix(s, "<")
+	s = strings.TrimSuffix(s, ">")
+
+	spaceIndex := strings.Index(s, " ")
+	if spaceIndex != -1 {
+		s = s[:spaceIndex]
 	}

-	return htmlLexToEndOfComment
+	return strings.ToLower(strings.TrimSpace(s)), true
 }

+func parseEndTag(s string) (string, bool) {
+	if !strings.HasPrefix(s, "</") {
+		return "", false
+	}
+
+	s = strings.TrimPrefix(s, "</")
+	s = strings.TrimSuffix(s, ">")
+
+	return strings.ToLower(strings.TrimSpace(s)), true
+}
+
+// No need to look inside these for HTML elements.
+func isPreFormatted(s string) bool {
+	return s == "pre" || s == "textarea" || s == "script" || s == "style"
+}
+
+type htmlElement struct {
+	Tag     string
+	Classes []string
+	IDs     []string
+}
+
+var (
+	htmlJsonFixer = strings.NewReplacer(", ", "\n")
+	jsonAttrRe    = regexp.MustCompile(`'?(.*?)'?:.*`)
+	classAttrRe   = regexp.MustCompile(`(?i)^class$|transition`)
+
+	exceptionList = map[string]bool{
+		"thead": true,
+		"tbody": true,
+		"tfoot": true,
+		"td":    true,
+		"tr":    true,
+	}
+)
+
 func parseHTMLElement(elStr string) (el htmlElement, err error) {
+	var tagBuffer string = ""

-	tagName := parseStartTag(elStr)
-
-	el.Tag = strings.ToLower(tagName)
-	tagNameToParse := el.Tag
+	tagName, ok := parseStartTag(elStr)
+	if !ok {
+		return
+	}

 	// The net/html parser does not handle single table elements as input, e.g. tbody.
 	// We only care about the element/class/ids, so just store away the original tag name
 	// and pretend it's a <div>.
-	if exceptionList[el.Tag] {
+	if exceptionList[tagName] {
+		tagBuffer = tagName
 		elStr = strings.Replace(elStr, tagName, "div", 1)
-		tagNameToParse = "div"
 	}

 	n, err := html.Parse(strings.NewReader(elStr))
 	if err != nil {
 		return
 	}
-
 	var walk func(*html.Node)
 	walk = func(n *html.Node) {
-		if n.Type == html.ElementNode && n.Data == tagNameToParse {
+		if n.Type == html.ElementNode && strings.Contains(elStr, n.Data) {
+			el.Tag = n.Data
+
 			for _, a := range n.Attr {
 				switch {
 				case strings.EqualFold(a.Key, "id"):
@@ -416,20 +345,10 @@ func parseHTMLElement(elStr string) (el htmlElement, err error) {

 	walk(n)

-	return
-}
-
-// Variants of s
-//    <body class="b a">
-//    <div>
-func parseStartTag(s string) string {
-	spaceIndex := strings.IndexFunc(s, func(r rune) bool {
-		return unicode.IsSpace(r)
-	})
-
-	if spaceIndex == -1 {
-		return s[1 : len(s)-1]
+	// did we replaced the start tag?
+	if tagBuffer != "" {
+		el.Tag = tagBuffer
 	}

-	return s[1:spaceIndex]
+	return
 }