Make the HTML collector parsing more robust

Most notably better handling self-closing elements Closes #10698
2025-08-25 22:00:58 +02:00 · 2023-02-06 17:29:12 +01:00
parent 2a364cca64
commit d33a7ebcc1
2 changed files with 17 additions and 4 deletions
--- a/publisher/htmlElementsCollector.go
+++ b/publisher/htmlElementsCollector.go
@@ -294,9 +294,10 @@ func htmlLexElementStart(w *htmlElementsCollectorWriter) htmlCollectorStateFunc
 		}

 		tagName := w.buff.Bytes()[1:]
+		isSelfClosing := tagName[len(tagName)-1] == '/'

 		switch {
-		case skipInnerElementRe.Match(tagName):
+		case !isSelfClosing && skipInnerElementRe.Match(tagName):
 			// pre, script etc. We collect classes etc. on the surrounding
 			// element, but skip the inner content.
 			w.backup()
@@ -432,10 +433,18 @@ func parseStartTag(s string) string {
 	})

 	if spaceIndex == -1 {
-		return s[1 : len(s)-1]
+		s = s[1 : len(s)-1]
+	} else {
+		s = s[1:spaceIndex]
 	}

-	return s[1:spaceIndex]
+	if s[len(s)-1] == '/' {
+		// Self closing.
+		s = s[:len(s)-1]
+	}
+
+	return s
+
 }

 // isClosedByTag reports whether b ends with a closing tag for tagName.
@@ -487,7 +496,7 @@ LOOP:
 		}
 	}

-	if state != 2 {
+	if state != 2 || lo >= hi {
 		return false
 	}