Fix handling of HTML files without front matter

This means that any HTML file inside /content will be treated as a regular file. If you want it processes with shortcodes and a layout, add front matter. The defintion of an HTML file here is: * File with extension .htm or .html * With first non-whitespace character "<" that isn't a HTML comment. This is in line with the documentation. Fixes #7030 Fixes #7028 See #6789
2025-08-21 21:35:28 +02:00 · 2020-03-09 12:04:33 +01:00
parent 8279d2e227
commit ffcb4aeb8e
18 changed files with 168 additions and 309 deletions
--- a/hugofs/files/classifier.go
+++ b/hugofs/files/classifier.go
@@ -14,10 +14,16 @@
 package files

 import (
+	"bufio"
+	"fmt"
+	"io"
 	"os"
 	"path/filepath"
 	"sort"
 	"strings"
+	"unicode"
+
+	"github.com/spf13/afero"
 )

 var (
@@ -32,6 +38,11 @@ var (
 		"pandoc", "pdc"}

 	contentFileExtensionsSet map[string]bool
+
+	htmlFileExtensions = []string{
+		"html", "htm"}
+
+	htmlFileExtensionsSet map[string]bool
 )

 func init() {
@@ -39,12 +50,20 @@ func init() {
 	for _, ext := range contentFileExtensions {
 		contentFileExtensionsSet[ext] = true
 	}
+	htmlFileExtensionsSet = make(map[string]bool)
+	for _, ext := range htmlFileExtensions {
+		htmlFileExtensionsSet[ext] = true
+	}
 }

 func IsContentFile(filename string) bool {
 	return contentFileExtensionsSet[strings.TrimPrefix(filepath.Ext(filename), ".")]
 }

+func IsHTMLFile(filename string) bool {
+	return htmlFileExtensionsSet[strings.TrimPrefix(filepath.Ext(filename), ".")]
+}
+
 func IsContentExt(ext string) bool {
 	return contentFileExtensionsSet[ext]
 }
@@ -62,10 +81,33 @@ func (c ContentClass) IsBundle() bool {
 	return c == ContentClassLeaf || c == ContentClassBranch
 }

-func ClassifyContentFile(filename string) ContentClass {
+func ClassifyContentFile(filename string, open func() (afero.File, error)) ContentClass {
 	if !IsContentFile(filename) {
 		return ContentClassFile
 	}
+
+	if IsHTMLFile(filename) {
+		// We need to look inside the file. If the first non-whitespace
+		// character is a "<", then we treat it as a regular file.
+		// Eearlier we created pages for these files, but that had all sorts
+		// of troubles, and isn't what it says in the documentation.
+		// See https://github.com/gohugoio/hugo/issues/7030
+		if open == nil {
+			panic(fmt.Sprintf("no file opener provided for %q", filename))
+		}
+
+		f, err := open()
+		if err != nil {
+			return ContentClassFile
+		}
+		ishtml := isHTMLContent(f)
+		f.Close()
+		if ishtml {
+			return ContentClassFile
+		}
+
+	}
+
 	if strings.HasPrefix(filename, "_index.") {
 		return ContentClassBranch
 	}
@@ -77,6 +119,40 @@ func ClassifyContentFile(filename string) ContentClass {
 	return ContentClassContent
 }

+var htmlComment = []rune{'<', '!', '-', '-'}
+
+func isHTMLContent(r io.Reader) bool {
+	br := bufio.NewReader(r)
+	i := 0
+	for {
+		c, _, err := br.ReadRune()
+		if err != nil {
+			break
+		}
+
+		if i > 0 {
+			if i >= len(htmlComment) {
+				return false
+			}
+
+			if c != htmlComment[i] {
+				return true
+			}
+
+			i++
+			continue
+		}
+
+		if !unicode.IsSpace(c) {
+			if i == 0 && c != '<' {
+				return false
+			}
+			i++
+		}
+	}
+	return true
+}
+
 const (
 	ComponentFolderArchetypes = "archetypes"
 	ComponentFolderStatic     = "static"