parser/pageparser: Don't store the byte slices

On its own this change doesn't do any magic, but this is part of a bigger picture about making Hugo leaner in the memory usage department.
2025-08-15 20:44:01 +02:00 · 2022-07-07 16:11:47 +02:00
parent 72b0ccdb01
commit 223bf28004
13 changed files with 385 additions and 198 deletions
--- a/parser/pageparser/pageparser.go
+++ b/parser/pageparser/pageparser.go
@@ -15,6 +15,7 @@ package pageparser

 import (
 	"bytes"
+	"errors"
 	"fmt"
 	"io"
 	"io/ioutil"
@@ -33,9 +34,6 @@ type Result interface {
 var _ Result = (*pageLexer)(nil)

 // Parse parses the page in the given reader according to the given Config.
-// TODO(bep) now that we have improved the "lazy order" init, it *may* be
-// some potential saving in doing a buffered approach where the first pass does
-// the frontmatter only.
 func Parse(r io.Reader, cfg Config) (Result, error) {
 	return parseSection(r, cfg, lexIntroSection)
 }
@@ -63,12 +61,12 @@ func ParseFrontMatterAndContent(r io.Reader) (ContentFrontMatter, error) {
 	walkFn := func(item Item) bool {
 		if frontMatterSource != nil {
 			// The rest is content.
-			cf.Content = psr.Input()[item.Pos:]
+			cf.Content = psr.Input()[item.low:]
 			// Done
 			return false
 		} else if item.IsFrontMatter() {
 			cf.FrontMatterFormat = FormatFromFrontMatterType(item.Type)
-			frontMatterSource = item.Val
+			frontMatterSource = item.Val(psr.Input())
 		}
 		return true
 	}
@@ -113,10 +111,15 @@ func parseBytes(b []byte, cfg Config, start stateFunc) (Result, error) {
 	return lexer, nil
 }

+// NewIterator creates a new Iterator.
+func NewIterator(items Items) *Iterator {
+	return &Iterator{items: items, lastPos: -1}
+}
+
 // An Iterator has methods to iterate a parsed page with support going back
 // if needed.
 type Iterator struct {
-	l       *pageLexer
+	items   Items
 	lastPos int // position of the last item returned by nextItem
 }

@@ -126,19 +129,14 @@ func (t *Iterator) Next() Item {
 	return t.Current()
 }

-// Input returns the input source.
-func (t *Iterator) Input() []byte {
-	return t.l.Input()
-}
-
-var errIndexOutOfBounds = Item{tError, 0, []byte("no more tokens"), true}
+var errIndexOutOfBounds = Item{Type: tError, Err: errors.New("no more tokens")}

 // Current will repeatably return the current item.
 func (t *Iterator) Current() Item {
-	if t.lastPos >= len(t.l.items) {
+	if t.lastPos >= len(t.items) {
 		return errIndexOutOfBounds
 	}
-	return t.l.items[t.lastPos]
+	return t.items[t.lastPos]
 }

 // backs up one token.
@@ -163,14 +161,14 @@ func (t *Iterator) IsValueNext() bool {
 // look at, but do not consume, the next item
 // repeated, sequential calls will return the same item
 func (t *Iterator) Peek() Item {
-	return t.l.items[t.lastPos+1]
+	return t.items[t.lastPos+1]
 }

 // PeekWalk will feed the next items in the iterator to walkFn
 // until it returns false.
 func (t *Iterator) PeekWalk(walkFn func(item Item) bool) {
-	for i := t.lastPos + 1; i < len(t.l.items); i++ {
-		item := t.l.items[i]
+	for i := t.lastPos + 1; i < len(t.items); i++ {
+		item := t.items[i]
 		if !walkFn(item) {
 			break
 		}
@@ -190,6 +188,49 @@ func (t *Iterator) Consume(cnt int) {
 }

 // LineNumber returns the current line number. Used for logging.
-func (t *Iterator) LineNumber() int {
-	return bytes.Count(t.l.input[:t.Current().Pos], lf) + 1
+func (t *Iterator) LineNumber(source []byte) int {
+	return bytes.Count(source[:t.Current().low], lf) + 1
+}
+
+// IsProbablySourceOfItems returns true if the given source looks like original
+// source of the items.
+// There may be some false positives, but that is highly unlikely and good enough
+// for the planned purpose.
+// It will also return false if the last item is not EOF (error situations) and
+// true if both source and items are empty.
+func IsProbablySourceOfItems(source []byte, items Items) bool {
+	if len(source) == 0 && len(items) == 0 {
+		return false
+	}
+	if len(items) == 0 {
+		return false
+	}
+
+	last := items[len(items)-1]
+	if last.Type != tEOF {
+		return false
+	}
+
+	if last.Pos() != len(source) {
+		return false
+	}
+
+	for _, item := range items {
+		if item.Type == tError {
+			return false
+		}
+		if item.Type == tEOF {
+			return true
+		}
+
+		if item.Pos() >= len(source) {
+			return false
+		}
+
+		if item.firstByte != source[item.Pos()] {
+			return false
+		}
+	}
+
+	return true
 }