hugolib: Integrate new page parser

See #5324
This commit is contained in:
Bjørn Erik Pedersen
2018-10-18 10:21:23 +02:00
parent 1b7ecfc2e1
commit 1e3e34002d
23 changed files with 729 additions and 356 deletions

View File

@@ -33,8 +33,8 @@ const eof = -1
type stateFunc func(*pageLexer) stateFunc
type lexerShortcodeState struct {
currLeftDelimItem itemType
currRightDelimItem itemType
currLeftDelimItem ItemType
currRightDelimItem ItemType
currShortcodeName string // is only set when a shortcode is in opened state
closingState int // > 0 = on its way to be closed
elementStepNum int // step number in element
@@ -50,14 +50,24 @@ type pageLexer struct {
pos pos // input position
start pos // item start position
width pos // width of last element
lastPos pos // position of the last item returned by nextItem
contentSections int
// Set when we have parsed any summary divider
summaryDividerChecked bool
lexerShortcodeState
// items delivered to client
items []Item
items Items
}
// Implement the Result interface
func (l *pageLexer) Iterator() *Iterator {
return l.newIterator()
}
func (l *pageLexer) Input() []byte {
return l.input
}
// note: the input position here is normally 0 (start), but
@@ -79,6 +89,10 @@ func newPageLexer(input []byte, inputPosition pos, stateStart stateFunc) *pageLe
return lexer
}
func (l *pageLexer) newIterator() *Iterator {
return &Iterator{l: l, lastPos: -1}
}
// main loop
func (l *pageLexer) run() *pageLexer {
for l.state = l.stateStart; l.state != nil; {
@@ -89,6 +103,7 @@ func (l *pageLexer) run() *pageLexer {
// Shortcode syntax
var (
leftDelimSc = []byte("{{")
leftDelimScNoMarkup = []byte("{{<")
rightDelimScNoMarkup = []byte(">}}")
leftDelimScWithMarkup = []byte("{{%")
@@ -99,11 +114,14 @@ var (
// Page syntax
var (
byteOrderMark = '\ufeff'
summaryDivider = []byte("<!--more-->")
summaryDividerOrg = []byte("# more")
delimTOML = []byte("+++")
delimYAML = []byte("---")
delimOrg = []byte("#+")
htmlCOmmentStart = []byte("<!--")
htmlCOmmentEnd = []byte("-->")
)
func (l *pageLexer) next() rune {
@@ -131,13 +149,13 @@ func (l *pageLexer) backup() {
}
// sends an item back to the client.
func (l *pageLexer) emit(t itemType) {
func (l *pageLexer) emit(t ItemType) {
l.items = append(l.items, Item{t, l.start, l.input[l.start:l.pos]})
l.start = l.pos
}
// special case, do not send '\\' back to client
func (l *pageLexer) ignoreEscapesAndEmit(t itemType) {
func (l *pageLexer) ignoreEscapesAndEmit(t ItemType) {
val := bytes.Map(func(r rune) rune {
if r == '\\' {
return -1
@@ -160,25 +178,12 @@ func (l *pageLexer) ignore() {
var lf = []byte("\n")
// nice to have in error logs
func (l *pageLexer) lineNum() int {
return bytes.Count(l.input[:l.lastPos], lf) + 1
}
// nil terminates the parser
func (l *pageLexer) errorf(format string, args ...interface{}) stateFunc {
l.items = append(l.items, Item{tError, l.start, []byte(fmt.Sprintf(format, args...))})
return nil
}
// consumes and returns the next item
func (l *pageLexer) nextItem() Item {
item := l.items[0]
l.items = l.items[1:]
l.lastPos = item.pos
return item
}
func (l *pageLexer) consumeCRLF() bool {
var consumed bool
for _, r := range crLf {
@@ -192,12 +197,28 @@ func (l *pageLexer) consumeCRLF() bool {
}
func lexMainSection(l *pageLexer) stateFunc {
// Fast forward as far as possible.
var l1, l2, l3 int
if !l.summaryDividerChecked {
// TODO(bep) 2errors make the summary divider per type
l1 = l.index(summaryDivider)
l2 = l.index(summaryDividerOrg)
if l1 == -1 && l2 == -1 {
l.summaryDividerChecked = true
}
}
l3 = l.index(leftDelimSc)
skip := minPositiveIndex(l1, l2, l3)
if skip > 0 {
l.pos += pos(skip)
}
for {
if l.isShortCodeStart() {
if l.pos > l.start {
l.emit(tText)
}
if bytes.HasPrefix(l.input[l.pos:], leftDelimScWithMarkup) {
if l.hasPrefix(leftDelimScWithMarkup) {
l.currLeftDelimItem = tLeftDelimScWithMarkup
l.currRightDelimItem = tRightDelimScWithMarkup
} else {
@@ -207,21 +228,21 @@ func lexMainSection(l *pageLexer) stateFunc {
return lexShortcodeLeftDelim
}
if l.contentSections <= 1 {
if bytes.HasPrefix(l.input[l.pos:], summaryDivider) {
if !l.summaryDividerChecked {
if l.hasPrefix(summaryDivider) {
if l.pos > l.start {
l.emit(tText)
}
l.contentSections++
l.summaryDividerChecked = true
l.pos += pos(len(summaryDivider))
l.emit(tSummaryDivider)
} else if bytes.HasPrefix(l.input[l.pos:], summaryDividerOrg) {
l.emit(TypeLeadSummaryDivider)
} else if l.hasPrefix(summaryDividerOrg) {
if l.pos > l.start {
l.emit(tText)
}
l.contentSections++
l.summaryDividerChecked = true
l.pos += pos(len(summaryDividerOrg))
l.emit(tSummaryDividerOrg)
l.emit(TypeSummaryDividerOrg)
}
}
@@ -237,7 +258,7 @@ func lexMainSection(l *pageLexer) stateFunc {
}
func (l *pageLexer) isShortCodeStart() bool {
return bytes.HasPrefix(l.input[l.pos:], leftDelimScWithMarkup) || bytes.HasPrefix(l.input[l.pos:], leftDelimScNoMarkup)
return l.hasPrefix(leftDelimScWithMarkup) || l.hasPrefix(leftDelimScNoMarkup)
}
func lexIntroSection(l *pageLexer) stateFunc {
@@ -250,28 +271,37 @@ LOOP:
switch {
case r == '+':
return l.lexFrontMatterSection(tFrontMatterTOML, r, "TOML", delimTOML)
return l.lexFrontMatterSection(TypeFrontMatterTOML, r, "TOML", delimTOML)
case r == '-':
return l.lexFrontMatterSection(tFrontMatterYAML, r, "YAML", delimYAML)
return l.lexFrontMatterSection(TypeFrontMatterYAML, r, "YAML", delimYAML)
case r == '{':
return lexFrontMatterJSON
case r == '#':
return lexFrontMatterOrgMode
case r == byteOrderMark:
l.emit(TypeIgnore)
case !isSpace(r) && !isEndOfLine(r):
// No front matter.
if r == '<' {
l.emit(tHTMLLead)
// Not need to look further. Hugo treats this as plain HTML,
// no front matter, no shortcodes, no nothing.
l.pos = pos(len(l.input))
l.emit(tText)
break LOOP
l.backup()
if l.hasPrefix(htmlCOmmentStart) {
right := l.index(htmlCOmmentEnd)
if right == -1 {
return l.errorf("starting HTML comment with no end")
}
l.pos += pos(right) + pos(len(htmlCOmmentEnd))
l.emit(TypeHTMLComment)
} else {
// Not need to look further. Hugo treats this as plain HTML,
// no front matter, no shortcodes, no nothing.
l.pos = pos(len(l.input))
l.emit(TypeHTMLDocument)
}
}
return l.errorf("failed to detect front matter type; got unknown identifier %q", r)
break LOOP
}
}
l.contentSections = 1
// Now move on to the shortcodes.
return lexMainSection
}
@@ -324,7 +354,7 @@ func lexFrontMatterJSON(l *pageLexer) stateFunc {
}
l.consumeCRLF()
l.emit(tFrontMatterJSON)
l.emit(TypeFrontMatterJSON)
return lexMainSection
}
@@ -338,7 +368,7 @@ func lexFrontMatterOrgMode(l *pageLexer) stateFunc {
l.backup()
if !bytes.HasPrefix(l.input[l.pos:], delimOrg) {
if !l.hasPrefix(delimOrg) {
// TODO(bep) consider error
return lexMainSection
}
@@ -351,7 +381,7 @@ LOOP:
switch {
case r == '\n':
if !bytes.HasPrefix(l.input[l.pos:], delimOrg) {
if !l.hasPrefix(delimOrg) {
break LOOP
}
case r == eof:
@@ -360,24 +390,25 @@ LOOP:
}
}
l.emit(tFrontMatterORG)
l.emit(TypeFrontMatterORG)
return lexMainSection
}
func (l *pageLexer) printCurrentInput() {
fmt.Printf("input[%d:]: %q", l.pos, string(l.input[l.pos:]))
}
// Handle YAML or TOML front matter.
func (l *pageLexer) lexFrontMatterSection(tp itemType, delimr rune, name string, delim []byte) stateFunc {
func (l *pageLexer) lexFrontMatterSection(tp ItemType, delimr rune, name string, delim []byte) stateFunc {
for i := 0; i < 2; i++ {
if r := l.next(); r != delimr {
return l.errorf("invalid %s delimiter", name)
}
}
if !l.consumeCRLF() {
return l.errorf("invalid %s delimiter", name)
}
// We don't care about the delimiters.
l.ignore()
@@ -387,7 +418,7 @@ func (l *pageLexer) lexFrontMatterSection(tp itemType, delimr rune, name string,
return l.errorf("EOF looking for end %s front matter delimiter", name)
}
if isEndOfLine(r) {
if bytes.HasPrefix(l.input[l.pos:], delim) {
if l.hasPrefix(delim) {
l.emit(tp)
l.pos += 3
l.consumeCRLF()
@@ -402,7 +433,7 @@ func (l *pageLexer) lexFrontMatterSection(tp itemType, delimr rune, name string,
func lexShortcodeLeftDelim(l *pageLexer) stateFunc {
l.pos += pos(len(l.currentLeftShortcodeDelim()))
if bytes.HasPrefix(l.input[l.pos:], leftComment) {
if l.hasPrefix(leftComment) {
return lexShortcodeComment
}
l.emit(l.currentLeftShortcodeDelimItem())
@@ -412,7 +443,7 @@ func lexShortcodeLeftDelim(l *pageLexer) stateFunc {
}
func lexShortcodeComment(l *pageLexer) stateFunc {
posRightComment := bytes.Index(l.input[l.pos:], append(rightComment, l.currentRightShortcodeDelim()...))
posRightComment := l.index(append(rightComment, l.currentRightShortcodeDelim()...))
if posRightComment <= 1 {
return l.errorf("comment must be closed")
}
@@ -493,7 +524,7 @@ func lexShortcodeParam(l *pageLexer, escapedQuoteStart bool) stateFunc {
}
func lexShortcodeQuotedParamVal(l *pageLexer, escapedQuotedValuesAllowed bool, typ itemType) stateFunc {
func lexShortcodeQuotedParamVal(l *pageLexer, escapedQuotedValuesAllowed bool, typ ItemType) stateFunc {
openQuoteFound := false
escapedInnerQuoteFound := false
escapedQuoteState := 0
@@ -592,7 +623,7 @@ Loop:
}
func lexEndOfShortcode(l *pageLexer) stateFunc {
if bytes.HasPrefix(l.input[l.pos:], l.currentRightShortcodeDelim()) {
if l.hasPrefix(l.currentRightShortcodeDelim()) {
return lexShortcodeRightDelim
}
switch r := l.next(); {
@@ -606,7 +637,7 @@ func lexEndOfShortcode(l *pageLexer) stateFunc {
// scans the elements inside shortcode tags
func lexInsideShortcode(l *pageLexer) stateFunc {
if bytes.HasPrefix(l.input[l.pos:], l.currentRightShortcodeDelim()) {
if l.hasPrefix(l.currentRightShortcodeDelim()) {
return lexShortcodeRightDelim
}
switch r := l.next(); {
@@ -643,11 +674,19 @@ func lexInsideShortcode(l *pageLexer) stateFunc {
// state helpers
func (l *pageLexer) currentLeftShortcodeDelimItem() itemType {
func (l *pageLexer) index(sep []byte) int {
return bytes.Index(l.input[l.pos:], sep)
}
func (l *pageLexer) hasPrefix(prefix []byte) bool {
return bytes.HasPrefix(l.input[l.pos:], prefix)
}
func (l *pageLexer) currentLeftShortcodeDelimItem() ItemType {
return l.currLeftDelimItem
}
func (l *pageLexer) currentRightShortcodeDelimItem() itemType {
func (l *pageLexer) currentRightShortcodeDelimItem() ItemType {
return l.currRightDelimItem
}
@@ -668,6 +707,23 @@ func (l *pageLexer) currentRightShortcodeDelim() []byte {
// helper functions
// returns the min index > 0
func minPositiveIndex(indices ...int) int {
min := -1
for _, j := range indices {
if j <= 0 {
continue
}
if min == -1 {
min = j
} else if j < min {
min = j
}
}
return min
}
func isSpace(r rune) bool {
return r == ' ' || r == '\t'
}