Move the emoji parsing to pageparser

This avoids double parsing the page content when `enableEmoji=true`.

This commit also adds some general improvements to the parser, making it in general much faster:

```bash
benchmark                     old ns/op     new ns/op     delta
BenchmarkShortcodeLexer-4     90258         101730        +12.71%
BenchmarkParse-4              148940        15037         -89.90%

benchmark                     old allocs     new allocs     delta
BenchmarkShortcodeLexer-4     456            700            +53.51%
BenchmarkParse-4              28             33             +17.86%

benchmark                     old bytes     new bytes     delta
BenchmarkShortcodeLexer-4     69875         81014         +15.94%
BenchmarkParse-4              8128          8304          +2.17%
```

Running some site benchmarks with Emoji support turned on:

```bash
benchmark                                                                                     old ns/op     new ns/op     delta
BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4     924556797     818115620     -11.51%

benchmark                                                                                     old allocs     new allocs     delta
BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4     4112613        4133787        +0.51%

benchmark                                                                                     old bytes     new bytes     delta
BenchmarkSiteBuilding/TOML,num_langs=3,num_pages=5000,tags_per_page=5,shortcodes,render-4     426982864     424363832     -0.61%
```

Fixes #5534
This commit is contained in:
Bjørn Erik Pedersen
2018-12-17 21:03:23 +01:00
parent a8853f1c5a
commit 9cd54cab20
13 changed files with 388 additions and 71 deletions

View File

@@ -37,6 +37,12 @@ type pageLexer struct {
start int // item start position
width int // width of last element
// Contains lexers for shortcodes and other main section
// elements.
sectionHandlers *sectionHandlers
cfg Config
// The summary divider to look for.
summaryDivider []byte
// Set when we have parsed any summary divider
@@ -60,13 +66,17 @@ func (l *pageLexer) Input() []byte {
}
type Config struct {
EnableEmoji bool
}
// note: the input position here is normally 0 (start), but
// can be set if position of first shortcode is known
func newPageLexer(input []byte, inputPosition int, stateStart stateFunc) *pageLexer {
func newPageLexer(input []byte, stateStart stateFunc, cfg Config) *pageLexer {
lexer := &pageLexer{
input: input,
pos: inputPosition,
stateStart: stateStart,
cfg: cfg,
lexerShortcodeState: lexerShortcodeState{
currLeftDelimItem: tLeftDelimScNoMarkup,
currRightDelimItem: tRightDelimScNoMarkup,
@@ -75,6 +85,8 @@ func newPageLexer(input []byte, inputPosition int, stateStart stateFunc) *pageLe
items: make([]Item, 0, 5),
}
lexer.sectionHandlers = createSectionHandlers(lexer)
return lexer
}
@@ -100,6 +112,8 @@ var (
delimOrg = []byte("#+")
htmlCommentStart = []byte("<!--")
htmlCommentEnd = []byte("-->")
emojiDelim = byte(':')
)
func (l *pageLexer) next() rune {
@@ -132,6 +146,10 @@ func (l *pageLexer) emit(t ItemType) {
l.start = l.pos
}
func (l *pageLexer) isEOF() bool {
return l.pos >= len(l.input)
}
// special case, do not send '\\' back to client
func (l *pageLexer) ignoreEscapesAndEmit(t ItemType) {
val := bytes.Map(func(r rune) rune {
@@ -193,30 +211,80 @@ func (l *pageLexer) consumeSpace() {
}
}
func lexMainSection(l *pageLexer) stateFunc {
if l.isInHTMLComment {
return lexEndFromtMatterHTMLComment
}
// lex a string starting at ":"
func lexEmoji(l *pageLexer) stateFunc {
pos := l.pos + 1
valid := false
// Fast forward as far as possible.
var l1, l2 int
if !l.summaryDividerChecked && l.summaryDivider != nil {
l1 = l.index(l.summaryDivider)
if l1 == -1 {
l.summaryDividerChecked = true
for i := pos; i < len(l.input); i++ {
if i > pos && l.input[i] == emojiDelim {
pos = i + 1
valid = true
break
}
r, _ := utf8.DecodeRune(l.input[i:])
if !isAlphaNumeric(r) {
break
}
}
l2 = l.index(leftDelimSc)
skip := minIndex(l1, l2)
if skip > 0 {
l.pos += skip
if valid {
l.pos = pos
l.emit(TypeEmoji)
} else {
l.pos++
l.emit(tText)
}
for {
if l.isShortCodeStart() {
return lexMainSection
}
type sectionHandlers struct {
l *pageLexer
// Set when none of the sections are found so we
// can safely stop looking and skip to the end.
skipAll bool
handlers []*sectionHandler
skipIndexes []int
}
func (s *sectionHandlers) skip() int {
if s.skipAll {
return -1
}
s.skipIndexes = s.skipIndexes[:0]
var shouldSkip bool
for _, skipper := range s.handlers {
idx := skipper.skip()
if idx != -1 {
shouldSkip = true
s.skipIndexes = append(s.skipIndexes, idx)
}
}
if !shouldSkip {
s.skipAll = true
return -1
}
return minIndex(s.skipIndexes...)
}
func createSectionHandlers(l *pageLexer) *sectionHandlers {
shortCodeHandler := &sectionHandler{
l: l,
skipFunc: func(l *pageLexer) int {
return l.index(leftDelimSc)
},
lexFunc: func(origin stateFunc, l *pageLexer) (stateFunc, bool) {
if !l.isShortCodeStart() {
return origin, false
}
if l.isInline {
// If we're inside an inline shortcode, the only valid shortcode markup is
// the markup which closes it.
@@ -225,14 +293,11 @@ func lexMainSection(l *pageLexer) stateFunc {
if end != len(l.input)-1 {
b = bytes.TrimSpace(b[end+1:])
if end == -1 || !bytes.HasPrefix(b, []byte(l.currShortcodeName+" ")) {
return l.errorf("inline shortcodes do not support nesting")
return l.errorf("inline shortcodes do not support nesting"), true
}
}
}
if l.pos > l.start {
l.emit(tText)
}
if l.hasPrefix(leftDelimScWithMarkup) {
l.currLeftDelimItem = tLeftDelimScWithMarkup
l.currRightDelimItem = tRightDelimScWithMarkup
@@ -240,32 +305,139 @@ func lexMainSection(l *pageLexer) stateFunc {
l.currLeftDelimItem = tLeftDelimScNoMarkup
l.currRightDelimItem = tRightDelimScNoMarkup
}
return lexShortcodeLeftDelim
}
if !l.summaryDividerChecked && l.summaryDivider != nil {
if l.hasPrefix(l.summaryDivider) {
if l.pos > l.start {
l.emit(tText)
}
l.summaryDividerChecked = true
l.pos += len(l.summaryDivider)
// This makes it a little easier to reason about later.
l.consumeSpace()
l.emit(TypeLeadSummaryDivider)
// We have already moved to the next.
continue
}
}
r := l.next()
if r == eof {
break
}
return lexShortcodeLeftDelim, true
},
}
summaryDividerHandler := &sectionHandler{
l: l,
skipFunc: func(l *pageLexer) int {
if l.summaryDividerChecked || l.summaryDivider == nil {
return -1
}
return l.index(l.summaryDivider)
},
lexFunc: func(origin stateFunc, l *pageLexer) (stateFunc, bool) {
if !l.hasPrefix(l.summaryDivider) {
return origin, false
}
l.summaryDividerChecked = true
l.pos += len(l.summaryDivider)
// This makes it a little easier to reason about later.
l.consumeSpace()
l.emit(TypeLeadSummaryDivider)
return origin, true
},
}
handlers := []*sectionHandler{shortCodeHandler, summaryDividerHandler}
if l.cfg.EnableEmoji {
emojiHandler := &sectionHandler{
l: l,
skipFunc: func(l *pageLexer) int {
return l.indexByte(emojiDelim)
},
lexFunc: func(origin stateFunc, l *pageLexer) (stateFunc, bool) {
return lexEmoji, true
},
}
handlers = append(handlers, emojiHandler)
}
return &sectionHandlers{
l: l,
handlers: handlers,
skipIndexes: make([]int, len(handlers)),
}
}
func (s *sectionHandlers) lex(origin stateFunc) stateFunc {
if s.skipAll {
return nil
}
if s.l.pos > s.l.start {
s.l.emit(tText)
}
for _, handler := range s.handlers {
if handler.skipAll {
continue
}
next, handled := handler.lexFunc(origin, handler.l)
if next == nil || handled {
return next
}
}
// Not handled by the above.
s.l.pos++
return origin
}
type sectionHandler struct {
l *pageLexer
// No more sections of this type.
skipAll bool
// Returns the index of the next match, -1 if none found.
skipFunc func(l *pageLexer) int
// Lex lexes the current section and returns the next state func and
// a bool telling if this section was handled.
// Note that returning nil as the next state will terminate the
// lexer.
lexFunc func(origin stateFunc, l *pageLexer) (stateFunc, bool)
}
func (s *sectionHandler) skip() int {
if s.skipAll {
return -1
}
idx := s.skipFunc(s.l)
if idx == -1 {
s.skipAll = true
}
return idx
}
func lexMainSection(l *pageLexer) stateFunc {
if l.isEOF() {
return lexDone
}
if l.isInHTMLComment {
return lexEndFromtMatterHTMLComment
}
// Fast forward as far as possible.
skip := l.sectionHandlers.skip()
if skip == -1 {
l.pos = len(l.input)
return lexDone
} else if skip > 0 {
l.pos += skip
}
next := l.sectionHandlers.lex(lexMainSection)
if next != nil {
return next
}
l.pos = len(l.input)
return lexDone
}
@@ -297,10 +469,22 @@ func (l *pageLexer) index(sep []byte) int {
return bytes.Index(l.input[l.pos:], sep)
}
func (l *pageLexer) indexByte(sep byte) int {
return bytes.IndexByte(l.input[l.pos:], sep)
}
func (l *pageLexer) hasPrefix(prefix []byte) bool {
return bytes.HasPrefix(l.input[l.pos:], prefix)
}
func (l *pageLexer) hasPrefixByte(prefix byte) bool {
b := l.input[l.pos:]
if len(b) == 0 {
return false
}
return b[0] == prefix
}
// helper functions
// returns the min index >= 0