transform/urlreplacers: Support unquoted URLs in canonifyURLs replacer

Fixes #5529
This commit is contained in:
Bjørn Erik Pedersen
2018-12-17 14:25:00 +01:00
parent d5a0b6bbbc
commit efe0b4e5c0
2 changed files with 130 additions and 104 deletions

View File

@@ -16,6 +16,7 @@ package urlreplacers
import ( import (
"bytes" "bytes"
"io" "io"
"unicode"
"unicode/utf8" "unicode/utf8"
"github.com/gohugoio/hugo/transform" "github.com/gohugoio/hugo/transform"
@@ -43,7 +44,7 @@ type absurllexer struct {
start int // item start position start int // item start position
width int // width of last element width int // width of last element
matchers []absURLMatcher quotes [][]byte
ms matchState ms matchState
matches [3]bool // track matches of the 3 prefixes matches [3]bool // track matches of the 3 prefixes
@@ -140,84 +141,115 @@ func (l *absurllexer) emit() {
l.start = l.pos l.start = l.pos
} }
var (
relURLPrefix = []byte("/")
relURLPrefixLen = len(relURLPrefix)
)
func (l *absurllexer) consumeQuote() []byte {
for _, q := range l.quotes {
if bytes.HasPrefix(l.content[l.pos:], q) {
l.pos += len(q)
l.emit()
return q
}
}
return nil
}
// handle URLs in src and href. // handle URLs in src and href.
func checkCandidateBase(l *absurllexer) { func checkCandidateBase(l *absurllexer) {
for _, m := range l.matchers { l.consumeQuote()
if !bytes.HasPrefix(l.content[l.pos:], m.match) {
continue if !bytes.HasPrefix(l.content[l.pos:], relURLPrefix) {
} return
// check for schemaless URLs
posAfter := l.pos + len(m.match)
if posAfter >= len(l.content) {
return
}
r, _ := utf8.DecodeRune(l.content[posAfter:])
if r == '/' {
// schemaless: skip
return
}
if l.pos > l.start {
l.emit()
}
l.pos += len(m.match)
l.w.Write(m.quote)
l.w.Write(l.path)
l.start = l.pos
} }
// check for schemaless URLs
posAfter := l.pos + relURLPrefixLen
if posAfter >= len(l.content) {
return
}
r, _ := utf8.DecodeRune(l.content[posAfter:])
if r == '/' {
// schemaless: skip
return
}
if l.pos > l.start {
l.emit()
}
l.pos += relURLPrefixLen
l.w.Write(l.path)
l.start = l.pos
}
func (l *absurllexer) posAfterURL(q []byte) int {
if len(q) > 0 {
// look for end quote
return bytes.Index(l.content[l.pos:], q)
}
return bytes.IndexFunc(l.content[l.pos:], func(r rune) bool {
return r == '>' || unicode.IsSpace(r)
})
} }
// handle URLs in srcset. // handle URLs in srcset.
func checkCandidateSrcset(l *absurllexer) { func checkCandidateSrcset(l *absurllexer) {
// special case, not frequent (me think) q := l.consumeQuote()
for _, m := range l.matchers { if q == nil {
if !bytes.HasPrefix(l.content[l.pos:], m.match) { // srcset needs to be quoted.
continue return
}
// check for schemaless URLs
posAfter := l.pos + len(m.match)
if posAfter >= len(l.content) {
return
}
r, _ := utf8.DecodeRune(l.content[posAfter:])
if r == '/' {
// schemaless: skip
continue
}
posLastQuote := bytes.Index(l.content[l.pos+1:], m.quote)
// safe guard
if posLastQuote < 0 || posLastQuote > 2000 {
return
}
if l.pos > l.start {
l.emit()
}
section := l.content[l.pos+len(m.quote) : l.pos+posLastQuote+1]
fields := bytes.Fields(section)
l.w.Write(m.quote)
for i, f := range fields {
if f[0] == '/' {
l.w.Write(l.path)
l.w.Write(f[1:])
} else {
l.w.Write(f)
}
if i < len(fields)-1 {
l.w.Write([]byte(" "))
}
}
l.w.Write(m.quote)
l.pos += len(section) + (len(m.quote) * 2)
l.start = l.pos
} }
// special case, not frequent (me think)
if !bytes.HasPrefix(l.content[l.pos:], relURLPrefix) {
return
}
// check for schemaless URLs
posAfter := l.pos + relURLPrefixLen
if posAfter >= len(l.content) {
return
}
r, _ := utf8.DecodeRune(l.content[posAfter:])
if r == '/' {
// schemaless: skip
return
}
posEnd := l.posAfterURL(q)
// safe guard
if posEnd < 0 || posEnd > 2000 {
return
}
if l.pos > l.start {
l.emit()
}
section := l.content[l.pos : l.pos+posEnd+1]
fields := bytes.Fields(section)
for i, f := range fields {
if f[0] == '/' {
l.w.Write(l.path)
l.w.Write(f[1:])
} else {
l.w.Write(f)
}
if i < len(fields)-1 {
l.w.Write([]byte(" "))
}
}
l.pos += len(section)
l.start = l.pos
} }
// main loop // main loop
@@ -262,53 +294,32 @@ func (l *absurllexer) replace() {
} }
} }
func doReplace(path string, ct transform.FromTo, matchers []absURLMatcher) { func doReplace(path string, ct transform.FromTo, quotes [][]byte) {
lexer := &absurllexer{ lexer := &absurllexer{
content: ct.From().Bytes(), content: ct.From().Bytes(),
w: ct.To(), w: ct.To(),
path: []byte(path), path: []byte(path),
matchers: matchers} quotes: quotes}
lexer.replace() lexer.replace()
} }
type absURLReplacer struct { type absURLReplacer struct {
htmlMatchers []absURLMatcher htmlQuotes [][]byte
xmlMatchers []absURLMatcher xmlQuotes [][]byte
} }
func newAbsURLReplacer() *absURLReplacer { func newAbsURLReplacer() *absURLReplacer {
// HTML
dqHTMLMatch := []byte("\"/")
sqHTMLMatch := []byte("'/")
// XML
dqXMLMatch := []byte("&#34;/")
sqXMLMatch := []byte("&#39;/")
dqHTML := []byte("\"")
sqHTML := []byte("'")
dqXML := []byte("&#34;")
sqXML := []byte("&#39;")
return &absURLReplacer{ return &absURLReplacer{
htmlMatchers: []absURLMatcher{ htmlQuotes: [][]byte{[]byte("\""), []byte("'")},
{dqHTMLMatch, dqHTML}, xmlQuotes: [][]byte{[]byte("&#34;"), []byte("&#39;")}}
{sqHTMLMatch, sqHTML},
},
xmlMatchers: []absURLMatcher{
{dqXMLMatch, dqXML},
{sqXMLMatch, sqXML},
}}
} }
func (au *absURLReplacer) replaceInHTML(path string, ct transform.FromTo) { func (au *absURLReplacer) replaceInHTML(path string, ct transform.FromTo) {
doReplace(path, ct, au.htmlMatchers) doReplace(path, ct, au.htmlQuotes)
} }
func (au *absURLReplacer) replaceInXML(path string, ct transform.FromTo) { func (au *absURLReplacer) replaceInXML(path string, ct transform.FromTo) {
doReplace(path, ct, au.xmlMatchers) doReplace(path, ct, au.xmlQuotes)
} }

View File

@@ -156,6 +156,21 @@ func TestAbsURL(t *testing.T) {
} }
func TestAbsURLUnqoted(t *testing.T) {
tr := transform.New(NewAbsURLTransformer(testBaseURL))
apply(t.Errorf, tr, []test{
test{
content: `Link: <a href=/asdf>ASDF</a>`,
expected: `Link: <a href=http://base/asdf>ASDF</a>`,
},
test{
content: `Link: <a href=/asdf >ASDF</a>`,
expected: `Link: <a href=http://base/asdf >ASDF</a>`,
},
})
}
func TestRelativeURL(t *testing.T) { func TestRelativeURL(t *testing.T) {
tr := transform.New(NewAbsURLTransformer(helpers.GetDottedRelativePath(filepath.FromSlash("/post/sub/")))) tr := transform.New(NewAbsURLTransformer(helpers.GetDottedRelativePath(filepath.FromSlash("/post/sub/"))))
@@ -176,7 +191,7 @@ func TestAbsXMLURLSrcSet(t *testing.T) {
} }
func BenchmarkXMLAbsURL(b *testing.B) { func BenchmarkXMLAbsURL(b *testing.B) {
tr := transform.New(NewAbsURLInXMLTransformer("")) tr := transform.New(NewAbsURLInXMLTransformer(testBaseURL))
b.ResetTimer() b.ResetTimer()
for i := 0; i < b.N; i++ { for i := 0; i < b.N; i++ {