Add a consolidated file cache

This commits reworks how file caching is performed in Hugo. Now there is only one way, and it can be configured.

This is the default configuration:

```toml
[caches]
[caches.getjson]
dir = ":cacheDir"
maxAge = -1
[caches.getcsv]
dir = ":cacheDir"
maxAge = -1
[caches.images]
dir = ":resourceDir/_gen"
maxAge = -1
[caches.assets]
dir = ":resourceDir/_gen"
maxAge = -1
```

You can override any of these cache setting in your own `config.toml`.

The placeholders explained:

`:cacheDir`: This is the value of the `cacheDir` config option if set (can also be set via OS env variable `HUGO_CACHEDIR`). It will fall back to `/opt/build/cache/hugo_cache/` on Netlify, or a `hugo_cache` directory below the OS temp dir for the others.
`:resourceDir`: This is the value of the `resourceDir` config option.

`maxAge` is the time in seconds before a cache entry will be evicted, -1 means forever and 0 effectively turns that particular cache off.

This means that if you run your builds on Netlify, all caches configured with `:cacheDir` will be saved and restored on the next build. For other CI vendors, please read their documentation. For an CircleCI example, see 6c3960a8f4/.circleci/config.yml

Fixes #5404
This commit is contained in:
Bjørn Erik Pedersen
2018-11-08 10:24:13 +01:00
parent 7d78a2afd3
commit f7aeaa6129
26 changed files with 1192 additions and 543 deletions

View File

@@ -1,85 +0,0 @@
// Copyright 2017 The Hugo Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package data
import (
"crypto/md5"
"encoding/hex"
"errors"
"sync"
"github.com/gohugoio/hugo/config"
"github.com/gohugoio/hugo/helpers"
"github.com/spf13/afero"
)
var cacheMu sync.RWMutex
// getCacheFileID returns the cache ID for a string.
func getCacheFileID(cfg config.Provider, id string) string {
hash := md5.Sum([]byte(id))
return cfg.GetString("cacheDir") + hex.EncodeToString(hash[:])
}
// getCache returns the content for an ID from the file cache or an error.
// If the ID is not found, return nil,nil.
func getCache(id string, fs afero.Fs, cfg config.Provider, ignoreCache bool) ([]byte, error) {
if ignoreCache {
return nil, nil
}
cacheMu.RLock()
defer cacheMu.RUnlock()
fID := getCacheFileID(cfg, id)
isExists, err := helpers.Exists(fID, fs)
if err != nil {
return nil, err
}
if !isExists {
return nil, nil
}
return afero.ReadFile(fs, fID)
}
// writeCache writes bytes associated with an ID into the file cache.
func writeCache(id string, c []byte, fs afero.Fs, cfg config.Provider, ignoreCache bool) error {
if ignoreCache {
return nil
}
cacheMu.Lock()
defer cacheMu.Unlock()
fID := getCacheFileID(cfg, id)
f, err := fs.Create(fID)
if err != nil {
return errors.New("Error: " + err.Error() + ". Failed to create file: " + fID)
}
defer f.Close()
n, err := f.Write(c)
if err != nil {
return errors.New("Error: " + err.Error() + ". Failed to write to file: " + fID)
}
if n == 0 {
return errors.New("No bytes written to file: " + fID)
}
return nil
}
func deleteCache(id string, fs afero.Fs, cfg config.Provider) error {
return fs.Remove(getCacheFileID(cfg, id))
}

View File

@@ -1,63 +0,0 @@
// Copyright 2017 The Hugo Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package data
import (
"fmt"
"testing"
"github.com/spf13/afero"
"github.com/spf13/viper"
"github.com/stretchr/testify/assert"
)
func TestCache(t *testing.T) {
t.Parallel()
fs := new(afero.MemMapFs)
for i, test := range []struct {
path string
content []byte
ignore bool
}{
{"http://Foo.Bar/foo_Bar-Foo", []byte(`T€st Content 123`), false},
{"fOO,bar:foo%bAR", []byte(`T€st Content 123 fOO,bar:foo%bAR`), false},
{"FOo/BaR.html", []byte(`FOo/BaR.html T€st Content 123`), false},
{"трям/трям", []byte(`T€st трям/трям Content 123`), false},
{"은행", []byte(`T€st C은행ontent 123`), false},
{"Банковский кассир", []byte(`Банковский кассир T€st Content 123`), false},
{"Банковский кассир", []byte(`Банковский кассир T€st Content 456`), true},
} {
msg := fmt.Sprintf("Test #%d: %v", i, test)
cfg := viper.New()
c, err := getCache(test.path, fs, cfg, test.ignore)
assert.NoError(t, err, msg)
assert.Nil(t, c, msg)
err = writeCache(test.path, test.content, fs, cfg, test.ignore)
assert.NoError(t, err, msg)
c, err = getCache(test.path, fs, cfg, test.ignore)
assert.NoError(t, err, msg)
if test.ignore {
assert.Nil(t, c, msg)
} else {
assert.Equal(t, string(test.content), string(c))
}
}
}

View File

@@ -20,17 +20,20 @@ import (
"errors"
"net/http"
"strings"
"time"
"github.com/gohugoio/hugo/cache/filecache"
"github.com/gohugoio/hugo/deps"
_errors "github.com/pkg/errors"
)
// New returns a new instance of the data-namespaced template functions.
func New(deps *deps.Deps) *Namespace {
return &Namespace{
deps: deps,
client: http.DefaultClient,
deps: deps,
cacheGetCSV: deps.FileCaches.GetCSVCache(),
cacheGetJSON: deps.FileCaches.GetJSONCache(),
client: http.DefaultClient,
}
}
@@ -38,6 +41,9 @@ func New(deps *deps.Deps) *Namespace {
type Namespace struct {
deps *deps.Deps
cacheGetJSON *filecache.Cache
cacheGetCSV *filecache.Cache
client *http.Client
}
@@ -48,40 +54,34 @@ type Namespace struct {
// GetCSV returns nil or a slice slice to use in a short code.
func (ns *Namespace) GetCSV(sep string, urlParts ...string) (d [][]string, err error) {
url := strings.Join(urlParts, "")
cache := ns.cacheGetCSV
var clearCacheSleep = func(i int, u string) {
ns.deps.Log.INFO.Printf("Retry #%d for %s and sleeping for %s", i, url, resSleep)
time.Sleep(resSleep)
deleteCache(url, ns.deps.Fs.Source, ns.deps.Cfg)
}
for i := 0; i <= resRetries; i++ {
var req *http.Request
req, err = http.NewRequest("GET", url, nil)
if err != nil {
return nil, _errors.Wrapf(err, "failed to create request for getCSV for resource %s", url)
unmarshal := func(b []byte) (error, bool) {
if !bytes.Contains(b, []byte(sep)) {
return _errors.Errorf("cannot find separator %s in CSV for %s", sep, url), false
}
req.Header.Add("Accept", "text/csv")
req.Header.Add("Accept", "text/plain")
var c []byte
c, err = ns.getResource(req)
if err != nil {
return nil, _errors.Wrapf(err, "failed to read CSV resource %q", url)
}
if !bytes.Contains(c, []byte(sep)) {
return nil, _errors.Errorf("cannot find separator %s in CSV for %s", sep, url)
}
if d, err = parseCSV(c, sep); err != nil {
if d, err = parseCSV(b, sep); err != nil {
err = _errors.Wrapf(err, "failed to parse CSV file %s", url)
clearCacheSleep(i, url)
continue
return err, true
}
break
return nil, false
}
var req *http.Request
req, err = http.NewRequest("GET", url, nil)
if err != nil {
return nil, _errors.Wrapf(err, "failed to create request for getCSV for resource %s", url)
}
req.Header.Add("Accept", "text/csv")
req.Header.Add("Accept", "text/plain")
err = ns.getResource(cache, unmarshal, req)
if err != nil {
return nil, _errors.Wrapf(err, "failed to read CSV resource %q", url)
}
return
@@ -90,38 +90,34 @@ func (ns *Namespace) GetCSV(sep string, urlParts ...string) (d [][]string, err e
// GetJSON expects one or n-parts of a URL to a resource which can either be a local or a remote one.
// If you provide multiple parts they will be joined together to the final URL.
// GetJSON returns nil or parsed JSON to use in a short code.
func (ns *Namespace) GetJSON(urlParts ...string) (v interface{}, err error) {
func (ns *Namespace) GetJSON(urlParts ...string) (interface{}, error) {
var v interface{}
url := strings.Join(urlParts, "")
cache := ns.cacheGetJSON
for i := 0; i <= resRetries; i++ {
var req *http.Request
req, err = http.NewRequest("GET", url, nil)
if err != nil {
return nil, _errors.Wrapf(err, "Failed to create request for getJSON resource %s", url)
}
req.Header.Add("Accept", "application/json")
var c []byte
c, err = ns.getResource(req)
if err != nil {
return nil, _errors.Wrapf(err, "failed to get getJSON resource %q", url)
}
err = json.Unmarshal(c, &v)
if err != nil {
ns.deps.Log.INFO.Printf("Cannot read JSON from resource %s: %s", url, err)
ns.deps.Log.INFO.Printf("Retry #%d for %s and sleeping for %s", i, url, resSleep)
time.Sleep(resSleep)
deleteCache(url, ns.deps.Fs.Source, ns.deps.Cfg)
continue
}
break
req, err := http.NewRequest("GET", url, nil)
if err != nil {
return nil, _errors.Wrapf(err, "Failed to create request for getJSON resource %s", url)
}
unmarshal := func(b []byte) (error, bool) {
err := json.Unmarshal(b, &v)
if err != nil {
return err, true
}
return nil, false
}
req.Header.Add("Accept", "application/json")
err = ns.getResource(cache, unmarshal, req)
if err != nil {
return nil, _errors.Wrapf(err, "failed to get getJSON resource %q", url)
}
return
return v, nil
}
// parseCSV parses bytes of CSV data into a slice slice string or an error

View File

@@ -16,8 +16,8 @@ package data
import (
"testing"
"github.com/gohugoio/hugo/deps"
"github.com/gohugoio/hugo/tpl/internal"
"github.com/spf13/viper"
"github.com/stretchr/testify/require"
)
@@ -25,8 +25,11 @@ func TestInit(t *testing.T) {
var found bool
var ns *internal.TemplateFuncsNamespace
v := viper.New()
v.Set("contentDir", "content")
for _, nsf := range internal.TemplateFuncsNamespaceRegistry {
ns = nsf(&deps.Deps{})
ns = nsf(newDeps(v))
if ns.Name == name {
found = true
break

View File

@@ -14,102 +14,81 @@
package data
import (
"fmt"
"io/ioutil"
"net/http"
"path/filepath"
"sync"
"time"
"github.com/pkg/errors"
"github.com/gohugoio/hugo/cache/filecache"
"github.com/gohugoio/hugo/config"
"github.com/gohugoio/hugo/helpers"
"github.com/spf13/afero"
jww "github.com/spf13/jwalterweatherman"
)
var (
remoteURLLock = &remoteLock{m: make(map[string]*sync.Mutex)}
resSleep = time.Second * 2 // if JSON decoding failed sleep for n seconds before retrying
resRetries = 1 // number of retries to load the JSON from URL or local file system
resSleep = time.Second * 2 // if JSON decoding failed sleep for n seconds before retrying
resRetries = 1 // number of retries to load the JSON from URL
)
type remoteLock struct {
sync.RWMutex
m map[string]*sync.Mutex
}
// URLLock locks an URL during download
func (l *remoteLock) URLLock(url string) {
var (
lock *sync.Mutex
ok bool
)
l.Lock()
if lock, ok = l.m[url]; !ok {
lock = &sync.Mutex{}
l.m[url] = lock
}
l.Unlock()
lock.Lock()
}
// URLUnlock unlocks an URL when the download has been finished. Use only in defer calls.
func (l *remoteLock) URLUnlock(url string) {
l.RLock()
defer l.RUnlock()
if um, ok := l.m[url]; ok {
um.Unlock()
}
}
// getRemote loads the content of a remote file. This method is thread safe.
func getRemote(req *http.Request, fs afero.Fs, cfg config.Provider, hc *http.Client) ([]byte, error) {
func (ns *Namespace) getRemote(cache *filecache.Cache, unmarshal func([]byte) (error, bool), req *http.Request) error {
url := req.URL.String()
id := helpers.MD5String(url)
var handled bool
var retry bool
_, b, err := cache.GetOrCreateBytes(id, func() ([]byte, error) {
var err error
handled = true
for i := 0; i <= resRetries; i++ {
ns.deps.Log.INFO.Printf("Downloading: %s ...", url)
var res *http.Response
res, err = ns.client.Do(req)
if err != nil {
return nil, err
}
if isHTTPError(res) {
return nil, errors.Errorf("Failed to retrieve remote file: %s", http.StatusText(res.StatusCode))
}
var b []byte
b, err = ioutil.ReadAll(res.Body)
if err != nil {
return nil, err
}
res.Body.Close()
err, retry = unmarshal(b)
if err == nil {
// Return it so it can be cached.
return b, nil
}
if !retry {
return nil, err
}
ns.deps.Log.INFO.Printf("Cannot read remote resource %s: %s", url, err)
ns.deps.Log.INFO.Printf("Retry #%d for %s and sleeping for %s", i+1, url, resSleep)
time.Sleep(resSleep)
}
c, err := getCache(url, fs, cfg, cfg.GetBool("ignoreCache"))
if err != nil {
return nil, err
}
if c != nil {
return c, nil
})
if !handled {
// This is cached content and should be correct.
err, _ = unmarshal(b)
}
// avoid race condition with locks, block other goroutines if the current url is processing
remoteURLLock.URLLock(url)
defer func() { remoteURLLock.URLUnlock(url) }()
// avoid multiple locks due to calling getCache twice
c, err = getCache(url, fs, cfg, cfg.GetBool("ignoreCache"))
if err != nil {
return nil, err
}
if c != nil {
return c, nil
}
jww.INFO.Printf("Downloading: %s ...", url)
res, err := hc.Do(req)
if err != nil {
return nil, err
}
if res.StatusCode < 200 || res.StatusCode > 299 {
return nil, fmt.Errorf("Failed to retrieve remote file: %s", http.StatusText(res.StatusCode))
}
c, err = ioutil.ReadAll(res.Body)
res.Body.Close()
if err != nil {
return nil, err
}
err = writeCache(url, c, fs, cfg, cfg.GetBool("ignoreCache"))
if err != nil {
return nil, err
}
jww.INFO.Printf("... and cached to: %s", getCacheFileID(cfg, url))
return c, nil
return err
}
// getLocal loads the content of a local file
@@ -123,12 +102,22 @@ func getLocal(url string, fs afero.Fs, cfg config.Provider) ([]byte, error) {
}
// getResource loads the content of a local or remote file
func (ns *Namespace) getResource(req *http.Request) ([]byte, error) {
// getResource loads the content of a local or remote file and returns its content and the
// cache ID used, if relevant.
func (ns *Namespace) getResource(cache *filecache.Cache, unmarshal func(b []byte) (error, bool), req *http.Request) error {
switch req.URL.Scheme {
case "":
return getLocal(req.URL.String(), ns.deps.Fs.Source, ns.deps.Cfg)
b, err := getLocal(req.URL.String(), ns.deps.Fs.Source, ns.deps.Cfg)
if err != nil {
return err
}
err, _ = unmarshal(b)
return err
default:
return getRemote(req, ns.deps.Fs.Source, ns.deps.Cfg, ns.client)
return ns.getRemote(cache, unmarshal, req)
}
}
func isHTTPError(res *http.Response) bool {
return res.StatusCode < 200 || res.StatusCode > 299
}

View File

@@ -23,6 +23,9 @@ import (
"testing"
"time"
"github.com/gohugoio/hugo/hugolib/paths"
"github.com/gohugoio/hugo/cache/filecache"
"github.com/gohugoio/hugo/common/loggers"
"github.com/gohugoio/hugo/config"
"github.com/gohugoio/hugo/deps"
@@ -85,16 +88,16 @@ func getTestServer(handler func(w http.ResponseWriter, r *http.Request)) (*httpt
func TestScpGetRemote(t *testing.T) {
t.Parallel()
fs := new(afero.MemMapFs)
cache := filecache.NewCache(fs, 100)
tests := []struct {
path string
content []byte
ignore bool
}{
{"http://Foo.Bar/foo_Bar-Foo", []byte(`T€st Content 123`), false},
{"http://Doppel.Gänger/foo_Bar-Foo", []byte(`T€st Cont€nt 123`), false},
{"http://Doppel.Gänger/Fizz_Bazz-Foo", []byte(`T€st Банковский кассир Cont€nt 123`), false},
{"http://Doppel.Gänger/Fizz_Bazz-Bar", []byte(`T€st Банковский кассир Cont€nt 456`), true},
{"http://Foo.Bar/foo_Bar-Foo", []byte(`T€st Content 123`)},
{"http://Doppel.Gänger/foo_Bar-Foo", []byte(`T€st Cont€nt 123`)},
{"http://Doppel.Gänger/Fizz_Bazz-Foo", []byte(`T€st Банковский кассир Cont€nt 123`)},
{"http://Doppel.Gänger/Fizz_Bazz-Bar", []byte(`T€st Банковский кассир Cont€nt 456`)},
}
for _, test := range tests {
@@ -108,53 +111,64 @@ func TestScpGetRemote(t *testing.T) {
})
defer func() { srv.Close() }()
cfg := viper.New()
ns := newTestNs()
ns.client = cl
c, err := getRemote(req, fs, cfg, cl)
var c []byte
f := func(b []byte) (error, bool) {
c = b
return nil, false
}
err = ns.getRemote(cache, f, req)
require.NoError(t, err, msg)
assert.Equal(t, string(test.content), string(c))
c, err = getCache(req.URL.String(), fs, cfg, test.ignore)
require.NoError(t, err, msg)
assert.Equal(t, string(test.content), string(c))
if test.ignore {
assert.Empty(t, c, msg)
} else {
assert.Equal(t, string(test.content), string(c))
}
}
}
func TestScpGetRemoteParallel(t *testing.T) {
t.Parallel()
ns := newTestNs()
content := []byte(`T€st Content 123`)
srv, cl := getTestServer(func(w http.ResponseWriter, r *http.Request) {
w.Write(content)
})
defer func() { srv.Close() }()
url := "http://Foo.Bar/foo_Bar-Foo"
req, err := http.NewRequest("GET", url, nil)
require.NoError(t, err)
for _, ignoreCache := range []bool{false, true} {
for _, ignoreCache := range []bool{false} {
cfg := viper.New()
cfg.Set("ignoreCache", ignoreCache)
cfg.Set("contentDir", "content")
ns := New(newDeps(cfg))
ns.client = cl
var wg sync.WaitGroup
for i := 0; i < 50; i++ {
for i := 0; i < 1; i++ {
wg.Add(1)
go func(gor int) {
defer wg.Done()
for j := 0; j < 10; j++ {
c, err := getRemote(req, ns.deps.Fs.Source, ns.deps.Cfg, cl)
var c []byte
f := func(b []byte) (error, bool) {
c = b
return nil, false
}
err := ns.getRemote(ns.cacheGetJSON, f, req)
assert.NoError(t, err)
assert.Equal(t, string(content), string(c))
if string(content) != string(c) {
t.Fatalf("expected\n%q\ngot\n%q", content, c)
}
time.Sleep(23 * time.Millisecond)
}
@@ -173,11 +187,16 @@ func newDeps(cfg config.Provider) *deps.Deps {
panic(err)
}
fs := hugofs.NewMem(l)
logger := loggers.NewErrorLogger()
p, _ := paths.New(fs, cfg)
fileCaches, _ := filecache.NewCachesFromPaths(p)
return &deps.Deps{
Cfg: cfg,
Fs: hugofs.NewMem(l),
Fs: fs,
FileCaches: fileCaches,
ContentSpec: cs,
Log: logger,
DistinctErrorLog: helpers.NewDistinctLogger(logger.ERROR),