Improve caching (#19130)

* Remove lock for cached operations. * Rename "Relax" to `ReturnLastGood`. * Add `CacheError` to allow caching values even on errors. * Add NoWait that will return current value with async fetching if within 2xTTL. * Make benchmark somewhat representative. ``` Before: BenchmarkCache-12 16408370 63.12 ns/op 0 B/op After: BenchmarkCache-12 428282187 2.789 ns/op 0 B/op ``` * Remove `storageRESTClient.scanning`. Nonsensical - RPC clients will not have any idea about scanning. * Always fetch remote diskinfo metrics and cache them. Seems most calls are requesting metrics. * Do async fetching of usage caches.
2025-07-13 02:51:05 -04:00 · 2024-02-26 10:49:19 -08:00 · 2024-02-26 10:49:19 -08:00 · 2b5e4b853c
commit 2b5e4b853c
parent 85bcb5874a
8 changed files with 84 additions and 67 deletions
--- a/cmd/admin-handlers-users.go
+++ b/cmd/admin-handlers-users.go
@ -1203,7 +1203,8 @@ func (a adminAPIHandlers) AccountInfoHandler(w http.ResponseWriter, r *http.Requ
 		bucketStorageCache.TTL = 10 * time.Second

 		// Rely on older value if usage loading fails from disk.
-		bucketStorageCache.Relax = true
+		bucketStorageCache.ReturnLastGood = true
+		bucketStorageCache.NoWait = true
 		bucketStorageCache.Update = func() (DataUsageInfo, error) {
 			ctx, done := context.WithTimeout(context.Background(), 2*time.Second)
 			defer done()
--- a/cmd/bucket-quota.go
+++ b/cmd/bucket-quota.go
@ -52,7 +52,8 @@ func (sys *BucketQuotaSys) Init(objAPI ObjectLayer) {
 		// does not update the bucket usage values frequently.
 		bucketStorageCache.TTL = 10 * time.Second
 		// Rely on older value if usage loading fails from disk.
-		bucketStorageCache.Relax = true
+		bucketStorageCache.ReturnLastGood = true
+		bucketStorageCache.NoWait = true
 		bucketStorageCache.Update = func() (DataUsageInfo, error) {
 			ctx, done := context.WithTimeout(context.Background(), 2*time.Second)
 			defer done()
--- a/cmd/data-usage.go
+++ b/cmd/data-usage.go
@ -81,7 +81,8 @@ func loadPrefixUsageFromBackend(ctx context.Context, objAPI ObjectLayer, bucket
 		prefixUsageCache.TTL = 30 * time.Second

 		// No need to fail upon Update() error, fallback to old value.
-		prefixUsageCache.Relax = true
+		prefixUsageCache.ReturnLastGood = true
+		prefixUsageCache.NoWait = true
 		prefixUsageCache.Update = func() (map[string]uint64, error) {
 			m := make(map[string]uint64)
 			for _, pool := range z.serverPools {
--- a/cmd/erasure-server-pool.go
+++ b/cmd/erasure-server-pool.go
@ -1851,7 +1851,8 @@ func (z *erasureServerPools) ListBuckets(ctx context.Context, opts BucketOptions
 		listBucketsCache.Once.Do(func() {
 			listBucketsCache.TTL = time.Second

-			listBucketsCache.Relax = true
+			listBucketsCache.ReturnLastGood = true
+			listBucketsCache.NoWait = true
 			listBucketsCache.Update = func() ([]BucketInfo, error) {
 				ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
 				buckets, err = z.s3Peer.ListBuckets(ctx, opts)
--- a/cmd/metrics-v2.go
+++ b/cmd/metrics-v2.go
@ -357,7 +357,7 @@ type MetricsGroupOpts struct {
 func (g *MetricsGroup) RegisterRead(read func(ctx context.Context) []Metric) {
 	g.metricsCache = cachevalue.New[[]Metric]()
 	g.metricsCache.Once.Do(func() {
-		g.metricsCache.Relax = true
+		g.metricsCache.ReturnLastGood = true
 		g.metricsCache.TTL = g.cacheInterval
 		g.metricsCache.Update = func() ([]Metric, error) {
 			if g.metricsGroupOpts.dependGlobalObjectAPI {
--- a/cmd/storage-rest-client.go
+++ b/cmd/storage-rest-client.go
@ -31,7 +31,6 @@ import (
 	"strconv"
 	"strings"
 	"sync"
-	"sync/atomic"
 	"time"

 	"github.com/minio/madmin-go/v3"
@ -157,9 +156,6 @@ func toStorageErr(err error) error {

 // Abstracts a remote disk.
 type storageRESTClient struct {
-	// Indicate of NSScanner is in progress in this disk
-	scanning int32
-
 	endpoint    Endpoint
 	restClient  *rest.Client
 	gridConn    *grid.Subroute
@ -236,8 +232,6 @@ func (client *storageRESTClient) Healing() *healingTracker {
 }

 func (client *storageRESTClient) NSScanner(ctx context.Context, cache dataUsageCache, updates chan<- dataUsageEntry, scanMode madmin.HealScanMode, _ func() bool) (dataUsageCache, error) {
-	atomic.AddInt32(&client.scanning, 1)
-	defer atomic.AddInt32(&client.scanning, -1)
 	defer xioutil.SafeClose(updates)

 	st, err := storageNSScannerRPC.Call(ctx, client.gridConn, &nsScannerOptions{
@ -310,8 +304,8 @@ func (client *storageRESTClient) DiskInfo(ctx context.Context, opts DiskInfoOpti
 		return info, errDiskNotFound
 	}

-	// if metrics was asked, or it was a NoOp we do not need to cache the value.
-	if opts.Metrics || opts.NoOp {
+	// if 'NoOp' we do not cache the value.
+	if opts.NoOp {
 		ctx, cancel := context.WithTimeout(ctx, 5*time.Second)
 		defer cancel()

@ -325,17 +319,17 @@ func (client *storageRESTClient) DiskInfo(ctx context.Context, opts DiskInfoOpti
 		if info.Error != "" {
 			return info, toStorageErr(errors.New(info.Error))
 		}
-		info.Scanning = atomic.LoadInt32(&client.scanning) == 1
 		return info, nil
 	} // In all other cases cache the value upto 1sec.

 	client.diskInfoCache.Once.Do(func() {
 		client.diskInfoCache.TTL = time.Second
+		client.diskInfoCache.CacheError = true
 		client.diskInfoCache.Update = func() (info DiskInfo, err error) {
 			ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
 			defer cancel()

-			nopts := DiskInfoOptions{DiskID: client.diskID}
+			nopts := DiskInfoOptions{DiskID: client.diskID, Metrics: true}
 			infop, err := storageDiskInfoRPC.Call(ctx, client.gridConn, &nopts)
 			if err != nil {
 				return info, toStorageErr(err)
@ -348,9 +342,7 @@ func (client *storageRESTClient) DiskInfo(ctx context.Context, opts DiskInfoOpti
 		}
 	})

-	info, err = client.diskInfoCache.Get()
-	info.Scanning = atomic.LoadInt32(&client.scanning) == 1
-	return info, err
+	return client.diskInfoCache.Get()
 }

 // MakeVolBulk - create multiple volumes in a bulk operation.
--- a/internal/cachevalue/cache.go
+++ b/internal/cachevalue/cache.go
@ -19,19 +19,20 @@ package cachevalue

 import (
 	"sync"
+	"sync/atomic"
 	"time"
 )

 // Cache contains a synchronized value that is considered valid
 // for a specific amount of time.
 // An Update function must be set to provide an updated value when needed.
-type Cache[I any] struct {
+type Cache[T any] struct {
 	// Update must return an updated value.
 	// If an error is returned the cached value is not set.
 	// Only one caller will call this function at any time, others will be blocking.
 	// The returned value can no longer be modified once returned.
 	// Should be set before calling Get().
-	Update func() (item I, err error)
+	Update func() (T, error)

 	// TTL for a cached value.
 	// If not set 1 second TTL is assumed.
@ -39,18 +40,31 @@ type Cache[I any] struct {
 	TTL time.Duration

 	// When set to true, return the last cached value
-	// even if updating the value errors out
-	Relax bool
+	// even if updating the value errors out.
+	// Returns the last good value AND the error.
+	ReturnLastGood bool
+
+	// If CacheError is set, errors will be cached as well
+	// and not continuously try to update.
+	// Should not be combined with ReturnLastGood.
+	CacheError bool
+
+	// If NoWait is set, Get() will return the last good value,
+	// if TTL has expired but 2x TTL has not yet passed,
+	// but will fetch a new value in the background.
+	NoWait bool

 	// Once can be used to initialize values for lazy initialization.
 	// Should be set before calling Get().
 	Once sync.Once

 	// Managed values.
-	value      I         // our cached value
-	valueSet   bool      // 'true' if the value 'I' has a value
-	lastUpdate time.Time // indicates when value 'I' was updated last, used for invalidation.
-	mu         sync.RWMutex
+	valErr atomic.Pointer[struct {
+		v T
+		e error
+	}]
+	lastUpdateMs atomic.Int64
+	updating     sync.Mutex
 }

 // New initializes a new cached value instance.
@ -61,30 +75,42 @@ func New[I any]() *Cache[I] {
 // Get will return a cached value or fetch a new one.
 // If the Update function returns an error the value is forwarded as is and not cached.
 func (t *Cache[I]) Get() (item I, err error) {
-	item, ok := t.get(t.ttl())
-	if ok {
-		return item, nil
-	}
-
-	item, err = t.Update()
-	if err != nil {
-		if t.Relax {
-			// if update fails, return current
-			// cached value along with error.
-			//
-			// Let the caller decide if they want
-			// to use the returned value based
-			// on error.
-			item, ok = t.get(0)
-			if ok {
-				return item, err
-			}
+	v := t.valErr.Load()
+	ttl := t.ttl()
+	vTime := t.lastUpdateMs.Load()
+	tNow := time.Now().UnixMilli()
+	if v != nil && tNow-vTime < ttl.Milliseconds() {
+		if v.e == nil {
+			return v.v, nil
+		}
+		if v.e != nil && t.CacheError || t.ReturnLastGood {
+			return v.v, v.e
 		}
-		return item, err
 	}

-	t.update(item)
-	return item, nil
+	// Fetch new value.
+	if t.NoWait && v != nil && tNow-vTime < ttl.Milliseconds()*2 && (v.e == nil || t.CacheError) {
+		if t.updating.TryLock() {
+			go func() {
+				defer t.updating.Unlock()
+				t.update()
+			}()
+		}
+		return v.v, v.e
+	}
+
+	// Get lock. Either we get it or we wait for it.
+	t.updating.Lock()
+	if time.Since(time.UnixMilli(t.lastUpdateMs.Load())) < ttl {
+		// There is a new value, release lock and return it.
+		v = t.valErr.Load()
+		t.updating.Unlock()
+		return v.v, v.e
+	}
+	t.update()
+	v = t.valErr.Load()
+	t.updating.Unlock()
+	return v.v, v.e
 }

 func (t *Cache[_]) ttl() time.Duration {
@ -95,25 +121,20 @@ func (t *Cache[_]) ttl() time.Duration {
 	return ttl
 }

-func (t *Cache[I]) get(ttl time.Duration) (item I, ok bool) {
-	t.mu.RLock()
-	defer t.mu.RUnlock()
-	if t.valueSet {
-		item = t.value
-		if ttl <= 0 {
-			return item, true
-		}
-		if time.Since(t.lastUpdate) < ttl {
-			return item, true
+func (t *Cache[T]) update() {
+	val, err := t.Update()
+	if err != nil {
+		if t.ReturnLastGood {
+			// Keep last good value.
+			v := t.valErr.Load()
+			if v != nil {
+				val = v.v
+			}
 		}
 	}
-	return item, false
-}
-
-func (t *Cache[I]) update(item I) {
-	t.mu.Lock()
-	defer t.mu.Unlock()
-	t.value = item
-	t.valueSet = true
-	t.lastUpdate = time.Now()
+	t.valErr.Store(&struct {
+		v T
+		e error
+	}{v: val, e: err})
+	t.lastUpdateMs.Store(time.Now().UnixMilli())
 }
--- a/internal/cachevalue/cache_test.go
+++ b/internal/cachevalue/cache_test.go
@ -51,7 +51,7 @@ func TestCache(t *testing.T) {
 func BenchmarkCache(b *testing.B) {
 	cache := New[time.Time]()
 	cache.Once.Do(func() {
-		cache.TTL = 1 * time.Microsecond
+		cache.TTL = 1 * time.Millisecond
 		cache.Update = func() (time.Time, error) {
 			return time.Now(), nil
 		}