diff --git a/cmd/disk-cache-backend.go b/cmd/disk-cache-backend.go index d73fa5542..afcaf5188 100644 --- a/cmd/disk-cache-backend.go +++ b/cmd/disk-cache-backend.go @@ -125,12 +125,14 @@ func (m *cacheMeta) ToObjectInfo(bucket, object string) (o ObjectInfo) { // represents disk cache struct type diskCache struct { - gcCounter uint64 // ref: https://golang.org/pkg/sync/atomic/#pkg-note-BUG // is set to 0 if drive is offline - online uint32 + online uint32 // ref: https://golang.org/pkg/sync/atomic/#pkg-note-BUG + purgeRunning int32 - dir string // caching directory - quotaPct int // max usage in % + triggerGC chan struct{} + dir string // caching directory + stats CacheDiskStats // disk cache stats for prometheus + quotaPct int // max usage in % pool sync.Pool after int // minimum accesses before an object is cached. lowWatermark int @@ -142,12 +144,14 @@ type diskCache struct { } // Inits the disk cache dir if it is not initialized already. -func newDiskCache(dir string, quotaPct, after, lowWatermark, highWatermark int) (*diskCache, error) { +func newDiskCache(ctx context.Context, dir string, quotaPct, after, lowWatermark, highWatermark int) (*diskCache, error) { if err := os.MkdirAll(dir, 0777); err != nil { return nil, fmt.Errorf("Unable to initialize '%s' dir, %w", dir, err) } cache := diskCache{ dir: dir, + triggerGC: make(chan struct{}), + stats: CacheDiskStats{Dir: dir}, quotaPct: quotaPct, after: after, lowWatermark: lowWatermark, @@ -161,6 +165,8 @@ func newDiskCache(dir string, quotaPct, after, lowWatermark, highWatermark int) }, nsMutex: newNSLock(false), } + go cache.purgeWait(ctx) + cache.diskUsageHigh() // update if cache usage is already high. cache.NewNSLockFn = func(ctx context.Context, cachePath string) RWLocker { return cache.nsMutex.NewNSLock(ctx, nil, cachePath, "") } @@ -181,7 +187,12 @@ func (c *diskCache) diskUsageLow() bool { return false } usedPercent := (di.Total - di.Free) * 100 / di.Total - return int(usedPercent) < gcStopPct + low := int(usedPercent) < gcStopPct + atomic.StoreUint64(&c.stats.UsagePercent, usedPercent) + if low { + atomic.StoreInt32(&c.stats.UsageState, 0) + } + return low } // Returns if the disk usage reaches high water mark w.r.t the configured cache quota. @@ -196,7 +207,12 @@ func (c *diskCache) diskUsageHigh() bool { return false } usedPercent := (di.Total - di.Free) * 100 / di.Total - return int(usedPercent) >= gcTriggerPct + high := int(usedPercent) >= gcTriggerPct + atomic.StoreUint64(&c.stats.UsagePercent, usedPercent) + if high { + atomic.StoreInt32(&c.stats.UsageState, 1) + } + return high } // Returns if size space can be allocated without exceeding @@ -230,24 +246,36 @@ var ( errDoneForNow = errors.New("done for now") ) +func (c *diskCache) purgeWait(ctx context.Context) { + for { + select { + case <-ctx.Done(): + case <-c.triggerGC: // wait here until someone triggers. + c.purge(ctx) + } + } +} + // Purge cache entries that were not accessed. func (c *diskCache) purge(ctx context.Context) { - if c.diskUsageLow() { + if atomic.LoadInt32(&c.purgeRunning) == 1 || c.diskUsageLow() { return } + toFree := c.toClear() if toFree == 0 { return } + + atomic.StoreInt32(&c.purgeRunning, 1) // do not run concurrent purge() + defer atomic.StoreInt32(&c.purgeRunning, 0) + // expiry for cleaning up old cache.json files that // need to be cleaned up. expiry := UTCNow().Add(-cacheExpiryDays) // defaulting max hits count to 100 - scorer, err := newFileScorer(toFree, time.Now().Unix(), 100) - if err != nil { - logger.LogIf(ctx, err) - return - } + // ignore error we know what value we are passing. + scorer, _ := newFileScorer(toFree, time.Now().Unix(), 100) // this function returns FileInfo for cached range files and cache data file. fiStatFn := func(ranges map[string]string, dataFile, pathPrefix string) map[string]os.FileInfo { @@ -326,27 +354,20 @@ func (c *diskCache) purge(ctx context.Context) { return } - for _, path := range scorer.fileNames() { - removeAll(path) - slashIdx := strings.LastIndex(path, SlashSeparator) - pathPrefix := path[0:slashIdx] - fname := path[slashIdx+1:] - if fname == cacheDataFile { - removeAll(pathPrefix) + scorer.purgeFunc(func(qfile queuedFile) { + fileName := qfile.name + removeAll(fileName) + slashIdx := strings.LastIndex(fileName, SlashSeparator) + if slashIdx >= 0 { + fileNamePrefix := fileName[0:slashIdx] + fname := fileName[slashIdx+1:] + if fname == cacheDataFile { + removeAll(fileNamePrefix) + } } - } -} + }) -func (c *diskCache) incGCCounter() { - atomic.AddUint64(&c.gcCounter, 1) -} - -func (c *diskCache) resetGCCounter() { - atomic.StoreUint64(&c.gcCounter, 0) -} - -func (c *diskCache) gcCount() uint64 { - return atomic.LoadUint64(&c.gcCounter) + scorer.reset() } // sets cache drive status @@ -630,7 +651,7 @@ func newCacheEncryptMetadata(bucket, object string, metadata map[string]string) // Caches the object to disk func (c *diskCache) Put(ctx context.Context, bucket, object string, data io.Reader, size int64, rs *HTTPRangeSpec, opts ObjectOptions, incHitsOnly bool) error { if c.diskUsageHigh() { - c.incGCCounter() + c.triggerGC <- struct{}{} io.Copy(ioutil.Discard, data) return errDiskFull } diff --git a/cmd/disk-cache-stats.go b/cmd/disk-cache-stats.go index 6542a6911..2cf15fa2d 100644 --- a/cmd/disk-cache-stats.go +++ b/cmd/disk-cache-stats.go @@ -1,5 +1,5 @@ /* - * MinIO Cloud Storage, (C) 2019 MinIO, Inc. + * MinIO Cloud Storage, (C) 2019, 2020 MinIO, Inc. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,14 +16,27 @@ package cmd -import "sync/atomic" +import ( + "sync/atomic" +) + +// CacheDiskStats represents cache disk statistics +// such as current disk usage and available. +type CacheDiskStats struct { + // indicates if usage is high or low, if high value is '1', if low its '0' + UsageState int32 + // indicates the current usage percentage of this cache disk + UsagePercent uint64 + Dir string +} // CacheStats - represents bytes served from cache, // cache hits and cache misses. type CacheStats struct { - BytesServed uint64 - Hits uint64 - Misses uint64 + BytesServed uint64 + Hits uint64 + Misses uint64 + GetDiskStats func() []CacheDiskStats } // Increase total bytes served from cache diff --git a/cmd/disk-cache-utils.go b/cmd/disk-cache-utils.go index 335af10fc..e927dd4f7 100644 --- a/cmd/disk-cache-utils.go +++ b/cmd/disk-cache-utils.go @@ -439,6 +439,14 @@ func (f *fileScorer) fileObjInfos() []ObjectInfo { return res } +func (f *fileScorer) purgeFunc(p func(qfile queuedFile)) { + e := f.queue.Front() + for e != nil { + p(e.Value.(queuedFile)) + e = e.Next() + } +} + // fileNames returns all queued file names. func (f *fileScorer) fileNames() []string { res := make([]string, 0, f.queue.Len()) diff --git a/cmd/disk-cache.go b/cmd/disk-cache.go index b9bbd11ef..3d1657bd3 100644 --- a/cmd/disk-cache.go +++ b/cmd/disk-cache.go @@ -24,6 +24,7 @@ import ( "net/http" "strings" "sync" + "sync/atomic" "time" "github.com/minio/minio/cmd/config/cache" @@ -282,10 +283,11 @@ func (c *cacheObjects) GetObjectNInfo(ctx context.Context, bucket, object string // Reaching here implies cache miss c.cacheStats.incMiss() + // Since we got here, we are serving the request from backend, // and also adding the object to the cache. if dcache.diskUsageHigh() { - dcache.incGCCounter() + dcache.triggerGC <- struct{}{} // this is non-blocking } bkReader, bkErr := c.GetObjectNInfoFn(ctx, bucket, object, rs, h, lockType, opts) @@ -544,7 +546,7 @@ func newCache(config cache.Config) ([]*diskCache, bool, error) { if quota == 0 { quota = config.Quota } - cache, err := newDiskCache(dir, quota, config.After, config.WatermarkLow, config.WatermarkHigh) + cache, err := newDiskCache(ctx, dir, quota, config.After, config.WatermarkLow, config.WatermarkHigh) if err != nil { return nil, false, err } @@ -677,7 +679,17 @@ func newServerCacheObjects(ctx context.Context, config cache.Config) (CacheObjec return newObjectLayerFn().CopyObject(ctx, srcBucket, srcObject, destBucket, destObject, srcInfo, srcOpts, dstOpts) }, } - + c.cacheStats.GetDiskStats = func() []CacheDiskStats { + cacheDiskStats := make([]CacheDiskStats, len(c.cache)) + for i := range c.cache { + cacheDiskStats[i] = CacheDiskStats{ + Dir: c.cache[i].stats.Dir, + } + atomic.StoreInt32(&cacheDiskStats[i].UsageState, atomic.LoadInt32(&c.cache[i].stats.UsageState)) + atomic.StoreUint64(&cacheDiskStats[i].UsagePercent, atomic.LoadUint64(&c.cache[i].stats.UsagePercent)) + } + return cacheDiskStats + } if migrateSw { go c.migrateCacheFromV1toV2(ctx) } @@ -697,19 +709,9 @@ func (c *cacheObjects) gc(ctx context.Context) { if c.migrating { continue } - var wg sync.WaitGroup for _, dcache := range c.cache { - if dcache.gcCount() == 0 { - continue - } - wg.Add(1) - go func(d *diskCache) { - defer wg.Done() - d.resetGCCounter() - d.purge(ctx) - }(dcache) + dcache.triggerGC <- struct{}{} } - wg.Wait() } } } diff --git a/cmd/metrics.go b/cmd/metrics.go index b5c8ec87b..0821378f0 100644 --- a/cmd/metrics.go +++ b/cmd/metrics.go @@ -260,6 +260,27 @@ func cacheMetricsPrometheus(ch chan<- prometheus.Metric) { prometheus.CounterValue, float64(cacheObjLayer.CacheStats().getBytesServed()), ) + for _, cdStats := range cacheObjLayer.CacheStats().GetDiskStats() { + // Cache disk usage percentage + ch <- prometheus.MustNewConstMetric( + prometheus.NewDesc( + prometheus.BuildFQName("cache", "usage", "percent"), + "Total percentage cache usage", + []string{"disk"}, nil), + prometheus.GaugeValue, + float64(cdStats.UsagePercent), + cdStats.Dir, + ) + ch <- prometheus.MustNewConstMetric( + prometheus.NewDesc( + prometheus.BuildFQName("cache", "usage", "high"), + "Indicates cache usage is high or low, relative to current cache 'quota' settings", + []string{"disk"}, nil), + prometheus.GaugeValue, + float64(cdStats.UsageState), + cdStats.Dir, + ) + } } // collects http metrics for MinIO server in Prometheus specific format diff --git a/docs/metrics/prometheus/README.md b/docs/metrics/prometheus/README.md index 5f1af5bcb..25e8a6073 100644 --- a/docs/metrics/prometheus/README.md +++ b/docs/metrics/prometheus/README.md @@ -110,7 +110,7 @@ These are the new set of metrics which will be in effect after `RELEASE.2019-10- - Metrics that records the http statistics and latencies are labeled to their respective APIs (putobject,getobject etc). - Disk usage metrics are distributed and labeled to the respective disk paths. -For more details, please check the `Migration guide for the new set of metrics`. +For more details, please check the `Migration guide for the new set of metrics`. The list of metrics and its definition are as follows. (NOTE: instance here is one MinIO node) @@ -178,14 +178,23 @@ All metrics are labeled by `bucket`, each metric is displayed per bucket. `bucke MinIO Gateway instances enabled with Disk-Caching expose caching related metrics. -| name | description | -|:---------------------|:----------------------------------------| -| `cache_data_served` | Total number of bytes served from cache | -| `cache_hits_total` | Total number of cache hits | -| `cache_misses_total` | Total number of cache misses | +#### Global cache metrics +| name | description | +|:---------------------|:--------------------------------------------------| +| `cache_hits_total` | Total number of cache hits | +| `cache_misses_total` | Total number of cache misses | +| `cache_data_served` | Total number of bytes served from cache | -### Gateway & Cache specific metrics +#### Per disk cache metrics +| `cache_usage_percent` | Total percentage cache usage | +| `cache_usage_state` | Indicates cache usage is high or low, relative to current cache 'quota' settings | +`cache_usage_state` holds only two states + +- '1' indicates high disk usage +- '0' indicates low disk usage + +### Gateway specific metrics MinIO Gateway instance exposes metrics related to Gateway communication with the cloud backend (S3, Azure & GCS Gateway). `` changes based on the gateway in use can be 's3', 'gcs' or 'azure'. Other metrics are labeled with `method` that identifies HTTP GET, HEAD, PUT and POST requests to the backend.