mirror of
https://github.com/minio/minio.git
synced 2025-04-06 04:40:38 -04:00
fix: export prometheus metrics for cache GC triggers (#9815)
Bonus change to use channel to serialize triggers, instead of using atomic variables. More efficient mechanism for synchronization. Co-authored-by: Nitish Tiwari <nitish@minio.io>
This commit is contained in:
parent
2073b79633
commit
f9aa239973
@ -125,12 +125,14 @@ func (m *cacheMeta) ToObjectInfo(bucket, object string) (o ObjectInfo) {
|
|||||||
|
|
||||||
// represents disk cache struct
|
// represents disk cache struct
|
||||||
type diskCache struct {
|
type diskCache struct {
|
||||||
gcCounter uint64 // ref: https://golang.org/pkg/sync/atomic/#pkg-note-BUG
|
|
||||||
// is set to 0 if drive is offline
|
// is set to 0 if drive is offline
|
||||||
online uint32
|
online uint32 // ref: https://golang.org/pkg/sync/atomic/#pkg-note-BUG
|
||||||
|
purgeRunning int32
|
||||||
|
|
||||||
dir string // caching directory
|
triggerGC chan struct{}
|
||||||
quotaPct int // max usage in %
|
dir string // caching directory
|
||||||
|
stats CacheDiskStats // disk cache stats for prometheus
|
||||||
|
quotaPct int // max usage in %
|
||||||
pool sync.Pool
|
pool sync.Pool
|
||||||
after int // minimum accesses before an object is cached.
|
after int // minimum accesses before an object is cached.
|
||||||
lowWatermark int
|
lowWatermark int
|
||||||
@ -142,12 +144,14 @@ type diskCache struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Inits the disk cache dir if it is not initialized already.
|
// Inits the disk cache dir if it is not initialized already.
|
||||||
func newDiskCache(dir string, quotaPct, after, lowWatermark, highWatermark int) (*diskCache, error) {
|
func newDiskCache(ctx context.Context, dir string, quotaPct, after, lowWatermark, highWatermark int) (*diskCache, error) {
|
||||||
if err := os.MkdirAll(dir, 0777); err != nil {
|
if err := os.MkdirAll(dir, 0777); err != nil {
|
||||||
return nil, fmt.Errorf("Unable to initialize '%s' dir, %w", dir, err)
|
return nil, fmt.Errorf("Unable to initialize '%s' dir, %w", dir, err)
|
||||||
}
|
}
|
||||||
cache := diskCache{
|
cache := diskCache{
|
||||||
dir: dir,
|
dir: dir,
|
||||||
|
triggerGC: make(chan struct{}),
|
||||||
|
stats: CacheDiskStats{Dir: dir},
|
||||||
quotaPct: quotaPct,
|
quotaPct: quotaPct,
|
||||||
after: after,
|
after: after,
|
||||||
lowWatermark: lowWatermark,
|
lowWatermark: lowWatermark,
|
||||||
@ -161,6 +165,8 @@ func newDiskCache(dir string, quotaPct, after, lowWatermark, highWatermark int)
|
|||||||
},
|
},
|
||||||
nsMutex: newNSLock(false),
|
nsMutex: newNSLock(false),
|
||||||
}
|
}
|
||||||
|
go cache.purgeWait(ctx)
|
||||||
|
cache.diskUsageHigh() // update if cache usage is already high.
|
||||||
cache.NewNSLockFn = func(ctx context.Context, cachePath string) RWLocker {
|
cache.NewNSLockFn = func(ctx context.Context, cachePath string) RWLocker {
|
||||||
return cache.nsMutex.NewNSLock(ctx, nil, cachePath, "")
|
return cache.nsMutex.NewNSLock(ctx, nil, cachePath, "")
|
||||||
}
|
}
|
||||||
@ -181,7 +187,12 @@ func (c *diskCache) diskUsageLow() bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
usedPercent := (di.Total - di.Free) * 100 / di.Total
|
usedPercent := (di.Total - di.Free) * 100 / di.Total
|
||||||
return int(usedPercent) < gcStopPct
|
low := int(usedPercent) < gcStopPct
|
||||||
|
atomic.StoreUint64(&c.stats.UsagePercent, usedPercent)
|
||||||
|
if low {
|
||||||
|
atomic.StoreInt32(&c.stats.UsageState, 0)
|
||||||
|
}
|
||||||
|
return low
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns if the disk usage reaches high water mark w.r.t the configured cache quota.
|
// Returns if the disk usage reaches high water mark w.r.t the configured cache quota.
|
||||||
@ -196,7 +207,12 @@ func (c *diskCache) diskUsageHigh() bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
usedPercent := (di.Total - di.Free) * 100 / di.Total
|
usedPercent := (di.Total - di.Free) * 100 / di.Total
|
||||||
return int(usedPercent) >= gcTriggerPct
|
high := int(usedPercent) >= gcTriggerPct
|
||||||
|
atomic.StoreUint64(&c.stats.UsagePercent, usedPercent)
|
||||||
|
if high {
|
||||||
|
atomic.StoreInt32(&c.stats.UsageState, 1)
|
||||||
|
}
|
||||||
|
return high
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns if size space can be allocated without exceeding
|
// Returns if size space can be allocated without exceeding
|
||||||
@ -230,24 +246,36 @@ var (
|
|||||||
errDoneForNow = errors.New("done for now")
|
errDoneForNow = errors.New("done for now")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func (c *diskCache) purgeWait(ctx context.Context) {
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
case <-c.triggerGC: // wait here until someone triggers.
|
||||||
|
c.purge(ctx)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Purge cache entries that were not accessed.
|
// Purge cache entries that were not accessed.
|
||||||
func (c *diskCache) purge(ctx context.Context) {
|
func (c *diskCache) purge(ctx context.Context) {
|
||||||
if c.diskUsageLow() {
|
if atomic.LoadInt32(&c.purgeRunning) == 1 || c.diskUsageLow() {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
toFree := c.toClear()
|
toFree := c.toClear()
|
||||||
if toFree == 0 {
|
if toFree == 0 {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
atomic.StoreInt32(&c.purgeRunning, 1) // do not run concurrent purge()
|
||||||
|
defer atomic.StoreInt32(&c.purgeRunning, 0)
|
||||||
|
|
||||||
// expiry for cleaning up old cache.json files that
|
// expiry for cleaning up old cache.json files that
|
||||||
// need to be cleaned up.
|
// need to be cleaned up.
|
||||||
expiry := UTCNow().Add(-cacheExpiryDays)
|
expiry := UTCNow().Add(-cacheExpiryDays)
|
||||||
// defaulting max hits count to 100
|
// defaulting max hits count to 100
|
||||||
scorer, err := newFileScorer(toFree, time.Now().Unix(), 100)
|
// ignore error we know what value we are passing.
|
||||||
if err != nil {
|
scorer, _ := newFileScorer(toFree, time.Now().Unix(), 100)
|
||||||
logger.LogIf(ctx, err)
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// this function returns FileInfo for cached range files and cache data file.
|
// this function returns FileInfo for cached range files and cache data file.
|
||||||
fiStatFn := func(ranges map[string]string, dataFile, pathPrefix string) map[string]os.FileInfo {
|
fiStatFn := func(ranges map[string]string, dataFile, pathPrefix string) map[string]os.FileInfo {
|
||||||
@ -326,27 +354,20 @@ func (c *diskCache) purge(ctx context.Context) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, path := range scorer.fileNames() {
|
scorer.purgeFunc(func(qfile queuedFile) {
|
||||||
removeAll(path)
|
fileName := qfile.name
|
||||||
slashIdx := strings.LastIndex(path, SlashSeparator)
|
removeAll(fileName)
|
||||||
pathPrefix := path[0:slashIdx]
|
slashIdx := strings.LastIndex(fileName, SlashSeparator)
|
||||||
fname := path[slashIdx+1:]
|
if slashIdx >= 0 {
|
||||||
if fname == cacheDataFile {
|
fileNamePrefix := fileName[0:slashIdx]
|
||||||
removeAll(pathPrefix)
|
fname := fileName[slashIdx+1:]
|
||||||
|
if fname == cacheDataFile {
|
||||||
|
removeAll(fileNamePrefix)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
})
|
||||||
}
|
|
||||||
|
|
||||||
func (c *diskCache) incGCCounter() {
|
scorer.reset()
|
||||||
atomic.AddUint64(&c.gcCounter, 1)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *diskCache) resetGCCounter() {
|
|
||||||
atomic.StoreUint64(&c.gcCounter, 0)
|
|
||||||
}
|
|
||||||
|
|
||||||
func (c *diskCache) gcCount() uint64 {
|
|
||||||
return atomic.LoadUint64(&c.gcCounter)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// sets cache drive status
|
// sets cache drive status
|
||||||
@ -630,7 +651,7 @@ func newCacheEncryptMetadata(bucket, object string, metadata map[string]string)
|
|||||||
// Caches the object to disk
|
// Caches the object to disk
|
||||||
func (c *diskCache) Put(ctx context.Context, bucket, object string, data io.Reader, size int64, rs *HTTPRangeSpec, opts ObjectOptions, incHitsOnly bool) error {
|
func (c *diskCache) Put(ctx context.Context, bucket, object string, data io.Reader, size int64, rs *HTTPRangeSpec, opts ObjectOptions, incHitsOnly bool) error {
|
||||||
if c.diskUsageHigh() {
|
if c.diskUsageHigh() {
|
||||||
c.incGCCounter()
|
c.triggerGC <- struct{}{}
|
||||||
io.Copy(ioutil.Discard, data)
|
io.Copy(ioutil.Discard, data)
|
||||||
return errDiskFull
|
return errDiskFull
|
||||||
}
|
}
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
/*
|
/*
|
||||||
* MinIO Cloud Storage, (C) 2019 MinIO, Inc.
|
* MinIO Cloud Storage, (C) 2019, 2020 MinIO, Inc.
|
||||||
*
|
*
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
* you may not use this file except in compliance with the License.
|
* you may not use this file except in compliance with the License.
|
||||||
@ -16,14 +16,27 @@
|
|||||||
|
|
||||||
package cmd
|
package cmd
|
||||||
|
|
||||||
import "sync/atomic"
|
import (
|
||||||
|
"sync/atomic"
|
||||||
|
)
|
||||||
|
|
||||||
|
// CacheDiskStats represents cache disk statistics
|
||||||
|
// such as current disk usage and available.
|
||||||
|
type CacheDiskStats struct {
|
||||||
|
// indicates if usage is high or low, if high value is '1', if low its '0'
|
||||||
|
UsageState int32
|
||||||
|
// indicates the current usage percentage of this cache disk
|
||||||
|
UsagePercent uint64
|
||||||
|
Dir string
|
||||||
|
}
|
||||||
|
|
||||||
// CacheStats - represents bytes served from cache,
|
// CacheStats - represents bytes served from cache,
|
||||||
// cache hits and cache misses.
|
// cache hits and cache misses.
|
||||||
type CacheStats struct {
|
type CacheStats struct {
|
||||||
BytesServed uint64
|
BytesServed uint64
|
||||||
Hits uint64
|
Hits uint64
|
||||||
Misses uint64
|
Misses uint64
|
||||||
|
GetDiskStats func() []CacheDiskStats
|
||||||
}
|
}
|
||||||
|
|
||||||
// Increase total bytes served from cache
|
// Increase total bytes served from cache
|
||||||
|
@ -439,6 +439,14 @@ func (f *fileScorer) fileObjInfos() []ObjectInfo {
|
|||||||
return res
|
return res
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (f *fileScorer) purgeFunc(p func(qfile queuedFile)) {
|
||||||
|
e := f.queue.Front()
|
||||||
|
for e != nil {
|
||||||
|
p(e.Value.(queuedFile))
|
||||||
|
e = e.Next()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// fileNames returns all queued file names.
|
// fileNames returns all queued file names.
|
||||||
func (f *fileScorer) fileNames() []string {
|
func (f *fileScorer) fileNames() []string {
|
||||||
res := make([]string, 0, f.queue.Len())
|
res := make([]string, 0, f.queue.Len())
|
||||||
|
@ -24,6 +24,7 @@ import (
|
|||||||
"net/http"
|
"net/http"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
"sync/atomic"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/minio/minio/cmd/config/cache"
|
"github.com/minio/minio/cmd/config/cache"
|
||||||
@ -282,10 +283,11 @@ func (c *cacheObjects) GetObjectNInfo(ctx context.Context, bucket, object string
|
|||||||
|
|
||||||
// Reaching here implies cache miss
|
// Reaching here implies cache miss
|
||||||
c.cacheStats.incMiss()
|
c.cacheStats.incMiss()
|
||||||
|
|
||||||
// Since we got here, we are serving the request from backend,
|
// Since we got here, we are serving the request from backend,
|
||||||
// and also adding the object to the cache.
|
// and also adding the object to the cache.
|
||||||
if dcache.diskUsageHigh() {
|
if dcache.diskUsageHigh() {
|
||||||
dcache.incGCCounter()
|
dcache.triggerGC <- struct{}{} // this is non-blocking
|
||||||
}
|
}
|
||||||
|
|
||||||
bkReader, bkErr := c.GetObjectNInfoFn(ctx, bucket, object, rs, h, lockType, opts)
|
bkReader, bkErr := c.GetObjectNInfoFn(ctx, bucket, object, rs, h, lockType, opts)
|
||||||
@ -544,7 +546,7 @@ func newCache(config cache.Config) ([]*diskCache, bool, error) {
|
|||||||
if quota == 0 {
|
if quota == 0 {
|
||||||
quota = config.Quota
|
quota = config.Quota
|
||||||
}
|
}
|
||||||
cache, err := newDiskCache(dir, quota, config.After, config.WatermarkLow, config.WatermarkHigh)
|
cache, err := newDiskCache(ctx, dir, quota, config.After, config.WatermarkLow, config.WatermarkHigh)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, false, err
|
return nil, false, err
|
||||||
}
|
}
|
||||||
@ -677,7 +679,17 @@ func newServerCacheObjects(ctx context.Context, config cache.Config) (CacheObjec
|
|||||||
return newObjectLayerFn().CopyObject(ctx, srcBucket, srcObject, destBucket, destObject, srcInfo, srcOpts, dstOpts)
|
return newObjectLayerFn().CopyObject(ctx, srcBucket, srcObject, destBucket, destObject, srcInfo, srcOpts, dstOpts)
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
c.cacheStats.GetDiskStats = func() []CacheDiskStats {
|
||||||
|
cacheDiskStats := make([]CacheDiskStats, len(c.cache))
|
||||||
|
for i := range c.cache {
|
||||||
|
cacheDiskStats[i] = CacheDiskStats{
|
||||||
|
Dir: c.cache[i].stats.Dir,
|
||||||
|
}
|
||||||
|
atomic.StoreInt32(&cacheDiskStats[i].UsageState, atomic.LoadInt32(&c.cache[i].stats.UsageState))
|
||||||
|
atomic.StoreUint64(&cacheDiskStats[i].UsagePercent, atomic.LoadUint64(&c.cache[i].stats.UsagePercent))
|
||||||
|
}
|
||||||
|
return cacheDiskStats
|
||||||
|
}
|
||||||
if migrateSw {
|
if migrateSw {
|
||||||
go c.migrateCacheFromV1toV2(ctx)
|
go c.migrateCacheFromV1toV2(ctx)
|
||||||
}
|
}
|
||||||
@ -697,19 +709,9 @@ func (c *cacheObjects) gc(ctx context.Context) {
|
|||||||
if c.migrating {
|
if c.migrating {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
var wg sync.WaitGroup
|
|
||||||
for _, dcache := range c.cache {
|
for _, dcache := range c.cache {
|
||||||
if dcache.gcCount() == 0 {
|
dcache.triggerGC <- struct{}{}
|
||||||
continue
|
|
||||||
}
|
|
||||||
wg.Add(1)
|
|
||||||
go func(d *diskCache) {
|
|
||||||
defer wg.Done()
|
|
||||||
d.resetGCCounter()
|
|
||||||
d.purge(ctx)
|
|
||||||
}(dcache)
|
|
||||||
}
|
}
|
||||||
wg.Wait()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -260,6 +260,27 @@ func cacheMetricsPrometheus(ch chan<- prometheus.Metric) {
|
|||||||
prometheus.CounterValue,
|
prometheus.CounterValue,
|
||||||
float64(cacheObjLayer.CacheStats().getBytesServed()),
|
float64(cacheObjLayer.CacheStats().getBytesServed()),
|
||||||
)
|
)
|
||||||
|
for _, cdStats := range cacheObjLayer.CacheStats().GetDiskStats() {
|
||||||
|
// Cache disk usage percentage
|
||||||
|
ch <- prometheus.MustNewConstMetric(
|
||||||
|
prometheus.NewDesc(
|
||||||
|
prometheus.BuildFQName("cache", "usage", "percent"),
|
||||||
|
"Total percentage cache usage",
|
||||||
|
[]string{"disk"}, nil),
|
||||||
|
prometheus.GaugeValue,
|
||||||
|
float64(cdStats.UsagePercent),
|
||||||
|
cdStats.Dir,
|
||||||
|
)
|
||||||
|
ch <- prometheus.MustNewConstMetric(
|
||||||
|
prometheus.NewDesc(
|
||||||
|
prometheus.BuildFQName("cache", "usage", "high"),
|
||||||
|
"Indicates cache usage is high or low, relative to current cache 'quota' settings",
|
||||||
|
[]string{"disk"}, nil),
|
||||||
|
prometheus.GaugeValue,
|
||||||
|
float64(cdStats.UsageState),
|
||||||
|
cdStats.Dir,
|
||||||
|
)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// collects http metrics for MinIO server in Prometheus specific format
|
// collects http metrics for MinIO server in Prometheus specific format
|
||||||
|
@ -110,7 +110,7 @@ These are the new set of metrics which will be in effect after `RELEASE.2019-10-
|
|||||||
- Metrics that records the http statistics and latencies are labeled to their respective APIs (putobject,getobject etc).
|
- Metrics that records the http statistics and latencies are labeled to their respective APIs (putobject,getobject etc).
|
||||||
- Disk usage metrics are distributed and labeled to the respective disk paths.
|
- Disk usage metrics are distributed and labeled to the respective disk paths.
|
||||||
|
|
||||||
For more details, please check the `Migration guide for the new set of metrics`.
|
For more details, please check the `Migration guide for the new set of metrics`.
|
||||||
|
|
||||||
The list of metrics and its definition are as follows. (NOTE: instance here is one MinIO node)
|
The list of metrics and its definition are as follows. (NOTE: instance here is one MinIO node)
|
||||||
|
|
||||||
@ -178,14 +178,23 @@ All metrics are labeled by `bucket`, each metric is displayed per bucket. `bucke
|
|||||||
|
|
||||||
MinIO Gateway instances enabled with Disk-Caching expose caching related metrics.
|
MinIO Gateway instances enabled with Disk-Caching expose caching related metrics.
|
||||||
|
|
||||||
| name | description |
|
#### Global cache metrics
|
||||||
|:---------------------|:----------------------------------------|
|
| name | description |
|
||||||
| `cache_data_served` | Total number of bytes served from cache |
|
|:---------------------|:--------------------------------------------------|
|
||||||
| `cache_hits_total` | Total number of cache hits |
|
| `cache_hits_total` | Total number of cache hits |
|
||||||
| `cache_misses_total` | Total number of cache misses |
|
| `cache_misses_total` | Total number of cache misses |
|
||||||
|
| `cache_data_served` | Total number of bytes served from cache |
|
||||||
|
|
||||||
### Gateway & Cache specific metrics
|
#### Per disk cache metrics
|
||||||
|
| `cache_usage_percent` | Total percentage cache usage |
|
||||||
|
| `cache_usage_state` | Indicates cache usage is high or low, relative to current cache 'quota' settings |
|
||||||
|
|
||||||
|
`cache_usage_state` holds only two states
|
||||||
|
|
||||||
|
- '1' indicates high disk usage
|
||||||
|
- '0' indicates low disk usage
|
||||||
|
|
||||||
|
### Gateway specific metrics
|
||||||
MinIO Gateway instance exposes metrics related to Gateway communication with the cloud backend (S3, Azure & GCS Gateway).
|
MinIO Gateway instance exposes metrics related to Gateway communication with the cloud backend (S3, Azure & GCS Gateway).
|
||||||
|
|
||||||
`<gateway_type>` changes based on the gateway in use can be 's3', 'gcs' or 'azure'. Other metrics are labeled with `method` that identifies HTTP GET, HEAD, PUT and POST requests to the backend.
|
`<gateway_type>` changes based on the gateway in use can be 's3', 'gcs' or 'azure'. Other metrics are labeled with `method` that identifies HTTP GET, HEAD, PUT and POST requests to the backend.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user