re-implement data usage crawler to be more efficient (#9075)

Implementation overview: 

https://gist.github.com/klauspost/1801c858d5e0df391114436fdad6987b
This commit is contained in:
Klaus Post
2020-03-19 00:19:29 +01:00
committed by GitHub
parent 7fdeb44372
commit 8d98662633
61 changed files with 2895 additions and 543 deletions

View File

@@ -37,12 +37,11 @@ import (
"github.com/minio/minio/cmd/config"
xhttp "github.com/minio/minio/cmd/http"
"github.com/minio/minio/cmd/logger"
bucketsse "github.com/minio/minio/pkg/bucket/encryption"
"github.com/minio/minio/pkg/bucket/lifecycle"
"github.com/minio/minio/pkg/bucket/object/tagging"
"github.com/minio/minio/pkg/bucket/policy"
"github.com/minio/minio/pkg/color"
"github.com/minio/minio/pkg/lock"
"github.com/minio/minio/pkg/madmin"
"github.com/minio/minio/pkg/mimedb"
@@ -112,7 +111,7 @@ func initMetaVolumeFS(fsPath, fsUUID string) error {
return err
}
if err := os.MkdirAll(pathJoin(fsPath, minioMetaBackgroundOpsBucket), 0777); err != nil {
if err := os.MkdirAll(pathJoin(fsPath, dataUsageBucket), 0777); err != nil {
return err
}
@@ -235,9 +234,26 @@ func (fs *FSObjects) waitForLowActiveIO() {
}
// CrawlAndGetDataUsage returns data usage stats of the current FS deployment
func (fs *FSObjects) CrawlAndGetDataUsage(ctx context.Context, endCh <-chan struct{}) DataUsageInfo {
dataUsageInfo := updateUsage(fs.fsPath, endCh, fs.waitForLowActiveIO, func(item Item) (int64, error) {
// Get file size, symlinks which cannot bex
func (fs *FSObjects) CrawlAndGetDataUsage(ctx context.Context, updates chan<- DataUsageInfo) error {
// Load bucket totals
var oldCache dataUsageCache
err := oldCache.load(ctx, fs, dataUsageCacheName)
if err != nil {
return err
}
if oldCache.Info.Name == "" {
oldCache.Info.Name = dataUsageRoot
}
if dataUsageDebug {
logger.Info(color.Green("FSObjects.CrawlAndGetDataUsage:") + " Start crawl cycle")
}
buckets, err := fs.ListBuckets(ctx)
if err != nil {
return err
}
t := time.Now()
cache, err := updateUsage(ctx, fs.fsPath, oldCache, fs.waitForLowActiveIO, func(item Item) (int64, error) {
// Get file size, symlinks which cannot be
// followed are automatically filtered by fastwalk.
fi, err := os.Stat(item.Path)
if err != nil {
@@ -245,11 +261,16 @@ func (fs *FSObjects) CrawlAndGetDataUsage(ctx context.Context, endCh <-chan stru
}
return fi.Size(), nil
})
if dataUsageDebug {
logger.Info(color.Green("FSObjects.CrawlAndGetDataUsage:")+" Crawl time: %v", time.Since(t))
}
// Even if there was an error, the new cache may have better info.
if cache.Info.LastUpdate.After(oldCache.Info.LastUpdate) {
logger.LogIf(ctx, cache.save(ctx, fs, dataUsageCacheName))
updates <- cache.dui(dataUsageRoot, buckets)
}
dataUsageInfo.LastUpdate = UTCNow()
atomic.StoreUint64(&fs.totalUsed, dataUsageInfo.ObjectsTotalSize)
return dataUsageInfo
return err
}
/// Bucket operations