re-implement data usage crawler to be more efficient (#9075)

Implementation overview: 

https://gist.github.com/klauspost/1801c858d5e0df391114436fdad6987b
This commit is contained in:
Klaus Post
2020-03-19 00:19:29 +01:00
committed by GitHub
parent 7fdeb44372
commit 8d98662633
61 changed files with 2895 additions and 543 deletions

View File

@@ -18,11 +18,15 @@ package cmd
import (
"context"
"fmt"
"path"
"sort"
"sync"
"time"
"github.com/minio/minio/cmd/logger"
"github.com/minio/minio/pkg/bpool"
"github.com/minio/minio/pkg/color"
"github.com/minio/minio/pkg/dsync"
"github.com/minio/minio/pkg/madmin"
"github.com/minio/minio/pkg/sync/errgroup"
@@ -195,45 +199,177 @@ func (xl xlObjects) GetMetrics(ctx context.Context) (*Metrics, error) {
return &Metrics{}, NotImplemented{}
}
// CrawlAndGetDataUsage picks three random disks to crawl and get data usage
func (xl xlObjects) CrawlAndGetDataUsage(ctx context.Context, endCh <-chan struct{}) DataUsageInfo {
var randomDisks []StorageAPI
// CrawlAndGetDataUsage will start crawling buckets and send updated totals as they are traversed.
// Updates are sent on a regular basis and the caller *must* consume them.
func (xl xlObjects) CrawlAndGetDataUsage(ctx context.Context, updates chan<- DataUsageInfo) error {
cache := make(chan dataUsageCache, 1)
defer close(cache)
buckets, err := xl.ListBuckets(ctx)
if err != nil {
return err
}
go func() {
for update := range cache {
updates <- update.dui(update.Info.Name, buckets)
}
}()
return xl.crawlAndGetDataUsage(ctx, buckets, cache)
}
// CrawlAndGetDataUsage will start crawling buckets and send updated totals as they are traversed.
// Updates are sent on a regular basis and the caller *must* consume them.
func (xl xlObjects) crawlAndGetDataUsage(ctx context.Context, buckets []BucketInfo, updates chan<- dataUsageCache) error {
var disks []StorageAPI
for _, d := range xl.getLoadBalancedDisks() {
if d == nil || !d.IsOnline() {
continue
}
randomDisks = append(randomDisks, d)
if len(randomDisks) >= 3 {
break
disks = append(disks, d)
}
if len(disks) == 0 || len(buckets) == 0 {
return nil
}
// Load bucket totals
oldCache := dataUsageCache{}
err := oldCache.load(ctx, xl, dataUsageCacheName)
if err != nil {
return err
}
// New cache..
cache := dataUsageCache{
Info: dataUsageCacheInfo{
Name: dataUsageRoot,
NextCycle: oldCache.Info.NextCycle,
},
Cache: make(map[dataUsageHash]dataUsageEntry, len(oldCache.Cache)),
}
// Put all buckets into channel.
bucketCh := make(chan BucketInfo, len(buckets))
// Add new buckets first
for _, b := range buckets {
if oldCache.find(b.Name) == nil {
bucketCh <- b
}
}
// Add existing buckets.
for _, b := range buckets {
e := oldCache.find(b.Name)
if e != nil {
bucketCh <- b
cache.replace(b.Name, dataUsageRoot, *e)
}
}
var dataUsageResults = make([]DataUsageInfo, len(randomDisks))
close(bucketCh)
buckets = nil
bucketResults := make(chan dataUsageEntryInfo, len(disks))
var wg sync.WaitGroup
for i := 0; i < len(randomDisks); i++ {
wg.Add(1)
go func(index int, disk StorageAPI) {
defer wg.Done()
var err error
dataUsageResults[index], err = disk.CrawlAndGetDataUsage(endCh)
if err != nil {
logger.LogIf(ctx, err)
// Start async collector/saver.
// This goroutine owns the cache.
var saverWg sync.WaitGroup
saverWg.Add(1)
go func() {
const updateTime = 30 * time.Second
t := time.NewTicker(updateTime)
defer t.Stop()
defer saverWg.Done()
var lastSave time.Time
saveLoop:
for {
select {
case <-ctx.Done():
// Return without saving.
return
case <-t.C:
if cache.Info.LastUpdate.Equal(lastSave) {
continue
}
logger.LogIf(ctx, cache.save(ctx, xl, dataUsageCacheName))
updates <- cache.clone()
lastSave = cache.Info.LastUpdate
case v, ok := <-bucketResults:
if !ok {
break saveLoop
}
cache.replace(v.Name, v.Parent, v.Entry)
cache.Info.LastUpdate = time.Now()
}
}(i, randomDisks[i])
}
// Save final state...
cache.Info.NextCycle++
cache.Info.LastUpdate = time.Now()
logger.LogIf(ctx, cache.save(ctx, xl, dataUsageCacheName))
updates <- cache
}()
// Start one crawler per disk
var wg sync.WaitGroup
wg.Add(len(disks))
for i := range disks {
go func(i int) {
defer wg.Done()
disk := disks[i]
for bucket := range bucketCh {
select {
case <-ctx.Done():
return
default:
}
if dataUsageDebug {
logger.Info(color.Green("crawlAndGetDataUsage:")+" Scanning bucket %v.", bucket.Name)
}
// Load cache for bucket
cacheName := path.Join(dataUsageBucketCacheDir, bucket.Name+".bin")
cache := dataUsageCache{}
logger.LogIf(ctx, cache.load(ctx, xl, cacheName))
if cache.Info.Name == "" {
cache.Info.Name = bucket.Name
}
if cache.Info.Name != bucket.Name {
logger.LogIf(ctx, fmt.Errorf("cache name mismatch: %s != %s", cache.Info.Name, bucket.Name))
cache.Info = dataUsageCacheInfo{
Name: bucket.Name,
LastUpdate: time.Time{},
NextCycle: 0,
}
}
// Calc usage
before := cache.Info.LastUpdate
cache, err = disk.CrawlAndGetDataUsage(ctx, cache)
if err != nil {
logger.LogIf(ctx, err)
if cache.Info.LastUpdate.After(before) {
logger.LogIf(ctx, cache.save(ctx, xl, cacheName))
}
continue
}
var root dataUsageEntry
if r := cache.root(); r != nil {
root = cache.flatten(*r)
}
bucketResults <- dataUsageEntryInfo{
Name: cache.Info.Name,
Parent: dataUsageRoot,
Entry: root,
}
// Save cache
logger.LogIf(ctx, cache.save(ctx, xl, cacheName))
}
}(i)
}
wg.Wait()
close(bucketResults)
saverWg.Wait()
var dataUsageInfo = dataUsageResults[0]
// Pick the crawling result of the disk which has the most
// number of objects in it.
for i := 1; i < len(dataUsageResults); i++ {
if dataUsageResults[i].ObjectsCount > dataUsageInfo.ObjectsCount {
dataUsageInfo = dataUsageResults[i]
}
}
return dataUsageInfo
return nil
}
// IsReady - No Op.