add backups for usage-caches to rely on upon error (#18029)

This allows scanner to avoid lengthy scans, skip
things appropriately and also not lose metrics in
any manner.

reduce longer deadlines for usage-cache loads/saves
to match the disk timeout which is 2minutes now per
IOP.
This commit is contained in:
Harshavardhana 2023-09-14 11:53:52 -07:00 committed by GitHub
parent 822cbd4b43
commit a2aabfabd9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 87 additions and 55 deletions

View File

@ -1501,9 +1501,18 @@ func replicateObjectWithMultipart(ctx context.Context, c *minio.Core, bucket, ob
var uploadedParts []minio.CompletePart var uploadedParts []minio.CompletePart
// new multipart must not set mtime as it may lead to erroneous cleanups at various intervals. // new multipart must not set mtime as it may lead to erroneous cleanups at various intervals.
opts.Internal.SourceMTime = time.Time{} // this value is saved properly in CompleteMultipartUpload() opts.Internal.SourceMTime = time.Time{} // this value is saved properly in CompleteMultipartUpload()
nctx, cancel := context.WithTimeout(ctx, 5*time.Minute) var uploadID string
defer cancel() attempts := 1
uploadID, err := c.NewMultipartUpload(nctx, bucket, object, opts) for attempts <= 3 {
nctx, cancel := context.WithTimeout(ctx, time.Minute)
uploadID, err = c.NewMultipartUpload(nctx, bucket, object, opts)
cancel()
if err == nil {
break
}
attempts++
time.Sleep(time.Duration(rand.Int63n(int64(time.Second))))
}
if err != nil { if err != nil {
return err return err
} }
@ -1524,7 +1533,7 @@ func replicateObjectWithMultipart(ctx context.Context, c *minio.Core, bucket, ob
fmt.Errorf("trying %s: Unable to cleanup failed multipart replication %s on remote %s/%s: %w - this may consume space on remote cluster", fmt.Errorf("trying %s: Unable to cleanup failed multipart replication %s on remote %s/%s: %w - this may consume space on remote cluster",
humanize.Ordinal(attempts), uploadID, bucket, object, aerr)) humanize.Ordinal(attempts), uploadID, bucket, object, aerr))
attempts++ attempts++
time.Sleep(time.Second) time.Sleep(time.Duration(rand.Int63n(int64(time.Second))))
} }
} }
}() }()

View File

@ -38,6 +38,7 @@ import (
"github.com/minio/minio/internal/hash" "github.com/minio/minio/internal/hash"
"github.com/minio/minio/internal/logger" "github.com/minio/minio/internal/logger"
"github.com/tinylib/msgp/msgp" "github.com/tinylib/msgp/msgp"
"github.com/valyala/bytebufferpool"
) )
//go:generate msgp -file $GOFILE -unexported //go:generate msgp -file $GOFILE -unexported
@ -927,34 +928,42 @@ type objectIO interface {
// The loader is optimistic and has no locking, but tries 5 times before giving up. // The loader is optimistic and has no locking, but tries 5 times before giving up.
// If the object is not found or unable to deserialize d is cleared and nil error is returned. // If the object is not found or unable to deserialize d is cleared and nil error is returned.
func (d *dataUsageCache) load(ctx context.Context, store objectIO, name string) error { func (d *dataUsageCache) load(ctx context.Context, store objectIO, name string) error {
// Abandon if more than 5 minutes, so we don't hold up scanner. load := func(name string, timeout time.Duration) (bool, error) {
ctx, cancel := context.WithTimeout(ctx, 5*time.Minute) // Abandon if more than time.Minute, so we don't hold up scanner.
// drive timeout by default is 2 minutes, we do not need to wait longer.
ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel() defer cancel()
// Caches are read+written without locks,
retries := 0
for retries < 5 {
r, err := store.GetObjectNInfo(ctx, dataUsageBucket, name, nil, http.Header{}, ObjectOptions{NoLock: true}) r, err := store.GetObjectNInfo(ctx, dataUsageBucket, name, nil, http.Header{}, ObjectOptions{NoLock: true})
if err != nil { if err != nil {
switch err.(type) { switch err.(type) {
case ObjectNotFound, BucketNotFound: case ObjectNotFound, BucketNotFound:
case InsufficientReadQuorum, StorageErr: case InsufficientReadQuorum, StorageErr:
retries++ return true, nil
time.Sleep(time.Duration(rand.Int63n(int64(time.Second))))
continue
default:
return toObjectErr(err, dataUsageBucket, name)
} }
*d = dataUsageCache{} return false, toObjectErr(err, dataUsageBucket, name)
return nil
} }
if err := d.deserialize(r); err != nil { err = d.deserialize(r)
r.Close() r.Close()
return err != nil, nil
}
// Caches are read+written without locks,
retries := 0
for retries < 5 {
retry, err := load(name, time.Minute)
if err != nil {
return err
}
if retry {
retry, _ = load(name+".bkp", 30*time.Second)
if !retry {
break
}
retries++ retries++
time.Sleep(time.Duration(rand.Int63n(int64(time.Second)))) time.Sleep(time.Duration(rand.Int63n(int64(time.Second))))
continue continue
} }
r.Close()
return nil return nil
} }
*d = dataUsageCache{} *d = dataUsageCache{}
@ -967,38 +976,38 @@ var maxConcurrentScannerSaves = make(chan struct{}, 4)
// save the content of the cache to minioMetaBackgroundOpsBucket with the provided name. // save the content of the cache to minioMetaBackgroundOpsBucket with the provided name.
// Note that no locking is done when saving. // Note that no locking is done when saving.
func (d *dataUsageCache) save(ctx context.Context, store objectIO, name string) error { func (d *dataUsageCache) save(ctx context.Context, store objectIO, name string) error {
var r io.Reader select {
maxConcurrentScannerSaves <- struct{}{} case <-ctx.Done():
return ctx.Err()
case maxConcurrentScannerSaves <- struct{}{}:
}
defer func() { defer func() {
<-maxConcurrentScannerSaves select {
case <-ctx.Done():
case <-maxConcurrentScannerSaves:
}
}() }()
// If big, do streaming...
size := int64(-1) buf := bytebufferpool.Get()
if len(d.Cache) > 10000 { defer func() {
pr, pw := io.Pipe() buf.Reset()
go func() { bytebufferpool.Put(buf)
pw.CloseWithError(d.serializeTo(pw))
}() }()
defer pr.Close()
r = pr if err := d.serializeTo(buf); err != nil {
} else {
var buf bytes.Buffer
err := d.serializeTo(&buf)
if err != nil {
return err return err
} }
r = &buf
size = int64(buf.Len())
}
hr, err := hash.NewReader(r, size, "", "", size) hr, err := hash.NewReader(bytes.NewReader(buf.Bytes()), int64(buf.Len()), "", "", int64(buf.Len()))
if err != nil { if err != nil {
return err return err
} }
// Abandon if more than 5 minutes, so we don't hold up scanner. save := func(name string, timeout time.Duration) error {
ctx, cancel := context.WithTimeout(ctx, 5*time.Minute) // Abandon if more than a minute, so we don't hold up scanner.
ctx, cancel := context.WithTimeout(ctx, timeout)
defer cancel() defer cancel()
_, err = store.PutObject(ctx, _, err = store.PutObject(ctx,
dataUsageBucket, dataUsageBucket,
name, name,
@ -1009,6 +1018,11 @@ func (d *dataUsageCache) save(ctx context.Context, store objectIO, name string)
} }
return err return err
} }
defer save(name+".bkp", 30*time.Second) // Keep a backup as well
// drive timeout by default is 2 minutes, we do not need to wait longer.
return save(name, time.Minute)
}
// dataUsageCacheVer indicates the cache version. // dataUsageCacheVer indicates the cache version.
// Bumping the cache version will drop data from previous versions // Bumping the cache version will drop data from previous versions

View File

@ -42,6 +42,7 @@ const (
// storeDataUsageInBackend will store all objects sent on the gui channel until closed. // storeDataUsageInBackend will store all objects sent on the gui channel until closed.
func storeDataUsageInBackend(ctx context.Context, objAPI ObjectLayer, dui <-chan DataUsageInfo) { func storeDataUsageInBackend(ctx context.Context, objAPI ObjectLayer, dui <-chan DataUsageInfo) {
attempts := 1
for dataUsageInfo := range dui { for dataUsageInfo := range dui {
json := jsoniter.ConfigCompatibleWithStandardLibrary json := jsoniter.ConfigCompatibleWithStandardLibrary
dataUsageJSON, err := json.Marshal(dataUsageInfo) dataUsageJSON, err := json.Marshal(dataUsageInfo)
@ -49,9 +50,14 @@ func storeDataUsageInBackend(ctx context.Context, objAPI ObjectLayer, dui <-chan
logger.LogIf(ctx, err) logger.LogIf(ctx, err)
continue continue
} }
if attempts > 10 {
saveConfig(ctx, objAPI, dataUsageObjNamePath+".bkp", dataUsageJSON) // Save a backup every 10th update.
attempts = 1
}
if err = saveConfig(ctx, objAPI, dataUsageObjNamePath, dataUsageJSON); err != nil { if err = saveConfig(ctx, objAPI, dataUsageObjNamePath, dataUsageJSON); err != nil {
logger.LogIf(ctx, err) logger.LogIf(ctx, err)
} }
attempts++
} }
} }
@ -93,12 +99,15 @@ func loadPrefixUsageFromBackend(ctx context.Context, objAPI ObjectLayer, bucket
func loadDataUsageFromBackend(ctx context.Context, objAPI ObjectLayer) (DataUsageInfo, error) { func loadDataUsageFromBackend(ctx context.Context, objAPI ObjectLayer) (DataUsageInfo, error) {
buf, err := readConfig(ctx, objAPI, dataUsageObjNamePath) buf, err := readConfig(ctx, objAPI, dataUsageObjNamePath)
if err != nil {
buf, err = readConfig(ctx, objAPI, dataUsageObjNamePath+".bkp")
if err != nil { if err != nil {
if errors.Is(err, errConfigNotFound) { if errors.Is(err, errConfigNotFound) {
return DataUsageInfo{}, nil return DataUsageInfo{}, nil
} }
return DataUsageInfo{}, toObjectErr(err, minioMetaBucket, dataUsageObjNamePath) return DataUsageInfo{}, toObjectErr(err, minioMetaBucket, dataUsageObjNamePath)
} }
}
var dataUsageInfo DataUsageInfo var dataUsageInfo DataUsageInfo
json := jsoniter.ConfigCompatibleWithStandardLibrary json := jsoniter.ConfigCompatibleWithStandardLibrary