heal: Enable periodic bitrot scan configuration (#14464)

This commit is contained in:
Anis Elleuch
2022-04-07 16:10:40 +01:00
committed by GitHub
parent ee49a23220
commit 16431d222c
20 changed files with 192 additions and 50 deletions

View File

@@ -21,6 +21,7 @@ import (
"bytes"
"context"
"encoding/binary"
"encoding/json"
"errors"
"fmt"
"io/fs"
@@ -103,6 +104,63 @@ func (s *safeDuration) Get() time.Duration {
return s.t
}
func getCycleScanMode(currentCycle, bitrotStartCycle uint64, bitrotStartTime time.Time) madmin.HealScanMode {
bitrotCycle := globalHealConfig.BitrotScanCycle()
switch bitrotCycle {
case -1:
return madmin.HealNormalScan
case 0:
return madmin.HealDeepScan
}
if currentCycle-bitrotStartCycle < healObjectSelectProb {
return madmin.HealDeepScan
}
if time.Since(bitrotStartTime) > bitrotCycle {
return madmin.HealDeepScan
}
return madmin.HealNormalScan
}
type backgroundHealInfo struct {
BitrotStartTime time.Time `json:"bitrotStartTime"`
BitrotStartCycle uint64 `json:"bitrotStartCycle"`
CurrentScanMode madmin.HealScanMode `json:"currentScanMode"`
}
func readBackgroundHealInfo(ctx context.Context, objAPI ObjectLayer) backgroundHealInfo {
// Get last healing information
buf, err := readConfig(ctx, objAPI, backgroundHealInfoPath)
if err != nil {
if !errors.Is(err, errConfigNotFound) {
logger.LogIf(ctx, err)
}
return backgroundHealInfo{}
}
var info backgroundHealInfo
err = json.Unmarshal(buf, &info)
if err != nil {
logger.LogIf(ctx, err)
return backgroundHealInfo{}
}
return info
}
func saveBackgroundHealInfo(ctx context.Context, objAPI ObjectLayer, info backgroundHealInfo) {
b, err := json.Marshal(info)
if err != nil {
logger.LogIf(ctx, err)
return
}
// Get last healing information
err = saveConfig(ctx, objAPI, backgroundHealInfoPath, b)
if err != nil {
logger.LogIf(ctx, err)
}
}
// runDataScanner will start a data scanner.
// The function will block until the context is canceled.
// There should only ever be one scanner running per cluster.
@@ -145,12 +203,24 @@ func runDataScanner(pctx context.Context, objAPI ObjectLayer) {
console.Debugln("starting scanner cycle")
}
bgHealInfo := readBackgroundHealInfo(ctx, objAPI)
scanMode := getCycleScanMode(nextBloomCycle, bgHealInfo.BitrotStartCycle, bgHealInfo.BitrotStartTime)
if bgHealInfo.CurrentScanMode != scanMode {
newHealInfo := bgHealInfo
newHealInfo.CurrentScanMode = scanMode
if scanMode == madmin.HealDeepScan {
newHealInfo.BitrotStartTime = time.Now().UTC()
newHealInfo.BitrotStartCycle = nextBloomCycle
}
saveBackgroundHealInfo(ctx, objAPI, newHealInfo)
}
// Wait before starting next cycle and wait on startup.
results := make(chan DataUsageInfo, 1)
go storeDataUsageInBackend(ctx, objAPI, results)
bf, err := globalNotificationSys.updateBloomFilter(ctx, nextBloomCycle)
logger.LogIf(ctx, err)
err = objAPI.NSScanner(ctx, bf, results, uint32(nextBloomCycle))
err = objAPI.NSScanner(ctx, bf, results, uint32(nextBloomCycle), scanMode)
logger.LogIf(ctx, err)
if err == nil {
// Store new cycle...
@@ -182,6 +252,7 @@ type folderScanner struct {
dataUsageScannerDebug bool
healFolderInclude uint32 // Include a clean folder one in n cycles.
healObjectSelect uint32 // Do a heal check on an object once every n cycles. Must divide into healFolderInclude
scanMode madmin.HealScanMode
disks []StorageAPI
disksQuorum int
@@ -250,7 +321,7 @@ var globalScannerStats scannerStats
// The returned cache will always be valid, but may not be updated from the existing.
// Before each operation sleepDuration is called which can be used to temporarily halt the scanner.
// If the supplied context is canceled the function will return at the first chance.
func scanDataFolder(ctx context.Context, poolIdx, setIdx int, basePath string, cache dataUsageCache, getSize getSizeFn) (dataUsageCache, error) {
func scanDataFolder(ctx context.Context, poolIdx, setIdx int, basePath string, cache dataUsageCache, getSize getSizeFn, scanMode madmin.HealScanMode) (dataUsageCache, error) {
t := UTCNow()
logPrefix := color.Green("data-usage: ")
@@ -279,6 +350,7 @@ func scanDataFolder(ctx context.Context, poolIdx, setIdx int, basePath string, c
dataUsageScannerDebug: intDataUpdateTracker.debug,
healFolderInclude: 0,
healObjectSelect: 0,
scanMode: scanMode,
updates: cache.Info.updates,
}
@@ -482,12 +554,15 @@ func (f *folderScanner) scanFolder(ctx context.Context, folder cachedFolder, int
debug: f.dataUsageScannerDebug,
lifeCycle: activeLifeCycle,
replication: replicationCfg,
heal: thisHash.modAlt(f.oldCache.Info.NextCycle/folder.objectHealProbDiv, f.healObjectSelect/folder.objectHealProbDiv) && globalIsErasure,
}
item.heal.enabled = thisHash.modAlt(f.oldCache.Info.NextCycle/folder.objectHealProbDiv, f.healObjectSelect/folder.objectHealProbDiv) && globalIsErasure
item.heal.bitrot = f.scanMode == madmin.HealDeepScan
// if the drive belongs to an erasure set
// that is already being healed, skip the
// healing attempt on this drive.
item.heal = item.heal && f.healObjectSelect > 0
item.heal.enabled = item.heal.enabled && f.healObjectSelect > 0
sz, err := f.getSize(item)
if err != nil {
@@ -821,8 +896,11 @@ type scannerItem struct {
replication replicationConfig
lifeCycle *lifecycle.Lifecycle
Typ fs.FileMode
heal bool // Has the object been selected for heal check?
debug bool
heal struct {
enabled bool
bitrot bool
} // Has the object been selected for heal check?
debug bool
}
type sizeSummary struct {
@@ -874,9 +952,13 @@ func (i *scannerItem) applyHealing(ctx context.Context, o ObjectLayer, oi Object
console.Debugf(applyActionsLogPrefix+" heal checking: %v/%v\n", i.bucket, i.objectPath())
}
}
scanMode := madmin.HealNormalScan
if i.heal.bitrot {
scanMode = madmin.HealDeepScan
}
healOpts := madmin.HealOpts{
Remove: healDeleteDangling,
ScanMode: globalHealConfig.ScanMode(),
ScanMode: scanMode,
}
res, err := o.HealObject(ctx, i.bucket, i.objectPath(), oi.VersionID, healOpts)
if err != nil && !errors.Is(err, NotImplemented{}) {
@@ -1040,7 +1122,7 @@ func (i *scannerItem) applyActions(ctx context.Context, o ObjectLayer, oi Object
// from the current deployment, which means we don't have to call healing
// routine even if we are asked to do via heal flag.
if !applied {
if i.heal {
if i.heal.enabled {
size = i.applyHealing(ctx, o, oi)
}
// replicate only if lifecycle rules are not applied.