fix: refactor background heal for cluster health (#10225)

This commit is contained in:
Harshavardhana 2020-08-07 19:43:06 -07:00 committed by GitHub
parent 8049184dcc
commit 2a9819aff8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 91 additions and 70 deletions

View File

@ -802,8 +802,15 @@ func (a adminAPIHandlers) HealHandler(w http.ResponseWriter, r *http.Request) {
func getAggregatedBackgroundHealState(ctx context.Context, failOnErr bool) (madmin.BgHealState, error) { func getAggregatedBackgroundHealState(ctx context.Context, failOnErr bool) (madmin.BgHealState, error) {
var bgHealStates []madmin.BgHealState var bgHealStates []madmin.BgHealState
localHealState, ok := getLocalBackgroundHealStatus()
if !ok {
if failOnErr {
return madmin.BgHealState{}, errServerNotInitialized
}
}
// Get local heal status first // Get local heal status first
bgHealStates = append(bgHealStates, getLocalBackgroundHealStatus()) bgHealStates = append(bgHealStates, localHealState)
if globalIsDistErasure { if globalIsDistErasure {
// Get heal status from other peers // Get heal status from other peers

View File

@ -112,11 +112,11 @@ func (ahs *allHealState) getHealLocalDisks() []Endpoints {
return healLocalDisks return healLocalDisks
} }
func (ahs *allHealState) updateHealLocalDisks(eps []Endpoints) { func (ahs *allHealState) updateHealLocalDisks(healLocalDisks []Endpoints) {
ahs.Lock() ahs.Lock()
defer ahs.Unlock() defer ahs.Unlock()
ahs.healLocalDisks = eps ahs.healLocalDisks = healLocalDisks
} }
func (ahs *allHealState) periodicHealSeqsClean(ctx context.Context) { func (ahs *allHealState) periodicHealSeqsClean(ctx context.Context) {
@ -502,6 +502,10 @@ func (h *healSequence) isQuitting() bool {
// check if the heal sequence has ended // check if the heal sequence has ended
func (h *healSequence) hasEnded() bool { func (h *healSequence) hasEnded() bool {
h.mutex.RLock() h.mutex.RLock()
// background heal never ends
if h.clientToken == bgHealingUUID {
return false
}
ended := len(h.currentStatus.Items) == 0 || h.currentStatus.Summary == healStoppedStatus || h.currentStatus.Summary == healFinishedStatus ended := len(h.currentStatus.Items) == 0 || h.currentStatus.Summary == healStoppedStatus || h.currentStatus.Summary == healFinishedStatus
h.mutex.RUnlock() h.mutex.RUnlock()
return ended return ended

View File

@ -90,9 +90,6 @@ func (h *healRoutine) run(ctx context.Context, objAPI ObjectLayer) {
case task.bucket == nopHeal: case task.bucket == nopHeal:
continue continue
case task.bucket == SlashSeparator: case task.bucket == SlashSeparator:
// Quickly check if drives need healing upon start-up
globalBackgroundHealState.updateHealLocalDisks(getLocalDisksToHeal(objAPI))
res, err = healDiskFormat(ctx, objAPI, task.opts) res, err = healDiskFormat(ctx, objAPI, task.opts)
case task.bucket != "" && task.object == "": case task.bucket != "" && task.object == "":
res, err = objAPI.HealBucket(ctx, task.bucket, task.opts.DryRun, task.opts.Remove) res, err = objAPI.HealBucket(ctx, task.bucket, task.opts.DryRun, task.opts.Remove)
@ -119,24 +116,6 @@ func newHealRoutine() *healRoutine {
} }
func initBackgroundHealing(ctx context.Context, objAPI ObjectLayer) {
// Run the background healer
globalBackgroundHealRoutine = newHealRoutine()
go globalBackgroundHealRoutine.run(ctx, objAPI)
nh := newBgHealSequence()
// Heal any disk format and metadata early, if possible.
if err := nh.healDiskMeta(); err != nil {
if newObjectLayerFn() != nil {
// log only in situations, when object layer
// has fully initialized.
logger.LogIf(nh.ctx, err)
}
}
globalBackgroundHealState.LaunchNewHealSequence(nh)
}
// healDiskFormat - heals format.json, return value indicates if a // healDiskFormat - heals format.json, return value indicates if a
// failure error occurred. // failure error occurred.
func healDiskFormat(ctx context.Context, objAPI ObjectLayer, opts madmin.HealOpts) (madmin.HealResultItem, error) { func healDiskFormat(ctx context.Context, objAPI ObjectLayer, opts madmin.HealOpts) (madmin.HealResultItem, error) {

View File

@ -27,8 +27,46 @@ import (
const defaultMonitorNewDiskInterval = time.Minute * 3 const defaultMonitorNewDiskInterval = time.Minute * 3
func initLocalDisksAutoHeal(ctx context.Context, objAPI ObjectLayer) { func initAutoHeal(ctx context.Context, objAPI ObjectLayer) {
go monitorLocalDisksAndHeal(ctx, objAPI) z, ok := objAPI.(*erasureZones)
if !ok {
return
}
initBackgroundHealing(ctx, objAPI) // start quick background healing
localDisksInZoneHeal := getLocalDisksToHeal(objAPI)
globalBackgroundHealState.updateHealLocalDisks(localDisksInZoneHeal)
drivesToHeal := getDrivesToHealCount(localDisksInZoneHeal)
if drivesToHeal != 0 {
logger.Info(fmt.Sprintf("Found drives to heal %d, waiting until %s to heal the content...",
drivesToHeal, defaultMonitorNewDiskInterval))
}
var bgSeq *healSequence
var found bool
for {
bgSeq, found = globalBackgroundHealState.getHealSequenceByToken(bgHealingUUID)
if found {
break
}
time.Sleep(time.Second)
}
if drivesToHeal != 0 {
// Heal any disk format and metadata early, if possible.
if err := bgSeq.healDiskMeta(); err != nil {
if newObjectLayerFn() != nil {
// log only in situations, when object layer
// has fully initialized.
logger.LogIf(bgSeq.ctx, err)
}
}
}
go monitorLocalDisksAndHeal(ctx, z, drivesToHeal, localDisksInZoneHeal, bgSeq)
} }
func getLocalDisksToHeal(objAPI ObjectLayer) []Endpoints { func getLocalDisksToHeal(objAPI ObjectLayer) []Endpoints {
@ -71,36 +109,18 @@ func getDrivesToHealCount(localDisksInZoneHeal []Endpoints) int {
return drivesToHeal return drivesToHeal
} }
func initBackgroundHealing(ctx context.Context, objAPI ObjectLayer) {
// Run the background healer
globalBackgroundHealRoutine = newHealRoutine()
go globalBackgroundHealRoutine.run(ctx, objAPI)
globalBackgroundHealState.LaunchNewHealSequence(newBgHealSequence())
}
// monitorLocalDisksAndHeal - ensures that detected new disks are healed // monitorLocalDisksAndHeal - ensures that detected new disks are healed
// 1. Only the concerned erasure set will be listed and healed // 1. Only the concerned erasure set will be listed and healed
// 2. Only the node hosting the disk is responsible to perform the heal // 2. Only the node hosting the disk is responsible to perform the heal
func monitorLocalDisksAndHeal(ctx context.Context, objAPI ObjectLayer) { func monitorLocalDisksAndHeal(ctx context.Context, z *erasureZones, drivesToHeal int, localDisksInZoneHeal []Endpoints, bgSeq *healSequence) {
z, ok := objAPI.(*erasureZones)
if !ok {
return
}
var bgSeq *healSequence
var found bool
for {
bgSeq, found = globalBackgroundHealState.getHealSequenceByToken(bgHealingUUID)
if found {
break
}
time.Sleep(time.Second)
}
localDisksInZoneHeal := globalBackgroundHealState.getHealLocalDisks()
drivesToHeal := getDrivesToHealCount(localDisksInZoneHeal)
if drivesToHeal != 0 {
logger.Info(fmt.Sprintf("Found drives to heal %d, waiting until %s to heal the content...",
drivesToHeal, defaultMonitorNewDiskInterval))
}
firstTime := true
// Perform automatic disk healing when a disk is replaced locally. // Perform automatic disk healing when a disk is replaced locally.
for { for {
select { select {
@ -109,7 +129,6 @@ func monitorLocalDisksAndHeal(ctx context.Context, objAPI ObjectLayer) {
case <-time.After(defaultMonitorNewDiskInterval): case <-time.After(defaultMonitorNewDiskInterval):
// heal only if new disks found. // heal only if new disks found.
if drivesToHeal == 0 { if drivesToHeal == 0 {
firstTime = false
localDisksInZoneHeal = getLocalDisksToHeal(z) localDisksInZoneHeal = getLocalDisksToHeal(z)
drivesToHeal = getDrivesToHealCount(localDisksInZoneHeal) drivesToHeal = getDrivesToHealCount(localDisksInZoneHeal)
if drivesToHeal == 0 { if drivesToHeal == 0 {
@ -118,9 +137,10 @@ func monitorLocalDisksAndHeal(ctx context.Context, objAPI ObjectLayer) {
continue continue
} }
globalBackgroundHealState.updateHealLocalDisks(localDisksInZoneHeal) globalBackgroundHealState.updateHealLocalDisks(localDisksInZoneHeal)
}
if !firstTime { logger.Info(fmt.Sprintf("Found drives to heal %d, proceeding to heal content...",
drivesToHeal))
// Reformat disks // Reformat disks
bgSeq.sourceCh <- healSource{bucket: SlashSeparator} bgSeq.sourceCh <- healSource{bucket: SlashSeparator}

View File

@ -2099,6 +2099,10 @@ func (z *erasureZones) Health(ctx context.Context, opts HealthOptions) HealthRes
} }
} }
if len(aggHealStateResult.HealDisks) > 0 {
logger.LogIf(ctx, fmt.Errorf("Total drives to be healed %d", len(aggHealStateResult.HealDisks)))
}
healthy := len(aggHealStateResult.HealDisks) == 0 healthy := len(aggHealStateResult.HealDisks) == 0
return HealthResult{ return HealthResult{

View File

@ -53,7 +53,9 @@ func newBgHealSequence() *healSequence {
respCh: make(chan healResult), respCh: make(chan healResult),
startTime: UTCNow(), startTime: UTCNow(),
clientToken: bgHealingUUID, clientToken: bgHealingUUID,
settings: hs, // run-background heal with reserved bucket
bucket: minioReservedBucket,
settings: hs,
currentStatus: healSequenceStatus{ currentStatus: healSequenceStatus{
Summary: healNotStartedStatus, Summary: healNotStartedStatus,
HealSettings: hs, HealSettings: hs,
@ -67,10 +69,10 @@ func newBgHealSequence() *healSequence {
} }
} }
func getLocalBackgroundHealStatus() madmin.BgHealState { func getLocalBackgroundHealStatus() (madmin.BgHealState, bool) {
bgSeq, ok := globalBackgroundHealState.getHealSequenceByToken(bgHealingUUID) bgSeq, ok := globalBackgroundHealState.getHealSequenceByToken(bgHealingUUID)
if !ok { if !ok {
return madmin.BgHealState{} return madmin.BgHealState{}, false
} }
var healDisks []string var healDisks []string
@ -85,7 +87,7 @@ func getLocalBackgroundHealStatus() madmin.BgHealState {
LastHealActivity: bgSeq.lastHealActivity, LastHealActivity: bgSeq.lastHealActivity,
HealDisks: healDisks, HealDisks: healDisks,
NextHealRound: UTCNow().Add(durationToNextHealRound(bgSeq.lastHealActivity)), NextHealRound: UTCNow().Add(durationToNextHealRound(bgSeq.lastHealActivity)),
} }, true
} }
// healErasureSet lists and heals all objects in a specific erasure set // healErasureSet lists and heals all objects in a specific erasure set
@ -172,13 +174,14 @@ func healErasureSet(ctx context.Context, setIndex int, xlObj *erasureObjects, dr
// deepHealObject heals given object path in deep to fix bitrot. // deepHealObject heals given object path in deep to fix bitrot.
func deepHealObject(bucket, object, versionID string) { func deepHealObject(bucket, object, versionID string) {
// Get background heal sequence to send elements to heal // Get background heal sequence to send elements to heal
bgSeq, _ := globalBackgroundHealState.getHealSequenceByToken(bgHealingUUID) bgSeq, ok := globalBackgroundHealState.getHealSequenceByToken(bgHealingUUID)
if ok {
bgSeq.sourceCh <- healSource{ bgSeq.sourceCh <- healSource{
bucket: bucket, bucket: bucket,
object: object, object: object,
versionID: versionID, versionID: versionID,
opts: &madmin.HealOpts{ScanMode: madmin.HealDeepScan}, opts: &madmin.HealOpts{ScanMode: madmin.HealDeepScan},
}
} }
} }

View File

@ -276,6 +276,7 @@ func (sys *NotificationSys) BackgroundHealStatus() ([]madmin.BgHealState, []Noti
if client == nil { if client == nil {
continue continue
} }
idx := idx
client := client client := client
ng.Go(GlobalContext, func() error { ng.Go(GlobalContext, func() error {
st, err := client.BackgroundHealStatus() st, err := client.BackgroundHealStatus()

View File

@ -976,7 +976,11 @@ func (s *peerRESTServer) BackgroundHealStatusHandler(w http.ResponseWriter, r *h
ctx := newContext(r, w, "BackgroundHealStatus") ctx := newContext(r, w, "BackgroundHealStatus")
state := getLocalBackgroundHealStatus() state, ok := getLocalBackgroundHealStatus()
if !ok {
s.writeErrorResponse(w, errServerNotInitialized)
return
}
defer w.(http.Flusher).Flush() defer w.(http.Flusher).Flush()
logger.LogIf(ctx, gob.NewEncoder(w).Encode(state)) logger.LogIf(ctx, gob.NewEncoder(w).Encode(state))

View File

@ -221,8 +221,7 @@ func initSafeMode(ctx context.Context, newObject ObjectLayer) (err error) {
// Enable healing to heal drives if possible // Enable healing to heal drives if possible
if globalIsErasure { if globalIsErasure {
initBackgroundHealing(ctx, newObject) initAutoHeal(ctx, newObject)
initLocalDisksAutoHeal(ctx, newObject)
} }
// **** WARNING **** // **** WARNING ****