minio/cmd/global-heal.go

191 lines
4.7 KiB
Go

/*
* MinIO Cloud Storage, (C) 2019 MinIO, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package cmd
import (
"context"
"fmt"
"sync"
"time"
"github.com/minio/minio/cmd/logger"
"github.com/minio/minio/pkg/madmin"
)
const (
bgHealingUUID = "0000-0000-0000-0000"
leaderTick = time.Hour
healInterval = 30 * 24 * time.Hour
)
var leaderLockTimeout = newDynamicTimeout(time.Minute, time.Minute)
// NewBgHealSequence creates a background healing sequence
// operation which crawls all objects and heal them.
func newBgHealSequence(numDisks int) *healSequence {
reqInfo := &logger.ReqInfo{API: "BackgroundHeal"}
ctx := logger.SetReqInfo(context.Background(), reqInfo)
hs := madmin.HealOpts{
// Remove objects that do not have read-quorum
Remove: true,
ScanMode: madmin.HealNormalScan,
}
return &healSequence{
sourceCh: make(chan string),
startTime: UTCNow(),
clientToken: bgHealingUUID,
settings: hs,
currentStatus: healSequenceStatus{
Summary: healNotStartedStatus,
HealSettings: hs,
NumDisks: numDisks,
updateLock: &sync.RWMutex{},
},
traverseAndHealDoneCh: make(chan error),
stopSignalCh: make(chan struct{}),
ctx: ctx,
reportProgress: false,
}
}
func getLocalBackgroundHealStatus() madmin.BgHealState {
bgSeq, ok := globalBackgroundHealState.getHealSequenceByToken(bgHealingUUID)
if !ok {
return madmin.BgHealState{}
}
return madmin.BgHealState{
ScannedItemsCount: bgSeq.scannedItemsCount,
LastHealActivity: bgSeq.lastHealActivity,
NextHealRound: UTCNow().Add(durationToNextHealRound(bgSeq.lastHealActivity)),
}
}
// healErasureSet lists and heals all objects in a specific erasure set
func healErasureSet(ctx context.Context, setIndex int, xlObj *xlObjects) error {
buckets, err := xlObj.ListBuckets(ctx)
if err != nil {
return err
}
// Get background heal sequence to send elements to heal
var bgSeq *healSequence
var ok bool
for {
bgSeq, ok = globalBackgroundHealState.getHealSequenceByToken(bgHealingUUID)
if ok {
break
}
time.Sleep(time.Second)
}
// Heal all buckets with all objects
for _, bucket := range buckets {
// Heal current bucket
bgSeq.sourceCh <- bucket.Name
// List all objects in the current bucket and heal them
listDir := listDirFactory(ctx, xlObj.getLoadBalancedDisks()...)
walkResultCh := startTreeWalk(ctx, bucket.Name, "", "", true, listDir, nil)
for walkEntry := range walkResultCh {
bgSeq.sourceCh <- pathJoin(bucket.Name, walkEntry.entry)
}
}
return nil
}
// Returns the duration to the next background healing round
func durationToNextHealRound(lastHeal time.Time) time.Duration {
if lastHeal.IsZero() {
lastHeal = globalBootTime
}
d := lastHeal.Add(healInterval).Sub(UTCNow())
if d < 0 {
return time.Second
}
return d
}
// Healing leader will take the charge of healing all erasure sets
func execLeaderTasks(z *xlZones) {
ctx := context.Background()
// Hold a lock so only one server performs auto-healing
leaderLock := z.NewNSLock(ctx, minioMetaBucket, "leader")
for {
err := leaderLock.GetLock(leaderLockTimeout)
if err == nil {
break
}
time.Sleep(leaderTick)
}
// Hold a lock for healing the erasure set
zeroDuration := time.Millisecond
zeroDynamicTimeout := newDynamicTimeout(zeroDuration, zeroDuration)
lastScanTime := time.Now() // So that we don't heal immediately, but after one month.
for {
time.Sleep(durationToNextHealRound(lastScanTime))
for _, zone := range z.zones {
// Heal set by set
for i, set := range zone.sets {
setLock := z.zones[0].NewNSLock(ctx, "system", fmt.Sprintf("erasure-set-heal-%d", i))
if err := setLock.GetLock(zeroDynamicTimeout); err != nil {
logger.LogIf(ctx, err)
continue
}
if err := healErasureSet(ctx, i, set); err != nil {
setLock.Unlock()
logger.LogIf(ctx, err)
continue
}
setLock.Unlock()
}
}
lastScanTime = time.Now()
}
}
func startGlobalHeal() {
var objAPI ObjectLayer
for {
objAPI = newObjectLayerWithoutSafeModeFn()
if objAPI == nil {
time.Sleep(time.Second)
continue
}
break
}
zones, ok := objAPI.(*xlZones)
if !ok {
return
}
execLeaderTasks(zones)
}
func initGlobalHeal() {
go startGlobalHeal()
}