add cluster maintenance healthcheck drive heal affinity (#10218)

This commit is contained in:
Harshavardhana 2020-08-07 13:22:53 -07:00 committed by GitHub
parent 19c4f3082b
commit 6c6137b2e7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 194 additions and 96 deletions

View File

@ -799,6 +799,52 @@ func (a adminAPIHandlers) HealHandler(w http.ResponseWriter, r *http.Request) {
keepConnLive(w, r, respCh) keepConnLive(w, r, respCh)
} }
func getAggregatedBackgroundHealState(ctx context.Context, failOnErr bool) (madmin.BgHealState, error) {
var bgHealStates []madmin.BgHealState
// Get local heal status first
bgHealStates = append(bgHealStates, getLocalBackgroundHealStatus())
if globalIsDistErasure {
// Get heal status from other peers
peersHealStates, nerrs := globalNotificationSys.BackgroundHealStatus()
for _, nerr := range nerrs {
if nerr.Err != nil {
if failOnErr {
return madmin.BgHealState{}, nerr.Err
}
logger.LogIf(ctx, nerr.Err)
}
}
bgHealStates = append(bgHealStates, peersHealStates...)
}
// Aggregate healing result
var aggregatedHealStateResult = madmin.BgHealState{
ScannedItemsCount: bgHealStates[0].ScannedItemsCount,
LastHealActivity: bgHealStates[0].LastHealActivity,
NextHealRound: bgHealStates[0].NextHealRound,
HealDisks: bgHealStates[0].HealDisks,
}
bgHealStates = bgHealStates[1:]
for _, state := range bgHealStates {
aggregatedHealStateResult.ScannedItemsCount += state.ScannedItemsCount
aggregatedHealStateResult.HealDisks = append(aggregatedHealStateResult.HealDisks, state.HealDisks...)
if !state.LastHealActivity.IsZero() && aggregatedHealStateResult.LastHealActivity.Before(state.LastHealActivity) {
aggregatedHealStateResult.LastHealActivity = state.LastHealActivity
// The node which has the last heal activity means its
// is the node that is orchestrating self healing operations,
// which also means it is the same node which decides when
// the next self healing operation will be done.
aggregatedHealStateResult.NextHealRound = state.NextHealRound
}
}
return aggregatedHealStateResult, nil
}
func (a adminAPIHandlers) BackgroundHealStatusHandler(w http.ResponseWriter, r *http.Request) { func (a adminAPIHandlers) BackgroundHealStatusHandler(w http.ResponseWriter, r *http.Request) {
ctx := newContext(r, w, "HealBackgroundStatus") ctx := newContext(r, w, "HealBackgroundStatus")
@ -815,39 +861,8 @@ func (a adminAPIHandlers) BackgroundHealStatusHandler(w http.ResponseWriter, r *
return return
} }
var bgHealStates []madmin.BgHealState aggregateHealStateResult, _ := getAggregatedBackgroundHealState(r.Context(), false)
if err := json.NewEncoder(w).Encode(aggregateHealStateResult); err != nil {
// Get local heal status first
bgHealStates = append(bgHealStates, getLocalBackgroundHealStatus())
if globalIsDistErasure {
// Get heal status from other peers
peersHealStates := globalNotificationSys.BackgroundHealStatus()
bgHealStates = append(bgHealStates, peersHealStates...)
}
// Aggregate healing result
var aggregatedHealStateResult = madmin.BgHealState{
ScannedItemsCount: bgHealStates[0].ScannedItemsCount,
LastHealActivity: bgHealStates[0].LastHealActivity,
NextHealRound: bgHealStates[0].NextHealRound,
}
bgHealStates = bgHealStates[1:]
for _, state := range bgHealStates {
aggregatedHealStateResult.ScannedItemsCount += state.ScannedItemsCount
if !state.LastHealActivity.IsZero() && aggregatedHealStateResult.LastHealActivity.Before(state.LastHealActivity) {
aggregatedHealStateResult.LastHealActivity = state.LastHealActivity
// The node which has the last heal activity means its
// is the node that is orchestrating self healing operations,
// which also means it is the same node which decides when
// the next self healing operation will be done.
aggregatedHealStateResult.NextHealRound = state.NextHealRound
}
}
if err := json.NewEncoder(w).Encode(aggregatedHealStateResult); err != nil {
writeErrorResponseJSON(ctx, w, toAdminAPIErr(ctx, err), r.URL) writeErrorResponseJSON(ctx, w, toAdminAPIErr(ctx, err), r.URL)
return return
} }

View File

@ -88,7 +88,8 @@ type allHealState struct {
sync.Mutex sync.Mutex
// map of heal path to heal sequence // map of heal path to heal sequence
healSeqMap map[string]*healSequence healSeqMap map[string]*healSequence
healLocalDisks []Endpoints
} }
// newHealState - initialize global heal state management // newHealState - initialize global heal state management
@ -102,6 +103,22 @@ func newHealState() *allHealState {
return healState return healState
} }
func (ahs *allHealState) getHealLocalDisks() []Endpoints {
ahs.Lock()
defer ahs.Unlock()
healLocalDisks := make([]Endpoints, len(ahs.healLocalDisks))
copy(healLocalDisks, ahs.healLocalDisks)
return healLocalDisks
}
func (ahs *allHealState) updateHealLocalDisks(eps []Endpoints) {
ahs.Lock()
defer ahs.Unlock()
ahs.healLocalDisks = eps
}
func (ahs *allHealState) periodicHealSeqsClean(ctx context.Context) { func (ahs *allHealState) periodicHealSeqsClean(ctx context.Context) {
// Launch clean-up routine to remove this heal sequence (after // Launch clean-up routine to remove this heal sequence (after
// it ends) from the global state after timeout has elapsed. // it ends) from the global state after timeout has elapsed.

View File

@ -90,6 +90,9 @@ func (h *healRoutine) run(ctx context.Context, objAPI ObjectLayer) {
case task.bucket == nopHeal: case task.bucket == nopHeal:
continue continue
case task.bucket == SlashSeparator: case task.bucket == SlashSeparator:
// Quickly check if drives need healing upon start-up
globalBackgroundHealState.updateHealLocalDisks(getLocalDisksToHeal(objAPI))
res, err = healDiskFormat(ctx, objAPI, task.opts) res, err = healDiskFormat(ctx, objAPI, task.opts)
case task.bucket != "" && task.object == "": case task.bucket != "" && task.object == "":
res, err = objAPI.HealBucket(ctx, task.bucket, task.opts.DryRun, task.opts.Remove) res, err = objAPI.HealBucket(ctx, task.bucket, task.opts.DryRun, task.opts.Remove)

View File

@ -18,6 +18,7 @@ package cmd
import ( import (
"context" "context"
"fmt"
"time" "time"
"github.com/dustin/go-humanize" "github.com/dustin/go-humanize"
@ -30,6 +31,46 @@ func initLocalDisksAutoHeal(ctx context.Context, objAPI ObjectLayer) {
go monitorLocalDisksAndHeal(ctx, objAPI) go monitorLocalDisksAndHeal(ctx, objAPI)
} }
func getLocalDisksToHeal(objAPI ObjectLayer) []Endpoints {
z, ok := objAPI.(*erasureZones)
if !ok {
return nil
}
// Attempt a heal as the server starts-up first.
localDisksInZoneHeal := make([]Endpoints, len(z.zones))
for i, ep := range globalEndpoints {
localDisksToHeal := Endpoints{}
for _, endpoint := range ep.Endpoints {
if !endpoint.IsLocal {
continue
}
// Try to connect to the current endpoint
// and reformat if the current disk is not formatted
_, _, err := connectEndpoint(endpoint)
if err == errUnformattedDisk {
localDisksToHeal = append(localDisksToHeal, endpoint)
}
}
if len(localDisksToHeal) == 0 {
continue
}
localDisksInZoneHeal[i] = localDisksToHeal
}
return localDisksInZoneHeal
}
func getDrivesToHealCount(localDisksInZoneHeal []Endpoints) int {
var drivesToHeal int
for _, eps := range localDisksInZoneHeal {
for range eps {
drivesToHeal++
}
}
return drivesToHeal
}
// monitorLocalDisksAndHeal - ensures that detected new disks are healed // monitorLocalDisksAndHeal - ensures that detected new disks are healed
// 1. Only the concerned erasure set will be listed and healed // 1. Only the concerned erasure set will be listed and healed
// 2. Only the node hosting the disk is responsible to perform the heal // 2. Only the node hosting the disk is responsible to perform the heal
@ -50,53 +91,43 @@ func monitorLocalDisksAndHeal(ctx context.Context, objAPI ObjectLayer) {
time.Sleep(time.Second) time.Sleep(time.Second)
} }
localDisksInZoneHeal := globalBackgroundHealState.getHealLocalDisks()
drivesToHeal := getDrivesToHealCount(localDisksInZoneHeal)
if drivesToHeal != 0 {
logger.Info(fmt.Sprintf("Found drives to heal %d, waiting until %s to heal the content...",
drivesToHeal, defaultMonitorNewDiskInterval))
}
firstTime := true
// Perform automatic disk healing when a disk is replaced locally. // Perform automatic disk healing when a disk is replaced locally.
for { for {
select { select {
case <-ctx.Done(): case <-ctx.Done():
return return
case <-time.After(defaultMonitorNewDiskInterval): case <-time.After(defaultMonitorNewDiskInterval):
// Attempt a heal as the server starts-up first. // heal only if new disks found.
localDisksInZoneHeal := make([]Endpoints, len(z.zones)) if drivesToHeal == 0 {
var healNewDisks bool firstTime = false
for i, ep := range globalEndpoints { localDisksInZoneHeal = getLocalDisksToHeal(z)
localDisksToHeal := Endpoints{} drivesToHeal = getDrivesToHealCount(localDisksInZoneHeal)
for _, endpoint := range ep.Endpoints { if drivesToHeal == 0 {
if !endpoint.IsLocal { // No drives to heal.
continue globalBackgroundHealState.updateHealLocalDisks(nil)
}
// Try to connect to the current endpoint
// and reformat if the current disk is not formatted
_, _, err := connectEndpoint(endpoint)
if err == errUnformattedDisk {
localDisksToHeal = append(localDisksToHeal, endpoint)
}
}
if len(localDisksToHeal) == 0 {
continue continue
} }
localDisksInZoneHeal[i] = localDisksToHeal globalBackgroundHealState.updateHealLocalDisks(localDisksInZoneHeal)
healNewDisks = true
} }
// Reformat disks only if needed. if !firstTime {
if !healNewDisks { // Reformat disks
continue bgSeq.sourceCh <- healSource{bucket: SlashSeparator}
// Ensure that reformatting disks is finished
bgSeq.sourceCh <- healSource{bucket: nopHeal}
} }
logger.Info("New unformatted drives detected attempting to heal...")
for i, disks := range localDisksInZoneHeal {
for _, disk := range disks {
logger.Info("Healing disk '%s' on %s zone", disk, humanize.Ordinal(i+1))
}
}
// Reformat disks
bgSeq.sourceCh <- healSource{bucket: SlashSeparator}
// Ensure that reformatting disks is finished
bgSeq.sourceCh <- healSource{bucket: nopHeal}
var erasureSetInZoneToHeal = make([][]int, len(localDisksInZoneHeal)) var erasureSetInZoneToHeal = make([][]int, len(localDisksInZoneHeal))
// Compute the list of erasure set to heal // Compute the list of erasure set to heal
for i, localDisksToHeal := range localDisksInZoneHeal { for i, localDisksToHeal := range localDisksInZoneHeal {
@ -108,6 +139,7 @@ func monitorLocalDisksAndHeal(ctx context.Context, objAPI ObjectLayer) {
printEndpointError(endpoint, err, true) printEndpointError(endpoint, err, true)
continue continue
} }
// Calculate the set index where the current endpoint belongs // Calculate the set index where the current endpoint belongs
setIndex, _, err := findDiskIndex(z.zones[i].format, format) setIndex, _, err := findDiskIndex(z.zones[i].format, format)
if err != nil { if err != nil {
@ -120,6 +152,13 @@ func monitorLocalDisksAndHeal(ctx context.Context, objAPI ObjectLayer) {
erasureSetInZoneToHeal[i] = erasureSetToHeal erasureSetInZoneToHeal[i] = erasureSetToHeal
} }
logger.Info("New unformatted drives detected attempting to heal the content...")
for i, disks := range localDisksInZoneHeal {
for _, disk := range disks {
logger.Info("Healing disk '%s' on %s zone", disk, humanize.Ordinal(i+1))
}
}
// Heal all erasure sets that need // Heal all erasure sets that need
for i, erasureSetToHeal := range erasureSetInZoneToHeal { for i, erasureSetToHeal := range erasureSetInZoneToHeal {
for _, setIndex := range erasureSetToHeal { for _, setIndex := range erasureSetToHeal {
@ -127,6 +166,11 @@ func monitorLocalDisksAndHeal(ctx context.Context, objAPI ObjectLayer) {
if err != nil { if err != nil {
logger.LogIf(ctx, err) logger.LogIf(ctx, err)
} }
// Only upon success reduce the counter
if err == nil {
drivesToHeal--
}
} }
} }
} }

View File

@ -1414,6 +1414,11 @@ func (s *erasureSets) HealFormat(ctx context.Context, dryRun bool) (res madmin.H
return madmin.HealResultItem{}, err return madmin.HealResultItem{}, err
} }
refFormat, err = getFormatErasureInQuorum(tmpNewFormats)
if err != nil {
return madmin.HealResultItem{}, err
}
// kill the monitoring loop such that we stop writing // kill the monitoring loop such that we stop writing
// to indicate that we will re-initialize everything // to indicate that we will re-initialize everything
// with new format. // with new format.

View File

@ -2032,6 +2032,7 @@ type HealthOptions struct {
// was queried // was queried
type HealthResult struct { type HealthResult struct {
Healthy bool Healthy bool
HealingDrives int
ZoneID, SetID int ZoneID, SetID int
WriteQuorum int WriteQuorum int
} }
@ -2086,8 +2087,23 @@ func (z *erasureZones) Health(ctx context.Context, opts HealthOptions) HealthRes
} }
} }
} }
// check if local disks are being healed, if they are being healed
// we need to tell healthy status as 'false' so that this server
// is not taken down for maintenance
aggHealStateResult, err := getAggregatedBackgroundHealState(ctx, true)
if err != nil {
logger.LogIf(ctx, fmt.Errorf("Unable to verify global heal status: %w", err))
return HealthResult{
Healthy: false,
}
}
healthy := len(aggHealStateResult.HealDisks) == 0
return HealthResult{ return HealthResult{
Healthy: true, Healthy: healthy,
HealingDrives: len(aggHealStateResult.HealDisks),
} }
} }

View File

@ -73,9 +73,17 @@ func getLocalBackgroundHealStatus() madmin.BgHealState {
return madmin.BgHealState{} return madmin.BgHealState{}
} }
var healDisks []string
for _, eps := range globalBackgroundHealState.getHealLocalDisks() {
for _, ep := range eps {
healDisks = append(healDisks, ep.String())
}
}
return madmin.BgHealState{ return madmin.BgHealState{
ScannedItemsCount: bgSeq.getScannedItemsCount(), ScannedItemsCount: bgSeq.getScannedItemsCount(),
LastHealActivity: bgSeq.lastHealActivity, LastHealActivity: bgSeq.lastHealActivity,
HealDisks: healDisks,
NextHealRound: UTCNow().Add(durationToNextHealRound(bgSeq.lastHealActivity)), NextHealRound: UTCNow().Add(durationToNextHealRound(bgSeq.lastHealActivity)),
} }
} }

View File

@ -269,21 +269,25 @@ func (sys *NotificationSys) LoadServiceAccount(accessKey string) []NotificationP
} }
// BackgroundHealStatus - returns background heal status of all peers // BackgroundHealStatus - returns background heal status of all peers
func (sys *NotificationSys) BackgroundHealStatus() []madmin.BgHealState { func (sys *NotificationSys) BackgroundHealStatus() ([]madmin.BgHealState, []NotificationPeerErr) {
ng := WithNPeers(len(sys.peerClients))
states := make([]madmin.BgHealState, len(sys.peerClients)) states := make([]madmin.BgHealState, len(sys.peerClients))
for idx, client := range sys.peerClients { for idx, client := range sys.peerClients {
if client == nil { if client == nil {
continue continue
} }
st, err := client.BackgroundHealStatus() client := client
if err != nil { ng.Go(GlobalContext, func() error {
logger.LogIf(GlobalContext, err) st, err := client.BackgroundHealStatus()
} else { if err != nil {
return err
}
states[idx] = st states[idx] = st
} return nil
}, idx, *client.host)
} }
return states return states, ng.Wait()
} }
// StartProfiling - start profiling on remote peers, by initiating a remote RPC. // StartProfiling - start profiling on remote peers, by initiating a remote RPC.

View File

@ -2103,9 +2103,6 @@ func (s *xlStorage) RenameData(srcVolume, srcPath, dataDir, dstVolume, dstPath s
legacyDataPath := pathJoin(dstVolumeDir, dstPath, legacyDataDir) legacyDataPath := pathJoin(dstVolumeDir, dstPath, legacyDataDir)
// legacy data dir means its old content, honor system umask. // legacy data dir means its old content, honor system umask.
if err = os.MkdirAll(legacyDataPath, 0777); err != nil { if err = os.MkdirAll(legacyDataPath, 0777); err != nil {
if isSysErrIO(err) {
return errFaultyDisk
}
return osErrToFileErr(err) return osErrToFileErr(err)
} }
@ -2116,14 +2113,11 @@ func (s *xlStorage) RenameData(srcVolume, srcPath, dataDir, dstVolume, dstPath s
for _, entry := range entries { for _, entry := range entries {
// Skip xl.meta renames further, also ignore any directories such as `legacyDataDir` // Skip xl.meta renames further, also ignore any directories such as `legacyDataDir`
if entry == xlStorageFormatFile || strings.HasPrefix(entry, SlashSeparator) { if entry == xlStorageFormatFile || strings.HasSuffix(entry, slashSeparator) {
continue continue
} }
if err = os.Rename(pathJoin(currentDataPath, entry), pathJoin(legacyDataPath, entry)); err != nil { if err = os.Rename(pathJoin(currentDataPath, entry), pathJoin(legacyDataPath, entry)); err != nil {
if isSysErrIO(err) {
return errFaultyDisk
}
return osErrToFileErr(err) return osErrToFileErr(err)
} }
} }
@ -2159,20 +2153,14 @@ func (s *xlStorage) RenameData(srcVolume, srcPath, dataDir, dstVolume, dstPath s
} }
if err = renameAll(srcFilePath, dstFilePath); err != nil { if err = renameAll(srcFilePath, dstFilePath); err != nil {
if isSysErrIO(err) { return osErrToFileErr(err)
return errFaultyDisk
}
return err
} }
if srcDataPath != "" { if srcDataPath != "" {
removeAll(oldDstDataPath) removeAll(oldDstDataPath)
removeAll(dstDataPath) removeAll(dstDataPath)
if err = renameAll(srcDataPath, dstDataPath); err != nil { if err = renameAll(srcDataPath, dstDataPath); err != nil {
if isSysErrIO(err) { return osErrToFileErr(err)
return errFaultyDisk
}
return err
} }
} }
@ -2265,10 +2253,7 @@ func (s *xlStorage) RenameFile(srcVolume, srcPath, dstVolume, dstPath string) (e
} }
if err = renameAll(srcFilePath, dstFilePath); err != nil { if err = renameAll(srcFilePath, dstFilePath); err != nil {
if isSysErrIO(err) { return osErrToFileErr(err)
return errFaultyDisk
}
return err
} }
// Remove parent dir of the source file if empty // Remove parent dir of the source file if empty

View File

@ -294,6 +294,7 @@ type BgHealState struct {
ScannedItemsCount int64 ScannedItemsCount int64
LastHealActivity time.Time LastHealActivity time.Time
NextHealRound time.Time NextHealRound time.Time
HealDisks []string
} }
// BackgroundHealStatus returns the background heal status of the // BackgroundHealStatus returns the background heal status of the