From 7b579caf686110d96349ec4e90beacf7eed5af89 Mon Sep 17 00:00:00 2001 From: Anis Elleuch Date: Fri, 16 Nov 2018 23:59:51 +0100 Subject: [PATCH] heal: Fix heal sequences cleanup process (#6780) The current code triggers a timeout to cleanup a heal seq from healSeqMap, but we don't know if the user did or not launch a new healing sequence with the same path. Add endTime to healSequence struct and add a periodic heal-sequence cleaner to remove heal sequences only if this latter is older than 10 minutes. --- cmd/admin-heal-ops.go | 66 ++++++++++++++++++++----------------------- 1 file changed, 31 insertions(+), 35 deletions(-) diff --git a/cmd/admin-heal-ops.go b/cmd/admin-heal-ops.go index 86e3b5677..b4344a3e2 100644 --- a/cmd/admin-heal-ops.go +++ b/cmd/admin-heal-ops.go @@ -113,6 +113,32 @@ func initAllHealState(isErasureMode bool) { globalAllHealState = allHealState{ healSeqMap: make(map[string]*healSequence), } + + go globalAllHealState.periodicHealSeqsClean() +} + +func (ahs *allHealState) periodicHealSeqsClean() { + // Launch clean-up routine to remove this heal sequence (after + // it ends) from the global state after timeout has elapsed. + ticker := time.NewTicker(time.Minute * 5) + defer ticker.Stop() + for { + select { + case <-ticker.C: + now := UTCNow() + ahs.Lock() + for path, h := range ahs.healSeqMap { + if h.hasEnded() && h.endTime.Add(keepHealSeqStateDuration).Before(now) { + delete(ahs.healSeqMap, path) + } + } + ahs.Unlock() + case <-globalServiceDoneCh: + // server could be restarting - need + // to exit immediately + return + } + } } // getHealSequence - Retrieve a heal sequence by path. The second @@ -213,41 +239,6 @@ func (ahs *allHealState) LaunchNewHealSequence(h *healSequence) ( // Launch top-level background heal go-routine go h.healSequenceStart() - // Launch clean-up routine to remove this heal sequence (after - // it ends) from the global state after timeout has elapsed. - go func() { - var keepStateTimeout <-chan time.Time - ticker := time.NewTicker(time.Minute) - defer ticker.Stop() - everyMinute := ticker.C - for { - select { - // Check every minute if heal sequence has ended. - case <-everyMinute: - if h.hasEnded() { - keepStateTimeout = time.After(keepHealSeqStateDuration) - everyMinute = nil - } - - // This case does not fire until the heal - // sequence completes. - case <-keepStateTimeout: - // Heal sequence has ended, keep - // results state duration has elapsed, - // so purge state. - ahs.Lock() - defer ahs.Unlock() - delete(ahs.healSeqMap, h.path) - return - - case <-globalServiceDoneCh: - // server could be restarting - need - // to exit immediately - return - } - } - }() - b, err := json.Marshal(madmin.HealStartSuccess{ ClientToken: h.clientToken, ClientAddress: h.clientAddress, @@ -321,6 +312,9 @@ type healSequence struct { // time at which heal sequence was started startTime time.Time + // time at which heal sequence has ended + endTime time.Time + // Heal client info clientToken, clientAddress string @@ -498,6 +492,7 @@ func (h *healSequence) healSequenceStart() { select { case err, ok := <-h.traverseAndHealDoneCh: + h.endTime = UTCNow() h.currentStatus.updateLock.Lock() defer h.currentStatus.updateLock.Unlock() // Heal traversal is complete. @@ -511,6 +506,7 @@ func (h *healSequence) healSequenceStart() { } case <-h.stopSignalCh: + h.endTime = UTCNow() h.currentStatus.updateLock.Lock() h.currentStatus.Summary = healStoppedStatus h.currentStatus.FailureDetail = errHealStopSignalled.Error()