From 7c9ef76f662d5142ea2746ac3f08f7557fa95c3c Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Thu, 17 Dec 2020 12:35:02 -0800 Subject: [PATCH] fix: timer deadlock on expired timers (#11124) issue was introduced in #11106 the following pattern <-t.C // timer fired if !t.Stop() { <-t.C // timer hangs } Seems to hang at the last `t.C` line, this issue happens because a fired timer cannot be Stopped() anymore and t.Stop() returns `false` leading to confusing state of usage. Refactor the code such that use timers appropriately with exact requirements in place. --- cmd/admin-heal-ops.go | 6 ++++- cmd/background-newdisks-heal-ops.go | 7 ++++- cmd/data-crawler.go | 8 +++++- cmd/erasure-sets.go | 41 ++++++++++++++++++----------- 4 files changed, 43 insertions(+), 19 deletions(-) diff --git a/cmd/admin-heal-ops.go b/cmd/admin-heal-ops.go index b9fd84461..a7c18cb1a 100644 --- a/cmd/admin-heal-ops.go +++ b/cmd/admin-heal-ops.go @@ -145,9 +145,13 @@ func (ahs *allHealState) pushHealLocalDisks(healLocalDisks ...Endpoint) { func (ahs *allHealState) periodicHealSeqsClean(ctx context.Context) { // Launch clean-up routine to remove this heal sequence (after // it ends) from the global state after timeout has elapsed. + periodicTimer := time.NewTimer(time.Minute * 5) + defer periodicTimer.Stop() + for { select { - case <-time.After(time.Minute * 5): + case <-periodicTimer.C: + periodicTimer.Reset(time.Minute * 5) now := UTCNow() ahs.Lock() for path, h := range ahs.healSeqMap { diff --git a/cmd/background-newdisks-heal-ops.go b/cmd/background-newdisks-heal-ops.go index 445e3e307..77242ad60 100644 --- a/cmd/background-newdisks-heal-ops.go +++ b/cmd/background-newdisks-heal-ops.go @@ -110,12 +110,17 @@ func initBackgroundHealing(ctx context.Context, objAPI ObjectLayer) { // 2. Only the node hosting the disk is responsible to perform the heal func monitorLocalDisksAndHeal(ctx context.Context, z *erasureServerPools, bgSeq *healSequence) { // Perform automatic disk healing when a disk is replaced locally. + diskCheckTimer := time.NewTimer(defaultMonitorNewDiskInterval) + defer diskCheckTimer.Stop() wait: for { select { case <-ctx.Done(): return - case <-time.After(defaultMonitorNewDiskInterval): + case <-diskCheckTimer.C: + // Reset to next interval. + diskCheckTimer.Reset(defaultMonitorNewDiskInterval) + var erasureSetInZoneDisksToHeal []map[int][]StorageAPI healDisks := globalBackgroundHealState.getHealLocalDisks() diff --git a/cmd/data-crawler.go b/cmd/data-crawler.go index 5486a18ca..f7cbdaf1f 100644 --- a/cmd/data-crawler.go +++ b/cmd/data-crawler.go @@ -99,11 +99,17 @@ func runDataCrawler(ctx context.Context, objAPI ObjectLayer) { } } + crawlTimer := time.NewTimer(dataCrawlStartDelay) + defer crawlTimer.Stop() + for { select { case <-ctx.Done(): return - case <-time.After(dataCrawlStartDelay): + case <-crawlTimer.C: + // Reset the timer for next cycle. + crawlTimer.Reset(dataCrawlStartDelay) + // Wait before starting next cycle and wait on startup. results := make(chan DataUsageInfo, 1) go storeDataUsageInBackend(ctx, objAPI, results) diff --git a/cmd/erasure-sets.go b/cmd/erasure-sets.go index c2d7808ee..cf390a799 100644 --- a/cmd/erasure-sets.go +++ b/cmd/erasure-sets.go @@ -278,14 +278,11 @@ func (s *erasureSets) monitorAndConnectEndpoints(ctx context.Context, monitorInt case <-ctx.Done(): return case <-monitor.C: + // Reset the timer once fired for required interval. + monitor.Reset(monitorInterval) + s.connectDisks() } - - if !monitor.Stop() { - <-monitor.C - } - - monitor.Reset(monitorInterval) } } @@ -1369,6 +1366,26 @@ func (s *erasureSets) maintainMRFList() { } } +func toSourceChTimed(t *time.Timer, sourceCh chan healSource, u healSource) { + t.Reset(100 * time.Millisecond) + + // No defer, as we don't know which + // case will be selected + + select { + case sourceCh <- u: + case <-t.C: + return + } + + // We still need to check the return value + // of Stop, because t could have fired + // between the send on sourceCh and this line. + if !t.Stop() { + <-t.C + } +} + // healMRFRoutine monitors new disks connection, sweep the MRF list // to find objects related to the new disk that needs to be healed. func (s *erasureSets) healMRFRoutine() { @@ -1392,16 +1409,8 @@ func (s *erasureSets) healMRFRoutine() { // Heal objects for _, u := range mrfOperations { - // Send an object to be healed with a timeout - select { - case bgSeq.sourceCh <- u: - case <-idler.C: - } - - if !idler.Stop() { - <-idler.C - } - idler.Reset(100 * time.Millisecond) + // Send an object to background heal + toSourceChTimed(idler, bgSeq.sourceCh, u) s.mrfMU.Lock() delete(s.mrfOperations, u)