From 7c9ef76f662d5142ea2746ac3f08f7557fa95c3c Mon Sep 17 00:00:00 2001
From: Harshavardhana <harsha@minio.io>
Date: Thu, 17 Dec 2020 12:35:02 -0800
Subject: [PATCH] fix: timer deadlock on expired timers (#11124)

issue was introduced in #11106 the following
pattern

<-t.C // timer fired

if !t.Stop() {
   <-t.C // timer hangs
}

Seems to hang at the last `t.C` line, this
issue happens because a fired timer cannot be
Stopped() anymore and t.Stop() returns `false`
leading to confusing state of usage.

Refactor the code such that use timers appropriately
with exact requirements in place.
---
 cmd/admin-heal-ops.go               |  6 ++++-
 cmd/background-newdisks-heal-ops.go |  7 ++++-
 cmd/data-crawler.go                 |  8 +++++-
 cmd/erasure-sets.go                 | 41 ++++++++++++++++++-----------
 4 files changed, 43 insertions(+), 19 deletions(-)

diff --git a/cmd/admin-heal-ops.go b/cmd/admin-heal-ops.go
index b9fd84461..a7c18cb1a 100644
--- a/cmd/admin-heal-ops.go
+++ b/cmd/admin-heal-ops.go
@@ -145,9 +145,13 @@ func (ahs *allHealState) pushHealLocalDisks(healLocalDisks ...Endpoint) {
 func (ahs *allHealState) periodicHealSeqsClean(ctx context.Context) {
 	// Launch clean-up routine to remove this heal sequence (after
 	// it ends) from the global state after timeout has elapsed.
+	periodicTimer := time.NewTimer(time.Minute * 5)
+	defer periodicTimer.Stop()
+
 	for {
 		select {
-		case <-time.After(time.Minute * 5):
+		case <-periodicTimer.C:
+			periodicTimer.Reset(time.Minute * 5)
 			now := UTCNow()
 			ahs.Lock()
 			for path, h := range ahs.healSeqMap {
diff --git a/cmd/background-newdisks-heal-ops.go b/cmd/background-newdisks-heal-ops.go
index 445e3e307..77242ad60 100644
--- a/cmd/background-newdisks-heal-ops.go
+++ b/cmd/background-newdisks-heal-ops.go
@@ -110,12 +110,17 @@ func initBackgroundHealing(ctx context.Context, objAPI ObjectLayer) {
 //  2. Only the node hosting the disk is responsible to perform the heal
 func monitorLocalDisksAndHeal(ctx context.Context, z *erasureServerPools, bgSeq *healSequence) {
 	// Perform automatic disk healing when a disk is replaced locally.
+	diskCheckTimer := time.NewTimer(defaultMonitorNewDiskInterval)
+	defer diskCheckTimer.Stop()
 wait:
 	for {
 		select {
 		case <-ctx.Done():
 			return
-		case <-time.After(defaultMonitorNewDiskInterval):
+		case <-diskCheckTimer.C:
+			// Reset to next interval.
+			diskCheckTimer.Reset(defaultMonitorNewDiskInterval)
+
 			var erasureSetInZoneDisksToHeal []map[int][]StorageAPI
 
 			healDisks := globalBackgroundHealState.getHealLocalDisks()
diff --git a/cmd/data-crawler.go b/cmd/data-crawler.go
index 5486a18ca..f7cbdaf1f 100644
--- a/cmd/data-crawler.go
+++ b/cmd/data-crawler.go
@@ -99,11 +99,17 @@ func runDataCrawler(ctx context.Context, objAPI ObjectLayer) {
 		}
 	}
 
+	crawlTimer := time.NewTimer(dataCrawlStartDelay)
+	defer crawlTimer.Stop()
+
 	for {
 		select {
 		case <-ctx.Done():
 			return
-		case <-time.After(dataCrawlStartDelay):
+		case <-crawlTimer.C:
+			// Reset the timer for next cycle.
+			crawlTimer.Reset(dataCrawlStartDelay)
+
 			// Wait before starting next cycle and wait on startup.
 			results := make(chan DataUsageInfo, 1)
 			go storeDataUsageInBackend(ctx, objAPI, results)
diff --git a/cmd/erasure-sets.go b/cmd/erasure-sets.go
index c2d7808ee..cf390a799 100644
--- a/cmd/erasure-sets.go
+++ b/cmd/erasure-sets.go
@@ -278,14 +278,11 @@ func (s *erasureSets) monitorAndConnectEndpoints(ctx context.Context, monitorInt
 		case <-ctx.Done():
 			return
 		case <-monitor.C:
+			// Reset the timer once fired for required interval.
+			monitor.Reset(monitorInterval)
+
 			s.connectDisks()
 		}
-
-		if !monitor.Stop() {
-			<-monitor.C
-		}
-
-		monitor.Reset(monitorInterval)
 	}
 }
 
@@ -1369,6 +1366,26 @@ func (s *erasureSets) maintainMRFList() {
 	}
 }
 
+func toSourceChTimed(t *time.Timer, sourceCh chan healSource, u healSource) {
+	t.Reset(100 * time.Millisecond)
+
+	// No defer, as we don't know which
+	// case will be selected
+
+	select {
+	case sourceCh <- u:
+	case <-t.C:
+		return
+	}
+
+	// We still need to check the return value
+	// of Stop, because t could have fired
+	// between the send on sourceCh and this line.
+	if !t.Stop() {
+		<-t.C
+	}
+}
+
 // healMRFRoutine monitors new disks connection, sweep the MRF list
 // to find objects related to the new disk that needs to be healed.
 func (s *erasureSets) healMRFRoutine() {
@@ -1392,16 +1409,8 @@ func (s *erasureSets) healMRFRoutine() {
 
 		// Heal objects
 		for _, u := range mrfOperations {
-			// Send an object to be healed with a timeout
-			select {
-			case bgSeq.sourceCh <- u:
-			case <-idler.C:
-			}
-
-			if !idler.Stop() {
-				<-idler.C
-			}
-			idler.Reset(100 * time.Millisecond)
+			// Send an object to background heal
+			toSourceChTimed(idler, bgSeq.sourceCh, u)
 
 			s.mrfMU.Lock()
 			delete(s.mrfOperations, u)