fix: timer deadlock on expired timers (#11124)

issue was introduced in #11106 the following pattern <-t.C // timer fired if !t.Stop() { <-t.C // timer hangs } Seems to hang at the last `t.C` line, this issue happens because a fired timer cannot be Stopped() anymore and t.Stop() returns `false` leading to confusing state of usage. Refactor the code such that use timers appropriately with exact requirements in place.
2025-11-07 21:02:58 -05:00 · 2020-12-17 12:35:02 -08:00
parent cffdb01279
commit 7c9ef76f66
4 changed files with 43 additions and 19 deletions
--- a/cmd/erasure-sets.go
+++ b/cmd/erasure-sets.go
@@ -278,14 +278,11 @@ func (s *erasureSets) monitorAndConnectEndpoints(ctx context.Context, monitorInt
 		case <-ctx.Done():
 			return
 		case <-monitor.C:
+			// Reset the timer once fired for required interval.
+			monitor.Reset(monitorInterval)
+
 			s.connectDisks()
 		}
-
-		if !monitor.Stop() {
-			<-monitor.C
-		}
-
-		monitor.Reset(monitorInterval)
 	}
 }

@@ -1369,6 +1366,26 @@ func (s *erasureSets) maintainMRFList() {
 	}
 }

+func toSourceChTimed(t *time.Timer, sourceCh chan healSource, u healSource) {
+	t.Reset(100 * time.Millisecond)
+
+	// No defer, as we don't know which
+	// case will be selected
+
+	select {
+	case sourceCh <- u:
+	case <-t.C:
+		return
+	}
+
+	// We still need to check the return value
+	// of Stop, because t could have fired
+	// between the send on sourceCh and this line.
+	if !t.Stop() {
+		<-t.C
+	}
+}
+
 // healMRFRoutine monitors new disks connection, sweep the MRF list
 // to find objects related to the new disk that needs to be healed.
 func (s *erasureSets) healMRFRoutine() {
@@ -1392,16 +1409,8 @@ func (s *erasureSets) healMRFRoutine() {

 		// Heal objects
 		for _, u := range mrfOperations {
-			// Send an object to be healed with a timeout
-			select {
-			case bgSeq.sourceCh <- u:
-			case <-idler.C:
-			}
-
-			if !idler.Stop() {
-				<-idler.C
-			}
-			idler.Reset(100 * time.Millisecond)
+			// Send an object to background heal
+			toSourceChTimed(idler, bgSeq.sourceCh, u)

 			s.mrfMU.Lock()
 			delete(s.mrfOperations, u)