From 14d89eaae45bdbc0461fbd008aed1534df49ae1a Mon Sep 17 00:00:00 2001
From: Anis Elleuch <vadmeste@users.noreply.github.com>
Date: Thu, 18 Mar 2021 19:19:02 +0100
Subject: [PATCH] mrf: Enhance behavior for better results (#11788)

MRF was starting to heal when it receives a disk connection event, which
is not good when a node having multiple disks reconnects to the cluster.

Besides, MRF needs Remove healing option to remove stale files.
---
 cmd/erasure-sets.go | 80 +++++++++++++++++++--------------------------
 1 file changed, 34 insertions(+), 46 deletions(-)

diff --git a/cmd/erasure-sets.go b/cmd/erasure-sets.go
index 8461aa531..a8457d6da 100644
--- a/cmd/erasure-sets.go
+++ b/cmd/erasure-sets.go
@@ -45,11 +45,6 @@ import (
 // setsDsyncLockers is encapsulated type for Close()
 type setsDsyncLockers [][]dsync.NetLocker
 
-// Information of a new disk connection
-type diskConnectInfo struct {
-	setIndex int
-}
-
 const envMinioDeleteCleanupInterval = "MINIO_DELETE_CLEANUP_INTERVAL"
 
 // erasureSets implements ObjectLayer combining a static list of erasure coded
@@ -89,7 +84,9 @@ type erasureSets struct {
 
 	poolIndex int
 
-	disksConnectEvent chan diskConnectInfo
+	// A channel to send the set index to the MRF when
+	// any disk belonging to that set is connected
+	setReconnectEvent chan int
 
 	// Distribution algorithm of choice.
 	distributionAlgo string
@@ -199,6 +196,7 @@ func findDiskIndex(refFormat, format *formatErasureV3) (int, int, error) {
 // and re-arranges the disks in proper position.
 func (s *erasureSets) connectDisks() {
 	var wg sync.WaitGroup
+	var setsJustConnected = make([]bool, s.setCount)
 	diskMap := s.getDiskMap()
 	for _, endpoint := range s.endpoints {
 		diskPath := endpoint.String()
@@ -253,19 +251,29 @@ func (s *erasureSets) connectDisks() {
 			disk.SetDiskLoc(s.poolIndex, setIndex, diskIndex)
 			s.endpointStrings[setIndex*s.setDriveCount+diskIndex] = disk.String()
 			s.erasureDisksMu.Unlock()
-			go func(setIndex int) {
-				idler := time.NewTimer(100 * time.Millisecond)
-				defer idler.Stop()
-
-				// Send a new disk connect event with a timeout
-				select {
-				case s.disksConnectEvent <- diskConnectInfo{setIndex: setIndex}:
-				case <-idler.C:
-				}
-			}(setIndex)
+			setsJustConnected[setIndex] = true
 		}(endpoint)
 	}
+
 	wg.Wait()
+
+	go func() {
+		idler := time.NewTimer(100 * time.Millisecond)
+		defer idler.Stop()
+
+		for setIndex, justConnected := range setsJustConnected {
+			if !justConnected {
+				continue
+			}
+
+			// Send a new set connect event with a timeout
+			idler.Reset(100 * time.Millisecond)
+			select {
+			case s.setReconnectEvent <- setIndex:
+			case <-idler.C:
+			}
+		}
+	}()
 }
 
 // monitorAndConnectEndpoints this is a monitoring loop to keep track of disconnected
@@ -354,7 +362,7 @@ func newErasureSets(ctx context.Context, endpoints Endpoints, storageDisks []Sto
 		setDriveCount:      setDriveCount,
 		defaultParityCount: defaultParityCount,
 		format:             format,
-		disksConnectEvent:  make(chan diskConnectInfo),
+		setReconnectEvent:  make(chan int),
 		distributionAlgo:   format.Erasure.DistributionAlgo,
 		deploymentID:       uuid.MustParse(format.ID),
 		mrfOperations:      make(map[healSource]int),
@@ -659,12 +667,12 @@ func (s *erasureSets) Shutdown(ctx context.Context) error {
 		}
 	}
 	select {
-	case _, ok := <-s.disksConnectEvent:
+	case _, ok := <-s.setReconnectEvent:
 		if ok {
-			close(s.disksConnectEvent)
+			close(s.setReconnectEvent)
 		}
 	default:
-		close(s.disksConnectEvent)
+		close(s.setReconnectEvent)
 	}
 	return nil
 }
@@ -1353,47 +1361,25 @@ func (s *erasureSets) maintainMRFList() {
 			bucket:    fOp.bucket,
 			object:    fOp.object,
 			versionID: fOp.versionID,
+			opts:      &madmin.HealOpts{Remove: true},
 		}] = fOp.failedSet
 		s.mrfMU.Unlock()
 	}
 }
 
-func toSourceChTimed(t *time.Timer, sourceCh chan healSource, u healSource) {
-	t.Reset(100 * time.Millisecond)
-
-	// No defer, as we don't know which
-	// case will be selected
-
-	select {
-	case sourceCh <- u:
-	case <-t.C:
-		return
-	}
-
-	// We still need to check the return value
-	// of Stop, because t could have fired
-	// between the send on sourceCh and this line.
-	if !t.Stop() {
-		<-t.C
-	}
-}
-
 // healMRFRoutine monitors new disks connection, sweep the MRF list
 // to find objects related to the new disk that needs to be healed.
 func (s *erasureSets) healMRFRoutine() {
 	// Wait until background heal state is initialized
 	bgSeq := mustGetHealSequence(GlobalContext)
 
-	idler := time.NewTimer(100 * time.Millisecond)
-	defer idler.Stop()
-
-	for e := range s.disksConnectEvent {
+	for setIndex := range s.setReconnectEvent {
 		// Get the list of objects related the er.set
 		// to which the connected disk belongs.
 		var mrfOperations []healSource
 		s.mrfMU.Lock()
 		for k, v := range s.mrfOperations {
-			if v == e.setIndex {
+			if v == setIndex {
 				mrfOperations = append(mrfOperations, k)
 			}
 		}
@@ -1401,8 +1387,10 @@ func (s *erasureSets) healMRFRoutine() {
 
 		// Heal objects
 		for _, u := range mrfOperations {
+			waitForLowHTTPReq(globalHealConfig.IOCount, globalHealConfig.Sleep)
+
 			// Send an object to background heal
-			toSourceChTimed(idler, bgSeq.sourceCh, u)
+			bgSeq.sourceCh <- u
 
 			s.mrfMU.Lock()
 			delete(s.mrfOperations, u)