fix: crawler to skip healing the drives in a set being healed (#11274)

If an erasure set had a drive replacement recently, we don't need to attempt healing on another drive with in the same erasure set - this would ensure we do not double heal the same content and also prioritizes usage for such an erasure set to be calculated sooner.
2025-11-07 04:42:56 -05:00 · 2021-01-19 02:40:52 -08:00
parent e8ce348da1
commit e0055609bb
7 changed files with 129 additions and 91 deletions
--- a/cmd/erasure.go
+++ b/cmd/erasure.go
@@ -223,6 +223,51 @@ func (er erasureObjects) StorageInfo(ctx context.Context) (StorageInfo, []error)
 	return getStorageInfo(disks, endpoints)
 }

+func (er erasureObjects) getOnlineDisksWithHealing() (newDisks []StorageAPI, healing bool) {
+	var wg sync.WaitGroup
+	disks := er.getDisks()
+	infos := make([]DiskInfo, len(disks))
+	for _, i := range hashOrder(UTCNow().String(), len(disks)) {
+		i := i
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+
+			disk := disks[i-1]
+
+			if disk == nil {
+				return
+			}
+
+			di, err := disk.DiskInfo(context.Background())
+			if err != nil {
+				// - Do not consume disks which are not reachable
+				//   unformatted or simply not accessible for some reason.
+				//
+				//
+				// - Future: skip busy disks
+				return
+			}
+
+			infos[i-1] = di
+		}()
+	}
+	wg.Wait()
+
+	for i, info := range infos {
+		// Check if one of the drives in the set is being healed.
+		// this information is used by crawler to skip healing
+		// this erasure set while it calculates the usage.
+		if info.Healing {
+			healing = true
+			continue
+		}
+		newDisks = append(newDisks, disks[i])
+	}
+
+	return newDisks, healing
+}
+
 // CrawlAndGetDataUsage will start crawling buckets and send updated totals as they are traversed.
 // Updates are sent on a regular basis and the caller *must* consume them.
 func (er erasureObjects) crawlAndGetDataUsage(ctx context.Context, buckets []BucketInfo, bf *bloomFilter, updates chan<- dataUsageCache) error {
@@ -231,8 +276,8 @@ func (er erasureObjects) crawlAndGetDataUsage(ctx context.Context, buckets []Buc
 		return nil
 	}

-	// Collect disks we can use, sorted by least inode usage.
-	disks := er.getOnlineDisksSortedByUsedInodes()
+	// Collect disks we can use.
+	disks, healing := er.getOnlineDisksWithHealing()
 	if len(disks) == 0 {
 		logger.Info(color.Green("data-crawl:") + " all disks are offline or being healed, skipping crawl")
 		return nil
@@ -247,6 +292,11 @@ func (er erasureObjects) crawlAndGetDataUsage(ctx context.Context, buckets []Buc
 			continue
 		}
 		id, _ := disk.GetDiskID()
+		if id == "" {
+			// its possible that disk is unformatted
+			// or just went offline
+			continue
+		}
 		allDiskIDs = append(allDiskIDs, id)
 	}

@@ -348,6 +398,7 @@ func (er erasureObjects) crawlAndGetDataUsage(ctx context.Context, buckets []Buc
 					cache.Info.Name = bucket.Name
 				}
 				cache.Info.BloomFilter = bloom
+				cache.Info.SkipHealing = healing
 				cache.Disks = allDiskIDs
 				if cache.Info.Name != bucket.Name {
 					logger.LogIf(ctx, fmt.Errorf("cache name mismatch: %s != %s", cache.Info.Name, bucket.Name))