mirror of
https://github.com/minio/minio.git
synced 2025-11-07 12:52:58 -05:00
add healing retries when there are failed heal attempts (#19986)
transient errors for long running tasks are normal, allow for drive to retry again upto 3 times before giving up on healing the drive.
This commit is contained in:
@@ -88,6 +88,8 @@ type healingTracker struct {
|
||||
|
||||
ItemsSkipped uint64
|
||||
BytesSkipped uint64
|
||||
|
||||
RetryAttempts uint64
|
||||
// Add future tracking capabilities
|
||||
// Be sure that they are included in toHealingDisk
|
||||
}
|
||||
@@ -382,6 +384,8 @@ func getLocalDisksToHeal() (disksToHeal Endpoints) {
|
||||
|
||||
var newDiskHealingTimeout = newDynamicTimeout(30*time.Second, 10*time.Second)
|
||||
|
||||
var errRetryHealing = errors.New("some items failed to heal, we will retry healing this drive again")
|
||||
|
||||
func healFreshDisk(ctx context.Context, z *erasureServerPools, endpoint Endpoint) error {
|
||||
poolIdx, setIdx := endpoint.PoolIdx, endpoint.SetIdx
|
||||
disk := getStorageViaEndpoint(endpoint)
|
||||
@@ -451,8 +455,27 @@ func healFreshDisk(ctx context.Context, z *erasureServerPools, endpoint Endpoint
|
||||
return err
|
||||
}
|
||||
|
||||
healingLogEvent(ctx, "Healing of drive '%s' is finished (healed: %d, skipped: %d, failed: %d).", disk, tracker.ItemsHealed, tracker.ItemsSkipped, tracker.ItemsFailed)
|
||||
// if objects have failed healing, we attempt a retry to heal the drive upto 3 times before giving up.
|
||||
if tracker.ItemsFailed > 0 && tracker.RetryAttempts < 4 {
|
||||
tracker.RetryAttempts++
|
||||
bugLogIf(ctx, tracker.update(ctx))
|
||||
|
||||
healingLogEvent(ctx, "Healing of drive '%s' is incomplete, retrying %s time (healed: %d, skipped: %d, failed: %d).", disk,
|
||||
humanize.Ordinal(int(tracker.RetryAttempts)), tracker.ItemsHealed, tracker.ItemsSkipped, tracker.ItemsFailed)
|
||||
return errRetryHealing
|
||||
}
|
||||
|
||||
if tracker.ItemsFailed > 0 {
|
||||
healingLogEvent(ctx, "Healing of drive '%s' is incomplete, retried %d times (healed: %d, skipped: %d, failed: %d).", disk,
|
||||
tracker.RetryAttempts-1, tracker.ItemsHealed, tracker.ItemsSkipped, tracker.ItemsFailed)
|
||||
} else {
|
||||
if tracker.RetryAttempts > 0 {
|
||||
healingLogEvent(ctx, "Healing of drive '%s' is complete, retried %d times (healed: %d, skipped: %d).", disk,
|
||||
tracker.RetryAttempts-1, tracker.ItemsHealed, tracker.ItemsSkipped)
|
||||
} else {
|
||||
healingLogEvent(ctx, "Healing of drive '%s' is finished (healed: %d, skipped: %d).", disk, tracker.ItemsHealed, tracker.ItemsSkipped)
|
||||
}
|
||||
}
|
||||
if serverDebugLog {
|
||||
tracker.printTo(os.Stdout)
|
||||
fmt.Printf("\n")
|
||||
@@ -524,7 +547,7 @@ func monitorLocalDisksAndHeal(ctx context.Context, z *erasureServerPools) {
|
||||
if err := healFreshDisk(ctx, z, disk); err != nil {
|
||||
globalBackgroundHealState.setDiskHealingStatus(disk, false)
|
||||
timedout := OperationTimedOut{}
|
||||
if !errors.Is(err, context.Canceled) && !errors.As(err, &timedout) {
|
||||
if !errors.Is(err, context.Canceled) && !errors.As(err, &timedout) && !errors.Is(err, errRetryHealing) {
|
||||
printEndpointError(disk, err, false)
|
||||
}
|
||||
return
|
||||
|
||||
Reference in New Issue
Block a user