mirror of https://github.com/minio/minio.git
add healing retries when there are failed heal attempts (#19986)
transient errors for long running tasks are normal, allow for drive to retry again upto 3 times before giving up on healing the drive.
This commit is contained in:
parent
41f508765d
commit
22c5a5b91b
|
@ -88,6 +88,8 @@ type healingTracker struct {
|
||||||
|
|
||||||
ItemsSkipped uint64
|
ItemsSkipped uint64
|
||||||
BytesSkipped uint64
|
BytesSkipped uint64
|
||||||
|
|
||||||
|
RetryAttempts uint64
|
||||||
// Add future tracking capabilities
|
// Add future tracking capabilities
|
||||||
// Be sure that they are included in toHealingDisk
|
// Be sure that they are included in toHealingDisk
|
||||||
}
|
}
|
||||||
|
@ -382,6 +384,8 @@ func getLocalDisksToHeal() (disksToHeal Endpoints) {
|
||||||
|
|
||||||
var newDiskHealingTimeout = newDynamicTimeout(30*time.Second, 10*time.Second)
|
var newDiskHealingTimeout = newDynamicTimeout(30*time.Second, 10*time.Second)
|
||||||
|
|
||||||
|
var errRetryHealing = errors.New("some items failed to heal, we will retry healing this drive again")
|
||||||
|
|
||||||
func healFreshDisk(ctx context.Context, z *erasureServerPools, endpoint Endpoint) error {
|
func healFreshDisk(ctx context.Context, z *erasureServerPools, endpoint Endpoint) error {
|
||||||
poolIdx, setIdx := endpoint.PoolIdx, endpoint.SetIdx
|
poolIdx, setIdx := endpoint.PoolIdx, endpoint.SetIdx
|
||||||
disk := getStorageViaEndpoint(endpoint)
|
disk := getStorageViaEndpoint(endpoint)
|
||||||
|
@ -451,8 +455,27 @@ func healFreshDisk(ctx context.Context, z *erasureServerPools, endpoint Endpoint
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
healingLogEvent(ctx, "Healing of drive '%s' is finished (healed: %d, skipped: %d, failed: %d).", disk, tracker.ItemsHealed, tracker.ItemsSkipped, tracker.ItemsFailed)
|
// if objects have failed healing, we attempt a retry to heal the drive upto 3 times before giving up.
|
||||||
|
if tracker.ItemsFailed > 0 && tracker.RetryAttempts < 4 {
|
||||||
|
tracker.RetryAttempts++
|
||||||
|
bugLogIf(ctx, tracker.update(ctx))
|
||||||
|
|
||||||
|
healingLogEvent(ctx, "Healing of drive '%s' is incomplete, retrying %s time (healed: %d, skipped: %d, failed: %d).", disk,
|
||||||
|
humanize.Ordinal(int(tracker.RetryAttempts)), tracker.ItemsHealed, tracker.ItemsSkipped, tracker.ItemsFailed)
|
||||||
|
return errRetryHealing
|
||||||
|
}
|
||||||
|
|
||||||
|
if tracker.ItemsFailed > 0 {
|
||||||
|
healingLogEvent(ctx, "Healing of drive '%s' is incomplete, retried %d times (healed: %d, skipped: %d, failed: %d).", disk,
|
||||||
|
tracker.RetryAttempts-1, tracker.ItemsHealed, tracker.ItemsSkipped, tracker.ItemsFailed)
|
||||||
|
} else {
|
||||||
|
if tracker.RetryAttempts > 0 {
|
||||||
|
healingLogEvent(ctx, "Healing of drive '%s' is complete, retried %d times (healed: %d, skipped: %d).", disk,
|
||||||
|
tracker.RetryAttempts-1, tracker.ItemsHealed, tracker.ItemsSkipped)
|
||||||
|
} else {
|
||||||
|
healingLogEvent(ctx, "Healing of drive '%s' is finished (healed: %d, skipped: %d).", disk, tracker.ItemsHealed, tracker.ItemsSkipped)
|
||||||
|
}
|
||||||
|
}
|
||||||
if serverDebugLog {
|
if serverDebugLog {
|
||||||
tracker.printTo(os.Stdout)
|
tracker.printTo(os.Stdout)
|
||||||
fmt.Printf("\n")
|
fmt.Printf("\n")
|
||||||
|
@ -524,7 +547,7 @@ func monitorLocalDisksAndHeal(ctx context.Context, z *erasureServerPools) {
|
||||||
if err := healFreshDisk(ctx, z, disk); err != nil {
|
if err := healFreshDisk(ctx, z, disk); err != nil {
|
||||||
globalBackgroundHealState.setDiskHealingStatus(disk, false)
|
globalBackgroundHealState.setDiskHealingStatus(disk, false)
|
||||||
timedout := OperationTimedOut{}
|
timedout := OperationTimedOut{}
|
||||||
if !errors.Is(err, context.Canceled) && !errors.As(err, &timedout) {
|
if !errors.Is(err, context.Canceled) && !errors.As(err, &timedout) && !errors.Is(err, errRetryHealing) {
|
||||||
printEndpointError(disk, err, false)
|
printEndpointError(disk, err, false)
|
||||||
}
|
}
|
||||||
return
|
return
|
||||||
|
|
|
@ -200,6 +200,12 @@ func (z *healingTracker) DecodeMsg(dc *msgp.Reader) (err error) {
|
||||||
err = msgp.WrapError(err, "BytesSkipped")
|
err = msgp.WrapError(err, "BytesSkipped")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
case "RetryAttempts":
|
||||||
|
z.RetryAttempts, err = dc.ReadUint64()
|
||||||
|
if err != nil {
|
||||||
|
err = msgp.WrapError(err, "RetryAttempts")
|
||||||
|
return
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
err = dc.Skip()
|
err = dc.Skip()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -213,9 +219,9 @@ func (z *healingTracker) DecodeMsg(dc *msgp.Reader) (err error) {
|
||||||
|
|
||||||
// EncodeMsg implements msgp.Encodable
|
// EncodeMsg implements msgp.Encodable
|
||||||
func (z *healingTracker) EncodeMsg(en *msgp.Writer) (err error) {
|
func (z *healingTracker) EncodeMsg(en *msgp.Writer) (err error) {
|
||||||
// map header, size 25
|
// map header, size 26
|
||||||
// write "ID"
|
// write "ID"
|
||||||
err = en.Append(0xde, 0x0, 0x19, 0xa2, 0x49, 0x44)
|
err = en.Append(0xde, 0x0, 0x1a, 0xa2, 0x49, 0x44)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
@ -478,15 +484,25 @@ func (z *healingTracker) EncodeMsg(en *msgp.Writer) (err error) {
|
||||||
err = msgp.WrapError(err, "BytesSkipped")
|
err = msgp.WrapError(err, "BytesSkipped")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
// write "RetryAttempts"
|
||||||
|
err = en.Append(0xad, 0x52, 0x65, 0x74, 0x72, 0x79, 0x41, 0x74, 0x74, 0x65, 0x6d, 0x70, 0x74, 0x73)
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
err = en.WriteUint64(z.RetryAttempts)
|
||||||
|
if err != nil {
|
||||||
|
err = msgp.WrapError(err, "RetryAttempts")
|
||||||
|
return
|
||||||
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// MarshalMsg implements msgp.Marshaler
|
// MarshalMsg implements msgp.Marshaler
|
||||||
func (z *healingTracker) MarshalMsg(b []byte) (o []byte, err error) {
|
func (z *healingTracker) MarshalMsg(b []byte) (o []byte, err error) {
|
||||||
o = msgp.Require(b, z.Msgsize())
|
o = msgp.Require(b, z.Msgsize())
|
||||||
// map header, size 25
|
// map header, size 26
|
||||||
// string "ID"
|
// string "ID"
|
||||||
o = append(o, 0xde, 0x0, 0x19, 0xa2, 0x49, 0x44)
|
o = append(o, 0xde, 0x0, 0x1a, 0xa2, 0x49, 0x44)
|
||||||
o = msgp.AppendString(o, z.ID)
|
o = msgp.AppendString(o, z.ID)
|
||||||
// string "PoolIndex"
|
// string "PoolIndex"
|
||||||
o = append(o, 0xa9, 0x50, 0x6f, 0x6f, 0x6c, 0x49, 0x6e, 0x64, 0x65, 0x78)
|
o = append(o, 0xa9, 0x50, 0x6f, 0x6f, 0x6c, 0x49, 0x6e, 0x64, 0x65, 0x78)
|
||||||
|
@ -566,6 +582,9 @@ func (z *healingTracker) MarshalMsg(b []byte) (o []byte, err error) {
|
||||||
// string "BytesSkipped"
|
// string "BytesSkipped"
|
||||||
o = append(o, 0xac, 0x42, 0x79, 0x74, 0x65, 0x73, 0x53, 0x6b, 0x69, 0x70, 0x70, 0x65, 0x64)
|
o = append(o, 0xac, 0x42, 0x79, 0x74, 0x65, 0x73, 0x53, 0x6b, 0x69, 0x70, 0x70, 0x65, 0x64)
|
||||||
o = msgp.AppendUint64(o, z.BytesSkipped)
|
o = msgp.AppendUint64(o, z.BytesSkipped)
|
||||||
|
// string "RetryAttempts"
|
||||||
|
o = append(o, 0xad, 0x52, 0x65, 0x74, 0x72, 0x79, 0x41, 0x74, 0x74, 0x65, 0x6d, 0x70, 0x74, 0x73)
|
||||||
|
o = msgp.AppendUint64(o, z.RetryAttempts)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -763,6 +782,12 @@ func (z *healingTracker) UnmarshalMsg(bts []byte) (o []byte, err error) {
|
||||||
err = msgp.WrapError(err, "BytesSkipped")
|
err = msgp.WrapError(err, "BytesSkipped")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
case "RetryAttempts":
|
||||||
|
z.RetryAttempts, bts, err = msgp.ReadUint64Bytes(bts)
|
||||||
|
if err != nil {
|
||||||
|
err = msgp.WrapError(err, "RetryAttempts")
|
||||||
|
return
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
bts, err = msgp.Skip(bts)
|
bts, err = msgp.Skip(bts)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -785,6 +810,6 @@ func (z *healingTracker) Msgsize() (s int) {
|
||||||
for za0002 := range z.HealedBuckets {
|
for za0002 := range z.HealedBuckets {
|
||||||
s += msgp.StringPrefixSize + len(z.HealedBuckets[za0002])
|
s += msgp.StringPrefixSize + len(z.HealedBuckets[za0002])
|
||||||
}
|
}
|
||||||
s += 7 + msgp.StringPrefixSize + len(z.HealID) + 13 + msgp.Uint64Size + 13 + msgp.Uint64Size
|
s += 7 + msgp.StringPrefixSize + len(z.HealID) + 13 + msgp.Uint64Size + 13 + msgp.Uint64Size + 14 + msgp.Uint64Size
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue