mirror of
https://github.com/minio/minio.git
synced 2025-01-11 23:13:23 -05:00
retry and resume decom operation upon retriable failures (#15244)
it is possible in a k8s-like system reading pool.bin might not have quorum during startup, however, add a way to retry after this failure.
This commit is contained in:
parent
c1901f4e12
commit
5802df4365
@ -22,6 +22,7 @@ import (
|
||||
"encoding/binary"
|
||||
"errors"
|
||||
"fmt"
|
||||
"math/rand"
|
||||
"net/http"
|
||||
"sort"
|
||||
"strconv"
|
||||
@ -523,14 +524,26 @@ func (z *erasureServerPools) Init(ctx context.Context) error {
|
||||
}
|
||||
if globalEndpoints[idx].Endpoints[0].IsLocal {
|
||||
go func(pool PoolStatus) {
|
||||
switch err := z.Decommission(ctx, pool.ID); err {
|
||||
case nil:
|
||||
r := rand.New(rand.NewSource(time.Now().UnixNano()))
|
||||
for {
|
||||
if err := z.Decommission(ctx, pool.ID); err != nil {
|
||||
switch err {
|
||||
// we already started decommission
|
||||
case errDecommissionAlreadyRunning:
|
||||
// A previous decommission running found restart it.
|
||||
z.doDecommissionInRoutine(ctx, idx)
|
||||
return
|
||||
default:
|
||||
if configRetriableErrors(err) {
|
||||
logger.LogIf(ctx, fmt.Errorf("Unable to resume decommission of pool %v: %w: retrying..", pool, err))
|
||||
time.Sleep(time.Second + time.Duration(r.Float64()*float64(5*time.Second)))
|
||||
continue
|
||||
}
|
||||
logger.LogIf(ctx, fmt.Errorf("Unable to resume decommission of pool %v: %w", pool, err))
|
||||
return
|
||||
}
|
||||
}
|
||||
break
|
||||
}
|
||||
}(pool)
|
||||
}
|
||||
@ -984,7 +997,9 @@ func (z *erasureServerPools) DecommissionCancel(ctx context.Context, idx int) (e
|
||||
defer z.poolMetaMutex.Unlock()
|
||||
|
||||
if z.poolMeta.DecommissionCancel(idx) {
|
||||
defer z.decommissionCancelers[idx]() // cancel any active thread.
|
||||
if fn := z.decommissionCancelers[idx]; fn != nil {
|
||||
defer fn() // cancel any active thread.
|
||||
}
|
||||
if err = z.poolMeta.save(ctx, z.serverPools); err != nil {
|
||||
return err
|
||||
}
|
||||
@ -1006,7 +1021,9 @@ func (z *erasureServerPools) DecommissionFailed(ctx context.Context, idx int) (e
|
||||
defer z.poolMetaMutex.Unlock()
|
||||
|
||||
if z.poolMeta.DecommissionFailed(idx) {
|
||||
defer z.decommissionCancelers[idx]() // cancel any active thread.
|
||||
if fn := z.decommissionCancelers[idx]; fn != nil {
|
||||
defer fn() // cancel any active thread.
|
||||
}
|
||||
if err = z.poolMeta.save(ctx, z.serverPools); err != nil {
|
||||
return err
|
||||
}
|
||||
@ -1028,7 +1045,9 @@ func (z *erasureServerPools) CompleteDecommission(ctx context.Context, idx int)
|
||||
defer z.poolMetaMutex.Unlock()
|
||||
|
||||
if z.poolMeta.DecommissionComplete(idx) {
|
||||
defer z.decommissionCancelers[idx]() // cancel any active thread.
|
||||
if fn := z.decommissionCancelers[idx]; fn != nil {
|
||||
defer fn() // cancel any active thread.
|
||||
}
|
||||
if err = z.poolMeta.save(ctx, z.serverPools); err != nil {
|
||||
return err
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user