From b7f319b62ab605e4f0ab3dd0105cadcbf4c76875 Mon Sep 17 00:00:00 2001 From: Anis Eleuch Date: Thu, 25 Jul 2024 00:30:33 +0100 Subject: [PATCH] properly reload a fresh drive when found in a failed state during startup (#20145) When a drive is in a failed state when a single node multiple drives deployment is started, a replacement of a fresh disk will not be properly healed unless the user restarts the node. Fix this by always adding the new fresh disk to globalLocalDrivesMap. Also remove globalLocalDrives for simplification, a map to store local node drives can still be used since the order of local drives of a node is not defined. --- cmd/background-newdisks-heal-ops.go | 2 +- cmd/bucket-replication.go | 4 ++-- cmd/erasure-server-pool.go | 2 +- cmd/erasure-sets.go | 16 ++-------------- cmd/globals.go | 7 +++---- cmd/metrics-resource.go | 2 +- cmd/peer-rest-server.go | 2 +- cmd/peer-s3-server.go | 20 +++++++++++--------- cmd/storage-rest-server.go | 2 +- 9 files changed, 23 insertions(+), 34 deletions(-) diff --git a/cmd/background-newdisks-heal-ops.go b/cmd/background-newdisks-heal-ops.go index 8cf612c1d..bd17a5e1f 100644 --- a/cmd/background-newdisks-heal-ops.go +++ b/cmd/background-newdisks-heal-ops.go @@ -362,7 +362,7 @@ func initAutoHeal(ctx context.Context, objAPI ObjectLayer) { func getLocalDisksToHeal() (disksToHeal Endpoints) { globalLocalDrivesMu.RLock() - localDrives := cloneDrives(globalLocalDrives) + localDrives := cloneDrives(globalLocalDrivesMap) globalLocalDrivesMu.RUnlock() for _, disk := range localDrives { _, err := disk.DiskInfo(context.Background(), DiskInfoOptions{}) diff --git a/cmd/bucket-replication.go b/cmd/bucket-replication.go index c71a5fc17..93bef14cd 100644 --- a/cmd/bucket-replication.go +++ b/cmd/bucket-replication.go @@ -3553,7 +3553,7 @@ func (p *ReplicationPool) persistToDrive(ctx context.Context, v MRFReplicateEntr } globalLocalDrivesMu.RLock() - localDrives := cloneDrives(globalLocalDrives) + localDrives := cloneDrives(globalLocalDrivesMap) globalLocalDrivesMu.RUnlock() for _, localDrive := range localDrives { @@ -3620,7 +3620,7 @@ func (p *ReplicationPool) loadMRF() (mrfRec MRFReplicateEntries, err error) { } globalLocalDrivesMu.RLock() - localDrives := cloneDrives(globalLocalDrives) + localDrives := cloneDrives(globalLocalDrivesMap) globalLocalDrivesMu.RUnlock() for _, localDrive := range localDrives { diff --git a/cmd/erasure-server-pool.go b/cmd/erasure-server-pool.go index 687b04f4d..abf1ff602 100644 --- a/cmd/erasure-server-pool.go +++ b/cmd/erasure-server-pool.go @@ -168,7 +168,7 @@ func newErasureServerPools(ctx context.Context, endpointServerPools EndpointServ if !globalIsDistErasure { globalLocalDrivesMu.Lock() - globalLocalDrives = localDrives + globalLocalDrivesMap = make(map[string]StorageAPI, len(localDrives)) for _, drive := range localDrives { globalLocalDrivesMap[drive.Endpoint().String()] = drive } diff --git a/cmd/erasure-sets.go b/cmd/erasure-sets.go index b35ff797a..089fad9f2 100644 --- a/cmd/erasure-sets.go +++ b/cmd/erasure-sets.go @@ -262,13 +262,7 @@ func (s *erasureSets) connectDisks(log bool) { if globalIsDistErasure { globalLocalSetDrives[s.poolIndex][setIndex][diskIndex] = disk } - for i, ldisk := range globalLocalDrives { - _, k, l := ldisk.GetDiskLoc() - if k == setIndex && l == diskIndex { - globalLocalDrives[i] = disk - break - } - } + globalLocalDrivesMap[disk.Endpoint().String()] = disk globalLocalDrivesMu.Unlock() } s.erasureDisksMu.Unlock() @@ -1135,13 +1129,7 @@ func (s *erasureSets) HealFormat(ctx context.Context, dryRun bool) (res madmin.H if globalIsDistErasure { globalLocalSetDrives[s.poolIndex][m][n] = disk } - for i, ldisk := range globalLocalDrives { - _, k, l := ldisk.GetDiskLoc() - if k == m && l == n { - globalLocalDrives[i] = disk - break - } - } + globalLocalDrivesMap[disk.Endpoint().String()] = disk globalLocalDrivesMu.Unlock() } } diff --git a/cmd/globals.go b/cmd/globals.go index 82489b3d6..e5bea1fba 100644 --- a/cmd/globals.go +++ b/cmd/globals.go @@ -414,10 +414,9 @@ var ( globalServiceFreezeCnt int32 globalServiceFreezeMu sync.Mutex // Updates. - // List of local drives to this node, this is only set during server startup, - // and is only mutated by HealFormat. Hold globalLocalDrivesMu to access. - globalLocalDrives []StorageAPI - globalLocalDrivesMap = make(map[string]StorageAPI) + // Map of local drives to this node, this is set during server startup, + // disk reconnect and mutated by HealFormat. Hold globalLocalDrivesMu to access. + globalLocalDrivesMap map[string]StorageAPI globalLocalDrivesMu sync.RWMutex globalDriveMonitoring = env.Get("_MINIO_DRIVE_ACTIVE_MONITORING", config.EnableOn) == config.EnableOn diff --git a/cmd/metrics-resource.go b/cmd/metrics-resource.go index e3ae408eb..a34d8091b 100644 --- a/cmd/metrics-resource.go +++ b/cmd/metrics-resource.go @@ -262,7 +262,7 @@ func collectDriveMetrics(m madmin.RealtimeMetrics) { latestDriveStatsMu.Unlock() globalLocalDrivesMu.RLock() - localDrives := cloneDrives(globalLocalDrives) + localDrives := cloneDrives(globalLocalDrivesMap) globalLocalDrivesMu.RUnlock() for _, d := range localDrives { diff --git a/cmd/peer-rest-server.go b/cmd/peer-rest-server.go index 00005d07b..ef18d0aea 100644 --- a/cmd/peer-rest-server.go +++ b/cmd/peer-rest-server.go @@ -664,7 +664,7 @@ var errUnsupportedSignal = fmt.Errorf("unsupported signal") func waitingDrivesNode() map[string]madmin.DiskMetrics { globalLocalDrivesMu.RLock() - localDrives := cloneDrives(globalLocalDrives) + localDrives := cloneDrives(globalLocalDrivesMap) globalLocalDrivesMu.RUnlock() errs := make([]error, len(localDrives)) diff --git a/cmd/peer-s3-server.go b/cmd/peer-s3-server.go index 8e9443c2a..5fc254305 100644 --- a/cmd/peer-s3-server.go +++ b/cmd/peer-s3-server.go @@ -34,7 +34,7 @@ const ( func healBucketLocal(ctx context.Context, bucket string, opts madmin.HealOpts) (res madmin.HealResultItem, err error) { globalLocalDrivesMu.RLock() - localDrives := cloneDrives(globalLocalDrives) + localDrives := cloneDrives(globalLocalDrivesMap) globalLocalDrivesMu.RUnlock() // Initialize sync waitgroup. @@ -158,7 +158,7 @@ func healBucketLocal(ctx context.Context, bucket string, opts madmin.HealOpts) ( func listBucketsLocal(ctx context.Context, opts BucketOptions) (buckets []BucketInfo, err error) { globalLocalDrivesMu.RLock() - localDrives := cloneDrives(globalLocalDrives) + localDrives := cloneDrives(globalLocalDrivesMap) globalLocalDrivesMu.RUnlock() quorum := (len(localDrives) / 2) @@ -204,15 +204,17 @@ func listBucketsLocal(ctx context.Context, opts BucketOptions) (buckets []Bucket return buckets, nil } -func cloneDrives(drives []StorageAPI) []StorageAPI { - newDrives := make([]StorageAPI, len(drives)) - copy(newDrives, drives) - return newDrives +func cloneDrives(drives map[string]StorageAPI) []StorageAPI { + copyDrives := make([]StorageAPI, 0, len(drives)) + for _, drive := range drives { + copyDrives = append(copyDrives, drive) + } + return copyDrives } func getBucketInfoLocal(ctx context.Context, bucket string, opts BucketOptions) (BucketInfo, error) { globalLocalDrivesMu.RLock() - localDrives := cloneDrives(globalLocalDrives) + localDrives := cloneDrives(globalLocalDrivesMap) globalLocalDrivesMu.RUnlock() g := errgroup.WithNErrs(len(localDrives)).WithConcurrency(32) @@ -261,7 +263,7 @@ func getBucketInfoLocal(ctx context.Context, bucket string, opts BucketOptions) func deleteBucketLocal(ctx context.Context, bucket string, opts DeleteBucketOptions) error { globalLocalDrivesMu.RLock() - localDrives := cloneDrives(globalLocalDrives) + localDrives := cloneDrives(globalLocalDrivesMap) globalLocalDrivesMu.RUnlock() g := errgroup.WithNErrs(len(localDrives)).WithConcurrency(32) @@ -299,7 +301,7 @@ func deleteBucketLocal(ctx context.Context, bucket string, opts DeleteBucketOpti func makeBucketLocal(ctx context.Context, bucket string, opts MakeBucketOptions) error { globalLocalDrivesMu.RLock() - localDrives := cloneDrives(globalLocalDrives) + localDrives := cloneDrives(globalLocalDrivesMap) globalLocalDrivesMu.RUnlock() g := errgroup.WithNErrs(len(localDrives)).WithConcurrency(32) diff --git a/cmd/storage-rest-server.go b/cmd/storage-rest-server.go index 75fad25f0..ab34552c9 100644 --- a/cmd/storage-rest-server.go +++ b/cmd/storage-rest-server.go @@ -1340,6 +1340,7 @@ func registerStorageRESTHandlers(router *mux.Router, endpointServerPools Endpoin return collectInternodeStats(httpTraceHdrs(f)) } + globalLocalDrivesMap = make(map[string]StorageAPI) globalLocalSetDrives = make([][][]StorageAPI, len(endpointServerPools)) for pool := range globalLocalSetDrives { globalLocalSetDrives[pool] = make([][]StorageAPI, endpointServerPools[pool].SetCount) @@ -1413,7 +1414,6 @@ func registerStorageRESTHandlers(router *mux.Router, endpointServerPools Endpoin globalLocalDrivesMu.Lock() defer globalLocalDrivesMu.Unlock() - globalLocalDrives = append(globalLocalDrives, storage) globalLocalDrivesMap[endpoint.String()] = storage globalLocalSetDrives[endpoint.PoolIdx][endpoint.SetIdx][endpoint.DiskIdx] = storage return true