properly reload a fresh drive when found in a failed state during startup (#20145)

When a drive is in a failed state when a single node multiple drives
deployment is started, a replacement of a fresh disk will not be
properly healed unless the user restarts the node.

Fix this by always adding the new fresh disk to globalLocalDrivesMap. Also
remove globalLocalDrives for simplification, a map to store local node
drives can still be used since the order of local drives of a node is
not defined.
This commit is contained in:
Anis Eleuch 2024-07-25 00:30:33 +01:00 committed by GitHub
parent 33c101544d
commit b7f319b62a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 23 additions and 34 deletions

View File

@ -362,7 +362,7 @@ func initAutoHeal(ctx context.Context, objAPI ObjectLayer) {
func getLocalDisksToHeal() (disksToHeal Endpoints) { func getLocalDisksToHeal() (disksToHeal Endpoints) {
globalLocalDrivesMu.RLock() globalLocalDrivesMu.RLock()
localDrives := cloneDrives(globalLocalDrives) localDrives := cloneDrives(globalLocalDrivesMap)
globalLocalDrivesMu.RUnlock() globalLocalDrivesMu.RUnlock()
for _, disk := range localDrives { for _, disk := range localDrives {
_, err := disk.DiskInfo(context.Background(), DiskInfoOptions{}) _, err := disk.DiskInfo(context.Background(), DiskInfoOptions{})

View File

@ -3553,7 +3553,7 @@ func (p *ReplicationPool) persistToDrive(ctx context.Context, v MRFReplicateEntr
} }
globalLocalDrivesMu.RLock() globalLocalDrivesMu.RLock()
localDrives := cloneDrives(globalLocalDrives) localDrives := cloneDrives(globalLocalDrivesMap)
globalLocalDrivesMu.RUnlock() globalLocalDrivesMu.RUnlock()
for _, localDrive := range localDrives { for _, localDrive := range localDrives {
@ -3620,7 +3620,7 @@ func (p *ReplicationPool) loadMRF() (mrfRec MRFReplicateEntries, err error) {
} }
globalLocalDrivesMu.RLock() globalLocalDrivesMu.RLock()
localDrives := cloneDrives(globalLocalDrives) localDrives := cloneDrives(globalLocalDrivesMap)
globalLocalDrivesMu.RUnlock() globalLocalDrivesMu.RUnlock()
for _, localDrive := range localDrives { for _, localDrive := range localDrives {

View File

@ -168,7 +168,7 @@ func newErasureServerPools(ctx context.Context, endpointServerPools EndpointServ
if !globalIsDistErasure { if !globalIsDistErasure {
globalLocalDrivesMu.Lock() globalLocalDrivesMu.Lock()
globalLocalDrives = localDrives globalLocalDrivesMap = make(map[string]StorageAPI, len(localDrives))
for _, drive := range localDrives { for _, drive := range localDrives {
globalLocalDrivesMap[drive.Endpoint().String()] = drive globalLocalDrivesMap[drive.Endpoint().String()] = drive
} }

View File

@ -262,13 +262,7 @@ func (s *erasureSets) connectDisks(log bool) {
if globalIsDistErasure { if globalIsDistErasure {
globalLocalSetDrives[s.poolIndex][setIndex][diskIndex] = disk globalLocalSetDrives[s.poolIndex][setIndex][diskIndex] = disk
} }
for i, ldisk := range globalLocalDrives { globalLocalDrivesMap[disk.Endpoint().String()] = disk
_, k, l := ldisk.GetDiskLoc()
if k == setIndex && l == diskIndex {
globalLocalDrives[i] = disk
break
}
}
globalLocalDrivesMu.Unlock() globalLocalDrivesMu.Unlock()
} }
s.erasureDisksMu.Unlock() s.erasureDisksMu.Unlock()
@ -1135,13 +1129,7 @@ func (s *erasureSets) HealFormat(ctx context.Context, dryRun bool) (res madmin.H
if globalIsDistErasure { if globalIsDistErasure {
globalLocalSetDrives[s.poolIndex][m][n] = disk globalLocalSetDrives[s.poolIndex][m][n] = disk
} }
for i, ldisk := range globalLocalDrives { globalLocalDrivesMap[disk.Endpoint().String()] = disk
_, k, l := ldisk.GetDiskLoc()
if k == m && l == n {
globalLocalDrives[i] = disk
break
}
}
globalLocalDrivesMu.Unlock() globalLocalDrivesMu.Unlock()
} }
} }

View File

@ -414,10 +414,9 @@ var (
globalServiceFreezeCnt int32 globalServiceFreezeCnt int32
globalServiceFreezeMu sync.Mutex // Updates. globalServiceFreezeMu sync.Mutex // Updates.
// List of local drives to this node, this is only set during server startup, // Map of local drives to this node, this is set during server startup,
// and is only mutated by HealFormat. Hold globalLocalDrivesMu to access. // disk reconnect and mutated by HealFormat. Hold globalLocalDrivesMu to access.
globalLocalDrives []StorageAPI globalLocalDrivesMap map[string]StorageAPI
globalLocalDrivesMap = make(map[string]StorageAPI)
globalLocalDrivesMu sync.RWMutex globalLocalDrivesMu sync.RWMutex
globalDriveMonitoring = env.Get("_MINIO_DRIVE_ACTIVE_MONITORING", config.EnableOn) == config.EnableOn globalDriveMonitoring = env.Get("_MINIO_DRIVE_ACTIVE_MONITORING", config.EnableOn) == config.EnableOn

View File

@ -262,7 +262,7 @@ func collectDriveMetrics(m madmin.RealtimeMetrics) {
latestDriveStatsMu.Unlock() latestDriveStatsMu.Unlock()
globalLocalDrivesMu.RLock() globalLocalDrivesMu.RLock()
localDrives := cloneDrives(globalLocalDrives) localDrives := cloneDrives(globalLocalDrivesMap)
globalLocalDrivesMu.RUnlock() globalLocalDrivesMu.RUnlock()
for _, d := range localDrives { for _, d := range localDrives {

View File

@ -664,7 +664,7 @@ var errUnsupportedSignal = fmt.Errorf("unsupported signal")
func waitingDrivesNode() map[string]madmin.DiskMetrics { func waitingDrivesNode() map[string]madmin.DiskMetrics {
globalLocalDrivesMu.RLock() globalLocalDrivesMu.RLock()
localDrives := cloneDrives(globalLocalDrives) localDrives := cloneDrives(globalLocalDrivesMap)
globalLocalDrivesMu.RUnlock() globalLocalDrivesMu.RUnlock()
errs := make([]error, len(localDrives)) errs := make([]error, len(localDrives))

View File

@ -34,7 +34,7 @@ const (
func healBucketLocal(ctx context.Context, bucket string, opts madmin.HealOpts) (res madmin.HealResultItem, err error) { func healBucketLocal(ctx context.Context, bucket string, opts madmin.HealOpts) (res madmin.HealResultItem, err error) {
globalLocalDrivesMu.RLock() globalLocalDrivesMu.RLock()
localDrives := cloneDrives(globalLocalDrives) localDrives := cloneDrives(globalLocalDrivesMap)
globalLocalDrivesMu.RUnlock() globalLocalDrivesMu.RUnlock()
// Initialize sync waitgroup. // Initialize sync waitgroup.
@ -158,7 +158,7 @@ func healBucketLocal(ctx context.Context, bucket string, opts madmin.HealOpts) (
func listBucketsLocal(ctx context.Context, opts BucketOptions) (buckets []BucketInfo, err error) { func listBucketsLocal(ctx context.Context, opts BucketOptions) (buckets []BucketInfo, err error) {
globalLocalDrivesMu.RLock() globalLocalDrivesMu.RLock()
localDrives := cloneDrives(globalLocalDrives) localDrives := cloneDrives(globalLocalDrivesMap)
globalLocalDrivesMu.RUnlock() globalLocalDrivesMu.RUnlock()
quorum := (len(localDrives) / 2) quorum := (len(localDrives) / 2)
@ -204,15 +204,17 @@ func listBucketsLocal(ctx context.Context, opts BucketOptions) (buckets []Bucket
return buckets, nil return buckets, nil
} }
func cloneDrives(drives []StorageAPI) []StorageAPI { func cloneDrives(drives map[string]StorageAPI) []StorageAPI {
newDrives := make([]StorageAPI, len(drives)) copyDrives := make([]StorageAPI, 0, len(drives))
copy(newDrives, drives) for _, drive := range drives {
return newDrives copyDrives = append(copyDrives, drive)
}
return copyDrives
} }
func getBucketInfoLocal(ctx context.Context, bucket string, opts BucketOptions) (BucketInfo, error) { func getBucketInfoLocal(ctx context.Context, bucket string, opts BucketOptions) (BucketInfo, error) {
globalLocalDrivesMu.RLock() globalLocalDrivesMu.RLock()
localDrives := cloneDrives(globalLocalDrives) localDrives := cloneDrives(globalLocalDrivesMap)
globalLocalDrivesMu.RUnlock() globalLocalDrivesMu.RUnlock()
g := errgroup.WithNErrs(len(localDrives)).WithConcurrency(32) g := errgroup.WithNErrs(len(localDrives)).WithConcurrency(32)
@ -261,7 +263,7 @@ func getBucketInfoLocal(ctx context.Context, bucket string, opts BucketOptions)
func deleteBucketLocal(ctx context.Context, bucket string, opts DeleteBucketOptions) error { func deleteBucketLocal(ctx context.Context, bucket string, opts DeleteBucketOptions) error {
globalLocalDrivesMu.RLock() globalLocalDrivesMu.RLock()
localDrives := cloneDrives(globalLocalDrives) localDrives := cloneDrives(globalLocalDrivesMap)
globalLocalDrivesMu.RUnlock() globalLocalDrivesMu.RUnlock()
g := errgroup.WithNErrs(len(localDrives)).WithConcurrency(32) g := errgroup.WithNErrs(len(localDrives)).WithConcurrency(32)
@ -299,7 +301,7 @@ func deleteBucketLocal(ctx context.Context, bucket string, opts DeleteBucketOpti
func makeBucketLocal(ctx context.Context, bucket string, opts MakeBucketOptions) error { func makeBucketLocal(ctx context.Context, bucket string, opts MakeBucketOptions) error {
globalLocalDrivesMu.RLock() globalLocalDrivesMu.RLock()
localDrives := cloneDrives(globalLocalDrives) localDrives := cloneDrives(globalLocalDrivesMap)
globalLocalDrivesMu.RUnlock() globalLocalDrivesMu.RUnlock()
g := errgroup.WithNErrs(len(localDrives)).WithConcurrency(32) g := errgroup.WithNErrs(len(localDrives)).WithConcurrency(32)

View File

@ -1340,6 +1340,7 @@ func registerStorageRESTHandlers(router *mux.Router, endpointServerPools Endpoin
return collectInternodeStats(httpTraceHdrs(f)) return collectInternodeStats(httpTraceHdrs(f))
} }
globalLocalDrivesMap = make(map[string]StorageAPI)
globalLocalSetDrives = make([][][]StorageAPI, len(endpointServerPools)) globalLocalSetDrives = make([][][]StorageAPI, len(endpointServerPools))
for pool := range globalLocalSetDrives { for pool := range globalLocalSetDrives {
globalLocalSetDrives[pool] = make([][]StorageAPI, endpointServerPools[pool].SetCount) globalLocalSetDrives[pool] = make([][]StorageAPI, endpointServerPools[pool].SetCount)
@ -1413,7 +1414,6 @@ func registerStorageRESTHandlers(router *mux.Router, endpointServerPools Endpoin
globalLocalDrivesMu.Lock() globalLocalDrivesMu.Lock()
defer globalLocalDrivesMu.Unlock() defer globalLocalDrivesMu.Unlock()
globalLocalDrives = append(globalLocalDrives, storage)
globalLocalDrivesMap[endpoint.String()] = storage globalLocalDrivesMap[endpoint.String()] = storage
globalLocalSetDrives[endpoint.PoolIdx][endpoint.SetIdx][endpoint.DiskIdx] = storage globalLocalSetDrives[endpoint.PoolIdx][endpoint.SetIdx][endpoint.DiskIdx] = storage
return true return true