avoid caching metrics for timeout errors per drive (#18584)

Bonus: combine the loop for drive/REST registration.
This commit is contained in:
Harshavardhana 2023-12-04 11:54:13 -08:00 committed by GitHub
parent 8fdfcfb562
commit 05bb655efc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 24 additions and 22 deletions

View File

@ -1348,6 +1348,7 @@ func registerStorageRESTHandlers(router *mux.Router, endpointServerPools Endpoin
if !endpoint.IsLocal { if !endpoint.IsLocal {
continue continue
} }
driveHandlers[pool][set] = &storageRESTServer{} driveHandlers[pool][set] = &storageRESTServer{}
server := driveHandlers[pool][set] server := driveHandlers[pool][set]
@ -1392,15 +1393,8 @@ func registerStorageRESTHandlers(router *mux.Router, endpointServerPools Endpoin
Handle: server.WalkDirHandler, Handle: server.WalkDirHandler,
OutCapacity: 1, OutCapacity: 1,
}), "unable to register handler") }), "unable to register handler")
}
}
for pool, serverPool := range endpointServerPools { createStorage := func(server *storageRESTServer) bool {
for set, endpoint := range serverPool.Endpoints {
if !endpoint.IsLocal {
continue
}
createStorage := func(pool, set int, endpoint Endpoint) bool {
xl, err := newXLStorage(endpoint, false) xl, err := newXLStorage(endpoint, false)
if err != nil { if err != nil {
// if supported errors don't fail, we proceed to // if supported errors don't fail, we proceed to
@ -1410,21 +1404,22 @@ func registerStorageRESTHandlers(router *mux.Router, endpointServerPools Endpoin
} }
storage := newXLStorageDiskIDCheck(xl, true) storage := newXLStorageDiskIDCheck(xl, true)
storage.SetDiskID(xl.diskID) storage.SetDiskID(xl.diskID)
driveHandlers[pool][set].setStorage(storage) server.setStorage(storage)
return true return true
} }
if createStorage(pool, set, endpoint) { if createStorage(server) {
continue continue
} }
// Start async goroutine to create storage. // Start async goroutine to create storage.
go func(pool, set int, endpoint Endpoint) { go func(server *storageRESTServer) {
for { for {
time.Sleep(time.Minute) time.Sleep(5 * time.Second)
if createStorage(pool, set, endpoint) { if createStorage(server) {
return return
} }
} }
}(pool, set, endpoint) }(server)
} }
} }
} }

View File

@ -78,8 +78,9 @@ const (
// Detects change in underlying disk. // Detects change in underlying disk.
type xlStorageDiskIDCheck struct { type xlStorageDiskIDCheck struct {
totalErrsAvailability uint64 // Captures all data availability errors such as permission denied, faulty disk and timeout errors. totalErrsTimeout atomic.Uint64 // Captures all timeout only errors
totalErrsTimeout uint64 // Captures all timeout only errors totalErrsAvailability atomic.Uint64 // Captures all data availability errors such as permission denied, faulty disk and timeout errors.
// apiCalls should be placed first so alignment is guaranteed for atomic operations. // apiCalls should be placed first so alignment is guaranteed for atomic operations.
apiCalls [storageMetricLast]uint64 apiCalls [storageMetricLast]uint64
apiLatencies [storageMetricLast]*lockedLastMinuteLatency apiLatencies [storageMetricLast]*lockedLastMinuteLatency
@ -102,7 +103,7 @@ type xlStorageDiskIDCheck struct {
func (p *xlStorageDiskIDCheck) getMetrics() DiskMetrics { func (p *xlStorageDiskIDCheck) getMetrics() DiskMetrics {
p.metricsCache.Once.Do(func() { p.metricsCache.Once.Do(func() {
p.metricsCache.TTL = 1 * time.Second p.metricsCache.TTL = 5 * time.Second
p.metricsCache.Update = func() (interface{}, error) { p.metricsCache.Update = func() (interface{}, error) {
diskMetric := DiskMetrics{ diskMetric := DiskMetrics{
LastMinute: make(map[string]AccElem, len(p.apiLatencies)), LastMinute: make(map[string]AccElem, len(p.apiLatencies)),
@ -114,13 +115,19 @@ func (p *xlStorageDiskIDCheck) getMetrics() DiskMetrics {
for i := range p.apiCalls { for i := range p.apiCalls {
diskMetric.APICalls[storageMetric(i).String()] = atomic.LoadUint64(&p.apiCalls[i]) diskMetric.APICalls[storageMetric(i).String()] = atomic.LoadUint64(&p.apiCalls[i])
} }
diskMetric.TotalErrorsAvailability = atomic.LoadUint64(&p.totalErrsAvailability)
diskMetric.TotalErrorsTimeout = atomic.LoadUint64(&p.totalErrsTimeout)
return diskMetric, nil return diskMetric, nil
} }
}) })
m, _ := p.metricsCache.Get() m, _ := p.metricsCache.Get()
return m.(DiskMetrics) diskMetric := DiskMetrics{}
if m != nil {
diskMetric = m.(DiskMetrics)
}
// Do not need this value to be cached.
diskMetric.TotalErrorsTimeout = p.totalErrsTimeout.Load()
diskMetric.TotalErrorsAvailability = p.totalErrsAvailability.Load()
return diskMetric
} }
// lockedLastMinuteLatency accumulates totals lockless for each second. // lockedLastMinuteLatency accumulates totals lockless for each second.
@ -746,9 +753,9 @@ func (p *xlStorageDiskIDCheck) updateStorageMetrics(s storageMetric, paths ...st
context.DeadlineExceeded, context.DeadlineExceeded,
context.Canceled, context.Canceled,
}...) { }...) {
atomic.AddUint64(&p.totalErrsAvailability, 1) p.totalErrsAvailability.Add(1)
if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) { if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) {
atomic.AddUint64(&p.totalErrsTimeout, 1) p.totalErrsTimeout.Add(1)
} }
} }
p.apiLatencies[s].add(duration) p.apiLatencies[s].add(duration)