avoid caching metrics for timeout errors per drive (#18584)

Bonus: combine the loop for drive/REST registration.
This commit is contained in:
Harshavardhana 2023-12-04 11:54:13 -08:00 committed by GitHub
parent 8fdfcfb562
commit 05bb655efc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 24 additions and 22 deletions

View File

@ -1348,6 +1348,7 @@ func registerStorageRESTHandlers(router *mux.Router, endpointServerPools Endpoin
if !endpoint.IsLocal {
continue
}
driveHandlers[pool][set] = &storageRESTServer{}
server := driveHandlers[pool][set]
@ -1392,15 +1393,8 @@ func registerStorageRESTHandlers(router *mux.Router, endpointServerPools Endpoin
Handle: server.WalkDirHandler,
OutCapacity: 1,
}), "unable to register handler")
}
}
for pool, serverPool := range endpointServerPools {
for set, endpoint := range serverPool.Endpoints {
if !endpoint.IsLocal {
continue
}
createStorage := func(pool, set int, endpoint Endpoint) bool {
createStorage := func(server *storageRESTServer) bool {
xl, err := newXLStorage(endpoint, false)
if err != nil {
// if supported errors don't fail, we proceed to
@ -1410,21 +1404,22 @@ func registerStorageRESTHandlers(router *mux.Router, endpointServerPools Endpoin
}
storage := newXLStorageDiskIDCheck(xl, true)
storage.SetDiskID(xl.diskID)
driveHandlers[pool][set].setStorage(storage)
server.setStorage(storage)
return true
}
if createStorage(pool, set, endpoint) {
if createStorage(server) {
continue
}
// Start async goroutine to create storage.
go func(pool, set int, endpoint Endpoint) {
go func(server *storageRESTServer) {
for {
time.Sleep(time.Minute)
if createStorage(pool, set, endpoint) {
time.Sleep(5 * time.Second)
if createStorage(server) {
return
}
}
}(pool, set, endpoint)
}(server)
}
}
}

View File

@ -78,8 +78,9 @@ const (
// Detects change in underlying disk.
type xlStorageDiskIDCheck struct {
totalErrsAvailability uint64 // Captures all data availability errors such as permission denied, faulty disk and timeout errors.
totalErrsTimeout uint64 // Captures all timeout only errors
totalErrsTimeout atomic.Uint64 // Captures all timeout only errors
totalErrsAvailability atomic.Uint64 // Captures all data availability errors such as permission denied, faulty disk and timeout errors.
// apiCalls should be placed first so alignment is guaranteed for atomic operations.
apiCalls [storageMetricLast]uint64
apiLatencies [storageMetricLast]*lockedLastMinuteLatency
@ -102,7 +103,7 @@ type xlStorageDiskIDCheck struct {
func (p *xlStorageDiskIDCheck) getMetrics() DiskMetrics {
p.metricsCache.Once.Do(func() {
p.metricsCache.TTL = 1 * time.Second
p.metricsCache.TTL = 5 * time.Second
p.metricsCache.Update = func() (interface{}, error) {
diskMetric := DiskMetrics{
LastMinute: make(map[string]AccElem, len(p.apiLatencies)),
@ -114,13 +115,19 @@ func (p *xlStorageDiskIDCheck) getMetrics() DiskMetrics {
for i := range p.apiCalls {
diskMetric.APICalls[storageMetric(i).String()] = atomic.LoadUint64(&p.apiCalls[i])
}
diskMetric.TotalErrorsAvailability = atomic.LoadUint64(&p.totalErrsAvailability)
diskMetric.TotalErrorsTimeout = atomic.LoadUint64(&p.totalErrsTimeout)
return diskMetric, nil
}
})
m, _ := p.metricsCache.Get()
return m.(DiskMetrics)
diskMetric := DiskMetrics{}
if m != nil {
diskMetric = m.(DiskMetrics)
}
// Do not need this value to be cached.
diskMetric.TotalErrorsTimeout = p.totalErrsTimeout.Load()
diskMetric.TotalErrorsAvailability = p.totalErrsAvailability.Load()
return diskMetric
}
// lockedLastMinuteLatency accumulates totals lockless for each second.
@ -746,9 +753,9 @@ func (p *xlStorageDiskIDCheck) updateStorageMetrics(s storageMetric, paths ...st
context.DeadlineExceeded,
context.Canceled,
}...) {
atomic.AddUint64(&p.totalErrsAvailability, 1)
p.totalErrsAvailability.Add(1)
if errors.Is(err, context.DeadlineExceeded) || errors.Is(err, context.Canceled) {
atomic.AddUint64(&p.totalErrsTimeout, 1)
p.totalErrsTimeout.Add(1)
}
}
p.apiLatencies[s].add(duration)