fix: batch status reporting after complete (#17852)

batch status can perpetually wait after completion
due to a race between the MetricsHandler() returning
the active metrics in intervals of 1sec and delete
of metrics after job completion.

this PR ensures that we keep the 'status' around
for a while, i.e upto 24hrs for all the batch jobs.
This commit is contained in:
Harshavardhana 2023-08-15 12:22:30 -07:00 committed by GitHub
parent c4ca0a5a57
commit 3ba927edae
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 53 additions and 14 deletions

View File

@ -1413,7 +1413,6 @@ func (j BatchJobRequest) delete(ctx context.Context, api ObjectLayer) {
case j.KeyRotate != nil:
deleteConfig(ctx, api, pathJoin(j.Location, batchKeyRotationName))
}
globalBatchJobsMetrics.delete(j.ID)
deleteConfig(ctx, api, j.Location)
}
@ -1815,10 +1814,6 @@ type batchJobMetrics struct {
metrics map[string]*batchJobInfo
}
var globalBatchJobsMetrics = batchJobMetrics{
metrics: make(map[string]*batchJobInfo),
}
//msgp:ignore batchJobMetric
//go:generate stringer -type=batchJobMetric -trimprefix=batchJobMetric $GOFILE
type batchJobMetric uint8
@ -1858,9 +1853,17 @@ func (m *batchJobMetrics) report(jobID string) (metrics *madmin.BatchJobMetrics)
metrics = &madmin.BatchJobMetrics{CollectedAt: time.Now(), Jobs: make(map[string]madmin.JobMetric)}
m.RLock()
defer m.RUnlock()
match := true
for id, job := range m.metrics {
match := jobID != "" && id == jobID
metrics.Jobs[id] = madmin.JobMetric{
if jobID != "" {
match = id == jobID
}
if !match {
continue
}
m := madmin.JobMetric{
JobID: job.JobID,
JobType: job.JobType,
StartTime: job.StartTime,
@ -1868,28 +1871,58 @@ func (m *batchJobMetrics) report(jobID string) (metrics *madmin.BatchJobMetrics)
RetryAttempts: job.RetryAttempts,
Complete: job.Complete,
Failed: job.Failed,
Replicate: &madmin.ReplicateInfo{
}
switch job.JobType {
case string(madmin.BatchJobReplicate):
m.Replicate = &madmin.ReplicateInfo{
Bucket: job.Bucket,
Object: job.Object,
Objects: job.Objects,
ObjectsFailed: job.ObjectsFailed,
BytesTransferred: job.BytesTransferred,
BytesFailed: job.BytesFailed,
},
KeyRotate: &madmin.KeyRotationInfo{
}
case string(madmin.BatchJobKeyRotate):
m.KeyRotate = &madmin.KeyRotationInfo{
Bucket: job.Bucket,
Object: job.Object,
Objects: job.Objects,
ObjectsFailed: job.ObjectsFailed,
},
}
if match {
break
}
}
metrics.Jobs[id] = m
}
return metrics
}
// keep job metrics for some time after the job is completed
// in-case some one wants to look at the older results.
func (m *batchJobMetrics) purgeJobMetrics() {
t := time.NewTicker(6 * time.Hour)
defer t.Stop()
for {
select {
case <-GlobalContext.Done():
return
case <-t.C:
var toDeleteJobMetrics []string
m.RLock()
for id, metrics := range m.metrics {
if time.Since(metrics.LastUpdate) > 24*time.Hour && (metrics.Complete || metrics.Failed) {
toDeleteJobMetrics = append(toDeleteJobMetrics, id)
}
}
m.RUnlock()
for _, jobID := range toDeleteJobMetrics {
m.delete(jobID)
}
}
}
}
func (m *batchJobMetrics) delete(jobID string) {
m.Lock()
defer m.Unlock()

View File

@ -95,6 +95,9 @@ func init() {
initGlobalContext()
globalBatchJobsMetrics = batchJobMetrics{metrics: make(map[string]*batchJobInfo)}
go globalBatchJobsMetrics.purgeJobMetrics()
t, _ := minioVersionToReleaseTime(Version)
if !t.IsZero() {
globalVersionUnix = uint64(t.Unix())

View File

@ -399,6 +399,9 @@ var (
// Set last client perf extra time (get lock, and validate)
globalLastClientPerfExtraTime int64
// Captures all batch jobs metrics globally
globalBatchJobsMetrics batchJobMetrics
// Add new variable global values here.
)