mirror of
https://github.com/minio/minio.git
synced 2025-04-08 21:55:44 -04:00
Add batch status API (#19679)
Currently the status of a completed or failed batch is held in the memory, a simple restart will lose the status and the user will not have any visibility of the job that was long running. In addition to the metrics, add a new API that reads the batch status from the drives. A batch job will be cleaned up three days after completion. Also add the batch type in the batch id, the reason is that the batch job request is removed immediately when the job is finished, then we do not know the type of batch job anymore, hence a difficulty to locate the job report
This commit is contained in:
parent
b35acb3dbc
commit
757cf413cb
@ -341,6 +341,9 @@ func registerAdminRouter(router *mux.Router, enableConfigOps bool) {
|
|||||||
adminRouter.Methods(http.MethodGet).Path(adminVersion + "/list-jobs").HandlerFunc(
|
adminRouter.Methods(http.MethodGet).Path(adminVersion + "/list-jobs").HandlerFunc(
|
||||||
adminMiddleware(adminAPI.ListBatchJobs))
|
adminMiddleware(adminAPI.ListBatchJobs))
|
||||||
|
|
||||||
|
adminRouter.Methods(http.MethodGet).Path(adminVersion + "/status-job").HandlerFunc(
|
||||||
|
adminMiddleware(adminAPI.BatchJobStatus))
|
||||||
|
|
||||||
adminRouter.Methods(http.MethodGet).Path(adminVersion + "/describe-job").HandlerFunc(
|
adminRouter.Methods(http.MethodGet).Path(adminVersion + "/describe-job").HandlerFunc(
|
||||||
adminMiddleware(adminAPI.DescribeBatchJob))
|
adminMiddleware(adminAPI.DescribeBatchJob))
|
||||||
adminRouter.Methods(http.MethodDelete).Path(adminVersion + "/cancel-job").HandlerFunc(
|
adminRouter.Methods(http.MethodDelete).Path(adminVersion + "/cancel-job").HandlerFunc(
|
||||||
|
@ -514,7 +514,7 @@ func (r *BatchJobExpire) Start(ctx context.Context, api ObjectLayer, job BatchJo
|
|||||||
JobType: string(job.Type()),
|
JobType: string(job.Type()),
|
||||||
StartTime: job.Started,
|
StartTime: job.Started,
|
||||||
}
|
}
|
||||||
if err := ri.load(ctx, api, job); err != nil {
|
if err := ri.loadOrInit(ctx, api, job); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -28,6 +28,7 @@ import (
|
|||||||
"math/rand"
|
"math/rand"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/url"
|
"net/url"
|
||||||
|
"path/filepath"
|
||||||
"runtime"
|
"runtime"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
@ -57,6 +58,11 @@ import (
|
|||||||
|
|
||||||
var globalBatchConfig batch.Config
|
var globalBatchConfig batch.Config
|
||||||
|
|
||||||
|
const (
|
||||||
|
// Keep the completed/failed job stats 3 days before removing it
|
||||||
|
oldJobsExpiration = 3 * 24 * time.Hour
|
||||||
|
)
|
||||||
|
|
||||||
// BatchJobRequest this is an internal data structure not for external consumption.
|
// BatchJobRequest this is an internal data structure not for external consumption.
|
||||||
type BatchJobRequest struct {
|
type BatchJobRequest struct {
|
||||||
ID string `yaml:"-" json:"name"`
|
ID string `yaml:"-" json:"name"`
|
||||||
@ -262,7 +268,7 @@ func (r *BatchJobReplicateV1) StartFromSource(ctx context.Context, api ObjectLay
|
|||||||
JobType: string(job.Type()),
|
JobType: string(job.Type()),
|
||||||
StartTime: job.Started,
|
StartTime: job.Started,
|
||||||
}
|
}
|
||||||
if err := ri.load(ctx, api, job); err != nil {
|
if err := ri.loadOrInit(ctx, api, job); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
if ri.Complete {
|
if ri.Complete {
|
||||||
@ -722,54 +728,43 @@ const (
|
|||||||
batchReplJobDefaultRetryDelay = 250 * time.Millisecond
|
batchReplJobDefaultRetryDelay = 250 * time.Millisecond
|
||||||
)
|
)
|
||||||
|
|
||||||
func getJobReportPath(job BatchJobRequest) string {
|
|
||||||
var fileName string
|
|
||||||
switch {
|
|
||||||
case job.Replicate != nil:
|
|
||||||
fileName = batchReplName
|
|
||||||
case job.KeyRotate != nil:
|
|
||||||
fileName = batchKeyRotationName
|
|
||||||
case job.Expire != nil:
|
|
||||||
fileName = batchExpireName
|
|
||||||
}
|
|
||||||
return pathJoin(batchJobReportsPrefix, job.ID, fileName)
|
|
||||||
}
|
|
||||||
|
|
||||||
func getJobPath(job BatchJobRequest) string {
|
func getJobPath(job BatchJobRequest) string {
|
||||||
return pathJoin(batchJobPrefix, job.ID)
|
return pathJoin(batchJobPrefix, job.ID)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (ri *batchJobInfo) load(ctx context.Context, api ObjectLayer, job BatchJobRequest) error {
|
func (ri *batchJobInfo) getJobReportPath() (string, error) {
|
||||||
var format, version uint16
|
var fileName string
|
||||||
switch {
|
switch madmin.BatchJobType(ri.JobType) {
|
||||||
case job.Replicate != nil:
|
case madmin.BatchJobReplicate:
|
||||||
version = batchReplVersionV1
|
fileName = batchReplName
|
||||||
format = batchReplFormat
|
case madmin.BatchJobKeyRotate:
|
||||||
case job.KeyRotate != nil:
|
fileName = batchKeyRotationName
|
||||||
version = batchKeyRotateVersionV1
|
case madmin.BatchJobExpire:
|
||||||
format = batchKeyRotationFormat
|
fileName = batchExpireName
|
||||||
case job.Expire != nil:
|
|
||||||
version = batchExpireVersionV1
|
|
||||||
format = batchExpireFormat
|
|
||||||
default:
|
default:
|
||||||
return errors.New("no supported batch job request specified")
|
return "", fmt.Errorf("unknown job type: %v", ri.JobType)
|
||||||
}
|
}
|
||||||
data, err := readConfig(ctx, api, getJobReportPath(job))
|
return pathJoin(batchJobReportsPrefix, ri.JobID, fileName), nil
|
||||||
if err != nil {
|
}
|
||||||
if errors.Is(err, errConfigNotFound) || isErrObjectNotFound(err) {
|
|
||||||
ri.Version = int(version)
|
func (ri *batchJobInfo) loadOrInit(ctx context.Context, api ObjectLayer, job BatchJobRequest) error {
|
||||||
|
err := ri.load(ctx, api, job)
|
||||||
|
if errors.Is(err, errNoSuchJob) {
|
||||||
switch {
|
switch {
|
||||||
case job.Replicate != nil:
|
case job.Replicate != nil:
|
||||||
|
ri.Version = batchReplVersionV1
|
||||||
ri.RetryAttempts = batchReplJobDefaultRetries
|
ri.RetryAttempts = batchReplJobDefaultRetries
|
||||||
if job.Replicate.Flags.Retry.Attempts > 0 {
|
if job.Replicate.Flags.Retry.Attempts > 0 {
|
||||||
ri.RetryAttempts = job.Replicate.Flags.Retry.Attempts
|
ri.RetryAttempts = job.Replicate.Flags.Retry.Attempts
|
||||||
}
|
}
|
||||||
case job.KeyRotate != nil:
|
case job.KeyRotate != nil:
|
||||||
|
ri.Version = batchKeyRotateVersionV1
|
||||||
ri.RetryAttempts = batchKeyRotateJobDefaultRetries
|
ri.RetryAttempts = batchKeyRotateJobDefaultRetries
|
||||||
if job.KeyRotate.Flags.Retry.Attempts > 0 {
|
if job.KeyRotate.Flags.Retry.Attempts > 0 {
|
||||||
ri.RetryAttempts = job.KeyRotate.Flags.Retry.Attempts
|
ri.RetryAttempts = job.KeyRotate.Flags.Retry.Attempts
|
||||||
}
|
}
|
||||||
case job.Expire != nil:
|
case job.Expire != nil:
|
||||||
|
ri.Version = batchExpireVersionV1
|
||||||
ri.RetryAttempts = batchExpireJobDefaultRetries
|
ri.RetryAttempts = batchExpireJobDefaultRetries
|
||||||
if job.Expire.Retry.Attempts > 0 {
|
if job.Expire.Retry.Attempts > 0 {
|
||||||
ri.RetryAttempts = job.Expire.Retry.Attempts
|
ri.RetryAttempts = job.Expire.Retry.Attempts
|
||||||
@ -779,6 +774,39 @@ func (ri *batchJobInfo) load(ctx context.Context, api ObjectLayer, job BatchJobR
|
|||||||
}
|
}
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (ri *batchJobInfo) load(ctx context.Context, api ObjectLayer, job BatchJobRequest) error {
|
||||||
|
path, err := job.getJobReportPath()
|
||||||
|
if err != nil {
|
||||||
|
batchLogIf(ctx, err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return ri.loadByPath(ctx, api, path)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (ri *batchJobInfo) loadByPath(ctx context.Context, api ObjectLayer, path string) error {
|
||||||
|
var format, version uint16
|
||||||
|
switch filepath.Base(path) {
|
||||||
|
case batchReplName:
|
||||||
|
version = batchReplVersionV1
|
||||||
|
format = batchReplFormat
|
||||||
|
case batchKeyRotationName:
|
||||||
|
version = batchKeyRotateVersionV1
|
||||||
|
format = batchKeyRotationFormat
|
||||||
|
case batchExpireName:
|
||||||
|
version = batchExpireVersionV1
|
||||||
|
format = batchExpireFormat
|
||||||
|
default:
|
||||||
|
return errors.New("no supported batch job request specified")
|
||||||
|
}
|
||||||
|
|
||||||
|
data, err := readConfig(ctx, api, path)
|
||||||
|
if err != nil {
|
||||||
|
if errors.Is(err, errConfigNotFound) || isErrObjectNotFound(err) {
|
||||||
|
return errNoSuchJob
|
||||||
|
}
|
||||||
|
return err
|
||||||
|
}
|
||||||
if len(data) == 0 {
|
if len(data) == 0 {
|
||||||
// Seems to be empty create a new batchRepl object.
|
// Seems to be empty create a new batchRepl object.
|
||||||
return nil
|
return nil
|
||||||
@ -919,7 +947,12 @@ func (ri *batchJobInfo) updateAfter(ctx context.Context, api ObjectLayer, durati
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
return saveConfig(ctx, api, getJobReportPath(job), buf)
|
path, err := ri.getJobReportPath()
|
||||||
|
if err != nil {
|
||||||
|
batchLogIf(ctx, err)
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return saveConfig(ctx, api, path, buf)
|
||||||
}
|
}
|
||||||
ri.mu.Unlock()
|
ri.mu.Unlock()
|
||||||
return nil
|
return nil
|
||||||
@ -971,7 +1004,7 @@ func (r *BatchJobReplicateV1) Start(ctx context.Context, api ObjectLayer, job Ba
|
|||||||
JobType: string(job.Type()),
|
JobType: string(job.Type()),
|
||||||
StartTime: job.Started,
|
StartTime: job.Started,
|
||||||
}
|
}
|
||||||
if err := ri.load(ctx, api, job); err != nil {
|
if err := ri.loadOrInit(ctx, api, job); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
if ri.Complete {
|
if ri.Complete {
|
||||||
@ -1434,10 +1467,24 @@ func (j BatchJobRequest) Validate(ctx context.Context, o ObjectLayer) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func (j BatchJobRequest) delete(ctx context.Context, api ObjectLayer) {
|
func (j BatchJobRequest) delete(ctx context.Context, api ObjectLayer) {
|
||||||
deleteConfig(ctx, api, getJobReportPath(j))
|
|
||||||
deleteConfig(ctx, api, getJobPath(j))
|
deleteConfig(ctx, api, getJobPath(j))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (j BatchJobRequest) getJobReportPath() (string, error) {
|
||||||
|
var fileName string
|
||||||
|
switch {
|
||||||
|
case j.Replicate != nil:
|
||||||
|
fileName = batchReplName
|
||||||
|
case j.KeyRotate != nil:
|
||||||
|
fileName = batchKeyRotationName
|
||||||
|
case j.Expire != nil:
|
||||||
|
fileName = batchExpireName
|
||||||
|
default:
|
||||||
|
return "", errors.New("unknown job type")
|
||||||
|
}
|
||||||
|
return pathJoin(batchJobReportsPrefix, j.ID, fileName), nil
|
||||||
|
}
|
||||||
|
|
||||||
func (j *BatchJobRequest) save(ctx context.Context, api ObjectLayer) error {
|
func (j *BatchJobRequest) save(ctx context.Context, api ObjectLayer) error {
|
||||||
if j.Replicate == nil && j.KeyRotate == nil && j.Expire == nil {
|
if j.Replicate == nil && j.KeyRotate == nil && j.Expire == nil {
|
||||||
return errInvalidArgument
|
return errInvalidArgument
|
||||||
@ -1540,6 +1587,55 @@ func (a adminAPIHandlers) ListBatchJobs(w http.ResponseWriter, r *http.Request)
|
|||||||
batchLogIf(ctx, json.NewEncoder(w).Encode(&listResult))
|
batchLogIf(ctx, json.NewEncoder(w).Encode(&listResult))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// BatchJobStatus - returns the status of a batch job saved in the disk
|
||||||
|
func (a adminAPIHandlers) BatchJobStatus(w http.ResponseWriter, r *http.Request) {
|
||||||
|
ctx := r.Context()
|
||||||
|
|
||||||
|
objectAPI, _ := validateAdminReq(ctx, w, r, policy.ListBatchJobsAction)
|
||||||
|
if objectAPI == nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
jobID := r.Form.Get("jobId")
|
||||||
|
if jobID == "" {
|
||||||
|
writeErrorResponseJSON(ctx, w, toAPIError(ctx, errInvalidArgument), r.URL)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
req := BatchJobRequest{ID: jobID}
|
||||||
|
if i := strings.Index(jobID, "-"); i > 0 {
|
||||||
|
switch madmin.BatchJobType(jobID[:i]) {
|
||||||
|
case madmin.BatchJobReplicate:
|
||||||
|
req.Replicate = &BatchJobReplicateV1{}
|
||||||
|
case madmin.BatchJobKeyRotate:
|
||||||
|
req.KeyRotate = &BatchJobKeyRotateV1{}
|
||||||
|
case madmin.BatchJobExpire:
|
||||||
|
req.Expire = &BatchJobExpire{}
|
||||||
|
default:
|
||||||
|
writeErrorResponseJSON(ctx, w, toAPIError(ctx, errors.New("job ID format unrecognized")), r.URL)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ri := &batchJobInfo{}
|
||||||
|
if err := ri.load(ctx, objectAPI, req); err != nil {
|
||||||
|
if !errors.Is(err, errNoSuchJob) {
|
||||||
|
batchLogIf(ctx, err)
|
||||||
|
}
|
||||||
|
writeErrorResponseJSON(ctx, w, toAPIError(ctx, err), r.URL)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
buf, err := json.Marshal(madmin.BatchJobStatus{LastMetric: ri.metric()})
|
||||||
|
if err != nil {
|
||||||
|
batchLogIf(ctx, err)
|
||||||
|
writeErrorResponseJSON(ctx, w, toAPIError(ctx, err), r.URL)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
w.Write(buf)
|
||||||
|
}
|
||||||
|
|
||||||
var errNoSuchJob = errors.New("no such job")
|
var errNoSuchJob = errors.New("no such job")
|
||||||
|
|
||||||
// DescribeBatchJob returns the currently active batch job definition
|
// DescribeBatchJob returns the currently active batch job definition
|
||||||
@ -1631,7 +1727,7 @@ func (a adminAPIHandlers) StartBatchJob(w http.ResponseWriter, r *http.Request)
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
job.ID = fmt.Sprintf("%s%s%d", shortuuid.New(), getKeySeparator(), GetProxyEndpointLocalIndex(globalProxyEndpoints))
|
job.ID = fmt.Sprintf("%s-%s%s%d", job.Type(), shortuuid.New(), getKeySeparator(), GetProxyEndpointLocalIndex(globalProxyEndpoints))
|
||||||
job.User = user
|
job.User = user
|
||||||
job.Started = time.Now()
|
job.Started = time.Now()
|
||||||
|
|
||||||
@ -1720,9 +1816,54 @@ func newBatchJobPool(ctx context.Context, o ObjectLayer, workers int) *BatchJobP
|
|||||||
}
|
}
|
||||||
jpool.ResizeWorkers(workers)
|
jpool.ResizeWorkers(workers)
|
||||||
jpool.resume()
|
jpool.resume()
|
||||||
|
|
||||||
|
go jpool.cleanupReports()
|
||||||
|
|
||||||
return jpool
|
return jpool
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (j *BatchJobPool) cleanupReports() {
|
||||||
|
randomWait := func() time.Duration {
|
||||||
|
// randomWait depends on the number of nodes to avoid triggering the cleanup at the same time
|
||||||
|
return time.Duration(rand.Float64() * float64(time.Duration(globalEndpoints.NEndpoints())*time.Hour))
|
||||||
|
}
|
||||||
|
|
||||||
|
t := time.NewTimer(randomWait())
|
||||||
|
defer t.Stop()
|
||||||
|
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-GlobalContext.Done():
|
||||||
|
return
|
||||||
|
case <-t.C:
|
||||||
|
results := make(chan itemOrErr[ObjectInfo], 100)
|
||||||
|
ctx, cancel := context.WithCancel(j.ctx)
|
||||||
|
defer cancel()
|
||||||
|
if err := j.objLayer.Walk(ctx, minioMetaBucket, batchJobReportsPrefix, results, WalkOptions{}); err != nil {
|
||||||
|
batchLogIf(j.ctx, err)
|
||||||
|
t.Reset(randomWait())
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
for result := range results {
|
||||||
|
if result.Err != nil {
|
||||||
|
batchLogIf(j.ctx, result.Err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
ri := &batchJobInfo{}
|
||||||
|
if err := ri.loadByPath(ctx, j.objLayer, result.Item.Name); err != nil {
|
||||||
|
batchLogIf(ctx, err)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if (ri.Complete || ri.Failed) && time.Since(ri.LastUpdate) > oldJobsExpiration {
|
||||||
|
deleteConfig(ctx, j.objLayer, result.Item.Name)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
t.Reset(randomWait())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (j *BatchJobPool) resume() {
|
func (j *BatchJobPool) resume() {
|
||||||
results := make(chan itemOrErr[ObjectInfo], 100)
|
results := make(chan itemOrErr[ObjectInfo], 100)
|
||||||
ctx, cancel := context.WithCancel(j.ctx)
|
ctx, cancel := context.WithCancel(j.ctx)
|
||||||
@ -1986,7 +2127,7 @@ func (m *batchJobMetrics) purgeJobMetrics() {
|
|||||||
var toDeleteJobMetrics []string
|
var toDeleteJobMetrics []string
|
||||||
m.RLock()
|
m.RLock()
|
||||||
for id, metrics := range m.metrics {
|
for id, metrics := range m.metrics {
|
||||||
if time.Since(metrics.LastUpdate) > 24*time.Hour && (metrics.Complete || metrics.Failed) {
|
if time.Since(metrics.LastUpdate) > oldJobsExpiration && (metrics.Complete || metrics.Failed) {
|
||||||
toDeleteJobMetrics = append(toDeleteJobMetrics, id)
|
toDeleteJobMetrics = append(toDeleteJobMetrics, id)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -257,7 +257,7 @@ func (r *BatchJobKeyRotateV1) Start(ctx context.Context, api ObjectLayer, job Ba
|
|||||||
JobType: string(job.Type()),
|
JobType: string(job.Type()),
|
||||||
StartTime: job.Started,
|
StartTime: job.Started,
|
||||||
}
|
}
|
||||||
if err := ri.load(ctx, api, job); err != nil {
|
if err := ri.loadOrInit(ctx, api, job); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
if ri.Complete {
|
if ri.Complete {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user