replication: make large workers configurable (#20077)

This PR also improves throttling by reducing tokens requested from rate limiter based on available tokens to avoid exceeding throttle wait deadlines
2025-07-14 11:21:52 -04:00 · 2024-07-12 07:57:31 -07:00 · 2024-07-12 07:57:31 -07:00 · 989c318a28
commit 989c318a28
parent ef802f2b2c
4 changed files with 129 additions and 55 deletions
--- a/cmd/bucket-replication.go
+++ b/cmd/bucket-replication.go
@ -1803,12 +1803,15 @@ var (
 type ReplicationPool struct {
 	// atomic ops:
 	activeWorkers    int32
 	activeLrgWorkers int32
 	activeMRFWorkers int32
 	objLayer    ObjectLayer
 	ctx         context.Context
 	priority    string
 	maxWorkers  int
 	maxLWorkers int
 	mu       sync.RWMutex
 	mrfMU    sync.Mutex
 	resyncer *replicationResyncer
@ -1882,9 +1885,13 @@ func NewReplicationPool(ctx context.Context, o ObjectLayer, opts replicationPool
 	if maxWorkers > 0 && failedWorkers > maxWorkers {
 		failedWorkers = maxWorkers
 	}
 	maxLWorkers := LargeWorkerCount
 	if opts.MaxLWorkers > 0 {
 		maxLWorkers = opts.MaxLWorkers
 	}
 	pool := &ReplicationPool{
 		workers:         make([]chan ReplicationWorkerOperation, 0, workers),
-		lrgworkers:      make([]chan ReplicationWorkerOperation, 0, LargeWorkerCount),
+		lrgworkers:      make([]chan ReplicationWorkerOperation, 0, maxLWorkers),
 		mrfReplicaCh:    make(chan ReplicationWorkerOperation, 100000),
 		mrfWorkerKillCh: make(chan struct{}, failedWorkers),
 		resyncer:        newresyncer(),
@ -1894,9 +1901,10 @@ func NewReplicationPool(ctx context.Context, o ObjectLayer, opts replicationPool
 		objLayer:        o,
 		priority:        priority,
 		maxWorkers:      maxWorkers,
 		maxLWorkers:     maxLWorkers,
 	}
-	pool.AddLargeWorkers()
+	pool.ResizeLrgWorkers(maxLWorkers, 0)
 	pool.ResizeWorkers(workers, 0)
 	pool.ResizeFailedWorkers(failedWorkers)
 	go pool.resyncer.PersistToDisk(ctx, o)
@ -1975,23 +1983,8 @@ func (p *ReplicationPool) AddWorker(input <-chan ReplicationWorkerOperation, opT
 	}
 }
 // AddLargeWorkers adds a static number of workers to handle large uploads
 func (p *ReplicationPool) AddLargeWorkers() {
 	for i := 0; i < LargeWorkerCount; i++ {
 		p.lrgworkers = append(p.lrgworkers, make(chan ReplicationWorkerOperation, 100000))
 		i := i
 		go p.AddLargeWorker(p.lrgworkers[i])
 	}
 	go func() {
 		<-p.ctx.Done()
 		for i := 0; i < LargeWorkerCount; i++ {
 			xioutil.SafeClose(p.lrgworkers[i])
 		}
 	}()
 }
 // AddLargeWorker adds a replication worker to the static pool for large uploads.
-func (p *ReplicationPool) AddLargeWorker(input <-chan ReplicationWorkerOperation) {
+func (p *ReplicationPool) AddLargeWorker(input <-chan ReplicationWorkerOperation, opTracker *int32) {
 	for {
 		select {
 		case <-p.ctx.Done():
@ -2002,11 +1995,23 @@ func (p *ReplicationPool) AddLargeWorker(input <-chan ReplicationWorkerOperation
 			}
 			switch v := oi.(type) {
 			case ReplicateObjectInfo:
 				if opTracker != nil {
 					atomic.AddInt32(opTracker, 1)
 				}
 				globalReplicationStats.incQ(v.Bucket, v.Size, v.DeleteMarker, v.OpType)
 				replicateObject(p.ctx, v, p.objLayer)
 				globalReplicationStats.decQ(v.Bucket, v.Size, v.DeleteMarker, v.OpType)
 				if opTracker != nil {
 					atomic.AddInt32(opTracker, -1)
 				}
 			case DeletedObjectReplicationInfo:
 				if opTracker != nil {
 					atomic.AddInt32(opTracker, 1)
 				}
 				replicateDelete(p.ctx, v, p.objLayer)
 				if opTracker != nil {
 					atomic.AddInt32(opTracker, -1)
 				}
 			default:
 				bugLogIf(p.ctx, fmt.Errorf("unknown replication type: %T", oi), "unknown-replicate-type")
 			}
@ -2014,6 +2019,30 @@ func (p *ReplicationPool) AddLargeWorker(input <-chan ReplicationWorkerOperation
 	}
 }
 // ResizeLrgWorkers sets replication workers pool for large transfers(>=128MiB) to new size.
 // checkOld can be set to an expected value.
 // If the worker count changed
 func (p *ReplicationPool) ResizeLrgWorkers(n, checkOld int) {
 	p.mu.Lock()
 	defer p.mu.Unlock()
 	if (checkOld > 0 && len(p.lrgworkers) != checkOld) || n == len(p.lrgworkers) || n < 1 {
 		// Either already satisfied or worker count changed while we waited for the lock.
 		return
 	}
 	for len(p.lrgworkers) < n {
 		input := make(chan ReplicationWorkerOperation, 100000)
 		p.lrgworkers = append(p.lrgworkers, input)
 		go p.AddLargeWorker(input, &p.activeLrgWorkers)
 	}
 	for len(p.lrgworkers) > n {
 		worker := p.lrgworkers[len(p.lrgworkers)-1]
 		p.lrgworkers = p.lrgworkers[:len(p.lrgworkers)-1]
 		xioutil.SafeClose(worker)
 	}
 }
 // ActiveWorkers returns the number of active workers handling replication traffic.
 func (p *ReplicationPool) ActiveWorkers() int {
 	return int(atomic.LoadInt32(&p.activeWorkers))
@ -2024,6 +2053,11 @@ func (p *ReplicationPool) ActiveMRFWorkers() int {
 	return int(atomic.LoadInt32(&p.activeMRFWorkers))
 }
 // ActiveLrgWorkers returns the number of active workers handling traffic > 128MiB object size.
 func (p *ReplicationPool) ActiveLrgWorkers() int {
 	return int(atomic.LoadInt32(&p.activeLrgWorkers))
 }
 // ResizeWorkers sets replication workers pool to new size.
 // checkOld can be set to an expected value.
 // If the worker count changed
@ -2049,7 +2083,7 @@ func (p *ReplicationPool) ResizeWorkers(n, checkOld int) {
 }
 // ResizeWorkerPriority sets replication failed workers pool size
-func (p *ReplicationPool) ResizeWorkerPriority(pri string, maxWorkers int) {
+func (p *ReplicationPool) ResizeWorkerPriority(pri string, maxWorkers, maxLWorkers int) {
 	var workers, mrfWorkers int
 	p.mu.Lock()
 	switch pri {
@ -2076,11 +2110,15 @@ func (p *ReplicationPool) ResizeWorkerPriority(pri string, maxWorkers int) {
 	if maxWorkers > 0 && mrfWorkers > maxWorkers {
 		mrfWorkers = maxWorkers
 	}
 	if maxLWorkers <= 0 {
 		maxLWorkers = LargeWorkerCount
 	}
 	p.priority = pri
 	p.maxWorkers = maxWorkers
 	p.mu.Unlock()
 	p.ResizeWorkers(workers, 0)
 	p.ResizeFailedWorkers(mrfWorkers)
 	p.ResizeLrgWorkers(maxLWorkers, 0)
 }
 // ResizeFailedWorkers sets replication failed workers pool size
@ -2127,6 +2165,15 @@ func (p *ReplicationPool) queueReplicaTask(ri ReplicateObjectInfo) {
 		case p.lrgworkers[h%LargeWorkerCount] <- ri:
 		default:
 			globalReplicationPool.queueMRFSave(ri.ToMRFEntry())
 			p.mu.RLock()
 			maxLWorkers := p.maxLWorkers
 			existing := len(p.lrgworkers)
 			p.mu.RUnlock()
 			maxLWorkers = min(maxLWorkers, LargeWorkerCount)
 			if p.ActiveLrgWorkers() < maxLWorkers {
 				workers := min(existing+1, maxLWorkers)
 				p.ResizeLrgWorkers(workers, existing)
 			}
 		}
 		return
 	}
@ -2231,6 +2278,7 @@ func (p *ReplicationPool) queueReplicaDeleteTask(doi DeletedObjectReplicationInf
 type replicationPoolOpts struct {
 	Priority    string
 	MaxWorkers  int
 	MaxLWorkers int
 }
 func initBackgroundReplication(ctx context.Context, objectAPI ObjectLayer) {
--- a/cmd/handler-api.go
+++ b/cmd/handler-api.go
@ -46,6 +46,7 @@ type apiConfig struct {
 	corsAllowOrigins       []string
 	replicationPriority    string
 	replicationMaxWorkers  int
 	replicationMaxLWorkers int
 	transitionWorkers      int
 	staleUploadsExpiry          time.Duration
@ -170,11 +171,12 @@ func (t *apiConfig) init(cfg api.Config, setDriveCounts []int, legacy bool) {
 	}
 	t.listQuorum = listQuorum
 	if globalReplicationPool != nil &&
-		(cfg.ReplicationPriority != t.replicationPriority || cfg.ReplicationMaxWorkers != t.replicationMaxWorkers) {
+		(cfg.ReplicationPriority != t.replicationPriority || cfg.ReplicationMaxWorkers != t.replicationMaxWorkers || cfg.ReplicationMaxLWorkers != t.replicationMaxLWorkers) {
-		globalReplicationPool.ResizeWorkerPriority(cfg.ReplicationPriority, cfg.ReplicationMaxWorkers)
+		globalReplicationPool.ResizeWorkerPriority(cfg.ReplicationPriority, cfg.ReplicationMaxWorkers, cfg.ReplicationMaxLWorkers)
 	}
 	t.replicationPriority = cfg.ReplicationPriority
 	t.replicationMaxWorkers = cfg.ReplicationMaxWorkers
 	t.replicationMaxLWorkers = cfg.ReplicationMaxLWorkers
 	// N B api.transition_workers will be deprecated
 	if globalTransitionState != nil {
@ -383,12 +385,14 @@ func (t *apiConfig) getReplicationOpts() replicationPoolOpts {
 		return replicationPoolOpts{
 			Priority:    "auto",
 			MaxWorkers:  WorkerMaxLimit,
 			MaxLWorkers: LargeWorkerCount,
 		}
 	}
 	return replicationPoolOpts{
 		Priority:    t.replicationPriority,
 		MaxWorkers:  t.replicationMaxWorkers,
 		MaxLWorkers: t.replicationMaxLWorkers,
 	}
 }
--- a/internal/bucket/bandwidth/reader.go
+++ b/internal/bucket/bandwidth/reader.go
@ -74,12 +74,16 @@ func (r *MonitoredReader) Read(buf []byte) (n int, err error) {
 		need = int(math.Min(float64(b), float64(need)))
 		tokens = need
 	}
-
+	// reduce tokens requested according to availability
 	av := int(r.throttle.Tokens())
 	if av < tokens && av > 0 {
 		tokens = av
 		need = int(math.Min(float64(tokens), float64(need)))
 	}
 	err = r.throttle.WaitN(r.ctx, tokens)
 	if err != nil {
 		return
 	}
 	n, err = r.r.Read(buf[:need])
 	if err != nil {
 		r.lastErr = err
--- a/internal/config/api/api.go
+++ b/internal/config/api/api.go
@ -40,6 +40,7 @@ const (
 	apiListQuorum              = "list_quorum"
 	apiReplicationPriority     = "replication_priority"
 	apiReplicationMaxWorkers   = "replication_max_workers"
 	apiReplicationMaxLWorkers  = "replication_max_lrg_workers"
 	apiTransitionWorkers           = "transition_workers"
 	apiStaleUploadsCleanupInterval = "stale_uploads_cleanup_interval"
@ -62,6 +63,8 @@ const (
 	EnvAPISecureCiphers           = "MINIO_API_SECURE_CIPHERS" // default config.EnableOn
 	EnvAPIReplicationPriority     = "MINIO_API_REPLICATION_PRIORITY"
 	EnvAPIReplicationMaxWorkers   = "MINIO_API_REPLICATION_MAX_WORKERS"
 	EnvAPIReplicationMaxLWorkers  = "MINIO_API_REPLICATION_MAX_LRG_WORKERS"
 	EnvAPIStaleUploadsCleanupInterval = "MINIO_API_STALE_UPLOADS_CLEANUP_INTERVAL"
 	EnvAPIStaleUploadsExpiry          = "MINIO_API_STALE_UPLOADS_EXPIRY"
 	EnvAPIDeleteCleanupInterval       = "MINIO_API_DELETE_CLEANUP_INTERVAL"
@ -117,6 +120,10 @@ var (
 			Key:   apiReplicationMaxWorkers,
 			Value: "500",
 		},
 		config.KV{
 			Key:   apiReplicationMaxLWorkers,
 			Value: "10",
 		},
 		config.KV{
 			Key:   apiTransitionWorkers,
 			Value: "100",
@ -171,6 +178,7 @@ type Config struct {
 	ListQuorum                  string        `json:"list_quorum"`
 	ReplicationPriority         string        `json:"replication_priority"`
 	ReplicationMaxWorkers       int           `json:"replication_max_workers"`
 	ReplicationMaxLWorkers      int           `json:"replication_max_lrg_workers"`
 	TransitionWorkers           int           `json:"transition_workers"`
 	StaleUploadsCleanupInterval time.Duration `json:"stale_uploads_cleanup_interval"`
 	StaleUploadsExpiry          time.Duration `json:"stale_uploads_expiry"`
@ -280,11 +288,21 @@ func LookupConfig(kvs config.KVS) (cfg Config, err error) {
 	if err != nil {
 		return cfg, err
 	}
 	if replicationMaxWorkers <= 0 || replicationMaxWorkers > 500 {
 		return cfg, config.ErrInvalidReplicationWorkersValue(nil).Msg("Number of replication workers should be between 1 and 500")
 	}
 	cfg.ReplicationMaxWorkers = replicationMaxWorkers
 	replicationMaxLWorkers, err := strconv.Atoi(env.Get(EnvAPIReplicationMaxLWorkers, kvs.GetWithDefault(apiReplicationMaxLWorkers, DefaultKVS)))
 	if err != nil {
 		return cfg, err
 	}
 	if replicationMaxLWorkers <= 0 || replicationMaxLWorkers > 10 {
 		return cfg, config.ErrInvalidReplicationWorkersValue(nil).Msg("Number of replication workers for transfers >=128MiB should be between 1 and 10 per node")
 	}
 	cfg.ReplicationMaxLWorkers = replicationMaxLWorkers
 	transitionWorkers, err := strconv.Atoi(env.Get(EnvAPITransitionWorkers, kvs.GetWithDefault(apiTransitionWorkers, DefaultKVS)))
 	if err != nil {
 		return cfg, err