mirror of
https://github.com/minio/minio.git
synced 2025-01-25 21:53:16 -05:00
add support for configurable replication MRF workers (#12125)
just like replication workers, allow failed replication workers to be configurable in situations like DR failures etc to catch up on replication sooner when DR is back online. Signed-off-by: Harshavardhana <harsha@minio.io>
This commit is contained in:
parent
014e419151
commit
82dc6aff1c
@ -36,9 +36,9 @@ func (b *BucketReplicationStats) hasReplicationUsage() bool {
|
|||||||
|
|
||||||
// ReplicationStats holds the global in-memory replication stats
|
// ReplicationStats holds the global in-memory replication stats
|
||||||
type ReplicationStats struct {
|
type ReplicationStats struct {
|
||||||
sync.RWMutex
|
|
||||||
Cache map[string]*BucketReplicationStats
|
Cache map[string]*BucketReplicationStats
|
||||||
UsageCache map[string]*BucketReplicationStats // initial usage
|
UsageCache map[string]*BucketReplicationStats
|
||||||
|
sync.RWMutex
|
||||||
}
|
}
|
||||||
|
|
||||||
// Delete deletes in-memory replication statistics for a bucket.
|
// Delete deletes in-memory replication statistics for a bucket.
|
||||||
|
@ -819,32 +819,34 @@ var (
|
|||||||
|
|
||||||
// ReplicationPool describes replication pool
|
// ReplicationPool describes replication pool
|
||||||
type ReplicationPool struct {
|
type ReplicationPool struct {
|
||||||
once sync.Once
|
objLayer ObjectLayer
|
||||||
mu sync.Mutex
|
ctx context.Context
|
||||||
size int
|
mrfWorkerKillCh chan struct{}
|
||||||
|
workerKillCh chan struct{}
|
||||||
|
mrfReplicaDeleteCh chan DeletedObjectVersionInfo
|
||||||
replicaCh chan ReplicateObjectInfo
|
replicaCh chan ReplicateObjectInfo
|
||||||
replicaDeleteCh chan DeletedObjectVersionInfo
|
replicaDeleteCh chan DeletedObjectVersionInfo
|
||||||
mrfReplicaCh chan ReplicateObjectInfo
|
mrfReplicaCh chan ReplicateObjectInfo
|
||||||
mrfReplicaDeleteCh chan DeletedObjectVersionInfo
|
workerSize int
|
||||||
killCh chan struct{}
|
mrfWorkerSize int
|
||||||
wg sync.WaitGroup
|
workerWg sync.WaitGroup
|
||||||
ctx context.Context
|
mrfWorkerWg sync.WaitGroup
|
||||||
objLayer ObjectLayer
|
once sync.Once
|
||||||
|
mu sync.Mutex
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewReplicationPool creates a pool of replication workers of specified size
|
// NewReplicationPool creates a pool of replication workers of specified size
|
||||||
func NewReplicationPool(ctx context.Context, o ObjectLayer, sz int) *ReplicationPool {
|
func NewReplicationPool(ctx context.Context, o ObjectLayer, opts replicationPoolOpts) *ReplicationPool {
|
||||||
pool := &ReplicationPool{
|
pool := &ReplicationPool{
|
||||||
replicaCh: make(chan ReplicateObjectInfo, 1000),
|
replicaCh: make(chan ReplicateObjectInfo, 10000),
|
||||||
replicaDeleteCh: make(chan DeletedObjectVersionInfo, 1000),
|
replicaDeleteCh: make(chan DeletedObjectVersionInfo, 10000),
|
||||||
mrfReplicaCh: make(chan ReplicateObjectInfo, 100000),
|
mrfReplicaCh: make(chan ReplicateObjectInfo, 100000),
|
||||||
mrfReplicaDeleteCh: make(chan DeletedObjectVersionInfo, 100000),
|
mrfReplicaDeleteCh: make(chan DeletedObjectVersionInfo, 100000),
|
||||||
ctx: ctx,
|
ctx: ctx,
|
||||||
objLayer: o,
|
objLayer: o,
|
||||||
}
|
}
|
||||||
pool.Resize(sz)
|
pool.ResizeWorkers(opts.Workers)
|
||||||
// add long running worker for handling most recent failures/pending replications
|
pool.ResizeFailedWorkers(opts.FailedWorkers)
|
||||||
go pool.AddMRFWorker()
|
|
||||||
return pool
|
return pool
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -871,7 +873,7 @@ func (p *ReplicationPool) AddMRFWorker() {
|
|||||||
|
|
||||||
// AddWorker adds a replication worker to the pool
|
// AddWorker adds a replication worker to the pool
|
||||||
func (p *ReplicationPool) AddWorker() {
|
func (p *ReplicationPool) AddWorker() {
|
||||||
defer p.wg.Done()
|
defer p.workerWg.Done()
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case <-p.ctx.Done():
|
case <-p.ctx.Done():
|
||||||
@ -886,26 +888,42 @@ func (p *ReplicationPool) AddWorker() {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
replicateDelete(p.ctx, doi, p.objLayer)
|
replicateDelete(p.ctx, doi, p.objLayer)
|
||||||
case <-p.killCh:
|
case <-p.workerKillCh:
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//Resize replication pool to new size
|
// ResizeWorkers sets replication workers pool to new size
|
||||||
func (p *ReplicationPool) Resize(n int) {
|
func (p *ReplicationPool) ResizeWorkers(n int) {
|
||||||
p.mu.Lock()
|
p.mu.Lock()
|
||||||
defer p.mu.Unlock()
|
defer p.mu.Unlock()
|
||||||
|
|
||||||
for p.size < n {
|
for p.workerSize < n {
|
||||||
p.size++
|
p.workerSize++
|
||||||
p.wg.Add(1)
|
p.workerWg.Add(1)
|
||||||
go p.AddWorker()
|
go p.AddWorker()
|
||||||
}
|
}
|
||||||
for p.size > n {
|
for p.workerSize > n {
|
||||||
p.size--
|
p.workerSize--
|
||||||
go func() { p.killCh <- struct{}{} }()
|
go func() { p.workerKillCh <- struct{}{} }()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ResizeFailedWorkers sets replication failed workers pool size
|
||||||
|
func (p *ReplicationPool) ResizeFailedWorkers(n int) {
|
||||||
|
p.mu.Lock()
|
||||||
|
defer p.mu.Unlock()
|
||||||
|
|
||||||
|
for p.mrfWorkerSize < n {
|
||||||
|
p.mrfWorkerSize++
|
||||||
|
p.mrfWorkerWg.Add(1)
|
||||||
|
go p.AddMRFWorker()
|
||||||
|
}
|
||||||
|
for p.mrfWorkerSize > n {
|
||||||
|
p.mrfWorkerSize--
|
||||||
|
go func() { p.mrfWorkerKillCh <- struct{}{} }()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -943,8 +961,16 @@ func (p *ReplicationPool) queueReplicaDeleteTask(ctx context.Context, doi Delete
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type replicationPoolOpts struct {
|
||||||
|
Workers int
|
||||||
|
FailedWorkers int
|
||||||
|
}
|
||||||
|
|
||||||
func initBackgroundReplication(ctx context.Context, objectAPI ObjectLayer) {
|
func initBackgroundReplication(ctx context.Context, objectAPI ObjectLayer) {
|
||||||
globalReplicationPool = NewReplicationPool(ctx, objectAPI, globalAPIConfig.getReplicationWorkers())
|
globalReplicationPool = NewReplicationPool(ctx, objectAPI, replicationPoolOpts{
|
||||||
|
Workers: globalAPIConfig.getReplicationWorkers(),
|
||||||
|
FailedWorkers: globalAPIConfig.getReplicationFailedWorkers(),
|
||||||
|
})
|
||||||
globalReplicationStats = NewReplicationStats(ctx, objectAPI)
|
globalReplicationStats = NewReplicationStats(ctx, objectAPI)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -30,24 +30,26 @@ import (
|
|||||||
|
|
||||||
// API sub-system constants
|
// API sub-system constants
|
||||||
const (
|
const (
|
||||||
apiRequestsMax = "requests_max"
|
apiRequestsMax = "requests_max"
|
||||||
apiRequestsDeadline = "requests_deadline"
|
apiRequestsDeadline = "requests_deadline"
|
||||||
apiClusterDeadline = "cluster_deadline"
|
apiClusterDeadline = "cluster_deadline"
|
||||||
apiCorsAllowOrigin = "cors_allow_origin"
|
apiCorsAllowOrigin = "cors_allow_origin"
|
||||||
apiRemoteTransportDeadline = "remote_transport_deadline"
|
apiRemoteTransportDeadline = "remote_transport_deadline"
|
||||||
apiListQuorum = "list_quorum"
|
apiListQuorum = "list_quorum"
|
||||||
apiExtendListCacheLife = "extend_list_cache_life"
|
apiExtendListCacheLife = "extend_list_cache_life"
|
||||||
apiReplicationWorkers = "replication_workers"
|
apiReplicationWorkers = "replication_workers"
|
||||||
|
apiReplicationFailedWorkers = "replication_failed_workers"
|
||||||
|
|
||||||
EnvAPIRequestsMax = "MINIO_API_REQUESTS_MAX"
|
EnvAPIRequestsMax = "MINIO_API_REQUESTS_MAX"
|
||||||
EnvAPIRequestsDeadline = "MINIO_API_REQUESTS_DEADLINE"
|
EnvAPIRequestsDeadline = "MINIO_API_REQUESTS_DEADLINE"
|
||||||
EnvAPIClusterDeadline = "MINIO_API_CLUSTER_DEADLINE"
|
EnvAPIClusterDeadline = "MINIO_API_CLUSTER_DEADLINE"
|
||||||
EnvAPICorsAllowOrigin = "MINIO_API_CORS_ALLOW_ORIGIN"
|
EnvAPICorsAllowOrigin = "MINIO_API_CORS_ALLOW_ORIGIN"
|
||||||
EnvAPIRemoteTransportDeadline = "MINIO_API_REMOTE_TRANSPORT_DEADLINE"
|
EnvAPIRemoteTransportDeadline = "MINIO_API_REMOTE_TRANSPORT_DEADLINE"
|
||||||
EnvAPIListQuorum = "MINIO_API_LIST_QUORUM"
|
EnvAPIListQuorum = "MINIO_API_LIST_QUORUM"
|
||||||
EnvAPIExtendListCacheLife = "MINIO_API_EXTEND_LIST_CACHE_LIFE"
|
EnvAPIExtendListCacheLife = "MINIO_API_EXTEND_LIST_CACHE_LIFE"
|
||||||
EnvAPISecureCiphers = "MINIO_API_SECURE_CIPHERS"
|
EnvAPISecureCiphers = "MINIO_API_SECURE_CIPHERS"
|
||||||
EnvAPIReplicationWorkers = "MINIO_API_REPLICATION_WORKERS"
|
EnvAPIReplicationWorkers = "MINIO_API_REPLICATION_WORKERS"
|
||||||
|
EnvAPIReplicationFailedWorkers = "MINIO_API_REPLICATION_FAILED_WORKERS"
|
||||||
)
|
)
|
||||||
|
|
||||||
// Deprecated key and ENVs
|
// Deprecated key and ENVs
|
||||||
@ -91,19 +93,24 @@ var (
|
|||||||
Key: apiReplicationWorkers,
|
Key: apiReplicationWorkers,
|
||||||
Value: "500",
|
Value: "500",
|
||||||
},
|
},
|
||||||
|
config.KV{
|
||||||
|
Key: apiReplicationFailedWorkers,
|
||||||
|
Value: "4",
|
||||||
|
},
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
// Config storage class configuration
|
// Config storage class configuration
|
||||||
type Config struct {
|
type Config struct {
|
||||||
RequestsMax int `json:"requests_max"`
|
RequestsMax int `json:"requests_max"`
|
||||||
RequestsDeadline time.Duration `json:"requests_deadline"`
|
RequestsDeadline time.Duration `json:"requests_deadline"`
|
||||||
ClusterDeadline time.Duration `json:"cluster_deadline"`
|
ClusterDeadline time.Duration `json:"cluster_deadline"`
|
||||||
CorsAllowOrigin []string `json:"cors_allow_origin"`
|
CorsAllowOrigin []string `json:"cors_allow_origin"`
|
||||||
RemoteTransportDeadline time.Duration `json:"remote_transport_deadline"`
|
RemoteTransportDeadline time.Duration `json:"remote_transport_deadline"`
|
||||||
ListQuorum string `json:"list_strict_quorum"`
|
ListQuorum string `json:"list_strict_quorum"`
|
||||||
ExtendListLife time.Duration `json:"extend_list_cache_life"`
|
ExtendListLife time.Duration `json:"extend_list_cache_life"`
|
||||||
ReplicationWorkers int `json:"replication_workers"`
|
ReplicationWorkers int `json:"replication_workers"`
|
||||||
|
ReplicationFailedWorkers int `json:"replication_failed_workers"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// UnmarshalJSON - Validate SS and RRS parity when unmarshalling JSON.
|
// UnmarshalJSON - Validate SS and RRS parity when unmarshalling JSON.
|
||||||
@ -190,14 +197,24 @@ func LookupConfig(kvs config.KVS) (cfg Config, err error) {
|
|||||||
return cfg, config.ErrInvalidReplicationWorkersValue(nil).Msg("Minimum number of replication workers should be 1")
|
return cfg, config.ErrInvalidReplicationWorkersValue(nil).Msg("Minimum number of replication workers should be 1")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
replicationFailedWorkers, err := strconv.Atoi(env.Get(EnvAPIReplicationFailedWorkers, kvs.Get(apiReplicationFailedWorkers)))
|
||||||
|
if err != nil {
|
||||||
|
return cfg, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if replicationFailedWorkers <= 0 {
|
||||||
|
return cfg, config.ErrInvalidReplicationWorkersValue(nil).Msg("Minimum number of replication failed workers should be 1")
|
||||||
|
}
|
||||||
|
|
||||||
return Config{
|
return Config{
|
||||||
RequestsMax: requestsMax,
|
RequestsMax: requestsMax,
|
||||||
RequestsDeadline: requestsDeadline,
|
RequestsDeadline: requestsDeadline,
|
||||||
ClusterDeadline: clusterDeadline,
|
ClusterDeadline: clusterDeadline,
|
||||||
CorsAllowOrigin: corsAllowOrigin,
|
CorsAllowOrigin: corsAllowOrigin,
|
||||||
RemoteTransportDeadline: remoteTransportDeadline,
|
RemoteTransportDeadline: remoteTransportDeadline,
|
||||||
ListQuorum: listQuorum,
|
ListQuorum: listQuorum,
|
||||||
ExtendListLife: listLife,
|
ExtendListLife: listLife,
|
||||||
ReplicationWorkers: replicationWorkers,
|
ReplicationWorkers: replicationWorkers,
|
||||||
|
ReplicationFailedWorkers: replicationFailedWorkers,
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
@ -52,5 +52,11 @@ var (
|
|||||||
Optional: true,
|
Optional: true,
|
||||||
Type: "number",
|
Type: "number",
|
||||||
},
|
},
|
||||||
|
config.HelpKV{
|
||||||
|
Key: apiReplicationFailedWorkers,
|
||||||
|
Description: `set the number of replication workers for recently failed replicas, defaults to 4`,
|
||||||
|
Optional: true,
|
||||||
|
Type: "number",
|
||||||
|
},
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
@ -37,8 +37,9 @@ type apiConfig struct {
|
|||||||
extendListLife time.Duration
|
extendListLife time.Duration
|
||||||
corsAllowOrigins []string
|
corsAllowOrigins []string
|
||||||
// total drives per erasure set across pools.
|
// total drives per erasure set across pools.
|
||||||
totalDriveCount int
|
totalDriveCount int
|
||||||
replicationWorkers int
|
replicationWorkers int
|
||||||
|
replicationFailedWorkers int
|
||||||
}
|
}
|
||||||
|
|
||||||
func (t *apiConfig) init(cfg api.Config, setDriveCounts []int) {
|
func (t *apiConfig) init(cfg api.Config, setDriveCounts []int) {
|
||||||
@ -83,8 +84,10 @@ func (t *apiConfig) init(cfg api.Config, setDriveCounts []int) {
|
|||||||
t.extendListLife = cfg.ExtendListLife
|
t.extendListLife = cfg.ExtendListLife
|
||||||
if globalReplicationPool != nil &&
|
if globalReplicationPool != nil &&
|
||||||
cfg.ReplicationWorkers != t.replicationWorkers {
|
cfg.ReplicationWorkers != t.replicationWorkers {
|
||||||
globalReplicationPool.Resize(cfg.ReplicationWorkers)
|
globalReplicationPool.ResizeFailedWorkers(cfg.ReplicationFailedWorkers)
|
||||||
|
globalReplicationPool.ResizeWorkers(cfg.ReplicationWorkers)
|
||||||
}
|
}
|
||||||
|
t.replicationFailedWorkers = cfg.ReplicationFailedWorkers
|
||||||
t.replicationWorkers = cfg.ReplicationWorkers
|
t.replicationWorkers = cfg.ReplicationWorkers
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -166,6 +169,13 @@ func maxClients(f http.HandlerFunc) http.HandlerFunc {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (t *apiConfig) getReplicationFailedWorkers() int {
|
||||||
|
t.mu.RLock()
|
||||||
|
defer t.mu.RUnlock()
|
||||||
|
|
||||||
|
return t.replicationFailedWorkers
|
||||||
|
}
|
||||||
|
|
||||||
func (t *apiConfig) getReplicationWorkers() int {
|
func (t *apiConfig) getReplicationWorkers() int {
|
||||||
t.mu.RLock()
|
t.mu.RLock()
|
||||||
defer t.mu.RUnlock()
|
defer t.mu.RUnlock()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user