mirror of
https://github.com/minio/minio.git
synced 2024-12-24 22:25:54 -05:00
replication: centralize healthcheck for remote targets (#15516)
This PR moves health check from minio-go client to being managed on the server. Additionally integrating health check into site replication
This commit is contained in:
parent
48640b1de2
commit
21fe14201f
@ -249,7 +249,7 @@ func (a adminAPIHandlers) SetRemoteTargetHandler(w http.ResponseWriter, r *http.
|
||||
}
|
||||
if err = globalBucketTargetSys.SetTarget(ctx, bucket, &target, update); err != nil {
|
||||
switch err.(type) {
|
||||
case BucketRemoteConnectionErr:
|
||||
case RemoteTargetConnectionErr:
|
||||
writeErrorResponseJSON(ctx, w, errorCodes.ToAPIErrWithErr(ErrReplicationRemoteConnectionError, err), r.URL)
|
||||
default:
|
||||
writeErrorResponseJSON(ctx, w, toAPIError(ctx, err), r.URL)
|
||||
|
@ -888,7 +888,7 @@ var errorCodes = errorCodeMap{
|
||||
},
|
||||
ErrReplicationRemoteConnectionError: {
|
||||
Code: "XMinioAdminReplicationRemoteConnectionError",
|
||||
Description: "Remote service connection error - please check remote service credentials and target bucket",
|
||||
Description: "Remote service connection error",
|
||||
HTTPStatusCode: http.StatusNotFound,
|
||||
},
|
||||
ErrReplicationBandwidthLimitError: {
|
||||
@ -2074,7 +2074,7 @@ func toAPIErrorCode(ctx context.Context, err error) (apiErr APIErrorCode) {
|
||||
apiErr = ErrRemoteDestinationNotFoundError
|
||||
case BucketRemoteTargetNotFound:
|
||||
apiErr = ErrRemoteTargetNotFoundError
|
||||
case BucketRemoteConnectionErr:
|
||||
case RemoteTargetConnectionErr:
|
||||
apiErr = ErrReplicationRemoteConnectionError
|
||||
case BucketRemoteAlreadyExists:
|
||||
apiErr = ErrBucketRemoteAlreadyExists
|
||||
|
@ -310,7 +310,7 @@ func (api objectAPIHandlers) ResetBucketReplicationStartHandler(w http.ResponseW
|
||||
rinfo.Targets = append(rinfo.Targets, ResyncTarget{Arn: tgtArns[0], ResetID: target.ResetID})
|
||||
if err = globalBucketTargetSys.SetTarget(ctx, bucket, &target, true); err != nil {
|
||||
switch err.(type) {
|
||||
case BucketRemoteConnectionErr:
|
||||
case RemoteTargetConnectionErr:
|
||||
writeErrorResponseJSON(ctx, w, errorCodes.ToAPIErrWithErr(ErrReplicationRemoteConnectionError, err), r.URL)
|
||||
default:
|
||||
writeErrorResponseJSON(ctx, w, toAPIError(ctx, err), r.URL)
|
||||
|
@ -534,7 +534,7 @@ func replicateDeleteToTarget(ctx context.Context, dobj DeletedObjectReplicationI
|
||||
if dobj.VersionID != "" && rinfo.VersionPurgeStatus == Complete {
|
||||
return
|
||||
}
|
||||
if tgt.IsOffline() {
|
||||
if globalBucketTargetSys.isOffline(tgt.EndpointURL()) {
|
||||
logger.LogIf(ctx, fmt.Errorf("remote target is offline for bucket:%s arn:%s", dobj.Bucket, tgt.ARN))
|
||||
sendEvent(eventArgs{
|
||||
BucketName: dobj.Bucket,
|
||||
@ -1039,7 +1039,7 @@ func replicateObjectToTarget(ctx context.Context, ri ReplicateObjectInfo, object
|
||||
rinfo.ReplicationResynced = true
|
||||
return
|
||||
}
|
||||
if tgt.IsOffline() {
|
||||
if globalBucketTargetSys.isOffline(tgt.EndpointURL()) {
|
||||
logger.LogIf(ctx, fmt.Errorf("remote target is offline for bucket:%s arn:%s", bucket, tgt.ARN))
|
||||
sendEvent(eventArgs{
|
||||
EventName: event.ObjectReplicationNotTracked,
|
||||
@ -1659,7 +1659,7 @@ func proxyHeadToRepTarget(ctx context.Context, bucket, object string, rs *HTTPRa
|
||||
}
|
||||
for _, t := range proxyTargets.Targets {
|
||||
tgt = globalBucketTargetSys.GetRemoteTargetClient(ctx, t.Arn)
|
||||
if tgt == nil || tgt.IsOffline() {
|
||||
if tgt == nil || globalBucketTargetSys.isOffline(tgt.EndpointURL()) {
|
||||
continue
|
||||
}
|
||||
// if proxying explicitly disabled on remote target
|
||||
|
@ -19,6 +19,8 @@ package cmd
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/url"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@ -35,7 +37,9 @@ import (
|
||||
)
|
||||
|
||||
const (
|
||||
defaultHealthCheckDuration = 30 * time.Second
|
||||
defaultHealthCheckDuration = 5 * time.Second
|
||||
// default interval for reload of all remote target endpoints
|
||||
defaultHealthCheckReloadDuration = 30 * time.Minute
|
||||
)
|
||||
|
||||
// BucketTargetSys represents bucket targets subsystem
|
||||
@ -43,6 +47,113 @@ type BucketTargetSys struct {
|
||||
sync.RWMutex
|
||||
arnRemotesMap map[string]*TargetClient
|
||||
targetsMap map[string][]madmin.BucketTarget
|
||||
hMutex sync.RWMutex
|
||||
hc map[string]epHealth
|
||||
hcClient *madmin.AnonymousClient
|
||||
}
|
||||
|
||||
// epHealth struct represents health of a replication target endpoint.
|
||||
type epHealth struct {
|
||||
Endpoint string
|
||||
Scheme string
|
||||
Online bool
|
||||
}
|
||||
|
||||
// isOffline returns current liveness result of remote target. Add endpoint to
|
||||
// healthcheck map if missing and default to online status
|
||||
func (sys *BucketTargetSys) isOffline(ep *url.URL) bool {
|
||||
sys.hMutex.RLock()
|
||||
defer sys.hMutex.RUnlock()
|
||||
if h, ok := sys.hc[ep.Host]; ok {
|
||||
return !h.Online
|
||||
}
|
||||
go sys.initHC(ep)
|
||||
return false
|
||||
}
|
||||
|
||||
func (sys *BucketTargetSys) initHC(ep *url.URL) {
|
||||
sys.hMutex.Lock()
|
||||
sys.hc[ep.Host] = epHealth{
|
||||
Endpoint: ep.Host,
|
||||
Scheme: ep.Scheme,
|
||||
Online: true,
|
||||
}
|
||||
sys.hMutex.Unlock()
|
||||
}
|
||||
|
||||
// newHCClient initializes an anonymous client for performing health check on the remote endpoints
|
||||
func newHCClient() *madmin.AnonymousClient {
|
||||
clnt, e := madmin.NewAnonymousClientNoEndpoint()
|
||||
if e != nil {
|
||||
logger.LogOnceIf(GlobalContext, fmt.Errorf("WARNING: Unable to initialize health check client"), string(replicationSubsystem))
|
||||
return nil
|
||||
}
|
||||
clnt.SetCustomTransport(globalRemoteTargetTransport)
|
||||
return clnt
|
||||
}
|
||||
|
||||
// heartBeat performs liveness check on remote endpoints.
|
||||
func (sys *BucketTargetSys) heartBeat(ctx context.Context) {
|
||||
hcTimer := time.NewTimer(defaultHealthCheckDuration)
|
||||
defer hcTimer.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-hcTimer.C:
|
||||
sys.hMutex.RLock()
|
||||
var eps []madmin.ServerProperties
|
||||
for _, ep := range sys.hc {
|
||||
eps = append(eps, madmin.ServerProperties{Endpoint: ep.Endpoint, Scheme: ep.Scheme})
|
||||
}
|
||||
sys.hMutex.RUnlock()
|
||||
|
||||
if len(eps) > 0 {
|
||||
cctx, cancel := context.WithTimeout(ctx, 30*time.Second)
|
||||
defer cancel()
|
||||
m := map[string]epHealth{}
|
||||
for result := range sys.hcClient.Alive(cctx, madmin.AliveOpts{}, eps...) {
|
||||
var online bool
|
||||
if result.Error == nil {
|
||||
online = result.Online
|
||||
}
|
||||
m[result.Endpoint.Host] = epHealth{
|
||||
Endpoint: result.Endpoint.Host,
|
||||
Scheme: result.Endpoint.Scheme,
|
||||
Online: online,
|
||||
}
|
||||
}
|
||||
sys.hMutex.Lock()
|
||||
sys.hc = m
|
||||
sys.hMutex.Unlock()
|
||||
}
|
||||
hcTimer.Reset(defaultHealthCheckDuration)
|
||||
case <-ctx.Done():
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// periodically rebuild the healthcheck map from list of targets to clear
|
||||
// out stale endpoints
|
||||
func (sys *BucketTargetSys) reloadHealthCheckers(ctx context.Context) {
|
||||
m := make(map[string]epHealth)
|
||||
tgts := sys.ListTargets(ctx, "", "")
|
||||
for _, t := range tgts {
|
||||
if _, ok := m[t.Endpoint]; !ok {
|
||||
scheme := "http"
|
||||
if t.Secure {
|
||||
scheme = "https"
|
||||
}
|
||||
m[t.Endpoint] = epHealth{
|
||||
Online: true,
|
||||
Endpoint: t.Endpoint,
|
||||
Scheme: scheme,
|
||||
}
|
||||
}
|
||||
}
|
||||
sys.hMutex.Lock()
|
||||
// swap out the map
|
||||
sys.hc = m
|
||||
sys.hMutex.Unlock()
|
||||
}
|
||||
|
||||
// ListTargets lists bucket targets across tenant or for individual bucket, and returns
|
||||
@ -91,9 +202,6 @@ func (sys *BucketTargetSys) Delete(bucket string) {
|
||||
return
|
||||
}
|
||||
for _, t := range tgts {
|
||||
if tgt, ok := sys.arnRemotesMap[t.Arn]; ok && tgt.healthCancelFn != nil {
|
||||
tgt.healthCancelFn()
|
||||
}
|
||||
delete(sys.arnRemotesMap, t.Arn)
|
||||
}
|
||||
delete(sys.targetsMap, bucket)
|
||||
@ -116,7 +224,7 @@ func (sys *BucketTargetSys) SetTarget(ctx context.Context, bucket string, tgt *m
|
||||
if minio.ToErrorResponse(err).Code == "NoSuchBucket" {
|
||||
return BucketRemoteTargetNotFound{Bucket: tgt.TargetBucket}
|
||||
}
|
||||
return BucketRemoteConnectionErr{Bucket: tgt.TargetBucket, Err: err}
|
||||
return RemoteTargetConnectionErr{Bucket: tgt.TargetBucket, Err: err}
|
||||
}
|
||||
if tgt.Type == madmin.ReplicationService {
|
||||
if !globalBucketVersioningSys.Enabled(bucket) {
|
||||
@ -124,7 +232,7 @@ func (sys *BucketTargetSys) SetTarget(ctx context.Context, bucket string, tgt *m
|
||||
}
|
||||
vcfg, err := clnt.GetBucketVersioning(ctx, tgt.TargetBucket)
|
||||
if err != nil {
|
||||
return BucketRemoteConnectionErr{Bucket: tgt.TargetBucket, Err: err}
|
||||
return RemoteTargetConnectionErr{Bucket: tgt.TargetBucket, Err: err}
|
||||
}
|
||||
if vcfg.Status != string(versioning.Enabled) {
|
||||
return BucketRemoteTargetNotVersioned{Bucket: tgt.TargetBucket}
|
||||
@ -152,10 +260,7 @@ func (sys *BucketTargetSys) SetTarget(ctx context.Context, bucket string, tgt *m
|
||||
if !found && !update {
|
||||
newtgts = append(newtgts, *tgt)
|
||||
}
|
||||
// cancel health check for previous target client to avoid leak.
|
||||
if prevClnt, ok := sys.arnRemotesMap[tgt.Arn]; ok && prevClnt.healthCancelFn != nil {
|
||||
prevClnt.healthCancelFn()
|
||||
}
|
||||
|
||||
sys.targetsMap[bucket] = newtgts
|
||||
sys.arnRemotesMap[tgt.Arn] = clnt
|
||||
sys.updateBandwidthLimit(bucket, tgt.BandwidthLimit)
|
||||
@ -227,9 +332,6 @@ func (sys *BucketTargetSys) RemoveTarget(ctx context.Context, bucket, arnStr str
|
||||
return BucketRemoteTargetNotFound{Bucket: bucket}
|
||||
}
|
||||
sys.targetsMap[bucket] = targets
|
||||
if tgt, ok := sys.arnRemotesMap[arnStr]; ok && tgt.healthCancelFn != nil {
|
||||
tgt.healthCancelFn()
|
||||
}
|
||||
delete(sys.arnRemotesMap, arnStr)
|
||||
sys.updateBandwidthLimit(bucket, 0)
|
||||
return nil
|
||||
@ -258,11 +360,29 @@ func (sys *BucketTargetSys) GetRemoteBucketTargetByArn(ctx context.Context, buck
|
||||
}
|
||||
|
||||
// NewBucketTargetSys - creates new replication system.
|
||||
func NewBucketTargetSys() *BucketTargetSys {
|
||||
return &BucketTargetSys{
|
||||
func NewBucketTargetSys(ctx context.Context) *BucketTargetSys {
|
||||
sys := &BucketTargetSys{
|
||||
arnRemotesMap: make(map[string]*TargetClient),
|
||||
targetsMap: make(map[string][]madmin.BucketTarget),
|
||||
hc: make(map[string]epHealth),
|
||||
hcClient: newHCClient(),
|
||||
}
|
||||
// reload healthcheck endpoints map periodically to remove stale endpoints from the map.
|
||||
go func() {
|
||||
rTimer := time.NewTimer(defaultHealthCheckReloadDuration)
|
||||
defer rTimer.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-rTimer.C:
|
||||
sys.reloadHealthCheckers(ctx)
|
||||
rTimer.Reset(defaultHealthCheckReloadDuration)
|
||||
case <-ctx.Done():
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
go sys.heartBeat(ctx)
|
||||
return sys
|
||||
}
|
||||
|
||||
// UpdateAllTargets updates target to reflect metadata updates
|
||||
@ -276,9 +396,6 @@ func (sys *BucketTargetSys) UpdateAllTargets(bucket string, tgts *madmin.BucketT
|
||||
// Remove existingtarget and arn association
|
||||
if tgts, ok := sys.targetsMap[bucket]; ok {
|
||||
for _, t := range tgts {
|
||||
if tgt, ok := sys.arnRemotesMap[t.Arn]; ok && tgt.healthCancelFn != nil {
|
||||
tgt.healthCancelFn()
|
||||
}
|
||||
delete(sys.arnRemotesMap, t.Arn)
|
||||
}
|
||||
delete(sys.targetsMap, bucket)
|
||||
@ -345,10 +462,6 @@ func (sys *BucketTargetSys) getRemoteTargetClient(tcfg *madmin.BucketTarget) (*T
|
||||
if tcfg.HealthCheckDuration >= 1 { // require minimum health check duration of 1 sec.
|
||||
hcDuration = tcfg.HealthCheckDuration
|
||||
}
|
||||
cancelFn, err := api.HealthCheck(hcDuration)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
tc := &TargetClient{
|
||||
Client: api,
|
||||
healthCheckDuration: hcDuration,
|
||||
@ -356,9 +469,10 @@ func (sys *BucketTargetSys) getRemoteTargetClient(tcfg *madmin.BucketTarget) (*T
|
||||
Bucket: tcfg.TargetBucket,
|
||||
StorageClass: tcfg.StorageClass,
|
||||
disableProxy: tcfg.DisableProxy,
|
||||
healthCancelFn: cancelFn,
|
||||
ARN: tcfg.Arn,
|
||||
ResetID: tcfg.ResetID,
|
||||
Endpoint: tcfg.Endpoint,
|
||||
Secure: tcfg.Secure,
|
||||
}
|
||||
return tc, nil
|
||||
}
|
||||
@ -432,7 +546,8 @@ type TargetClient struct {
|
||||
replicateSync bool
|
||||
StorageClass string // storage class on remote
|
||||
disableProxy bool
|
||||
healthCancelFn context.CancelFunc // cancellation function for client healthcheck
|
||||
ARN string // ARN to uniquely identify remote target
|
||||
ARN string // ARN to uniquely identify remote target
|
||||
ResetID string
|
||||
Endpoint string
|
||||
Secure bool
|
||||
}
|
||||
|
@ -422,11 +422,18 @@ func (e BucketRemoteTargetNotFound) Error() string {
|
||||
return "Remote target not found: " + e.Bucket
|
||||
}
|
||||
|
||||
// BucketRemoteConnectionErr remote target connection failure.
|
||||
type BucketRemoteConnectionErr GenericError
|
||||
// RemoteTargetConnectionErr remote target connection failure.
|
||||
type RemoteTargetConnectionErr struct {
|
||||
Err error
|
||||
Bucket string
|
||||
Endpoint string
|
||||
}
|
||||
|
||||
func (e BucketRemoteConnectionErr) Error() string {
|
||||
return fmt.Sprintf("Remote service endpoint or target bucket not available: %s \n\t%s", e.Bucket, e.Err.Error())
|
||||
func (e RemoteTargetConnectionErr) Error() string {
|
||||
if e.Bucket != "" {
|
||||
return fmt.Sprintf("Remote service endpoint offline or target bucket/remote service credentials invalid: %s \n\t%s", e.Bucket, e.Err.Error())
|
||||
}
|
||||
return fmt.Sprintf("Remote service endpoint %s not available\n\t%s", e.Endpoint, e.Err.Error())
|
||||
}
|
||||
|
||||
// BucketRemoteAlreadyExists remote already exists for this target type.
|
||||
|
@ -290,7 +290,7 @@ func initAllSubsystems() {
|
||||
}
|
||||
|
||||
// Create new bucket replication subsytem
|
||||
globalBucketTargetSys = NewBucketTargetSys()
|
||||
globalBucketTargetSys = NewBucketTargetSys(GlobalContext)
|
||||
|
||||
// Create new ILM tier configuration subsystem
|
||||
globalTierConfigMgr = NewTierConfigMgr()
|
||||
|
@ -2217,6 +2217,9 @@ func getAdminClient(endpoint, accessKey, secretKey string) (*madmin.AdminClient,
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if globalBucketTargetSys.isOffline(epURL) {
|
||||
return nil, RemoteTargetConnectionErr{Endpoint: epURL.String(), Err: fmt.Errorf("remote target is offline for endpoint %s", epURL.String())}
|
||||
}
|
||||
client, err := madmin.New(epURL.Host, accessKey, secretKey, epURL.Scheme == "https")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@ -2230,6 +2233,10 @@ func getS3Client(pc madmin.PeerSite) (*minioClient.Client, error) {
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if globalBucketTargetSys.isOffline(ep) {
|
||||
return nil, RemoteTargetConnectionErr{Endpoint: ep.String(), Err: fmt.Errorf("remote target is offline for endpoint %s", ep.String())}
|
||||
}
|
||||
|
||||
return minioClient.New(ep.Host, &minioClient.Options{
|
||||
Creds: credentials.NewStaticV4(pc.AccessKey, pc.SecretKey, ""),
|
||||
Secure: ep.Scheme == "https",
|
||||
|
2
go.mod
2
go.mod
@ -48,7 +48,7 @@ require (
|
||||
github.com/minio/dperf v0.4.2
|
||||
github.com/minio/highwayhash v1.0.2
|
||||
github.com/minio/kes v0.20.0
|
||||
github.com/minio/madmin-go v1.4.17
|
||||
github.com/minio/madmin-go v1.4.20
|
||||
github.com/minio/minio-go/v7 v7.0.34
|
||||
github.com/minio/pkg v1.3.0
|
||||
github.com/minio/selfupdate v0.5.0
|
||||
|
4
go.sum
4
go.sum
@ -622,8 +622,8 @@ github.com/minio/highwayhash v1.0.2/go.mod h1:BQskDq+xkJ12lmlUUi7U0M5Swg3EWR+dLT
|
||||
github.com/minio/kes v0.20.0 h1:1tyC51Rr8zTregTESuT/QN/iebNMX7B9t7d3xLNMEpE=
|
||||
github.com/minio/kes v0.20.0/go.mod h1:3FW1BQkMGQW78yhy+69tUq5bdcf5rnXJizyeKB9a/tc=
|
||||
github.com/minio/madmin-go v1.3.5/go.mod h1:vGKGboQgGIWx4DuDUaXixjlIEZOCIp6ivJkQoiVaACc=
|
||||
github.com/minio/madmin-go v1.4.17 h1:o8b42lrJW9qyIthIa+24IzygZjmPm04iSUSBITcvzYU=
|
||||
github.com/minio/madmin-go v1.4.17/go.mod h1:ez87VmMtsxP7DRxjKJKD4RDNW+nhO2QF9KSzwxBDQ98=
|
||||
github.com/minio/madmin-go v1.4.20 h1:OpPxc8uIaevJMpBGSZ2TXJvFGvPSDn0IT3VJPh7w90M=
|
||||
github.com/minio/madmin-go v1.4.20/go.mod h1:ez87VmMtsxP7DRxjKJKD4RDNW+nhO2QF9KSzwxBDQ98=
|
||||
github.com/minio/mc v0.0.0-20220805080128-351d021b924b h1:ikMXncKqNE/0acH6us6yy3v+gJBP7nGv/3Rc9F7vRio=
|
||||
github.com/minio/mc v0.0.0-20220805080128-351d021b924b/go.mod h1:YUXIqqgGfFknByv0eeJSMBQl/WGuEN0XkpW68/ghBm0=
|
||||
github.com/minio/md5-simd v1.1.0/go.mod h1:XpBqgZULrMYD3R+M28PcmP0CkI7PEMzB3U77ZrKZ0Gw=
|
||||
|
Loading…
Reference in New Issue
Block a user