replication: centralize healthcheck for remote targets (#15516)

This PR moves health check from minio-go client to being
managed on the server.

Additionally integrating health check into site replication
This commit is contained in:
Poorna 2022-08-16 17:46:22 -07:00 committed by GitHub
parent 48640b1de2
commit 21fe14201f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 169 additions and 40 deletions

View File

@ -249,7 +249,7 @@ func (a adminAPIHandlers) SetRemoteTargetHandler(w http.ResponseWriter, r *http.
}
if err = globalBucketTargetSys.SetTarget(ctx, bucket, &target, update); err != nil {
switch err.(type) {
case BucketRemoteConnectionErr:
case RemoteTargetConnectionErr:
writeErrorResponseJSON(ctx, w, errorCodes.ToAPIErrWithErr(ErrReplicationRemoteConnectionError, err), r.URL)
default:
writeErrorResponseJSON(ctx, w, toAPIError(ctx, err), r.URL)

View File

@ -888,7 +888,7 @@ var errorCodes = errorCodeMap{
},
ErrReplicationRemoteConnectionError: {
Code: "XMinioAdminReplicationRemoteConnectionError",
Description: "Remote service connection error - please check remote service credentials and target bucket",
Description: "Remote service connection error",
HTTPStatusCode: http.StatusNotFound,
},
ErrReplicationBandwidthLimitError: {
@ -2074,7 +2074,7 @@ func toAPIErrorCode(ctx context.Context, err error) (apiErr APIErrorCode) {
apiErr = ErrRemoteDestinationNotFoundError
case BucketRemoteTargetNotFound:
apiErr = ErrRemoteTargetNotFoundError
case BucketRemoteConnectionErr:
case RemoteTargetConnectionErr:
apiErr = ErrReplicationRemoteConnectionError
case BucketRemoteAlreadyExists:
apiErr = ErrBucketRemoteAlreadyExists

View File

@ -310,7 +310,7 @@ func (api objectAPIHandlers) ResetBucketReplicationStartHandler(w http.ResponseW
rinfo.Targets = append(rinfo.Targets, ResyncTarget{Arn: tgtArns[0], ResetID: target.ResetID})
if err = globalBucketTargetSys.SetTarget(ctx, bucket, &target, true); err != nil {
switch err.(type) {
case BucketRemoteConnectionErr:
case RemoteTargetConnectionErr:
writeErrorResponseJSON(ctx, w, errorCodes.ToAPIErrWithErr(ErrReplicationRemoteConnectionError, err), r.URL)
default:
writeErrorResponseJSON(ctx, w, toAPIError(ctx, err), r.URL)

View File

@ -534,7 +534,7 @@ func replicateDeleteToTarget(ctx context.Context, dobj DeletedObjectReplicationI
if dobj.VersionID != "" && rinfo.VersionPurgeStatus == Complete {
return
}
if tgt.IsOffline() {
if globalBucketTargetSys.isOffline(tgt.EndpointURL()) {
logger.LogIf(ctx, fmt.Errorf("remote target is offline for bucket:%s arn:%s", dobj.Bucket, tgt.ARN))
sendEvent(eventArgs{
BucketName: dobj.Bucket,
@ -1039,7 +1039,7 @@ func replicateObjectToTarget(ctx context.Context, ri ReplicateObjectInfo, object
rinfo.ReplicationResynced = true
return
}
if tgt.IsOffline() {
if globalBucketTargetSys.isOffline(tgt.EndpointURL()) {
logger.LogIf(ctx, fmt.Errorf("remote target is offline for bucket:%s arn:%s", bucket, tgt.ARN))
sendEvent(eventArgs{
EventName: event.ObjectReplicationNotTracked,
@ -1659,7 +1659,7 @@ func proxyHeadToRepTarget(ctx context.Context, bucket, object string, rs *HTTPRa
}
for _, t := range proxyTargets.Targets {
tgt = globalBucketTargetSys.GetRemoteTargetClient(ctx, t.Arn)
if tgt == nil || tgt.IsOffline() {
if tgt == nil || globalBucketTargetSys.isOffline(tgt.EndpointURL()) {
continue
}
// if proxying explicitly disabled on remote target

View File

@ -19,6 +19,8 @@ package cmd
import (
"context"
"fmt"
"net/url"
"sync"
"time"
@ -35,7 +37,9 @@ import (
)
const (
defaultHealthCheckDuration = 30 * time.Second
defaultHealthCheckDuration = 5 * time.Second
// default interval for reload of all remote target endpoints
defaultHealthCheckReloadDuration = 30 * time.Minute
)
// BucketTargetSys represents bucket targets subsystem
@ -43,6 +47,113 @@ type BucketTargetSys struct {
sync.RWMutex
arnRemotesMap map[string]*TargetClient
targetsMap map[string][]madmin.BucketTarget
hMutex sync.RWMutex
hc map[string]epHealth
hcClient *madmin.AnonymousClient
}
// epHealth struct represents health of a replication target endpoint.
type epHealth struct {
Endpoint string
Scheme string
Online bool
}
// isOffline returns current liveness result of remote target. Add endpoint to
// healthcheck map if missing and default to online status
func (sys *BucketTargetSys) isOffline(ep *url.URL) bool {
sys.hMutex.RLock()
defer sys.hMutex.RUnlock()
if h, ok := sys.hc[ep.Host]; ok {
return !h.Online
}
go sys.initHC(ep)
return false
}
func (sys *BucketTargetSys) initHC(ep *url.URL) {
sys.hMutex.Lock()
sys.hc[ep.Host] = epHealth{
Endpoint: ep.Host,
Scheme: ep.Scheme,
Online: true,
}
sys.hMutex.Unlock()
}
// newHCClient initializes an anonymous client for performing health check on the remote endpoints
func newHCClient() *madmin.AnonymousClient {
clnt, e := madmin.NewAnonymousClientNoEndpoint()
if e != nil {
logger.LogOnceIf(GlobalContext, fmt.Errorf("WARNING: Unable to initialize health check client"), string(replicationSubsystem))
return nil
}
clnt.SetCustomTransport(globalRemoteTargetTransport)
return clnt
}
// heartBeat performs liveness check on remote endpoints.
func (sys *BucketTargetSys) heartBeat(ctx context.Context) {
hcTimer := time.NewTimer(defaultHealthCheckDuration)
defer hcTimer.Stop()
for {
select {
case <-hcTimer.C:
sys.hMutex.RLock()
var eps []madmin.ServerProperties
for _, ep := range sys.hc {
eps = append(eps, madmin.ServerProperties{Endpoint: ep.Endpoint, Scheme: ep.Scheme})
}
sys.hMutex.RUnlock()
if len(eps) > 0 {
cctx, cancel := context.WithTimeout(ctx, 30*time.Second)
defer cancel()
m := map[string]epHealth{}
for result := range sys.hcClient.Alive(cctx, madmin.AliveOpts{}, eps...) {
var online bool
if result.Error == nil {
online = result.Online
}
m[result.Endpoint.Host] = epHealth{
Endpoint: result.Endpoint.Host,
Scheme: result.Endpoint.Scheme,
Online: online,
}
}
sys.hMutex.Lock()
sys.hc = m
sys.hMutex.Unlock()
}
hcTimer.Reset(defaultHealthCheckDuration)
case <-ctx.Done():
return
}
}
}
// periodically rebuild the healthcheck map from list of targets to clear
// out stale endpoints
func (sys *BucketTargetSys) reloadHealthCheckers(ctx context.Context) {
m := make(map[string]epHealth)
tgts := sys.ListTargets(ctx, "", "")
for _, t := range tgts {
if _, ok := m[t.Endpoint]; !ok {
scheme := "http"
if t.Secure {
scheme = "https"
}
m[t.Endpoint] = epHealth{
Online: true,
Endpoint: t.Endpoint,
Scheme: scheme,
}
}
}
sys.hMutex.Lock()
// swap out the map
sys.hc = m
sys.hMutex.Unlock()
}
// ListTargets lists bucket targets across tenant or for individual bucket, and returns
@ -91,9 +202,6 @@ func (sys *BucketTargetSys) Delete(bucket string) {
return
}
for _, t := range tgts {
if tgt, ok := sys.arnRemotesMap[t.Arn]; ok && tgt.healthCancelFn != nil {
tgt.healthCancelFn()
}
delete(sys.arnRemotesMap, t.Arn)
}
delete(sys.targetsMap, bucket)
@ -116,7 +224,7 @@ func (sys *BucketTargetSys) SetTarget(ctx context.Context, bucket string, tgt *m
if minio.ToErrorResponse(err).Code == "NoSuchBucket" {
return BucketRemoteTargetNotFound{Bucket: tgt.TargetBucket}
}
return BucketRemoteConnectionErr{Bucket: tgt.TargetBucket, Err: err}
return RemoteTargetConnectionErr{Bucket: tgt.TargetBucket, Err: err}
}
if tgt.Type == madmin.ReplicationService {
if !globalBucketVersioningSys.Enabled(bucket) {
@ -124,7 +232,7 @@ func (sys *BucketTargetSys) SetTarget(ctx context.Context, bucket string, tgt *m
}
vcfg, err := clnt.GetBucketVersioning(ctx, tgt.TargetBucket)
if err != nil {
return BucketRemoteConnectionErr{Bucket: tgt.TargetBucket, Err: err}
return RemoteTargetConnectionErr{Bucket: tgt.TargetBucket, Err: err}
}
if vcfg.Status != string(versioning.Enabled) {
return BucketRemoteTargetNotVersioned{Bucket: tgt.TargetBucket}
@ -152,10 +260,7 @@ func (sys *BucketTargetSys) SetTarget(ctx context.Context, bucket string, tgt *m
if !found && !update {
newtgts = append(newtgts, *tgt)
}
// cancel health check for previous target client to avoid leak.
if prevClnt, ok := sys.arnRemotesMap[tgt.Arn]; ok && prevClnt.healthCancelFn != nil {
prevClnt.healthCancelFn()
}
sys.targetsMap[bucket] = newtgts
sys.arnRemotesMap[tgt.Arn] = clnt
sys.updateBandwidthLimit(bucket, tgt.BandwidthLimit)
@ -227,9 +332,6 @@ func (sys *BucketTargetSys) RemoveTarget(ctx context.Context, bucket, arnStr str
return BucketRemoteTargetNotFound{Bucket: bucket}
}
sys.targetsMap[bucket] = targets
if tgt, ok := sys.arnRemotesMap[arnStr]; ok && tgt.healthCancelFn != nil {
tgt.healthCancelFn()
}
delete(sys.arnRemotesMap, arnStr)
sys.updateBandwidthLimit(bucket, 0)
return nil
@ -258,11 +360,29 @@ func (sys *BucketTargetSys) GetRemoteBucketTargetByArn(ctx context.Context, buck
}
// NewBucketTargetSys - creates new replication system.
func NewBucketTargetSys() *BucketTargetSys {
return &BucketTargetSys{
func NewBucketTargetSys(ctx context.Context) *BucketTargetSys {
sys := &BucketTargetSys{
arnRemotesMap: make(map[string]*TargetClient),
targetsMap: make(map[string][]madmin.BucketTarget),
hc: make(map[string]epHealth),
hcClient: newHCClient(),
}
// reload healthcheck endpoints map periodically to remove stale endpoints from the map.
go func() {
rTimer := time.NewTimer(defaultHealthCheckReloadDuration)
defer rTimer.Stop()
for {
select {
case <-rTimer.C:
sys.reloadHealthCheckers(ctx)
rTimer.Reset(defaultHealthCheckReloadDuration)
case <-ctx.Done():
return
}
}
}()
go sys.heartBeat(ctx)
return sys
}
// UpdateAllTargets updates target to reflect metadata updates
@ -276,9 +396,6 @@ func (sys *BucketTargetSys) UpdateAllTargets(bucket string, tgts *madmin.BucketT
// Remove existingtarget and arn association
if tgts, ok := sys.targetsMap[bucket]; ok {
for _, t := range tgts {
if tgt, ok := sys.arnRemotesMap[t.Arn]; ok && tgt.healthCancelFn != nil {
tgt.healthCancelFn()
}
delete(sys.arnRemotesMap, t.Arn)
}
delete(sys.targetsMap, bucket)
@ -345,10 +462,6 @@ func (sys *BucketTargetSys) getRemoteTargetClient(tcfg *madmin.BucketTarget) (*T
if tcfg.HealthCheckDuration >= 1 { // require minimum health check duration of 1 sec.
hcDuration = tcfg.HealthCheckDuration
}
cancelFn, err := api.HealthCheck(hcDuration)
if err != nil {
return nil, err
}
tc := &TargetClient{
Client: api,
healthCheckDuration: hcDuration,
@ -356,9 +469,10 @@ func (sys *BucketTargetSys) getRemoteTargetClient(tcfg *madmin.BucketTarget) (*T
Bucket: tcfg.TargetBucket,
StorageClass: tcfg.StorageClass,
disableProxy: tcfg.DisableProxy,
healthCancelFn: cancelFn,
ARN: tcfg.Arn,
ResetID: tcfg.ResetID,
Endpoint: tcfg.Endpoint,
Secure: tcfg.Secure,
}
return tc, nil
}
@ -432,7 +546,8 @@ type TargetClient struct {
replicateSync bool
StorageClass string // storage class on remote
disableProxy bool
healthCancelFn context.CancelFunc // cancellation function for client healthcheck
ARN string // ARN to uniquely identify remote target
ARN string // ARN to uniquely identify remote target
ResetID string
Endpoint string
Secure bool
}

View File

@ -422,11 +422,18 @@ func (e BucketRemoteTargetNotFound) Error() string {
return "Remote target not found: " + e.Bucket
}
// BucketRemoteConnectionErr remote target connection failure.
type BucketRemoteConnectionErr GenericError
// RemoteTargetConnectionErr remote target connection failure.
type RemoteTargetConnectionErr struct {
Err error
Bucket string
Endpoint string
}
func (e BucketRemoteConnectionErr) Error() string {
return fmt.Sprintf("Remote service endpoint or target bucket not available: %s \n\t%s", e.Bucket, e.Err.Error())
func (e RemoteTargetConnectionErr) Error() string {
if e.Bucket != "" {
return fmt.Sprintf("Remote service endpoint offline or target bucket/remote service credentials invalid: %s \n\t%s", e.Bucket, e.Err.Error())
}
return fmt.Sprintf("Remote service endpoint %s not available\n\t%s", e.Endpoint, e.Err.Error())
}
// BucketRemoteAlreadyExists remote already exists for this target type.

View File

@ -290,7 +290,7 @@ func initAllSubsystems() {
}
// Create new bucket replication subsytem
globalBucketTargetSys = NewBucketTargetSys()
globalBucketTargetSys = NewBucketTargetSys(GlobalContext)
// Create new ILM tier configuration subsystem
globalTierConfigMgr = NewTierConfigMgr()

View File

@ -2217,6 +2217,9 @@ func getAdminClient(endpoint, accessKey, secretKey string) (*madmin.AdminClient,
if err != nil {
return nil, err
}
if globalBucketTargetSys.isOffline(epURL) {
return nil, RemoteTargetConnectionErr{Endpoint: epURL.String(), Err: fmt.Errorf("remote target is offline for endpoint %s", epURL.String())}
}
client, err := madmin.New(epURL.Host, accessKey, secretKey, epURL.Scheme == "https")
if err != nil {
return nil, err
@ -2230,6 +2233,10 @@ func getS3Client(pc madmin.PeerSite) (*minioClient.Client, error) {
if err != nil {
return nil, err
}
if globalBucketTargetSys.isOffline(ep) {
return nil, RemoteTargetConnectionErr{Endpoint: ep.String(), Err: fmt.Errorf("remote target is offline for endpoint %s", ep.String())}
}
return minioClient.New(ep.Host, &minioClient.Options{
Creds: credentials.NewStaticV4(pc.AccessKey, pc.SecretKey, ""),
Secure: ep.Scheme == "https",

2
go.mod
View File

@ -48,7 +48,7 @@ require (
github.com/minio/dperf v0.4.2
github.com/minio/highwayhash v1.0.2
github.com/minio/kes v0.20.0
github.com/minio/madmin-go v1.4.17
github.com/minio/madmin-go v1.4.20
github.com/minio/minio-go/v7 v7.0.34
github.com/minio/pkg v1.3.0
github.com/minio/selfupdate v0.5.0

4
go.sum
View File

@ -622,8 +622,8 @@ github.com/minio/highwayhash v1.0.2/go.mod h1:BQskDq+xkJ12lmlUUi7U0M5Swg3EWR+dLT
github.com/minio/kes v0.20.0 h1:1tyC51Rr8zTregTESuT/QN/iebNMX7B9t7d3xLNMEpE=
github.com/minio/kes v0.20.0/go.mod h1:3FW1BQkMGQW78yhy+69tUq5bdcf5rnXJizyeKB9a/tc=
github.com/minio/madmin-go v1.3.5/go.mod h1:vGKGboQgGIWx4DuDUaXixjlIEZOCIp6ivJkQoiVaACc=
github.com/minio/madmin-go v1.4.17 h1:o8b42lrJW9qyIthIa+24IzygZjmPm04iSUSBITcvzYU=
github.com/minio/madmin-go v1.4.17/go.mod h1:ez87VmMtsxP7DRxjKJKD4RDNW+nhO2QF9KSzwxBDQ98=
github.com/minio/madmin-go v1.4.20 h1:OpPxc8uIaevJMpBGSZ2TXJvFGvPSDn0IT3VJPh7w90M=
github.com/minio/madmin-go v1.4.20/go.mod h1:ez87VmMtsxP7DRxjKJKD4RDNW+nhO2QF9KSzwxBDQ98=
github.com/minio/mc v0.0.0-20220805080128-351d021b924b h1:ikMXncKqNE/0acH6us6yy3v+gJBP7nGv/3Rc9F7vRio=
github.com/minio/mc v0.0.0-20220805080128-351d021b924b/go.mod h1:YUXIqqgGfFknByv0eeJSMBQl/WGuEN0XkpW68/ghBm0=
github.com/minio/md5-simd v1.1.0/go.mod h1:XpBqgZULrMYD3R+M28PcmP0CkI7PEMzB3U77ZrKZ0Gw=