From 1250312287a8cb4cfff33fda56e64c9767d8b135 Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Fri, 3 Sep 2021 17:05:41 -0700 Subject: [PATCH] fail ready/liveness if etcd is unhealthy in gateway mode (#13146) --- cmd/gateway-main.go | 3 --- cmd/healthcheck-handler.go | 23 +++++++++++++++++++++++ docs/metrics/healthcheck/README.md | 19 ++++++++++++++++++- 3 files changed, 41 insertions(+), 4 deletions(-) diff --git a/cmd/gateway-main.go b/cmd/gateway-main.go index 1dae42de1..4e84ef2d1 100644 --- a/cmd/gateway-main.go +++ b/cmd/gateway-main.go @@ -301,9 +301,6 @@ func StartGateway(ctx *cli.Context, gw Gateway) { logger.FatalIf(globalNotificationSys.Init(GlobalContext, buckets, newObject), "Unable to initialize notification system") } - // Initialize users credentials and policies in background. - globalIAMSys.InitStore(newObject) - go globalIAMSys.Init(GlobalContext, newObject) if globalCacheConfig.Enabled { diff --git a/cmd/healthcheck-handler.go b/cmd/healthcheck-handler.go index 221e2368b..ff37ce34b 100644 --- a/cmd/healthcheck-handler.go +++ b/cmd/healthcheck-handler.go @@ -95,6 +95,17 @@ func ReadinessCheckHandler(w http.ResponseWriter, r *http.Request) { w.Header().Set(xhttp.MinIOServerStatus, unavailable) } + if globalIsGateway && globalEtcdClient != nil { + // Borrowed from https://github.com/etcd-io/etcd/blob/main/etcdctl/ctlv3/command/ep_command.go#L118 + ctx, cancel := context.WithTimeout(r.Context(), defaultContextTimeout) + defer cancel() + // etcd unreachable throw an error for readiness. + if _, err := globalEtcdClient.Get(ctx, "health"); err != nil { + writeErrorResponse(r.Context(), w, toAPIError(r.Context(), err), r.URL) + return + } + } + writeResponse(w, http.StatusOK, nil, mimeNone) } @@ -104,5 +115,17 @@ func LivenessCheckHandler(w http.ResponseWriter, r *http.Request) { // Service not initialized yet w.Header().Set(xhttp.MinIOServerStatus, unavailable) } + + if globalIsGateway && globalEtcdClient != nil { + // Borrowed from https://github.com/etcd-io/etcd/blob/main/etcdctl/ctlv3/command/ep_command.go#L118 + ctx, cancel := context.WithTimeout(r.Context(), defaultContextTimeout) + defer cancel() + // etcd unreachable throw an error for readiness. + if _, err := globalEtcdClient.Get(ctx, "health"); err != nil { + writeErrorResponse(r.Context(), w, toAPIError(r.Context(), err), r.URL) + return + } + } + writeResponse(w, http.StatusOK, nil, mimeNone) } diff --git a/docs/metrics/healthcheck/README.md b/docs/metrics/healthcheck/README.md index 2f055807c..b5930b1af 100644 --- a/docs/metrics/healthcheck/README.md +++ b/docs/metrics/healthcheck/README.md @@ -4,7 +4,7 @@ MinIO server exposes three un-authenticated, healthcheck endpoints liveness prob ### Liveness probe -This probe always responds with '200 OK'. When liveness probe fails, Kubernetes like platforms restart the container. +This probe always responds with '200 OK'. Only fails if 'etcd' is configured and unreachable. This behavior is specific to gateway. When liveness probe fails, Kubernetes like platforms restart the container. ``` livenessProbe: @@ -13,6 +13,23 @@ livenessProbe: port: 9000 scheme: HTTP initialDelaySeconds: 120 + periodSeconds: 30 + timeoutSeconds: 10 + successThreshold: 1 + failureThreshold: 3 +``` + +### Readiness probe + +This probe always responds with '200 OK'. Only fails if 'etcd' is configured and unreachable. This behavior is specific to gateway. When readiness probe fails, Kubernetes like platforms turn-off routing to the container. + +``` +readinessProbe: + httpGet: + path: /minio/health/ready + port: 9000 + scheme: HTTP + initialDelaySeconds: 120 periodSeconds: 15 timeoutSeconds: 10 successThreshold: 1