read-health check endpoint returns success if cluster can serve read requests (#11310)

2025-07-14 11:21:52 -04:00 · 2021-02-09 01:00:44 -08:00 · 2021-02-09 01:00:44 -08:00 · 876b79b8d8
commit 876b79b8d8
parent 3d74efa6b1
8 changed files with 98 additions and 7 deletions
--- a/cmd/erasure-server-pool.go
+++ b/cmd/erasure-server-pool.go
@ -1461,6 +1461,40 @@ type HealthResult struct {
 	WriteQuorum   int
 }
 // ReadHealth returns if the cluster can serve read requests
 func (z *erasureServerPools) ReadHealth(ctx context.Context) bool {
 	erasureSetUpCount := make([][]int, len(z.serverPools))
 	for i := range z.serverPools {
 		erasureSetUpCount[i] = make([]int, len(z.serverPools[i].sets))
 	}
 	diskIDs := globalNotificationSys.GetLocalDiskIDs(ctx)
 	diskIDs = append(diskIDs, getLocalDiskIDs(z))
 	for _, localDiskIDs := range diskIDs {
 		for _, id := range localDiskIDs {
 			poolIdx, setIdx, err := z.getPoolAndSet(id)
 			if err != nil {
 				logger.LogIf(ctx, err)
 				continue
 			}
 			erasureSetUpCount[poolIdx][setIdx]++
 		}
 	}
 	b := z.BackendInfo()
 	readQuorum := b.StandardSCData[0]
 	for poolIdx := range erasureSetUpCount {
 		for setIdx := range erasureSetUpCount[poolIdx] {
 			if erasureSetUpCount[poolIdx][setIdx] < readQuorum {
 				return false
 			}
 		}
 	}
 	return true
 }
 // Health - returns current status of the object layer health,
 // provides if write access exists across sets, additionally
 // can be used to query scenarios if health may be lost
--- a/cmd/fs-v1.go
+++ b/cmd/fs-v1.go
@ -1623,3 +1623,9 @@ func (fs *FSObjects) Health(ctx context.Context, opts HealthOptions) HealthResul
 		Healthy: newObjectLayerFn() != nil,
 	}
 }
 // ReadHealth returns "read" health of the object layer
 func (fs *FSObjects) ReadHealth(ctx context.Context) bool {
 	_, err := os.Stat(fs.fsPath)
 	return err == nil
 }
--- a/cmd/gateway-unsupported.go
+++ b/cmd/gateway-unsupported.go
@ -254,3 +254,8 @@ func (a GatewayUnsupported) IsCompressionSupported() bool {
 func (a GatewayUnsupported) Health(_ context.Context, _ HealthOptions) HealthResult {
 	return HealthResult{}
 }
 // ReadHealth - No Op.
 func (a GatewayUnsupported) ReadHealth(_ context.Context) bool {
 	return true
 }
--- a/cmd/generic-handlers.go
+++ b/cmd/generic-handlers.go
@ -216,7 +216,8 @@ func guessIsHealthCheckReq(req *http.Request) bool {
 	return aType == authTypeAnonymous && (req.Method == http.MethodGet || req.Method == http.MethodHead) &&
 		(req.URL.Path == healthCheckPathPrefix+healthCheckLivenessPath ||
 			req.URL.Path == healthCheckPathPrefix+healthCheckReadinessPath ||
-			req.URL.Path == healthCheckPathPrefix+healthCheckClusterPath)
+			req.URL.Path == healthCheckPathPrefix+healthCheckClusterPath ||
 			req.URL.Path == healthCheckPathPrefix+healthCheckClusterReadPath)
 }
 // guessIsMetricsReq - returns true if incoming request looks
--- a/cmd/healthcheck-handler.go
+++ b/cmd/healthcheck-handler.go
@ -64,6 +64,29 @@ func ClusterCheckHandler(w http.ResponseWriter, r *http.Request) {
 	writeResponse(w, http.StatusOK, nil, mimeNone)
 }
 // ClusterReadCheckHandler returns if the server is ready for requests.
 func ClusterReadCheckHandler(w http.ResponseWriter, r *http.Request) {
 	ctx := newContext(r, w, "ClusterReadCheckHandler")
 	if shouldProxy() {
 		w.Header().Set(xhttp.MinIOServerStatus, unavailable)
 		writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone)
 		return
 	}
 	objLayer := newObjectLayerFn()
 	ctx, cancel := context.WithTimeout(ctx, globalAPIConfig.getClusterDeadline())
 	defer cancel()
 	result := objLayer.ReadHealth(ctx)
 	if !result {
 		writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone)
 		return
 	}
 	writeResponse(w, http.StatusOK, nil, mimeNone)
 }
 // ReadinessCheckHandler Checks if the process is up. Always returns success.
 func ReadinessCheckHandler(w http.ResponseWriter, r *http.Request) {
 	if shouldProxy() {
--- a/cmd/healthcheck-router.go
+++ b/cmd/healthcheck-router.go
@ -23,11 +23,12 @@ import (
 )
 const (
-	healthCheckPath          = "/health"
+	healthCheckPath            = "/health"
-	healthCheckLivenessPath  = "/live"
+	healthCheckLivenessPath    = "/live"
-	healthCheckReadinessPath = "/ready"
+	healthCheckReadinessPath   = "/ready"
-	healthCheckClusterPath   = "/cluster"
+	healthCheckClusterPath     = "/cluster"
-	healthCheckPathPrefix    = minioReservedBucketPath + healthCheckPath
+	healthCheckClusterReadPath = "/cluster/read"
 	healthCheckPathPrefix      = minioReservedBucketPath + healthCheckPath
 )
 // registerHealthCheckRouter - add handler functions for liveness and readiness routes.
@ -38,6 +39,7 @@ func registerHealthCheckRouter(router *mux.Router) {
 	// Cluster check handler to verify cluster is active
 	healthRouter.Methods(http.MethodGet).Path(healthCheckClusterPath).HandlerFunc(httpTraceAll(ClusterCheckHandler))
 	healthRouter.Methods(http.MethodGet).Path(healthCheckClusterReadPath).HandlerFunc(httpTraceAll(ClusterReadCheckHandler))
 	// Liveness handler
 	healthRouter.Methods(http.MethodGet).Path(healthCheckLivenessPath).HandlerFunc(httpTraceAll(LivenessCheckHandler))
--- a/cmd/object-api-interface.go
+++ b/cmd/object-api-interface.go
@ -155,6 +155,7 @@ type ObjectLayer interface {
 	// Returns health of the backend
 	Health(ctx context.Context, opts HealthOptions) HealthResult
 	ReadHealth(ctx context.Context) bool
 	// ObjectTagging operations
 	PutObjectTags(context.Context, string, string, string, ObjectOptions) (ObjectInfo, error)
--- a/docs/metrics/healthcheck/README.md
+++ b/docs/metrics/healthcheck/README.md
@ -20,7 +20,8 @@ livenessProbe:
 ```
 ### Cluster probe
-This probe is not useful in almost all cases, this is meant for administrators to see if quorum is available in any given cluster. The reply is '200 OK' if cluster has quorum if not it returns '503 Service Unavailable'.
+#### Cluster-writeable probe
 This probe is not useful in almost all cases, this is meant for administrators to see if write quorum is available in any given cluster. The reply is '200 OK' if cluster has write quorum if not it returns '503 Service Unavailable'.
 ```
 curl http://minio1:9001/minio/health/cluster
@ -37,6 +38,24 @@ X-Xss-Protection: 1; mode=block
 Date: Tue, 21 Jul 2020 00:36:14 GMT
 ```
 #### Clustr-readable probe
 This probe is not useful in almost all cases, this is meant for administrators to see if read quorum is available in any given cluster. The reply is '200 OK' if cluster has read quorum if not it returns '503 Service Unavailable'.
 ```
 curl http://minio1:9001/minio/health/cluster/read
 HTTP/1.1 503 Service Unavailable
 Accept-Ranges: bytes
 Content-Length: 0
 Content-Security-Policy: block-all-mixed-content
 Server: MinIO/GOGET.GOGET
 Vary: Origin
 X-Amz-Bucket-Region: us-east-1
 X-Minio-Write-Quorum: 3
 X-Amz-Request-Id: 16239D6AB80EBECF
 X-Xss-Protection: 1; mode=block
 Date: Tue, 21 Jul 2020 00:36:14 GMT
 ```
 #### Checking cluster health for maintenance
 You may query the cluster probe endpoint to check if the node which received the request can be taken down for maintenance, if the server replies back '412 Precondition Failed' this means you will lose HA. '200 OK' means you are okay to proceed.