read-health check endpoint returns success if cluster can serve read requests (#11310)

This commit is contained in:
Krishna Srinivas 2021-02-09 01:00:44 -08:00 committed by GitHub
parent 3d74efa6b1
commit 876b79b8d8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 98 additions and 7 deletions

View File

@ -1461,6 +1461,40 @@ type HealthResult struct {
WriteQuorum int WriteQuorum int
} }
// ReadHealth returns if the cluster can serve read requests
func (z *erasureServerPools) ReadHealth(ctx context.Context) bool {
erasureSetUpCount := make([][]int, len(z.serverPools))
for i := range z.serverPools {
erasureSetUpCount[i] = make([]int, len(z.serverPools[i].sets))
}
diskIDs := globalNotificationSys.GetLocalDiskIDs(ctx)
diskIDs = append(diskIDs, getLocalDiskIDs(z))
for _, localDiskIDs := range diskIDs {
for _, id := range localDiskIDs {
poolIdx, setIdx, err := z.getPoolAndSet(id)
if err != nil {
logger.LogIf(ctx, err)
continue
}
erasureSetUpCount[poolIdx][setIdx]++
}
}
b := z.BackendInfo()
readQuorum := b.StandardSCData[0]
for poolIdx := range erasureSetUpCount {
for setIdx := range erasureSetUpCount[poolIdx] {
if erasureSetUpCount[poolIdx][setIdx] < readQuorum {
return false
}
}
}
return true
}
// Health - returns current status of the object layer health, // Health - returns current status of the object layer health,
// provides if write access exists across sets, additionally // provides if write access exists across sets, additionally
// can be used to query scenarios if health may be lost // can be used to query scenarios if health may be lost

View File

@ -1623,3 +1623,9 @@ func (fs *FSObjects) Health(ctx context.Context, opts HealthOptions) HealthResul
Healthy: newObjectLayerFn() != nil, Healthy: newObjectLayerFn() != nil,
} }
} }
// ReadHealth returns "read" health of the object layer
func (fs *FSObjects) ReadHealth(ctx context.Context) bool {
_, err := os.Stat(fs.fsPath)
return err == nil
}

View File

@ -254,3 +254,8 @@ func (a GatewayUnsupported) IsCompressionSupported() bool {
func (a GatewayUnsupported) Health(_ context.Context, _ HealthOptions) HealthResult { func (a GatewayUnsupported) Health(_ context.Context, _ HealthOptions) HealthResult {
return HealthResult{} return HealthResult{}
} }
// ReadHealth - No Op.
func (a GatewayUnsupported) ReadHealth(_ context.Context) bool {
return true
}

View File

@ -216,7 +216,8 @@ func guessIsHealthCheckReq(req *http.Request) bool {
return aType == authTypeAnonymous && (req.Method == http.MethodGet || req.Method == http.MethodHead) && return aType == authTypeAnonymous && (req.Method == http.MethodGet || req.Method == http.MethodHead) &&
(req.URL.Path == healthCheckPathPrefix+healthCheckLivenessPath || (req.URL.Path == healthCheckPathPrefix+healthCheckLivenessPath ||
req.URL.Path == healthCheckPathPrefix+healthCheckReadinessPath || req.URL.Path == healthCheckPathPrefix+healthCheckReadinessPath ||
req.URL.Path == healthCheckPathPrefix+healthCheckClusterPath) req.URL.Path == healthCheckPathPrefix+healthCheckClusterPath ||
req.URL.Path == healthCheckPathPrefix+healthCheckClusterReadPath)
} }
// guessIsMetricsReq - returns true if incoming request looks // guessIsMetricsReq - returns true if incoming request looks

View File

@ -64,6 +64,29 @@ func ClusterCheckHandler(w http.ResponseWriter, r *http.Request) {
writeResponse(w, http.StatusOK, nil, mimeNone) writeResponse(w, http.StatusOK, nil, mimeNone)
} }
// ClusterReadCheckHandler returns if the server is ready for requests.
func ClusterReadCheckHandler(w http.ResponseWriter, r *http.Request) {
ctx := newContext(r, w, "ClusterReadCheckHandler")
if shouldProxy() {
w.Header().Set(xhttp.MinIOServerStatus, unavailable)
writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone)
return
}
objLayer := newObjectLayerFn()
ctx, cancel := context.WithTimeout(ctx, globalAPIConfig.getClusterDeadline())
defer cancel()
result := objLayer.ReadHealth(ctx)
if !result {
writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone)
return
}
writeResponse(w, http.StatusOK, nil, mimeNone)
}
// ReadinessCheckHandler Checks if the process is up. Always returns success. // ReadinessCheckHandler Checks if the process is up. Always returns success.
func ReadinessCheckHandler(w http.ResponseWriter, r *http.Request) { func ReadinessCheckHandler(w http.ResponseWriter, r *http.Request) {
if shouldProxy() { if shouldProxy() {

View File

@ -23,11 +23,12 @@ import (
) )
const ( const (
healthCheckPath = "/health" healthCheckPath = "/health"
healthCheckLivenessPath = "/live" healthCheckLivenessPath = "/live"
healthCheckReadinessPath = "/ready" healthCheckReadinessPath = "/ready"
healthCheckClusterPath = "/cluster" healthCheckClusterPath = "/cluster"
healthCheckPathPrefix = minioReservedBucketPath + healthCheckPath healthCheckClusterReadPath = "/cluster/read"
healthCheckPathPrefix = minioReservedBucketPath + healthCheckPath
) )
// registerHealthCheckRouter - add handler functions for liveness and readiness routes. // registerHealthCheckRouter - add handler functions for liveness and readiness routes.
@ -38,6 +39,7 @@ func registerHealthCheckRouter(router *mux.Router) {
// Cluster check handler to verify cluster is active // Cluster check handler to verify cluster is active
healthRouter.Methods(http.MethodGet).Path(healthCheckClusterPath).HandlerFunc(httpTraceAll(ClusterCheckHandler)) healthRouter.Methods(http.MethodGet).Path(healthCheckClusterPath).HandlerFunc(httpTraceAll(ClusterCheckHandler))
healthRouter.Methods(http.MethodGet).Path(healthCheckClusterReadPath).HandlerFunc(httpTraceAll(ClusterReadCheckHandler))
// Liveness handler // Liveness handler
healthRouter.Methods(http.MethodGet).Path(healthCheckLivenessPath).HandlerFunc(httpTraceAll(LivenessCheckHandler)) healthRouter.Methods(http.MethodGet).Path(healthCheckLivenessPath).HandlerFunc(httpTraceAll(LivenessCheckHandler))

View File

@ -155,6 +155,7 @@ type ObjectLayer interface {
// Returns health of the backend // Returns health of the backend
Health(ctx context.Context, opts HealthOptions) HealthResult Health(ctx context.Context, opts HealthOptions) HealthResult
ReadHealth(ctx context.Context) bool
// ObjectTagging operations // ObjectTagging operations
PutObjectTags(context.Context, string, string, string, ObjectOptions) (ObjectInfo, error) PutObjectTags(context.Context, string, string, string, ObjectOptions) (ObjectInfo, error)

View File

@ -20,7 +20,8 @@ livenessProbe:
``` ```
### Cluster probe ### Cluster probe
This probe is not useful in almost all cases, this is meant for administrators to see if quorum is available in any given cluster. The reply is '200 OK' if cluster has quorum if not it returns '503 Service Unavailable'. #### Cluster-writeable probe
This probe is not useful in almost all cases, this is meant for administrators to see if write quorum is available in any given cluster. The reply is '200 OK' if cluster has write quorum if not it returns '503 Service Unavailable'.
``` ```
curl http://minio1:9001/minio/health/cluster curl http://minio1:9001/minio/health/cluster
@ -37,6 +38,24 @@ X-Xss-Protection: 1; mode=block
Date: Tue, 21 Jul 2020 00:36:14 GMT Date: Tue, 21 Jul 2020 00:36:14 GMT
``` ```
#### Clustr-readable probe
This probe is not useful in almost all cases, this is meant for administrators to see if read quorum is available in any given cluster. The reply is '200 OK' if cluster has read quorum if not it returns '503 Service Unavailable'.
```
curl http://minio1:9001/minio/health/cluster/read
HTTP/1.1 503 Service Unavailable
Accept-Ranges: bytes
Content-Length: 0
Content-Security-Policy: block-all-mixed-content
Server: MinIO/GOGET.GOGET
Vary: Origin
X-Amz-Bucket-Region: us-east-1
X-Minio-Write-Quorum: 3
X-Amz-Request-Id: 16239D6AB80EBECF
X-Xss-Protection: 1; mode=block
Date: Tue, 21 Jul 2020 00:36:14 GMT
```
#### Checking cluster health for maintenance #### Checking cluster health for maintenance
You may query the cluster probe endpoint to check if the node which received the request can be taken down for maintenance, if the server replies back '412 Precondition Failed' this means you will lose HA. '200 OK' means you are okay to proceed. You may query the cluster probe endpoint to check if the node which received the request can be taken down for maintenance, if the server replies back '412 Precondition Failed' this means you will lose HA. '200 OK' means you are okay to proceed.