Add healthcheck endpoints (#5543)

This PR adds readiness and liveness endpoints to probe Minio server
instance health. Endpoints can only be accessed without authentication
and the paths are /minio/health/live and /minio/health/ready for
liveness and readiness respectively.

The new healthcheck liveness endpoint is used for Docker healthcheck
now.

Fixes #5357
Fixes #5514
This commit is contained in:
Nitish Tiwari 2018-03-12 11:46:53 +05:30 committed by GitHub
parent d90985b6d8
commit 10b01ac836
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 217 additions and 11 deletions

View File

@ -193,6 +193,18 @@ func guessIsBrowserReq(req *http.Request) bool {
return strings.Contains(req.Header.Get("User-Agent"), "Mozilla")
}
// guessIsHealthCheckReq - returns true if incoming request looks
// like healthcheck request
func guessIsHealthCheckReq(req *http.Request) bool {
if req == nil {
return false
}
aType := getRequestAuthType(req)
return req.Method == http.MethodGet && aType == authTypeAnonymous &&
(req.URL.Path == healthCheckPathPrefix+healthCheckLivenessPath ||
req.URL.Path == healthCheckPathPrefix+healthCheckReadinessPath)
}
// guessIsRPCReq - returns true if the request is for an RPC endpoint.
func guessIsRPCReq(req *http.Request) bool {
if req == nil {
@ -263,7 +275,7 @@ func setReservedBucketHandler(h http.Handler) http.Handler {
func (h minioReservedBucketHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
switch {
case guessIsRPCReq(r), guessIsBrowserReq(r), isAdminReq(r):
case guessIsRPCReq(r), guessIsBrowserReq(r), guessIsHealthCheckReq(r), isAdminReq(r):
// Allow access to reserved buckets
default:
// For all other requests reject access to reserved

View File

@ -0,0 +1,69 @@
/*
* Minio Cloud Storage, (C) 2018 Minio, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package cmd
import (
"fmt"
"net/http"
"runtime"
)
const (
minioHealthGoroutineThreshold = 1000
)
// ReadinessCheckHandler -- checks if there are more than threshold number of goroutines running,
// returns service unavailable.
// Readiness probes are used to detect situations where application is under heavy load
// and temporarily unable to serve. In a orchestrated setup like Kubernetes, containers reporting
// that they are not ready do not receive traffic through Kubernetes Services.
func ReadinessCheckHandler(w http.ResponseWriter, r *http.Request) {
if err := goroutineCountCheck(minioHealthGoroutineThreshold); err != nil {
writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone)
return
}
writeResponse(w, http.StatusOK, nil, mimeNone)
}
// LivenessCheckHandler -- checks if server can ListBuckets internally. If not, server is
// considered to have failed and needs to be restarted.
// Liveness probes are used to detect situations where application (minio)
// has gone into a state where it can not recover except by being restarted.
func LivenessCheckHandler(w http.ResponseWriter, r *http.Request) {
objLayer := newObjectLayerFn()
// Service not initialized yet
if objLayer == nil {
writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone)
return
}
// List buckets is unsuccessful, means server is having issues, send 503 service unavailable
if _, err := objLayer.ListBuckets(); err != nil {
writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone)
return
}
writeResponse(w, http.StatusOK, nil, mimeNone)
}
// checks threshold against total number of go-routines in the system and throws error if
// more than threshold go-routines are running.
func goroutineCountCheck(threshold int) error {
count := runtime.NumGoroutine()
if count > threshold {
return fmt.Errorf("too many goroutines (%d > %d)", count, threshold)
}
return nil
}

View File

@ -0,0 +1,44 @@
/*
* Minio Cloud Storage, (C) 2018 Minio, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package cmd
import (
"testing"
"time"
)
func TestGoroutineCountCheck(t *testing.T) {
tests := []struct {
threshold int
wantErr bool
}{
{5000, false},
{5, true},
{6, true},
}
for _, tt := range tests {
// Make goroutines -- to make sure number of go-routines is higher than threshold
if tt.threshold == 5 || tt.threshold == 6 {
for i := 0; i < 6; i++ {
go time.Sleep(5)
}
}
if err := goroutineCountCheck(tt.threshold); (err != nil) != tt.wantErr {
t.Errorf("goroutineCountCheck() error = %v, wantErr %v", err, tt.wantErr)
}
}
}

43
cmd/healthcheck-router.go Normal file
View File

@ -0,0 +1,43 @@
/*
* Minio Cloud Storage, (C) 2018 Minio, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package cmd
import (
"net/http"
router "github.com/gorilla/mux"
)
const (
healthCheckPath = "/health"
healthCheckLivenessPath = "/live"
healthCheckReadinessPath = "/ready"
healthCheckPathPrefix = minioReservedBucketPath + healthCheckPath
)
// registerHealthCheckRouter - add handler functions for liveness and readiness routes.
func registerHealthCheckRouter(mux *router.Router) {
// Healthcheck router
healthRouter := mux.NewRoute().PathPrefix(healthCheckPathPrefix).Subrouter()
// Liveness handler
healthRouter.Methods(http.MethodGet).Path(healthCheckLivenessPath).HandlerFunc(LivenessCheckHandler)
// Readiness handler
healthRouter.Methods(http.MethodGet).Path(healthCheckReadinessPath).HandlerFunc(ReadinessCheckHandler)
}

View File

@ -73,6 +73,9 @@ func configureServerHandler(endpoints EndpointList) (http.Handler, error) {
// Add Admin router.
registerAdminRouter(mux)
// Add healthcheck router
registerHealthCheckRouter(mux)
// Register web router when its enabled.
if globalIsBrowserEnabled {
if err := registerWebRouter(mux); err != nil {

View File

@ -20,7 +20,7 @@ set -x
_init () {
scheme="http://"
address="$(netstat -nplt 2>/dev/null | awk ' /(.*\/minio)/ { gsub(":::","127.0.0.1:",$4); print $4}')"
resource="/minio/index.html"
resource="/minio/health/live"
start=$(stat -c "%Y" /proc/1)
}
@ -34,11 +34,10 @@ healthcheck_main () {
exit 0
else
# Get the http response code
http_response=$(curl -H "User-Agent: Mozilla" -s -k -o /dev/null -I -w "%{http_code}" \
${scheme}${address}${resource})
http_response=$(curl -s -k -o /dev/null -I -w "%{http_code}" ${scheme}${address}${resource})
# Get the http response body
http_response_body=$(curl -H "User-Agent: Mozilla" -k -s ${scheme}${address}${resource})
http_response_body=$(curl -k -s ${scheme}${address}${resource})
# server returns response 403 and body "SSL required" if non-TLS
# connection is attempted on a TLS-configured server. Change
@ -46,14 +45,11 @@ healthcheck_main () {
if [ "$http_response" = "403" ] && \
[ "$http_response_body" = "SSL required" ]; then
scheme="https://"
http_response=$(curl -H "User-Agent: Mozilla" -s -k -o /dev/null -I -w "%{http_code}" \
${scheme}${address}${resource})
http_response=$(curl -s -k -o /dev/null -I -w "%{http_code}" ${scheme}${address}${resource})
fi
# If http_repsonse is 200 - server is up. When MINIO_BROWSER is
# set to off, curl responds with 404. We assume that the server
# is up
[ "$http_response" = "200" ] || [ "$http_response" = "404" ]
# If http_repsonse is 200 - server is up.
[ "$http_response" = "200" ]
fi
}

View File

@ -0,0 +1,39 @@
## Minio Healthcheck
Minio server exposes two un-authenticated, healthcheck endpoints - liveness probe and readiness probe at `/minio/health/live` and `/minio/health/ready` respectively.
### Liveness probe
This probe is used to identify situations where the server is running but may not behave optimally, i.e. sluggish response or corrupt backend. Such problems can be *only* fixed by a restart.
Internally, Minio liveness probe handler does a ListBuckets call. If successful, the server returns 200 OK, otherwise 503 Service Unavailable.
When liveness probe fails, Kubernetes like platforms restart the container.
Sample configuration in a Kubernetes `yaml` file.
```yaml
livenessProbe:
httpGet:
path: /minio/health/live
port: 9000
initialDelaySeconds: 10
periodSeconds: 20
```
### Readiness probe
This probe is used to identify situations where the server is not ready to accept requests yet. In most cases, such conditions recover in some time.
Internally, Minio readiness probe handler checks for total go-routines. If the number of go-routines is less than 1000 (threshold), the server returns 200 OK, otherwise 503 Service Unavailable.
Platforms like Kubernetes *do not* forward traffic to a pod until its readiness probe is successful.
Sample configuration in a Kubernetes `yaml` file.
```yaml
livenessProbe:
httpGet:
path: /minio/health/ready
port: 9000
initialDelaySeconds: 10
periodSeconds: 20
```