rest: healthcheck should not update failure metrics (#12458)

Otherwise, we can see high numbers of networking issues when a node is
down.
This commit is contained in:
Anis Elleuch 2021-06-08 22:09:26 +01:00 committed by GitHub
parent 9a2102f5ed
commit 6c8be64cdb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 14 additions and 3 deletions

View File

@ -161,6 +161,7 @@ func newlockRESTClient(endpoint Endpoint) *lockRESTClient {
// Use a separate client to avoid recursive calls.
healthClient := rest.NewClient(serverURL, globalInternodeTransport, newAuthToken)
healthClient.ExpectTimeouts = true
healthClient.NoMetrics = true
restClient.HealthCheckFn = func() bool {
ctx, cancel := context.WithTimeout(context.Background(), restClient.HealthCheckTimeout)
defer cancel()

View File

@ -919,6 +919,7 @@ func newPeerRESTClient(peer *xnet.Host) *peerRESTClient {
// Use a separate client to avoid recursive calls.
healthClient := rest.NewClient(serverURL, globalInternodeTransport, newAuthToken)
healthClient.ExpectTimeouts = true
healthClient.NoMetrics = true
// Construct a new health function.
restClient.HealthCheckFn = func() bool {

View File

@ -704,6 +704,7 @@ func newStorageRESTClient(endpoint Endpoint, healthcheck bool) *storageRESTClien
// Use a separate client to avoid recursive calls.
healthClient := rest.NewClient(serverURL, globalInternodeTransport, newAuthToken)
healthClient.ExpectTimeouts = true
healthClient.NoMetrics = true
restClient.HealthCheckFn = func() bool {
ctx, cancel := context.WithTimeout(context.Background(), restClient.HealthCheckTimeout)
defer cancel()

View File

@ -99,6 +99,9 @@ type Client struct {
// This will not mark the client offline in these cases.
ExpectTimeouts bool
// Avoid metrics update if set to true
NoMetrics bool
httpClient *http.Client
url *url.URL
newAuthToken func(audience string) string
@ -136,8 +139,10 @@ func (c *Client) Call(ctx context.Context, method string, values url.Values, bod
}
resp, err := c.httpClient.Do(req)
if err != nil {
if c.HealthCheckFn != nil && xnet.IsNetworkOrHostDown(err, c.ExpectTimeouts) {
atomic.AddUint64(&networkErrsCounter, 1)
if xnet.IsNetworkOrHostDown(err, c.ExpectTimeouts) {
if !c.NoMetrics {
atomic.AddUint64(&networkErrsCounter, 1)
}
if c.MarkOffline() {
logger.LogIf(ctx, fmt.Errorf("Marking %s temporary offline; caused by %w", c.url.String(), err))
}
@ -169,7 +174,10 @@ func (c *Client) Call(ctx context.Context, method string, values url.Values, bod
// Limit the ReadAll(), just in case, because of a bug, the server responds with large data.
b, err := ioutil.ReadAll(io.LimitReader(resp.Body, c.MaxErrResponseSize))
if err != nil {
if c.HealthCheckFn != nil && xnet.IsNetworkOrHostDown(err, c.ExpectTimeouts) {
if xnet.IsNetworkOrHostDown(err, c.ExpectTimeouts) {
if !c.NoMetrics {
atomic.AddUint64(&networkErrsCounter, 1)
}
if c.MarkOffline() {
logger.LogIf(ctx, fmt.Errorf("Marking %s temporary offline; caused by %w", c.url.String(), err))
}