From e0086c1be7deb8d2e2cb9db051270883adbc4af5 Mon Sep 17 00:00:00 2001 From: Harshavardhana Date: Thu, 5 Jan 2023 02:32:43 -0800 Subject: [PATCH] reduce startup delays on kubernetes (#16356) --- cmd/common-main.go | 3 ++- cmd/endpoint.go | 45 +++++++++++++++++++++++++++++++++++++++++++-- cmd/server-main.go | 8 ++++++-- 3 files changed, 51 insertions(+), 5 deletions(-) diff --git a/cmd/common-main.go b/cmd/common-main.go index b0cec4205..9ec492f48 100644 --- a/cmd/common-main.go +++ b/cmd/common-main.go @@ -117,7 +117,8 @@ func init() { go func() { var t *time.Ticker if containers { - t = time.NewTicker(1 * time.Minute) + // k8s DNS TTL is 30s (Attempt a refresh only after) + t = time.NewTicker(30 * time.Second) } else { t = time.NewTicker(10 * time.Minute) } diff --git a/cmd/endpoint.go b/cmd/endpoint.go index 4ed51dd33..e5cbe262d 100644 --- a/cmd/endpoint.go +++ b/cmd/endpoint.go @@ -417,6 +417,27 @@ func (endpoints Endpoints) atleastOneEndpointLocal() bool { return false } +func hostResolveToLocalhost(endpoint Endpoint) bool { + hostIPs, err := getHostIP(endpoint.Hostname()) + if err != nil { + // Log the message to console about the host resolving + reqInfo := (&logger.ReqInfo{}).AppendTags( + "host", + endpoint.Hostname(), + ) + ctx := logger.SetReqInfo(GlobalContext, reqInfo) + logger.LogOnceIf(ctx, err, endpoint.Hostname(), logger.Application) + return false + } + var loopback int + for _, hostIP := range hostIPs.ToSlice() { + if net.ParseIP(hostIP).IsLoopback() { + loopback++ + } + } + return loopback == len(hostIPs) +} + // UpdateIsLocal - resolves the host and discovers the local host. func (endpoints Endpoints) UpdateIsLocal(foundPrevLocal bool) error { orchestrated := IsDocker() || IsKubernetes() @@ -452,6 +473,26 @@ func (endpoints Endpoints) UpdateIsLocal(foundPrevLocal bool) error { endpoints[i].Hostname(), ) + if orchestrated && hostResolveToLocalhost(endpoints[i]) { + // time elapsed + timeElapsed := time.Since(startTime) + // log error only if more than a second has elapsed + if timeElapsed > time.Second { + reqInfo.AppendTags("elapsedTime", + humanize.RelTime(startTime, + startTime.Add(timeElapsed), + "elapsed", + "", + )) + ctx := logger.SetReqInfo(GlobalContext, + reqInfo) + logger.LogOnceIf(ctx, fmt.Errorf("%s resolves to localhost in a containerized deployment, waiting for it to resolve to a valid IP", + endpoints[i].Hostname()), endpoints[i].Hostname(), logger.Application) + } + + continue + } + // return err if not Docker or Kubernetes // We use IsDocker() to check for Docker environment // We use IsKubernetes() to check for Kubernetes environment @@ -465,7 +506,7 @@ func (endpoints Endpoints) UpdateIsLocal(foundPrevLocal bool) error { if err != nil { // time elapsed timeElapsed := time.Since(startTime) - // log error only if more than 1s elapsed + // log error only if more than a second has elapsed if timeElapsed > time.Second { reqInfo.AppendTags("elapsedTime", humanize.RelTime(startTime, @@ -475,7 +516,7 @@ func (endpoints Endpoints) UpdateIsLocal(foundPrevLocal bool) error { )) ctx := logger.SetReqInfo(GlobalContext, reqInfo) - logger.LogIf(ctx, err, logger.Application) + logger.LogOnceIf(ctx, err, endpoints[i].Hostname(), logger.Application) } } else { resolvedList[i] = true diff --git a/cmd/server-main.go b/cmd/server-main.go index 6c0b138f2..98c77c302 100644 --- a/cmd/server-main.go +++ b/cmd/server-main.go @@ -399,10 +399,14 @@ func initServer(ctx context.Context, newObject ObjectLayer) error { lkctx, err := txnLk.GetLock(ctx, lockTimeout) if err != nil { logger.Info("Waiting for all MinIO sub-systems to be initialized.. trying to acquire lock") - waitDuration := time.Duration(r.Float64() * 2 * float64(time.Second)) + waitDuration := time.Duration(r.Float64() * 5 * float64(time.Second)) bootstrapTrace(fmt.Sprintf("lock not available. error: %v. sleeping for %v before retry", err, waitDuration)) - // Sleep 0 -> 2 seconds to average 1 second retry interval. + // Sleep 0 -> 5 seconds, provider a higher range such that sleeps() + // and retries for lock are more spread out, needed orchestrated + // systems take 30s minimum to respond to DNS resolvers. + // + // Do not change this value. time.Sleep(waitDuration) continue }