reduce startup delays on kubernetes (#16356)

This commit is contained in:
Harshavardhana 2023-01-05 02:32:43 -08:00 committed by GitHub
parent b29e159604
commit e0086c1be7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 51 additions and 5 deletions

View File

@ -117,7 +117,8 @@ func init() {
go func() {
var t *time.Ticker
if containers {
t = time.NewTicker(1 * time.Minute)
// k8s DNS TTL is 30s (Attempt a refresh only after)
t = time.NewTicker(30 * time.Second)
} else {
t = time.NewTicker(10 * time.Minute)
}

View File

@ -417,6 +417,27 @@ func (endpoints Endpoints) atleastOneEndpointLocal() bool {
return false
}
func hostResolveToLocalhost(endpoint Endpoint) bool {
hostIPs, err := getHostIP(endpoint.Hostname())
if err != nil {
// Log the message to console about the host resolving
reqInfo := (&logger.ReqInfo{}).AppendTags(
"host",
endpoint.Hostname(),
)
ctx := logger.SetReqInfo(GlobalContext, reqInfo)
logger.LogOnceIf(ctx, err, endpoint.Hostname(), logger.Application)
return false
}
var loopback int
for _, hostIP := range hostIPs.ToSlice() {
if net.ParseIP(hostIP).IsLoopback() {
loopback++
}
}
return loopback == len(hostIPs)
}
// UpdateIsLocal - resolves the host and discovers the local host.
func (endpoints Endpoints) UpdateIsLocal(foundPrevLocal bool) error {
orchestrated := IsDocker() || IsKubernetes()
@ -452,6 +473,26 @@ func (endpoints Endpoints) UpdateIsLocal(foundPrevLocal bool) error {
endpoints[i].Hostname(),
)
if orchestrated && hostResolveToLocalhost(endpoints[i]) {
// time elapsed
timeElapsed := time.Since(startTime)
// log error only if more than a second has elapsed
if timeElapsed > time.Second {
reqInfo.AppendTags("elapsedTime",
humanize.RelTime(startTime,
startTime.Add(timeElapsed),
"elapsed",
"",
))
ctx := logger.SetReqInfo(GlobalContext,
reqInfo)
logger.LogOnceIf(ctx, fmt.Errorf("%s resolves to localhost in a containerized deployment, waiting for it to resolve to a valid IP",
endpoints[i].Hostname()), endpoints[i].Hostname(), logger.Application)
}
continue
}
// return err if not Docker or Kubernetes
// We use IsDocker() to check for Docker environment
// We use IsKubernetes() to check for Kubernetes environment
@ -465,7 +506,7 @@ func (endpoints Endpoints) UpdateIsLocal(foundPrevLocal bool) error {
if err != nil {
// time elapsed
timeElapsed := time.Since(startTime)
// log error only if more than 1s elapsed
// log error only if more than a second has elapsed
if timeElapsed > time.Second {
reqInfo.AppendTags("elapsedTime",
humanize.RelTime(startTime,
@ -475,7 +516,7 @@ func (endpoints Endpoints) UpdateIsLocal(foundPrevLocal bool) error {
))
ctx := logger.SetReqInfo(GlobalContext,
reqInfo)
logger.LogIf(ctx, err, logger.Application)
logger.LogOnceIf(ctx, err, endpoints[i].Hostname(), logger.Application)
}
} else {
resolvedList[i] = true

View File

@ -399,10 +399,14 @@ func initServer(ctx context.Context, newObject ObjectLayer) error {
lkctx, err := txnLk.GetLock(ctx, lockTimeout)
if err != nil {
logger.Info("Waiting for all MinIO sub-systems to be initialized.. trying to acquire lock")
waitDuration := time.Duration(r.Float64() * 2 * float64(time.Second))
waitDuration := time.Duration(r.Float64() * 5 * float64(time.Second))
bootstrapTrace(fmt.Sprintf("lock not available. error: %v. sleeping for %v before retry", err, waitDuration))
// Sleep 0 -> 2 seconds to average 1 second retry interval.
// Sleep 0 -> 5 seconds, provider a higher range such that sleeps()
// and retries for lock are more spread out, needed orchestrated
// systems take 30s minimum to respond to DNS resolvers.
//
// Do not change this value.
time.Sleep(waitDuration)
continue
}