Retry name lookup for kubernetes and docker swarm environment (#4800)

Wait for remote hosts to resolve instead of failing on first host
resolution error, when running in Kubernetes or Docker environment.

Note that

- Waiting is based on exponential back-off mechanism
- If run as a binary, server fails if remote host is not resolvable

This is needed because in orchestration platforms like Kubernetes, remote
hosts are started sequentially and all the hosts are not up initially,
though they are expected to come up in a short time frame
It is difficult to identify a cap on the waiting time due to
non-deterministic nature of infrastructure platforms, so the server waits
infinitely for the hosts to come up, while logging the error messages to
the console.

Fixes: https://github.com/minio/minio/issues/4669
This commit is contained in:
Nitish Tiwari 2017-08-14 01:33:06 +05:30 committed by Harshavardhana
parent 53f84d6084
commit d4b107adf4
2 changed files with 38 additions and 42 deletions

View File

@ -22,41 +22,6 @@ if [ "${1}" != "minio" ]; then
fi
fi
# Wait for all the hosts to come online and have
# their DNS entries populated properly.
docker_wait_hosts() {
hosts="$@"
num_hosts=0
# Count number of hosts in arguments.
for host in $hosts; do
[ $(echo "$host" | grep -E "^http") ] || continue
num_hosts=$((num_hosts+1))
done
if [ $num_hosts -gt 0 ]; then
echo -n "Waiting for all hosts to resolve..."
while true; do
x=0
for host in $hosts; do
[ $(echo "$host" | grep -E "^http") ] || continue
# Extract the domain.
host=$(echo $host | sed -e 's/^http[s]\?:\/\/\([^\/]\+\).*/\1/')
echo -n .
val=$(ping -c 1 $host 2>/dev/null)
if [ $? != 0 ]; then
echo "Failed to lookup $host"
continue
fi
x=$((x+1))
done
# Provided hosts same as successful hosts, should break out.
test $x -eq $num_hosts && break
echo "Failed to resolve hosts.. retrying after 1 second."
sleep 1
done
echo "All hosts are resolving proceeding to initialize Minio."
fi
}
## Look for docker secrets in default documented location.
docker_secrets_env() {
local MINIO_ACCESS_KEY_FILE="/run/secrets/access_key"
@ -75,7 +40,4 @@ docker_secrets_env() {
## Set access env from secrets if necessary.
docker_secrets_env
## Wait for all the hosts to come online.
docker_wait_hosts "$@"
exec "$@"

View File

@ -26,7 +26,9 @@ import (
"strconv"
"strings"
"syscall"
"time"
humanize "github.com/dustin/go-humanize"
"github.com/minio/minio-go/pkg/set"
)
@ -65,12 +67,44 @@ func mustGetLocalIP4() (ipList set.StringSet) {
// getHostIP4 returns IPv4 address of given host.
func getHostIP4(host string) (ipList set.StringSet, err error) {
ipList = set.NewStringSet()
ips, err := net.LookupIP(host)
if err != nil {
return ipList, err
var ips []net.IP
if ips, err = net.LookupIP(host); err != nil {
// return err if not Docker or Kubernetes
// We use IsDocker() method to check for Docker Swarm environment
// as there is no reliable way to clearly identify Swarm from
// Docker environment.
if !IsDocker() && !IsKubernetes() {
return ipList, err
}
// channel to indicate completion of host resolution
doneCh := make(chan struct{})
// Indicate retry routine to exit cleanly, upon this function return.
defer close(doneCh)
// Mark the starting time
startTime := time.Now()
// wait for hosts to resolve in exponentialbackoff manner
for _ = range newRetryTimerSimple(doneCh) {
// Retry infinitely on Kubernetes and Docker swarm.
// This is needed as the remote hosts are sometime
// not available immediately.
if ips, err = net.LookupIP(host); err == nil {
break
}
// time elapsed
timeElapsed := time.Since(startTime)
// log error only if more than 1s elapsed
if timeElapsed > time.Second {
// log the message to console about the host not being
// resolveable.
errorIf(err, "Unable to resolve host %s (%s)", host,
humanize.RelTime(startTime, startTime.Add(timeElapsed), "elapsed", ""))
}
}
}
ipList = set.NewStringSet()
for _, ip := range ips {
if ip.To4() != nil {
ipList.Add(ip.String())