diff --git a/cmd/erasure-server-pool.go b/cmd/erasure-server-pool.go index 386d03256..03230a0a5 100644 --- a/cmd/erasure-server-pool.go +++ b/cmd/erasure-server-pool.go @@ -2425,6 +2425,7 @@ const ( type HealthOptions struct { Maintenance bool DeploymentType string + Startup bool } // HealthResult returns the current state of the system, also @@ -2449,6 +2450,24 @@ type HealthResult struct { UsingDefaults bool } +func (hr HealthResult) String() string { + var str strings.Builder + for i, es := range hr.ESHealth { + str.WriteString("(Pool: ") + str.WriteString(strconv.Itoa(es.PoolID)) + str.WriteString(" Set: ") + str.WriteString(strconv.Itoa(es.SetID)) + str.WriteString(" Healthy: ") + str.WriteString(strconv.FormatBool(es.Healthy)) + if i == 0 { + str.WriteString(")") + } else { + str.WriteString("), ") + } + } + return str.String() +} + // Health - returns current status of the object layer health, // provides if write access exists across sets, additionally // can be used to query scenarios if health may be lost @@ -2567,17 +2586,29 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea healthy := erasureSetUpCount[poolIdx][setIdx].online >= poolWriteQuorums[poolIdx] if !healthy { - storageLogIf(logger.SetReqInfo(ctx, reqInfo), - fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d", - poolIdx, setIdx, poolWriteQuorums[poolIdx]), logger.FatalKind) + if opts.Startup { + storageLogIf(logger.SetReqInfo(ctx, reqInfo), + fmt.Errorf("Write quorum was not established on pool: %d, set: %d, expected write quorum: %d", + poolIdx, setIdx, poolWriteQuorums[poolIdx]), logger.FatalKind) + } else { + storageLogIf(logger.SetReqInfo(ctx, reqInfo), + fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d", + poolIdx, setIdx, poolWriteQuorums[poolIdx]), logger.FatalKind) + } } result.Healthy = result.Healthy && healthy healthyRead := erasureSetUpCount[poolIdx][setIdx].online >= poolReadQuorums[poolIdx] if !healthyRead { - storageLogIf(logger.SetReqInfo(ctx, reqInfo), - fmt.Errorf("Read quorum may be lost on pool: %d, set: %d, expected read quorum: %d", - poolIdx, setIdx, poolReadQuorums[poolIdx])) + if opts.Startup { + storageLogIf(logger.SetReqInfo(ctx, reqInfo), + fmt.Errorf("Read quorum was not established on pool: %d, set: %d, expected read quorum: %d", + poolIdx, setIdx, poolReadQuorums[poolIdx])) + } else { + storageLogIf(logger.SetReqInfo(ctx, reqInfo), + fmt.Errorf("Read quorum may be lost on pool: %d, set: %d, expected read quorum: %d", + poolIdx, setIdx, poolReadQuorums[poolIdx])) + } } result.HealthyRead = result.HealthyRead && healthyRead } diff --git a/cmd/server-main.go b/cmd/server-main.go index 938d6176e..b8c5808f7 100644 --- a/cmd/server-main.go +++ b/cmd/server-main.go @@ -740,6 +740,8 @@ func initializeLogRotate(ctx *cli.Context) (io.WriteCloser, error) { // serverMain handler called for 'minio server' command. func serverMain(ctx *cli.Context) { + r := rand.New(rand.NewSource(time.Now().UnixNano())) + var warnings []string signal.Notify(globalOSSignalCh, os.Interrupt, syscall.SIGTERM, syscall.SIGQUIT) @@ -920,6 +922,16 @@ func serverMain(ctx *cli.Context) { globalNodeNamesHex[hex.EncodeToString(nodeNameSum[:])] = struct{}{} } + bootstrapTrace("waitForQuorum", func() { + result := newObject.Health(context.Background(), HealthOptions{Startup: true}) + for !result.Healthy { + d := time.Duration(r.Float64() * float64(time.Second)) + logger.Info("Waiting for quorum healthcheck to succeed.. possible cause unhealthy sets (%s), retrying in %s", result, d) + time.Sleep(d) + result = newObject.Health(context.Background(), HealthOptions{}) + } + }) + var err error bootstrapTrace("initServerConfig", func() { if err = initServerConfig(GlobalContext, newObject); err != nil { @@ -986,8 +998,6 @@ func serverMain(ctx *cli.Context) { }() go func() { - r := rand.New(rand.NewSource(time.Now().UnixNano())) - if !globalDisableFreezeOnBoot { defer bootstrapTrace("unfreezeServices", unfreezeServices) t := time.AfterFunc(5*time.Minute, func() {