mirror of
https://github.com/minio/minio.git
synced 2025-02-03 01:46:00 -05:00
perform healthchecks before initializing everything fully (#19953)
adds more informative logs that provide details on which erasure set is losing quorum etc.
This commit is contained in:
parent
9ba39d7fad
commit
ee48f9f206
@ -2425,6 +2425,7 @@ const (
|
||||
type HealthOptions struct {
|
||||
Maintenance bool
|
||||
DeploymentType string
|
||||
Startup bool
|
||||
}
|
||||
|
||||
// HealthResult returns the current state of the system, also
|
||||
@ -2449,6 +2450,24 @@ type HealthResult struct {
|
||||
UsingDefaults bool
|
||||
}
|
||||
|
||||
func (hr HealthResult) String() string {
|
||||
var str strings.Builder
|
||||
for i, es := range hr.ESHealth {
|
||||
str.WriteString("(Pool: ")
|
||||
str.WriteString(strconv.Itoa(es.PoolID))
|
||||
str.WriteString(" Set: ")
|
||||
str.WriteString(strconv.Itoa(es.SetID))
|
||||
str.WriteString(" Healthy: ")
|
||||
str.WriteString(strconv.FormatBool(es.Healthy))
|
||||
if i == 0 {
|
||||
str.WriteString(")")
|
||||
} else {
|
||||
str.WriteString("), ")
|
||||
}
|
||||
}
|
||||
return str.String()
|
||||
}
|
||||
|
||||
// Health - returns current status of the object layer health,
|
||||
// provides if write access exists across sets, additionally
|
||||
// can be used to query scenarios if health may be lost
|
||||
@ -2567,17 +2586,29 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
|
||||
|
||||
healthy := erasureSetUpCount[poolIdx][setIdx].online >= poolWriteQuorums[poolIdx]
|
||||
if !healthy {
|
||||
storageLogIf(logger.SetReqInfo(ctx, reqInfo),
|
||||
fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d",
|
||||
poolIdx, setIdx, poolWriteQuorums[poolIdx]), logger.FatalKind)
|
||||
if opts.Startup {
|
||||
storageLogIf(logger.SetReqInfo(ctx, reqInfo),
|
||||
fmt.Errorf("Write quorum was not established on pool: %d, set: %d, expected write quorum: %d",
|
||||
poolIdx, setIdx, poolWriteQuorums[poolIdx]), logger.FatalKind)
|
||||
} else {
|
||||
storageLogIf(logger.SetReqInfo(ctx, reqInfo),
|
||||
fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d",
|
||||
poolIdx, setIdx, poolWriteQuorums[poolIdx]), logger.FatalKind)
|
||||
}
|
||||
}
|
||||
result.Healthy = result.Healthy && healthy
|
||||
|
||||
healthyRead := erasureSetUpCount[poolIdx][setIdx].online >= poolReadQuorums[poolIdx]
|
||||
if !healthyRead {
|
||||
storageLogIf(logger.SetReqInfo(ctx, reqInfo),
|
||||
fmt.Errorf("Read quorum may be lost on pool: %d, set: %d, expected read quorum: %d",
|
||||
poolIdx, setIdx, poolReadQuorums[poolIdx]))
|
||||
if opts.Startup {
|
||||
storageLogIf(logger.SetReqInfo(ctx, reqInfo),
|
||||
fmt.Errorf("Read quorum was not established on pool: %d, set: %d, expected read quorum: %d",
|
||||
poolIdx, setIdx, poolReadQuorums[poolIdx]))
|
||||
} else {
|
||||
storageLogIf(logger.SetReqInfo(ctx, reqInfo),
|
||||
fmt.Errorf("Read quorum may be lost on pool: %d, set: %d, expected read quorum: %d",
|
||||
poolIdx, setIdx, poolReadQuorums[poolIdx]))
|
||||
}
|
||||
}
|
||||
result.HealthyRead = result.HealthyRead && healthyRead
|
||||
}
|
||||
|
@ -740,6 +740,8 @@ func initializeLogRotate(ctx *cli.Context) (io.WriteCloser, error) {
|
||||
|
||||
// serverMain handler called for 'minio server' command.
|
||||
func serverMain(ctx *cli.Context) {
|
||||
r := rand.New(rand.NewSource(time.Now().UnixNano()))
|
||||
|
||||
var warnings []string
|
||||
|
||||
signal.Notify(globalOSSignalCh, os.Interrupt, syscall.SIGTERM, syscall.SIGQUIT)
|
||||
@ -920,6 +922,16 @@ func serverMain(ctx *cli.Context) {
|
||||
globalNodeNamesHex[hex.EncodeToString(nodeNameSum[:])] = struct{}{}
|
||||
}
|
||||
|
||||
bootstrapTrace("waitForQuorum", func() {
|
||||
result := newObject.Health(context.Background(), HealthOptions{Startup: true})
|
||||
for !result.Healthy {
|
||||
d := time.Duration(r.Float64() * float64(time.Second))
|
||||
logger.Info("Waiting for quorum healthcheck to succeed.. possible cause unhealthy sets (%s), retrying in %s", result, d)
|
||||
time.Sleep(d)
|
||||
result = newObject.Health(context.Background(), HealthOptions{})
|
||||
}
|
||||
})
|
||||
|
||||
var err error
|
||||
bootstrapTrace("initServerConfig", func() {
|
||||
if err = initServerConfig(GlobalContext, newObject); err != nil {
|
||||
@ -986,8 +998,6 @@ func serverMain(ctx *cli.Context) {
|
||||
}()
|
||||
|
||||
go func() {
|
||||
r := rand.New(rand.NewSource(time.Now().UnixNano()))
|
||||
|
||||
if !globalDisableFreezeOnBoot {
|
||||
defer bootstrapTrace("unfreezeServices", unfreezeServices)
|
||||
t := time.AfterFunc(5*time.Minute, func() {
|
||||
|
Loading…
x
Reference in New Issue
Block a user