perform healthchecks before initializing everything fully (#19953)

adds more informative logs that provide details on which
erasure set is losing quorum etc.
This commit is contained in:
Harshavardhana 2024-06-19 07:33:40 -07:00 committed by GitHub
parent 9ba39d7fad
commit ee48f9f206
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 49 additions and 8 deletions

View File

@ -2425,6 +2425,7 @@ const (
type HealthOptions struct { type HealthOptions struct {
Maintenance bool Maintenance bool
DeploymentType string DeploymentType string
Startup bool
} }
// HealthResult returns the current state of the system, also // HealthResult returns the current state of the system, also
@ -2449,6 +2450,24 @@ type HealthResult struct {
UsingDefaults bool UsingDefaults bool
} }
func (hr HealthResult) String() string {
var str strings.Builder
for i, es := range hr.ESHealth {
str.WriteString("(Pool: ")
str.WriteString(strconv.Itoa(es.PoolID))
str.WriteString(" Set: ")
str.WriteString(strconv.Itoa(es.SetID))
str.WriteString(" Healthy: ")
str.WriteString(strconv.FormatBool(es.Healthy))
if i == 0 {
str.WriteString(")")
} else {
str.WriteString("), ")
}
}
return str.String()
}
// Health - returns current status of the object layer health, // Health - returns current status of the object layer health,
// provides if write access exists across sets, additionally // provides if write access exists across sets, additionally
// can be used to query scenarios if health may be lost // can be used to query scenarios if health may be lost
@ -2567,18 +2586,30 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
healthy := erasureSetUpCount[poolIdx][setIdx].online >= poolWriteQuorums[poolIdx] healthy := erasureSetUpCount[poolIdx][setIdx].online >= poolWriteQuorums[poolIdx]
if !healthy { if !healthy {
if opts.Startup {
storageLogIf(logger.SetReqInfo(ctx, reqInfo),
fmt.Errorf("Write quorum was not established on pool: %d, set: %d, expected write quorum: %d",
poolIdx, setIdx, poolWriteQuorums[poolIdx]), logger.FatalKind)
} else {
storageLogIf(logger.SetReqInfo(ctx, reqInfo), storageLogIf(logger.SetReqInfo(ctx, reqInfo),
fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d", fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d",
poolIdx, setIdx, poolWriteQuorums[poolIdx]), logger.FatalKind) poolIdx, setIdx, poolWriteQuorums[poolIdx]), logger.FatalKind)
} }
}
result.Healthy = result.Healthy && healthy result.Healthy = result.Healthy && healthy
healthyRead := erasureSetUpCount[poolIdx][setIdx].online >= poolReadQuorums[poolIdx] healthyRead := erasureSetUpCount[poolIdx][setIdx].online >= poolReadQuorums[poolIdx]
if !healthyRead { if !healthyRead {
if opts.Startup {
storageLogIf(logger.SetReqInfo(ctx, reqInfo),
fmt.Errorf("Read quorum was not established on pool: %d, set: %d, expected read quorum: %d",
poolIdx, setIdx, poolReadQuorums[poolIdx]))
} else {
storageLogIf(logger.SetReqInfo(ctx, reqInfo), storageLogIf(logger.SetReqInfo(ctx, reqInfo),
fmt.Errorf("Read quorum may be lost on pool: %d, set: %d, expected read quorum: %d", fmt.Errorf("Read quorum may be lost on pool: %d, set: %d, expected read quorum: %d",
poolIdx, setIdx, poolReadQuorums[poolIdx])) poolIdx, setIdx, poolReadQuorums[poolIdx]))
} }
}
result.HealthyRead = result.HealthyRead && healthyRead result.HealthyRead = result.HealthyRead && healthyRead
} }
} }

View File

@ -740,6 +740,8 @@ func initializeLogRotate(ctx *cli.Context) (io.WriteCloser, error) {
// serverMain handler called for 'minio server' command. // serverMain handler called for 'minio server' command.
func serverMain(ctx *cli.Context) { func serverMain(ctx *cli.Context) {
r := rand.New(rand.NewSource(time.Now().UnixNano()))
var warnings []string var warnings []string
signal.Notify(globalOSSignalCh, os.Interrupt, syscall.SIGTERM, syscall.SIGQUIT) signal.Notify(globalOSSignalCh, os.Interrupt, syscall.SIGTERM, syscall.SIGQUIT)
@ -920,6 +922,16 @@ func serverMain(ctx *cli.Context) {
globalNodeNamesHex[hex.EncodeToString(nodeNameSum[:])] = struct{}{} globalNodeNamesHex[hex.EncodeToString(nodeNameSum[:])] = struct{}{}
} }
bootstrapTrace("waitForQuorum", func() {
result := newObject.Health(context.Background(), HealthOptions{Startup: true})
for !result.Healthy {
d := time.Duration(r.Float64() * float64(time.Second))
logger.Info("Waiting for quorum healthcheck to succeed.. possible cause unhealthy sets (%s), retrying in %s", result, d)
time.Sleep(d)
result = newObject.Health(context.Background(), HealthOptions{})
}
})
var err error var err error
bootstrapTrace("initServerConfig", func() { bootstrapTrace("initServerConfig", func() {
if err = initServerConfig(GlobalContext, newObject); err != nil { if err = initServerConfig(GlobalContext, newObject); err != nil {
@ -986,8 +998,6 @@ func serverMain(ctx *cli.Context) {
}() }()
go func() { go func() {
r := rand.New(rand.NewSource(time.Now().UnixNano()))
if !globalDisableFreezeOnBoot { if !globalDisableFreezeOnBoot {
defer bootstrapTrace("unfreezeServices", unfreezeServices) defer bootstrapTrace("unfreezeServices", unfreezeServices)
t := time.AfterFunc(5*time.Minute, func() { t := time.AfterFunc(5*time.Minute, func() {