mirror of
https://github.com/minio/minio.git
synced 2025-02-03 09:55:59 -05:00
perform healthchecks before initializing everything fully (#19953)
adds more informative logs that provide details on which erasure set is losing quorum etc.
This commit is contained in:
parent
9ba39d7fad
commit
ee48f9f206
@ -2425,6 +2425,7 @@ const (
|
|||||||
type HealthOptions struct {
|
type HealthOptions struct {
|
||||||
Maintenance bool
|
Maintenance bool
|
||||||
DeploymentType string
|
DeploymentType string
|
||||||
|
Startup bool
|
||||||
}
|
}
|
||||||
|
|
||||||
// HealthResult returns the current state of the system, also
|
// HealthResult returns the current state of the system, also
|
||||||
@ -2449,6 +2450,24 @@ type HealthResult struct {
|
|||||||
UsingDefaults bool
|
UsingDefaults bool
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (hr HealthResult) String() string {
|
||||||
|
var str strings.Builder
|
||||||
|
for i, es := range hr.ESHealth {
|
||||||
|
str.WriteString("(Pool: ")
|
||||||
|
str.WriteString(strconv.Itoa(es.PoolID))
|
||||||
|
str.WriteString(" Set: ")
|
||||||
|
str.WriteString(strconv.Itoa(es.SetID))
|
||||||
|
str.WriteString(" Healthy: ")
|
||||||
|
str.WriteString(strconv.FormatBool(es.Healthy))
|
||||||
|
if i == 0 {
|
||||||
|
str.WriteString(")")
|
||||||
|
} else {
|
||||||
|
str.WriteString("), ")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return str.String()
|
||||||
|
}
|
||||||
|
|
||||||
// Health - returns current status of the object layer health,
|
// Health - returns current status of the object layer health,
|
||||||
// provides if write access exists across sets, additionally
|
// provides if write access exists across sets, additionally
|
||||||
// can be used to query scenarios if health may be lost
|
// can be used to query scenarios if health may be lost
|
||||||
@ -2567,18 +2586,30 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
|
|||||||
|
|
||||||
healthy := erasureSetUpCount[poolIdx][setIdx].online >= poolWriteQuorums[poolIdx]
|
healthy := erasureSetUpCount[poolIdx][setIdx].online >= poolWriteQuorums[poolIdx]
|
||||||
if !healthy {
|
if !healthy {
|
||||||
|
if opts.Startup {
|
||||||
|
storageLogIf(logger.SetReqInfo(ctx, reqInfo),
|
||||||
|
fmt.Errorf("Write quorum was not established on pool: %d, set: %d, expected write quorum: %d",
|
||||||
|
poolIdx, setIdx, poolWriteQuorums[poolIdx]), logger.FatalKind)
|
||||||
|
} else {
|
||||||
storageLogIf(logger.SetReqInfo(ctx, reqInfo),
|
storageLogIf(logger.SetReqInfo(ctx, reqInfo),
|
||||||
fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d",
|
fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d",
|
||||||
poolIdx, setIdx, poolWriteQuorums[poolIdx]), logger.FatalKind)
|
poolIdx, setIdx, poolWriteQuorums[poolIdx]), logger.FatalKind)
|
||||||
}
|
}
|
||||||
|
}
|
||||||
result.Healthy = result.Healthy && healthy
|
result.Healthy = result.Healthy && healthy
|
||||||
|
|
||||||
healthyRead := erasureSetUpCount[poolIdx][setIdx].online >= poolReadQuorums[poolIdx]
|
healthyRead := erasureSetUpCount[poolIdx][setIdx].online >= poolReadQuorums[poolIdx]
|
||||||
if !healthyRead {
|
if !healthyRead {
|
||||||
|
if opts.Startup {
|
||||||
|
storageLogIf(logger.SetReqInfo(ctx, reqInfo),
|
||||||
|
fmt.Errorf("Read quorum was not established on pool: %d, set: %d, expected read quorum: %d",
|
||||||
|
poolIdx, setIdx, poolReadQuorums[poolIdx]))
|
||||||
|
} else {
|
||||||
storageLogIf(logger.SetReqInfo(ctx, reqInfo),
|
storageLogIf(logger.SetReqInfo(ctx, reqInfo),
|
||||||
fmt.Errorf("Read quorum may be lost on pool: %d, set: %d, expected read quorum: %d",
|
fmt.Errorf("Read quorum may be lost on pool: %d, set: %d, expected read quorum: %d",
|
||||||
poolIdx, setIdx, poolReadQuorums[poolIdx]))
|
poolIdx, setIdx, poolReadQuorums[poolIdx]))
|
||||||
}
|
}
|
||||||
|
}
|
||||||
result.HealthyRead = result.HealthyRead && healthyRead
|
result.HealthyRead = result.HealthyRead && healthyRead
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -740,6 +740,8 @@ func initializeLogRotate(ctx *cli.Context) (io.WriteCloser, error) {
|
|||||||
|
|
||||||
// serverMain handler called for 'minio server' command.
|
// serverMain handler called for 'minio server' command.
|
||||||
func serverMain(ctx *cli.Context) {
|
func serverMain(ctx *cli.Context) {
|
||||||
|
r := rand.New(rand.NewSource(time.Now().UnixNano()))
|
||||||
|
|
||||||
var warnings []string
|
var warnings []string
|
||||||
|
|
||||||
signal.Notify(globalOSSignalCh, os.Interrupt, syscall.SIGTERM, syscall.SIGQUIT)
|
signal.Notify(globalOSSignalCh, os.Interrupt, syscall.SIGTERM, syscall.SIGQUIT)
|
||||||
@ -920,6 +922,16 @@ func serverMain(ctx *cli.Context) {
|
|||||||
globalNodeNamesHex[hex.EncodeToString(nodeNameSum[:])] = struct{}{}
|
globalNodeNamesHex[hex.EncodeToString(nodeNameSum[:])] = struct{}{}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bootstrapTrace("waitForQuorum", func() {
|
||||||
|
result := newObject.Health(context.Background(), HealthOptions{Startup: true})
|
||||||
|
for !result.Healthy {
|
||||||
|
d := time.Duration(r.Float64() * float64(time.Second))
|
||||||
|
logger.Info("Waiting for quorum healthcheck to succeed.. possible cause unhealthy sets (%s), retrying in %s", result, d)
|
||||||
|
time.Sleep(d)
|
||||||
|
result = newObject.Health(context.Background(), HealthOptions{})
|
||||||
|
}
|
||||||
|
})
|
||||||
|
|
||||||
var err error
|
var err error
|
||||||
bootstrapTrace("initServerConfig", func() {
|
bootstrapTrace("initServerConfig", func() {
|
||||||
if err = initServerConfig(GlobalContext, newObject); err != nil {
|
if err = initServerConfig(GlobalContext, newObject); err != nil {
|
||||||
@ -986,8 +998,6 @@ func serverMain(ctx *cli.Context) {
|
|||||||
}()
|
}()
|
||||||
|
|
||||||
go func() {
|
go func() {
|
||||||
r := rand.New(rand.NewSource(time.Now().UnixNano()))
|
|
||||||
|
|
||||||
if !globalDisableFreezeOnBoot {
|
if !globalDisableFreezeOnBoot {
|
||||||
defer bootstrapTrace("unfreezeServices", unfreezeServices)
|
defer bootstrapTrace("unfreezeServices", unfreezeServices)
|
||||||
t := time.AfterFunc(5*time.Minute, func() {
|
t := time.AfterFunc(5*time.Minute, func() {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user