mirror of
https://github.com/minio/minio.git
synced 2025-02-09 12:48:08 -05:00
Cluster healthcheck improvements (#10408)
- do not fail the healthcheck if heal status was not obtained from one of the nodes, if many nodes fail then report this as a catastrophic error. - add "x-minio-write-quorum" value to match the write tolerance supported by server. - admin info now states if a drive is healing where madmin.Disk.Healing is set to true and madmin.Disk.State is "ok"
This commit is contained in:
parent
650dccfa9e
commit
8a291e1dc0
@ -799,15 +799,13 @@ func (a adminAPIHandlers) HealHandler(w http.ResponseWriter, r *http.Request) {
|
|||||||
keepConnLive(w, r, respCh)
|
keepConnLive(w, r, respCh)
|
||||||
}
|
}
|
||||||
|
|
||||||
func getAggregatedBackgroundHealState(ctx context.Context, failOnErr bool) (madmin.BgHealState, error) {
|
func getAggregatedBackgroundHealState(ctx context.Context) (madmin.BgHealState, error) {
|
||||||
var bgHealStates []madmin.BgHealState
|
var bgHealStates []madmin.BgHealState
|
||||||
|
|
||||||
localHealState, ok := getLocalBackgroundHealStatus()
|
localHealState, ok := getLocalBackgroundHealStatus()
|
||||||
if !ok {
|
if !ok {
|
||||||
if failOnErr {
|
|
||||||
return madmin.BgHealState{}, errServerNotInitialized
|
return madmin.BgHealState{}, errServerNotInitialized
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// Get local heal status first
|
// Get local heal status first
|
||||||
bgHealStates = append(bgHealStates, localHealState)
|
bgHealStates = append(bgHealStates, localHealState)
|
||||||
@ -815,14 +813,16 @@ func getAggregatedBackgroundHealState(ctx context.Context, failOnErr bool) (madm
|
|||||||
if globalIsDistErasure {
|
if globalIsDistErasure {
|
||||||
// Get heal status from other peers
|
// Get heal status from other peers
|
||||||
peersHealStates, nerrs := globalNotificationSys.BackgroundHealStatus()
|
peersHealStates, nerrs := globalNotificationSys.BackgroundHealStatus()
|
||||||
|
var errCount int
|
||||||
for _, nerr := range nerrs {
|
for _, nerr := range nerrs {
|
||||||
if nerr.Err != nil {
|
if nerr.Err != nil {
|
||||||
if failOnErr {
|
|
||||||
return madmin.BgHealState{}, nerr.Err
|
|
||||||
}
|
|
||||||
logger.LogIf(ctx, nerr.Err)
|
logger.LogIf(ctx, nerr.Err)
|
||||||
|
errCount++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if errCount == len(nerrs) {
|
||||||
|
return madmin.BgHealState{}, fmt.Errorf("all remote servers failed to report heal status, cluster is unhealthy")
|
||||||
|
}
|
||||||
bgHealStates = append(bgHealStates, peersHealStates...)
|
bgHealStates = append(bgHealStates, peersHealStates...)
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -868,7 +868,12 @@ func (a adminAPIHandlers) BackgroundHealStatusHandler(w http.ResponseWriter, r *
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
aggregateHealStateResult, _ := getAggregatedBackgroundHealState(r.Context(), false)
|
aggregateHealStateResult, err := getAggregatedBackgroundHealState(r.Context())
|
||||||
|
if err != nil {
|
||||||
|
writeErrorResponseJSON(ctx, w, toAdminAPIErr(ctx, err), r.URL)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
if err := json.NewEncoder(w).Encode(aggregateHealStateResult); err != nil {
|
if err := json.NewEncoder(w).Encode(aggregateHealStateResult); err != nil {
|
||||||
writeErrorResponseJSON(ctx, w, toAdminAPIErr(ctx, err), r.URL)
|
writeErrorResponseJSON(ctx, w, toAdminAPIErr(ctx, err), r.URL)
|
||||||
return
|
return
|
||||||
@ -1489,20 +1494,34 @@ func (a adminAPIHandlers) ServerInfoHandler(w http.ResponseWriter, r *http.Reque
|
|||||||
Notifications: notifyTarget,
|
Notifications: notifyTarget,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Collect any disk healing.
|
||||||
|
healing, _ := getAggregatedBackgroundHealState(ctx)
|
||||||
|
healDisks := make(map[string]struct{}, len(healing.HealDisks))
|
||||||
|
for _, disk := range healing.HealDisks {
|
||||||
|
healDisks[disk] = struct{}{}
|
||||||
|
}
|
||||||
|
|
||||||
// find all disks which belong to each respective endpoints
|
// find all disks which belong to each respective endpoints
|
||||||
for i := range servers {
|
for i := range servers {
|
||||||
for _, disk := range storageInfo.Disks {
|
for _, disk := range storageInfo.Disks {
|
||||||
if strings.Contains(disk.Endpoint, servers[i].Endpoint) {
|
if strings.Contains(disk.Endpoint, servers[i].Endpoint) {
|
||||||
|
if _, ok := healDisks[disk.Endpoint]; ok {
|
||||||
|
disk.Healing = true
|
||||||
|
}
|
||||||
servers[i].Disks = append(servers[i].Disks, disk)
|
servers[i].Disks = append(servers[i].Disks, disk)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// add all the disks local to this server.
|
// add all the disks local to this server.
|
||||||
for _, disk := range storageInfo.Disks {
|
for _, disk := range storageInfo.Disks {
|
||||||
if disk.DrivePath == "" && disk.Endpoint == "" {
|
if disk.DrivePath == "" && disk.Endpoint == "" {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
if disk.Endpoint == disk.DrivePath {
|
if disk.Endpoint == disk.DrivePath {
|
||||||
|
if _, ok := healDisks[disk.Endpoint]; ok {
|
||||||
|
disk.Healing = true
|
||||||
|
}
|
||||||
servers[len(servers)-1].Disks = append(servers[len(servers)-1].Disks, disk)
|
servers[len(servers)-1].Disks = append(servers[len(servers)-1].Disks, disk)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -2045,9 +2045,8 @@ func (z *erasureZones) Health(ctx context.Context, opts HealthOptions) HealthRes
|
|||||||
|
|
||||||
reqInfo := (&logger.ReqInfo{}).AppendTags("maintenance", strconv.FormatBool(opts.Maintenance))
|
reqInfo := (&logger.ReqInfo{}).AppendTags("maintenance", strconv.FormatBool(opts.Maintenance))
|
||||||
|
|
||||||
for zoneIdx := range erasureSetUpCount {
|
|
||||||
parityDrives := globalStorageClass.GetParityForSC(storageclass.STANDARD)
|
parityDrives := globalStorageClass.GetParityForSC(storageclass.STANDARD)
|
||||||
diskCount := z.zones[zoneIdx].setDriveCount
|
diskCount := z.SetDriveCount()
|
||||||
if parityDrives == 0 {
|
if parityDrives == 0 {
|
||||||
parityDrives = getDefaultParityBlocks(diskCount)
|
parityDrives = getDefaultParityBlocks(diskCount)
|
||||||
}
|
}
|
||||||
@ -2056,6 +2055,8 @@ func (z *erasureZones) Health(ctx context.Context, opts HealthOptions) HealthRes
|
|||||||
if dataDrives == parityDrives {
|
if dataDrives == parityDrives {
|
||||||
writeQuorum++
|
writeQuorum++
|
||||||
}
|
}
|
||||||
|
|
||||||
|
for zoneIdx := range erasureSetUpCount {
|
||||||
for setIdx := range erasureSetUpCount[zoneIdx] {
|
for setIdx := range erasureSetUpCount[zoneIdx] {
|
||||||
if erasureSetUpCount[zoneIdx][setIdx] < writeQuorum {
|
if erasureSetUpCount[zoneIdx][setIdx] < writeQuorum {
|
||||||
logger.LogIf(logger.SetReqInfo(ctx, reqInfo),
|
logger.LogIf(logger.SetReqInfo(ctx, reqInfo),
|
||||||
@ -2076,13 +2077,14 @@ func (z *erasureZones) Health(ctx context.Context, opts HealthOptions) HealthRes
|
|||||||
if !opts.Maintenance {
|
if !opts.Maintenance {
|
||||||
return HealthResult{
|
return HealthResult{
|
||||||
Healthy: true,
|
Healthy: true,
|
||||||
|
WriteQuorum: writeQuorum,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// check if local disks are being healed, if they are being healed
|
// check if local disks are being healed, if they are being healed
|
||||||
// we need to tell healthy status as 'false' so that this server
|
// we need to tell healthy status as 'false' so that this server
|
||||||
// is not taken down for maintenance
|
// is not taken down for maintenance
|
||||||
aggHealStateResult, err := getAggregatedBackgroundHealState(ctx, true)
|
aggHealStateResult, err := getAggregatedBackgroundHealState(ctx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
logger.LogIf(logger.SetReqInfo(ctx, reqInfo), fmt.Errorf("Unable to verify global heal status: %w", err))
|
logger.LogIf(logger.SetReqInfo(ctx, reqInfo), fmt.Errorf("Unable to verify global heal status: %w", err))
|
||||||
return HealthResult{
|
return HealthResult{
|
||||||
@ -2094,11 +2096,10 @@ func (z *erasureZones) Health(ctx context.Context, opts HealthOptions) HealthRes
|
|||||||
logger.LogIf(logger.SetReqInfo(ctx, reqInfo), fmt.Errorf("Total drives to be healed %d", len(aggHealStateResult.HealDisks)))
|
logger.LogIf(logger.SetReqInfo(ctx, reqInfo), fmt.Errorf("Total drives to be healed %d", len(aggHealStateResult.HealDisks)))
|
||||||
}
|
}
|
||||||
|
|
||||||
healthy := len(aggHealStateResult.HealDisks) == 0
|
|
||||||
|
|
||||||
return HealthResult{
|
return HealthResult{
|
||||||
Healthy: healthy,
|
Healthy: len(aggHealStateResult.HealDisks) == 0,
|
||||||
HealingDrives: len(aggHealStateResult.HealDisks),
|
HealingDrives: len(aggHealStateResult.HealDisks),
|
||||||
|
WriteQuorum: writeQuorum,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -266,10 +266,11 @@ func (er erasureObjects) crawlAndGetDataUsage(ctx context.Context, buckets []Buc
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Collect any disk healing.
|
// Collect any disk healing.
|
||||||
healing, err := getAggregatedBackgroundHealState(ctx, true)
|
healing, err := getAggregatedBackgroundHealState(ctx)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
healDisks := make(map[string]struct{}, len(healing.HealDisks))
|
healDisks := make(map[string]struct{}, len(healing.HealDisks))
|
||||||
for _, disk := range healing.HealDisks {
|
for _, disk := range healing.HealDisks {
|
||||||
healDisks[disk] = struct{}{}
|
healDisks[disk] = struct{}{}
|
||||||
|
@ -38,6 +38,9 @@ func ClusterCheckHandler(w http.ResponseWriter, r *http.Request) {
|
|||||||
|
|
||||||
opts := HealthOptions{Maintenance: r.URL.Query().Get("maintenance") == "true"}
|
opts := HealthOptions{Maintenance: r.URL.Query().Get("maintenance") == "true"}
|
||||||
result := objLayer.Health(ctx, opts)
|
result := objLayer.Health(ctx, opts)
|
||||||
|
if result.WriteQuorum > 0 {
|
||||||
|
w.Header().Set("X-Minio-Write-Quorum", strconv.Itoa(result.WriteQuorum))
|
||||||
|
}
|
||||||
if !result.Healthy {
|
if !result.Healthy {
|
||||||
// return how many drives are being healed if any
|
// return how many drives are being healed if any
|
||||||
w.Header().Set("X-Minio-Healing-Drives", strconv.Itoa(result.HealingDrives))
|
w.Header().Set("X-Minio-Healing-Drives", strconv.Itoa(result.HealingDrives))
|
||||||
@ -51,7 +54,6 @@ func ClusterCheckHandler(w http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
writeResponse(w, http.StatusOK, nil, mimeNone)
|
writeResponse(w, http.StatusOK, nil, mimeNone)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -272,6 +272,7 @@ type Disk struct {
|
|||||||
Endpoint string `json:"endpoint,omitempty"`
|
Endpoint string `json:"endpoint,omitempty"`
|
||||||
RootDisk bool `json:"rootDisk,omitempty"`
|
RootDisk bool `json:"rootDisk,omitempty"`
|
||||||
DrivePath string `json:"path,omitempty"`
|
DrivePath string `json:"path,omitempty"`
|
||||||
|
Healing bool `json:"healing,omitempty"`
|
||||||
State string `json:"state,omitempty"`
|
State string `json:"state,omitempty"`
|
||||||
UUID string `json:"uuid,omitempty"`
|
UUID string `json:"uuid,omitempty"`
|
||||||
Model string `json:"model,omitempty"`
|
Model string `json:"model,omitempty"`
|
||||||
|
Loading…
x
Reference in New Issue
Block a user