converge clusterRead health into cluster health (#19063)

This commit is contained in:
Harshavardhana 2024-02-15 16:48:36 -08:00 committed by GitHub
parent 68dde2359f
commit 607cafadbc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 69 additions and 149 deletions

View File

@ -2280,55 +2280,23 @@ type HealthOptions struct {
// was queried // was queried
type HealthResult struct { type HealthResult struct {
Healthy bool Healthy bool
HealthyRead bool
HealingDrives int HealingDrives int
ESHealth []struct { ESHealth []struct {
Maintenance bool Maintenance bool
PoolID, SetID int PoolID, SetID int
Healthy bool Healthy bool
HealthyRead bool
HealthyDrives int HealthyDrives int
HealingDrives int HealingDrives int
ReadQuorum int ReadQuorum int
WriteQuorum int WriteQuorum int
} }
WriteQuorum int WriteQuorum int
ReadQuorum int
UsingDefaults bool UsingDefaults bool
} }
// ReadHealth returns if the cluster can serve read requests
func (z *erasureServerPools) ReadHealth(ctx context.Context) bool {
erasureSetUpCount := make([][]int, len(z.serverPools))
for i := range z.serverPools {
erasureSetUpCount[i] = make([]int, len(z.serverPools[i].sets))
}
diskIDs := globalNotificationSys.GetLocalDiskIDs(ctx)
diskIDs = append(diskIDs, getLocalDiskIDs(z))
for _, localDiskIDs := range diskIDs {
for _, id := range localDiskIDs {
poolIdx, setIdx, _, err := z.getPoolAndSet(id)
if err != nil {
logger.LogIf(ctx, err)
continue
}
erasureSetUpCount[poolIdx][setIdx]++
}
}
b := z.BackendInfo()
poolReadQuorums := make([]int, len(b.StandardSCData))
copy(poolReadQuorums, b.StandardSCData)
for poolIdx := range erasureSetUpCount {
for setIdx := range erasureSetUpCount[poolIdx] {
if erasureSetUpCount[poolIdx][setIdx] < poolReadQuorums[poolIdx] {
return false
}
}
}
return true
}
// Health - returns current status of the object layer health, // Health - returns current status of the object layer health,
// provides if write access exists across sets, additionally // provides if write access exists across sets, additionally
// can be used to query scenarios if health may be lost // can be used to query scenarios if health may be lost
@ -2397,9 +2365,20 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
} }
} }
var maximumReadQuorum int
for _, readQuorum := range poolReadQuorums {
if maximumReadQuorum == 0 {
maximumReadQuorum = readQuorum
}
if readQuorum > maximumReadQuorum {
maximumReadQuorum = readQuorum
}
}
result := HealthResult{ result := HealthResult{
Healthy: true, Healthy: true,
WriteQuorum: maximumWriteQuorum, WriteQuorum: maximumWriteQuorum,
ReadQuorum: maximumReadQuorum,
UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node. UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
} }
@ -2409,6 +2388,7 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
Maintenance bool Maintenance bool
PoolID, SetID int PoolID, SetID int
Healthy bool Healthy bool
HealthyRead bool
HealthyDrives, HealingDrives int HealthyDrives, HealingDrives int
ReadQuorum, WriteQuorum int ReadQuorum, WriteQuorum int
}{ }{
@ -2416,6 +2396,7 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
SetID: setIdx, SetID: setIdx,
PoolID: poolIdx, PoolID: poolIdx,
Healthy: erasureSetUpCount[poolIdx][setIdx].online >= poolWriteQuorums[poolIdx], Healthy: erasureSetUpCount[poolIdx][setIdx].online >= poolWriteQuorums[poolIdx],
HealthyRead: erasureSetUpCount[poolIdx][setIdx].online >= poolReadQuorums[poolIdx],
HealthyDrives: erasureSetUpCount[poolIdx][setIdx].online, HealthyDrives: erasureSetUpCount[poolIdx][setIdx].online,
HealingDrives: erasureSetUpCount[poolIdx][setIdx].healing, HealingDrives: erasureSetUpCount[poolIdx][setIdx].healing,
ReadQuorum: poolReadQuorums[poolIdx], ReadQuorum: poolReadQuorums[poolIdx],
@ -2428,6 +2409,12 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d", fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d",
poolIdx, setIdx, poolWriteQuorums[poolIdx])) poolIdx, setIdx, poolWriteQuorums[poolIdx]))
} }
result.HealthyRead = erasureSetUpCount[poolIdx][setIdx].online >= poolReadQuorums[poolIdx]
if !result.HealthyRead {
logger.LogIf(logger.SetReqInfo(ctx, reqInfo),
fmt.Errorf("Read quorum may be lost on pool: %d, set: %d, expected read quorum: %d",
poolIdx, setIdx, poolReadQuorums[poolIdx]))
}
} }
} }

View File

@ -81,12 +81,28 @@ func ClusterReadCheckHandler(w http.ResponseWriter, r *http.Request) {
ctx, cancel := context.WithTimeout(ctx, globalAPIConfig.getClusterDeadline()) ctx, cancel := context.WithTimeout(ctx, globalAPIConfig.getClusterDeadline())
defer cancel() defer cancel()
result := objLayer.ReadHealth(ctx) opts := HealthOptions{
if !result { Maintenance: r.Form.Get("maintenance") == "true",
DeploymentType: r.Form.Get("deployment-type"),
}
result := objLayer.Health(ctx, opts)
w.Header().Set(xhttp.MinIOReadQuorum, strconv.Itoa(result.ReadQuorum))
w.Header().Set(xhttp.MinIOStorageClassDefaults, strconv.FormatBool(result.UsingDefaults))
// return how many drives are being healed if any
if result.HealingDrives > 0 {
w.Header().Set(xhttp.MinIOHealingDrives, strconv.Itoa(result.HealingDrives))
}
if !result.HealthyRead {
// As a maintenance call we are purposefully asked to be taken
// down, this is for orchestrators to know if we can safely
// take this server down, return appropriate error.
if opts.Maintenance {
writeResponse(w, http.StatusPreconditionFailed, nil, mimeNone)
} else {
writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone) writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone)
}
return return
} }
writeResponse(w, http.StatusOK, nil, mimeNone) writeResponse(w, http.StatusOK, nil, mimeNone)
} }

View File

@ -1104,24 +1104,6 @@ func (sys *NotificationSys) ServerInfo(metrics bool) []madmin.ServerProperties {
return reply return reply
} }
// GetLocalDiskIDs - return disk ids of the local disks of the peers.
func (sys *NotificationSys) GetLocalDiskIDs(ctx context.Context) (localDiskIDs [][]string) {
localDiskIDs = make([][]string, len(sys.peerClients))
var wg sync.WaitGroup
for idx, client := range sys.peerClients {
if client == nil {
continue
}
wg.Add(1)
go func(idx int, client *peerRESTClient) {
defer wg.Done()
localDiskIDs[idx] = client.GetLocalDiskIDs(ctx)
}(idx, client)
}
wg.Wait()
return localDiskIDs
}
// returns all the peers that are currently online. // returns all the peers that are currently online.
func (sys *NotificationSys) getOnlinePeers() []*peerRESTClient { func (sys *NotificationSys) getOnlinePeers() []*peerRESTClient {
var peerClients []*peerRESTClient var peerClients []*peerRESTClient

View File

@ -287,7 +287,6 @@ type ObjectLayer interface {
// Returns health of the backend // Returns health of the backend
Health(ctx context.Context, opts HealthOptions) HealthResult Health(ctx context.Context, opts HealthOptions) HealthResult
ReadHealth(ctx context.Context) bool
// Metadata operations // Metadata operations
PutObjectMetadata(context.Context, string, string, ObjectOptions) (ObjectInfo, error) PutObjectMetadata(context.Context, string, string, ObjectOptions) (ObjectInfo, error)

View File

@ -601,21 +601,6 @@ func (client *peerRESTClient) BackgroundHealStatus() (madmin.BgHealState, error)
return state, err return state, err
} }
// GetLocalDiskIDs - get a peer's local disks' IDs.
func (client *peerRESTClient) GetLocalDiskIDs(ctx context.Context) (diskIDs []string) {
conn := client.gridConn()
if conn == nil {
return
}
resp, err := getLocalDiskIDsHandler.Call(ctx, conn, grid.NewMSS())
if err != nil {
return
}
return resp.IDs
}
// GetMetacacheListing - get a new or existing metacache. // GetMetacacheListing - get a new or existing metacache.
func (client *peerRESTClient) GetMetacacheListing(ctx context.Context, o listPathOptions) (*metacache, error) { func (client *peerRESTClient) GetMetacacheListing(ctx context.Context, o listPathOptions) (*metacache, error) {
if client == nil { if client == nil {

View File

@ -715,59 +715,11 @@ func (s *peerRESTServer) PutBucketNotificationHandler(w http.ResponseWriter, r *
globalEventNotifier.AddRulesMap(bucketName, rulesMap) globalEventNotifier.AddRulesMap(bucketName, rulesMap)
} }
// Return disk IDs of all the local disks.
func getLocalDiskIDs(z *erasureServerPools) []string {
var ids []string
for poolIdx := range z.serverPools {
for _, set := range z.serverPools[poolIdx].sets {
disks := set.getDisks()
for _, disk := range disks {
if disk == nil {
continue
}
if disk.IsLocal() {
id, err := disk.GetDiskID()
if err != nil {
continue
}
if id == "" {
continue
}
ids = append(ids, id)
}
}
}
}
return ids
}
// HealthHandler - returns true of health // HealthHandler - returns true of health
func (s *peerRESTServer) HealthHandler(w http.ResponseWriter, r *http.Request) { func (s *peerRESTServer) HealthHandler(w http.ResponseWriter, r *http.Request) {
s.IsValid(w, r) s.IsValid(w, r)
} }
var getLocalDiskIDsHandler = grid.NewSingleHandler[*grid.MSS, *LocalDiskIDs](grid.HandlerGetLocalDiskIDs, grid.NewMSS, func() *LocalDiskIDs {
return &LocalDiskIDs{}
})
// GetLocalDiskIDs - Return disk IDs of all the local disks.
func (s *peerRESTServer) GetLocalDiskIDs(mss *grid.MSS) (*LocalDiskIDs, *grid.RemoteErr) {
objLayer := newObjectLayerFn()
// Service not initialized yet
if objLayer == nil {
return nil, grid.NewRemoteErr(errServerNotInitialized)
}
z, ok := objLayer.(*erasureServerPools)
if !ok {
return nil, grid.NewRemoteErr(errServerNotInitialized)
}
return &LocalDiskIDs{IDs: getLocalDiskIDs(z)}, nil
}
// VerifyBinary - verifies the downloaded binary is in-tact // VerifyBinary - verifies the downloaded binary is in-tact
func (s *peerRESTServer) VerifyBinaryHandler(w http.ResponseWriter, r *http.Request) { func (s *peerRESTServer) VerifyBinaryHandler(w http.ResponseWriter, r *http.Request) {
if !s.IsValid(w, r) { if !s.IsValid(w, r) {
@ -1591,7 +1543,6 @@ func registerPeerRESTHandlers(router *mux.Router, gm *grid.Manager) {
logger.FatalIf(reloadSiteReplicationConfigHandler.Register(gm, server.ReloadSiteReplicationConfigHandler), "unable to register handler") logger.FatalIf(reloadSiteReplicationConfigHandler.Register(gm, server.ReloadSiteReplicationConfigHandler), "unable to register handler")
logger.FatalIf(loadBucketMetadataHandler.Register(gm, server.LoadBucketMetadataHandler), "unable to register handler") logger.FatalIf(loadBucketMetadataHandler.Register(gm, server.LoadBucketMetadataHandler), "unable to register handler")
logger.FatalIf(deleteBucketMetadataHandler.Register(gm, server.DeleteBucketMetadataHandler), "unable to register handler") logger.FatalIf(deleteBucketMetadataHandler.Register(gm, server.DeleteBucketMetadataHandler), "unable to register handler")
logger.FatalIf(getLocalDiskIDsHandler.Register(gm, server.GetLocalDiskIDs), "unable to register handler")
logger.FatalIf(listenHandler.RegisterNoInput(gm, server.ListenHandler), "unable to register handler") logger.FatalIf(listenHandler.RegisterNoInput(gm, server.ListenHandler), "unable to register handler")
logger.FatalIf(gm.RegisterStreamingHandler(grid.HandlerTrace, grid.StreamHandler{ logger.FatalIf(gm.RegisterStreamingHandler(grid.HandlerTrace, grid.StreamHandler{
Handle: server.TraceHandler, Handle: server.TraceHandler,

View File

@ -62,7 +62,6 @@ const (
HandlerServerVerify HandlerServerVerify
HandlerTrace HandlerTrace
HandlerListen HandlerListen
HandlerGetLocalDiskIDs
HandlerDeleteBucketMetadata HandlerDeleteBucketMetadata
HandlerLoadBucketMetadata HandlerLoadBucketMetadata
HandlerReloadSiteReplicationConfig HandlerReloadSiteReplicationConfig
@ -119,7 +118,6 @@ var handlerPrefixes = [handlerLast]string{
HandlerServerVerify: bootstrapPrefix, HandlerServerVerify: bootstrapPrefix,
HandlerTrace: peerPrefix, HandlerTrace: peerPrefix,
HandlerListen: peerPrefix, HandlerListen: peerPrefix,
HandlerGetLocalDiskIDs: peerPrefix,
HandlerDeleteBucketMetadata: peerPrefix, HandlerDeleteBucketMetadata: peerPrefix,
HandlerLoadBucketMetadata: peerPrefix, HandlerLoadBucketMetadata: peerPrefix,
HandlerReloadSiteReplicationConfig: peerPrefix, HandlerReloadSiteReplicationConfig: peerPrefix,

View File

@ -32,34 +32,33 @@ func _() {
_ = x[HandlerServerVerify-21] _ = x[HandlerServerVerify-21]
_ = x[HandlerTrace-22] _ = x[HandlerTrace-22]
_ = x[HandlerListen-23] _ = x[HandlerListen-23]
_ = x[HandlerGetLocalDiskIDs-24] _ = x[HandlerDeleteBucketMetadata-24]
_ = x[HandlerDeleteBucketMetadata-25] _ = x[HandlerLoadBucketMetadata-25]
_ = x[HandlerLoadBucketMetadata-26] _ = x[HandlerReloadSiteReplicationConfig-26]
_ = x[HandlerReloadSiteReplicationConfig-27] _ = x[HandlerReloadPoolMeta-27]
_ = x[HandlerReloadPoolMeta-28] _ = x[HandlerStopRebalance-28]
_ = x[HandlerStopRebalance-29] _ = x[HandlerLoadRebalanceMeta-29]
_ = x[HandlerLoadRebalanceMeta-30] _ = x[HandlerLoadTransitionTierConfig-30]
_ = x[HandlerLoadTransitionTierConfig-31] _ = x[HandlerDeletePolicy-31]
_ = x[HandlerDeletePolicy-32] _ = x[HandlerLoadPolicy-32]
_ = x[HandlerLoadPolicy-33] _ = x[HandlerLoadPolicyMapping-33]
_ = x[HandlerLoadPolicyMapping-34] _ = x[HandlerDeleteServiceAccount-34]
_ = x[HandlerDeleteServiceAccount-35] _ = x[HandlerLoadServiceAccount-35]
_ = x[HandlerLoadServiceAccount-36] _ = x[HandlerDeleteUser-36]
_ = x[HandlerDeleteUser-37] _ = x[HandlerLoadUser-37]
_ = x[HandlerLoadUser-38] _ = x[HandlerLoadGroup-38]
_ = x[HandlerLoadGroup-39] _ = x[HandlerHealBucket-39]
_ = x[HandlerHealBucket-40] _ = x[HandlerMakeBucket-40]
_ = x[HandlerMakeBucket-41] _ = x[HandlerHeadBucket-41]
_ = x[HandlerHeadBucket-42] _ = x[HandlerDeleteBucket-42]
_ = x[HandlerDeleteBucket-43] _ = x[handlerTest-43]
_ = x[handlerTest-44] _ = x[handlerTest2-44]
_ = x[handlerTest2-45] _ = x[handlerLast-45]
_ = x[handlerLast-46]
} }
const _HandlerID_name = "handlerInvalidLockLockLockRLockLockUnlockLockRUnlockLockRefreshLockForceUnlockWalkDirStatVolDiskInfoNSScannerReadXLReadVersionDeleteFileDeleteVersionUpdateMetadataWriteMetadataCheckPartsRenameDataRenameFileReadAllServerVerifyTraceListenGetLocalDiskIDsDeleteBucketMetadataLoadBucketMetadataReloadSiteReplicationConfigReloadPoolMetaStopRebalanceLoadRebalanceMetaLoadTransitionTierConfigDeletePolicyLoadPolicyLoadPolicyMappingDeleteServiceAccountLoadServiceAccountDeleteUserLoadUserLoadGroupHealBucketMakeBucketHeadBucketDeleteBuckethandlerTesthandlerTest2handlerLast" const _HandlerID_name = "handlerInvalidLockLockLockRLockLockUnlockLockRUnlockLockRefreshLockForceUnlockWalkDirStatVolDiskInfoNSScannerReadXLReadVersionDeleteFileDeleteVersionUpdateMetadataWriteMetadataCheckPartsRenameDataRenameFileReadAllServerVerifyTraceListenDeleteBucketMetadataLoadBucketMetadataReloadSiteReplicationConfigReloadPoolMetaStopRebalanceLoadRebalanceMetaLoadTransitionTierConfigDeletePolicyLoadPolicyLoadPolicyMappingDeleteServiceAccountLoadServiceAccountDeleteUserLoadUserLoadGroupHealBucketMakeBucketHeadBucketDeleteBuckethandlerTesthandlerTest2handlerLast"
var _HandlerID_index = [...]uint16{0, 14, 22, 31, 41, 52, 63, 78, 85, 92, 100, 109, 115, 126, 136, 149, 163, 176, 186, 196, 206, 213, 225, 230, 236, 251, 271, 289, 316, 330, 343, 360, 384, 396, 406, 423, 443, 461, 471, 479, 488, 498, 508, 518, 530, 541, 553, 564} var _HandlerID_index = [...]uint16{0, 14, 22, 31, 41, 52, 63, 78, 85, 92, 100, 109, 115, 126, 136, 149, 163, 176, 186, 196, 206, 213, 225, 230, 236, 256, 274, 301, 315, 328, 345, 369, 381, 391, 408, 428, 446, 456, 464, 473, 483, 493, 503, 515, 526, 538, 549}
func (i HandlerID) String() string { func (i HandlerID) String() string {
if i >= HandlerID(len(_HandlerID_index)-1) { if i >= HandlerID(len(_HandlerID_index)-1) {

View File

@ -192,6 +192,9 @@ const (
// Writes expected write quorum // Writes expected write quorum
MinIOWriteQuorum = "x-minio-write-quorum" MinIOWriteQuorum = "x-minio-write-quorum"
// Reads expected read quorum
MinIOReadQuorum = "x-minio-read-quorum"
// Indicates if we are using default storage class and there was problem loading config // Indicates if we are using default storage class and there was problem loading config
// if this header is set to "true" // if this header is set to "true"
MinIOStorageClassDefaults = "x-minio-storage-class-defaults" MinIOStorageClassDefaults = "x-minio-storage-class-defaults"