mirror of
https://github.com/minio/minio.git
synced 2025-01-11 15:03:22 -05:00
converge clusterRead health into cluster health (#19063)
This commit is contained in:
parent
68dde2359f
commit
607cafadbc
@ -2280,55 +2280,23 @@ type HealthOptions struct {
|
||||
// was queried
|
||||
type HealthResult struct {
|
||||
Healthy bool
|
||||
HealthyRead bool
|
||||
HealingDrives int
|
||||
ESHealth []struct {
|
||||
Maintenance bool
|
||||
PoolID, SetID int
|
||||
Healthy bool
|
||||
HealthyRead bool
|
||||
HealthyDrives int
|
||||
HealingDrives int
|
||||
ReadQuorum int
|
||||
WriteQuorum int
|
||||
}
|
||||
WriteQuorum int
|
||||
ReadQuorum int
|
||||
UsingDefaults bool
|
||||
}
|
||||
|
||||
// ReadHealth returns if the cluster can serve read requests
|
||||
func (z *erasureServerPools) ReadHealth(ctx context.Context) bool {
|
||||
erasureSetUpCount := make([][]int, len(z.serverPools))
|
||||
for i := range z.serverPools {
|
||||
erasureSetUpCount[i] = make([]int, len(z.serverPools[i].sets))
|
||||
}
|
||||
|
||||
diskIDs := globalNotificationSys.GetLocalDiskIDs(ctx)
|
||||
diskIDs = append(diskIDs, getLocalDiskIDs(z))
|
||||
|
||||
for _, localDiskIDs := range diskIDs {
|
||||
for _, id := range localDiskIDs {
|
||||
poolIdx, setIdx, _, err := z.getPoolAndSet(id)
|
||||
if err != nil {
|
||||
logger.LogIf(ctx, err)
|
||||
continue
|
||||
}
|
||||
erasureSetUpCount[poolIdx][setIdx]++
|
||||
}
|
||||
}
|
||||
|
||||
b := z.BackendInfo()
|
||||
poolReadQuorums := make([]int, len(b.StandardSCData))
|
||||
copy(poolReadQuorums, b.StandardSCData)
|
||||
|
||||
for poolIdx := range erasureSetUpCount {
|
||||
for setIdx := range erasureSetUpCount[poolIdx] {
|
||||
if erasureSetUpCount[poolIdx][setIdx] < poolReadQuorums[poolIdx] {
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// Health - returns current status of the object layer health,
|
||||
// provides if write access exists across sets, additionally
|
||||
// can be used to query scenarios if health may be lost
|
||||
@ -2397,9 +2365,20 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
|
||||
}
|
||||
}
|
||||
|
||||
var maximumReadQuorum int
|
||||
for _, readQuorum := range poolReadQuorums {
|
||||
if maximumReadQuorum == 0 {
|
||||
maximumReadQuorum = readQuorum
|
||||
}
|
||||
if readQuorum > maximumReadQuorum {
|
||||
maximumReadQuorum = readQuorum
|
||||
}
|
||||
}
|
||||
|
||||
result := HealthResult{
|
||||
Healthy: true,
|
||||
WriteQuorum: maximumWriteQuorum,
|
||||
ReadQuorum: maximumReadQuorum,
|
||||
UsingDefaults: usingDefaults, // indicates if config was not initialized and we are using defaults on this node.
|
||||
}
|
||||
|
||||
@ -2409,6 +2388,7 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
|
||||
Maintenance bool
|
||||
PoolID, SetID int
|
||||
Healthy bool
|
||||
HealthyRead bool
|
||||
HealthyDrives, HealingDrives int
|
||||
ReadQuorum, WriteQuorum int
|
||||
}{
|
||||
@ -2416,6 +2396,7 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
|
||||
SetID: setIdx,
|
||||
PoolID: poolIdx,
|
||||
Healthy: erasureSetUpCount[poolIdx][setIdx].online >= poolWriteQuorums[poolIdx],
|
||||
HealthyRead: erasureSetUpCount[poolIdx][setIdx].online >= poolReadQuorums[poolIdx],
|
||||
HealthyDrives: erasureSetUpCount[poolIdx][setIdx].online,
|
||||
HealingDrives: erasureSetUpCount[poolIdx][setIdx].healing,
|
||||
ReadQuorum: poolReadQuorums[poolIdx],
|
||||
@ -2428,6 +2409,12 @@ func (z *erasureServerPools) Health(ctx context.Context, opts HealthOptions) Hea
|
||||
fmt.Errorf("Write quorum may be lost on pool: %d, set: %d, expected write quorum: %d",
|
||||
poolIdx, setIdx, poolWriteQuorums[poolIdx]))
|
||||
}
|
||||
result.HealthyRead = erasureSetUpCount[poolIdx][setIdx].online >= poolReadQuorums[poolIdx]
|
||||
if !result.HealthyRead {
|
||||
logger.LogIf(logger.SetReqInfo(ctx, reqInfo),
|
||||
fmt.Errorf("Read quorum may be lost on pool: %d, set: %d, expected read quorum: %d",
|
||||
poolIdx, setIdx, poolReadQuorums[poolIdx]))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -81,12 +81,28 @@ func ClusterReadCheckHandler(w http.ResponseWriter, r *http.Request) {
|
||||
ctx, cancel := context.WithTimeout(ctx, globalAPIConfig.getClusterDeadline())
|
||||
defer cancel()
|
||||
|
||||
result := objLayer.ReadHealth(ctx)
|
||||
if !result {
|
||||
writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone)
|
||||
opts := HealthOptions{
|
||||
Maintenance: r.Form.Get("maintenance") == "true",
|
||||
DeploymentType: r.Form.Get("deployment-type"),
|
||||
}
|
||||
result := objLayer.Health(ctx, opts)
|
||||
w.Header().Set(xhttp.MinIOReadQuorum, strconv.Itoa(result.ReadQuorum))
|
||||
w.Header().Set(xhttp.MinIOStorageClassDefaults, strconv.FormatBool(result.UsingDefaults))
|
||||
// return how many drives are being healed if any
|
||||
if result.HealingDrives > 0 {
|
||||
w.Header().Set(xhttp.MinIOHealingDrives, strconv.Itoa(result.HealingDrives))
|
||||
}
|
||||
if !result.HealthyRead {
|
||||
// As a maintenance call we are purposefully asked to be taken
|
||||
// down, this is for orchestrators to know if we can safely
|
||||
// take this server down, return appropriate error.
|
||||
if opts.Maintenance {
|
||||
writeResponse(w, http.StatusPreconditionFailed, nil, mimeNone)
|
||||
} else {
|
||||
writeResponse(w, http.StatusServiceUnavailable, nil, mimeNone)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
writeResponse(w, http.StatusOK, nil, mimeNone)
|
||||
}
|
||||
|
||||
|
@ -1104,24 +1104,6 @@ func (sys *NotificationSys) ServerInfo(metrics bool) []madmin.ServerProperties {
|
||||
return reply
|
||||
}
|
||||
|
||||
// GetLocalDiskIDs - return disk ids of the local disks of the peers.
|
||||
func (sys *NotificationSys) GetLocalDiskIDs(ctx context.Context) (localDiskIDs [][]string) {
|
||||
localDiskIDs = make([][]string, len(sys.peerClients))
|
||||
var wg sync.WaitGroup
|
||||
for idx, client := range sys.peerClients {
|
||||
if client == nil {
|
||||
continue
|
||||
}
|
||||
wg.Add(1)
|
||||
go func(idx int, client *peerRESTClient) {
|
||||
defer wg.Done()
|
||||
localDiskIDs[idx] = client.GetLocalDiskIDs(ctx)
|
||||
}(idx, client)
|
||||
}
|
||||
wg.Wait()
|
||||
return localDiskIDs
|
||||
}
|
||||
|
||||
// returns all the peers that are currently online.
|
||||
func (sys *NotificationSys) getOnlinePeers() []*peerRESTClient {
|
||||
var peerClients []*peerRESTClient
|
||||
|
@ -287,7 +287,6 @@ type ObjectLayer interface {
|
||||
|
||||
// Returns health of the backend
|
||||
Health(ctx context.Context, opts HealthOptions) HealthResult
|
||||
ReadHealth(ctx context.Context) bool
|
||||
|
||||
// Metadata operations
|
||||
PutObjectMetadata(context.Context, string, string, ObjectOptions) (ObjectInfo, error)
|
||||
|
@ -601,21 +601,6 @@ func (client *peerRESTClient) BackgroundHealStatus() (madmin.BgHealState, error)
|
||||
return state, err
|
||||
}
|
||||
|
||||
// GetLocalDiskIDs - get a peer's local disks' IDs.
|
||||
func (client *peerRESTClient) GetLocalDiskIDs(ctx context.Context) (diskIDs []string) {
|
||||
conn := client.gridConn()
|
||||
if conn == nil {
|
||||
return
|
||||
}
|
||||
|
||||
resp, err := getLocalDiskIDsHandler.Call(ctx, conn, grid.NewMSS())
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
return resp.IDs
|
||||
}
|
||||
|
||||
// GetMetacacheListing - get a new or existing metacache.
|
||||
func (client *peerRESTClient) GetMetacacheListing(ctx context.Context, o listPathOptions) (*metacache, error) {
|
||||
if client == nil {
|
||||
|
@ -715,59 +715,11 @@ func (s *peerRESTServer) PutBucketNotificationHandler(w http.ResponseWriter, r *
|
||||
globalEventNotifier.AddRulesMap(bucketName, rulesMap)
|
||||
}
|
||||
|
||||
// Return disk IDs of all the local disks.
|
||||
func getLocalDiskIDs(z *erasureServerPools) []string {
|
||||
var ids []string
|
||||
|
||||
for poolIdx := range z.serverPools {
|
||||
for _, set := range z.serverPools[poolIdx].sets {
|
||||
disks := set.getDisks()
|
||||
for _, disk := range disks {
|
||||
if disk == nil {
|
||||
continue
|
||||
}
|
||||
if disk.IsLocal() {
|
||||
id, err := disk.GetDiskID()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if id == "" {
|
||||
continue
|
||||
}
|
||||
ids = append(ids, id)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return ids
|
||||
}
|
||||
|
||||
// HealthHandler - returns true of health
|
||||
func (s *peerRESTServer) HealthHandler(w http.ResponseWriter, r *http.Request) {
|
||||
s.IsValid(w, r)
|
||||
}
|
||||
|
||||
var getLocalDiskIDsHandler = grid.NewSingleHandler[*grid.MSS, *LocalDiskIDs](grid.HandlerGetLocalDiskIDs, grid.NewMSS, func() *LocalDiskIDs {
|
||||
return &LocalDiskIDs{}
|
||||
})
|
||||
|
||||
// GetLocalDiskIDs - Return disk IDs of all the local disks.
|
||||
func (s *peerRESTServer) GetLocalDiskIDs(mss *grid.MSS) (*LocalDiskIDs, *grid.RemoteErr) {
|
||||
objLayer := newObjectLayerFn()
|
||||
// Service not initialized yet
|
||||
if objLayer == nil {
|
||||
return nil, grid.NewRemoteErr(errServerNotInitialized)
|
||||
}
|
||||
|
||||
z, ok := objLayer.(*erasureServerPools)
|
||||
if !ok {
|
||||
return nil, grid.NewRemoteErr(errServerNotInitialized)
|
||||
}
|
||||
|
||||
return &LocalDiskIDs{IDs: getLocalDiskIDs(z)}, nil
|
||||
}
|
||||
|
||||
// VerifyBinary - verifies the downloaded binary is in-tact
|
||||
func (s *peerRESTServer) VerifyBinaryHandler(w http.ResponseWriter, r *http.Request) {
|
||||
if !s.IsValid(w, r) {
|
||||
@ -1591,7 +1543,6 @@ func registerPeerRESTHandlers(router *mux.Router, gm *grid.Manager) {
|
||||
logger.FatalIf(reloadSiteReplicationConfigHandler.Register(gm, server.ReloadSiteReplicationConfigHandler), "unable to register handler")
|
||||
logger.FatalIf(loadBucketMetadataHandler.Register(gm, server.LoadBucketMetadataHandler), "unable to register handler")
|
||||
logger.FatalIf(deleteBucketMetadataHandler.Register(gm, server.DeleteBucketMetadataHandler), "unable to register handler")
|
||||
logger.FatalIf(getLocalDiskIDsHandler.Register(gm, server.GetLocalDiskIDs), "unable to register handler")
|
||||
logger.FatalIf(listenHandler.RegisterNoInput(gm, server.ListenHandler), "unable to register handler")
|
||||
logger.FatalIf(gm.RegisterStreamingHandler(grid.HandlerTrace, grid.StreamHandler{
|
||||
Handle: server.TraceHandler,
|
||||
|
@ -62,7 +62,6 @@ const (
|
||||
HandlerServerVerify
|
||||
HandlerTrace
|
||||
HandlerListen
|
||||
HandlerGetLocalDiskIDs
|
||||
HandlerDeleteBucketMetadata
|
||||
HandlerLoadBucketMetadata
|
||||
HandlerReloadSiteReplicationConfig
|
||||
@ -119,7 +118,6 @@ var handlerPrefixes = [handlerLast]string{
|
||||
HandlerServerVerify: bootstrapPrefix,
|
||||
HandlerTrace: peerPrefix,
|
||||
HandlerListen: peerPrefix,
|
||||
HandlerGetLocalDiskIDs: peerPrefix,
|
||||
HandlerDeleteBucketMetadata: peerPrefix,
|
||||
HandlerLoadBucketMetadata: peerPrefix,
|
||||
HandlerReloadSiteReplicationConfig: peerPrefix,
|
||||
|
@ -32,34 +32,33 @@ func _() {
|
||||
_ = x[HandlerServerVerify-21]
|
||||
_ = x[HandlerTrace-22]
|
||||
_ = x[HandlerListen-23]
|
||||
_ = x[HandlerGetLocalDiskIDs-24]
|
||||
_ = x[HandlerDeleteBucketMetadata-25]
|
||||
_ = x[HandlerLoadBucketMetadata-26]
|
||||
_ = x[HandlerReloadSiteReplicationConfig-27]
|
||||
_ = x[HandlerReloadPoolMeta-28]
|
||||
_ = x[HandlerStopRebalance-29]
|
||||
_ = x[HandlerLoadRebalanceMeta-30]
|
||||
_ = x[HandlerLoadTransitionTierConfig-31]
|
||||
_ = x[HandlerDeletePolicy-32]
|
||||
_ = x[HandlerLoadPolicy-33]
|
||||
_ = x[HandlerLoadPolicyMapping-34]
|
||||
_ = x[HandlerDeleteServiceAccount-35]
|
||||
_ = x[HandlerLoadServiceAccount-36]
|
||||
_ = x[HandlerDeleteUser-37]
|
||||
_ = x[HandlerLoadUser-38]
|
||||
_ = x[HandlerLoadGroup-39]
|
||||
_ = x[HandlerHealBucket-40]
|
||||
_ = x[HandlerMakeBucket-41]
|
||||
_ = x[HandlerHeadBucket-42]
|
||||
_ = x[HandlerDeleteBucket-43]
|
||||
_ = x[handlerTest-44]
|
||||
_ = x[handlerTest2-45]
|
||||
_ = x[handlerLast-46]
|
||||
_ = x[HandlerDeleteBucketMetadata-24]
|
||||
_ = x[HandlerLoadBucketMetadata-25]
|
||||
_ = x[HandlerReloadSiteReplicationConfig-26]
|
||||
_ = x[HandlerReloadPoolMeta-27]
|
||||
_ = x[HandlerStopRebalance-28]
|
||||
_ = x[HandlerLoadRebalanceMeta-29]
|
||||
_ = x[HandlerLoadTransitionTierConfig-30]
|
||||
_ = x[HandlerDeletePolicy-31]
|
||||
_ = x[HandlerLoadPolicy-32]
|
||||
_ = x[HandlerLoadPolicyMapping-33]
|
||||
_ = x[HandlerDeleteServiceAccount-34]
|
||||
_ = x[HandlerLoadServiceAccount-35]
|
||||
_ = x[HandlerDeleteUser-36]
|
||||
_ = x[HandlerLoadUser-37]
|
||||
_ = x[HandlerLoadGroup-38]
|
||||
_ = x[HandlerHealBucket-39]
|
||||
_ = x[HandlerMakeBucket-40]
|
||||
_ = x[HandlerHeadBucket-41]
|
||||
_ = x[HandlerDeleteBucket-42]
|
||||
_ = x[handlerTest-43]
|
||||
_ = x[handlerTest2-44]
|
||||
_ = x[handlerLast-45]
|
||||
}
|
||||
|
||||
const _HandlerID_name = "handlerInvalidLockLockLockRLockLockUnlockLockRUnlockLockRefreshLockForceUnlockWalkDirStatVolDiskInfoNSScannerReadXLReadVersionDeleteFileDeleteVersionUpdateMetadataWriteMetadataCheckPartsRenameDataRenameFileReadAllServerVerifyTraceListenGetLocalDiskIDsDeleteBucketMetadataLoadBucketMetadataReloadSiteReplicationConfigReloadPoolMetaStopRebalanceLoadRebalanceMetaLoadTransitionTierConfigDeletePolicyLoadPolicyLoadPolicyMappingDeleteServiceAccountLoadServiceAccountDeleteUserLoadUserLoadGroupHealBucketMakeBucketHeadBucketDeleteBuckethandlerTesthandlerTest2handlerLast"
|
||||
const _HandlerID_name = "handlerInvalidLockLockLockRLockLockUnlockLockRUnlockLockRefreshLockForceUnlockWalkDirStatVolDiskInfoNSScannerReadXLReadVersionDeleteFileDeleteVersionUpdateMetadataWriteMetadataCheckPartsRenameDataRenameFileReadAllServerVerifyTraceListenDeleteBucketMetadataLoadBucketMetadataReloadSiteReplicationConfigReloadPoolMetaStopRebalanceLoadRebalanceMetaLoadTransitionTierConfigDeletePolicyLoadPolicyLoadPolicyMappingDeleteServiceAccountLoadServiceAccountDeleteUserLoadUserLoadGroupHealBucketMakeBucketHeadBucketDeleteBuckethandlerTesthandlerTest2handlerLast"
|
||||
|
||||
var _HandlerID_index = [...]uint16{0, 14, 22, 31, 41, 52, 63, 78, 85, 92, 100, 109, 115, 126, 136, 149, 163, 176, 186, 196, 206, 213, 225, 230, 236, 251, 271, 289, 316, 330, 343, 360, 384, 396, 406, 423, 443, 461, 471, 479, 488, 498, 508, 518, 530, 541, 553, 564}
|
||||
var _HandlerID_index = [...]uint16{0, 14, 22, 31, 41, 52, 63, 78, 85, 92, 100, 109, 115, 126, 136, 149, 163, 176, 186, 196, 206, 213, 225, 230, 236, 256, 274, 301, 315, 328, 345, 369, 381, 391, 408, 428, 446, 456, 464, 473, 483, 493, 503, 515, 526, 538, 549}
|
||||
|
||||
func (i HandlerID) String() string {
|
||||
if i >= HandlerID(len(_HandlerID_index)-1) {
|
||||
|
@ -192,6 +192,9 @@ const (
|
||||
// Writes expected write quorum
|
||||
MinIOWriteQuorum = "x-minio-write-quorum"
|
||||
|
||||
// Reads expected read quorum
|
||||
MinIOReadQuorum = "x-minio-read-quorum"
|
||||
|
||||
// Indicates if we are using default storage class and there was problem loading config
|
||||
// if this header is set to "true"
|
||||
MinIOStorageClassDefaults = "x-minio-storage-class-defaults"
|
||||
|
Loading…
Reference in New Issue
Block a user