re-attach offline drive after new drive replacement (#10416)

inconsistent drive healing when one of the drive is offline
while a new drive was replaced, this change is to ensure
that we can add the offline drive back into the mix by
healing it again.
This commit is contained in:
Harshavardhana 2020-09-04 17:09:02 -07:00 committed by GitHub
parent eb19c8af40
commit b0e1d4ce78
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 166 additions and 146 deletions

View File

@ -89,13 +89,14 @@ type allHealState struct {
// map of heal path to heal sequence // map of heal path to heal sequence
healSeqMap map[string]*healSequence healSeqMap map[string]*healSequence
healLocalDisks []Endpoints healLocalDisks map[Endpoint]struct{}
} }
// newHealState - initialize global heal state management // newHealState - initialize global heal state management
func newHealState() *allHealState { func newHealState() *allHealState {
healState := &allHealState{ healState := &allHealState{
healSeqMap: make(map[string]*healSequence), healSeqMap: make(map[string]*healSequence),
healLocalDisks: map[Endpoint]struct{}{},
} }
go healState.periodicHealSeqsClean(GlobalContext) go healState.periodicHealSeqsClean(GlobalContext)
@ -103,20 +104,43 @@ func newHealState() *allHealState {
return healState return healState
} }
func (ahs *allHealState) getHealLocalDisks() []Endpoints { func (ahs *allHealState) healDriveCount() int {
ahs.Lock() ahs.Lock()
defer ahs.Unlock() defer ahs.Unlock()
healLocalDisks := make([]Endpoints, len(ahs.healLocalDisks)) fmt.Println(ahs.healLocalDisks)
copy(healLocalDisks, ahs.healLocalDisks) return len(ahs.healLocalDisks)
}
func (ahs *allHealState) getHealLocalDisks() Endpoints {
ahs.Lock()
defer ahs.Unlock()
var healLocalDisks Endpoints
for ep := range ahs.healLocalDisks {
healLocalDisks = append(healLocalDisks, ep)
}
return healLocalDisks return healLocalDisks
} }
func (ahs *allHealState) updateHealLocalDisks(healLocalDisks []Endpoints) { func (ahs *allHealState) popHealLocalDisks(healLocalDisks ...Endpoint) {
ahs.Lock() ahs.Lock()
defer ahs.Unlock() defer ahs.Unlock()
ahs.healLocalDisks = healLocalDisks for _, ep := range healLocalDisks {
delete(ahs.healLocalDisks, ep)
}
fmt.Println(ahs.healLocalDisks)
}
func (ahs *allHealState) pushHealLocalDisks(healLocalDisks ...Endpoint) {
ahs.Lock()
defer ahs.Unlock()
for _, ep := range healLocalDisks {
ahs.healLocalDisks[ep] = struct{}{}
}
fmt.Println(ahs.healLocalDisks)
} }
func (ahs *allHealState) periodicHealSeqsClean(ctx context.Context) { func (ahs *allHealState) periodicHealSeqsClean(ctx context.Context) {

View File

@ -26,7 +26,7 @@ import (
"github.com/minio/minio/cmd/logger" "github.com/minio/minio/cmd/logger"
) )
const defaultMonitorNewDiskInterval = time.Minute * 3 const defaultMonitorNewDiskInterval = time.Second * 10
func initAutoHeal(ctx context.Context, objAPI ObjectLayer) { func initAutoHeal(ctx context.Context, objAPI ObjectLayer) {
z, ok := objAPI.(*erasureZones) z, ok := objAPI.(*erasureZones)
@ -36,15 +36,6 @@ func initAutoHeal(ctx context.Context, objAPI ObjectLayer) {
initBackgroundHealing(ctx, objAPI) // start quick background healing initBackgroundHealing(ctx, objAPI) // start quick background healing
localDisksInZoneHeal := getLocalDisksToHeal(objAPI)
globalBackgroundHealState.updateHealLocalDisks(localDisksInZoneHeal)
drivesToHeal := getDrivesToHealCount(localDisksInZoneHeal)
if drivesToHeal != 0 {
logger.Info(fmt.Sprintf("Found drives to heal %d, waiting until %s to heal the content...",
drivesToHeal, defaultMonitorNewDiskInterval))
}
var bgSeq *healSequence var bgSeq *healSequence
var found bool var found bool
@ -56,7 +47,14 @@ func initAutoHeal(ctx context.Context, objAPI ObjectLayer) {
time.Sleep(time.Second) time.Sleep(time.Second)
} }
if drivesToHeal != 0 { for _, ep := range getLocalDisksToHeal() {
globalBackgroundHealState.pushHealLocalDisks(ep)
}
if drivesToHeal := globalBackgroundHealState.healDriveCount(); drivesToHeal > 0 {
logger.Info(fmt.Sprintf("Found drives to heal %d, waiting until %s to heal the content...",
drivesToHeal, defaultMonitorNewDiskInterval))
// Heal any disk format and metadata early, if possible. // Heal any disk format and metadata early, if possible.
if err := bgSeq.healDiskMeta(); err != nil { if err := bgSeq.healDiskMeta(); err != nil {
if newObjectLayerFn() != nil { if newObjectLayerFn() != nil {
@ -67,19 +65,11 @@ func initAutoHeal(ctx context.Context, objAPI ObjectLayer) {
} }
} }
go monitorLocalDisksAndHeal(ctx, z, drivesToHeal, localDisksInZoneHeal, bgSeq) go monitorLocalDisksAndHeal(ctx, z, bgSeq)
} }
func getLocalDisksToHeal(objAPI ObjectLayer) []Endpoints { func getLocalDisksToHeal() (disksToHeal Endpoints) {
z, ok := objAPI.(*erasureZones) for _, ep := range globalEndpoints {
if !ok {
return nil
}
// Attempt a heal as the server starts-up first.
localDisksInZoneHeal := make([]Endpoints, len(z.zones))
for i, ep := range globalEndpoints {
localDisksToHeal := Endpoints{}
for _, endpoint := range ep.Endpoints { for _, endpoint := range ep.Endpoints {
if !endpoint.IsLocal { if !endpoint.IsLocal {
continue continue
@ -88,28 +78,14 @@ func getLocalDisksToHeal(objAPI ObjectLayer) []Endpoints {
// and reformat if the current disk is not formatted // and reformat if the current disk is not formatted
_, _, err := connectEndpoint(endpoint) _, _, err := connectEndpoint(endpoint)
if errors.Is(err, errUnformattedDisk) { if errors.Is(err, errUnformattedDisk) {
localDisksToHeal = append(localDisksToHeal, endpoint) disksToHeal = append(disksToHeal, endpoint)
} }
} }
if len(localDisksToHeal) == 0 {
continue
}
localDisksInZoneHeal[i] = localDisksToHeal
} }
return localDisksInZoneHeal return disksToHeal
} }
func getDrivesToHealCount(localDisksInZoneHeal []Endpoints) int {
var drivesToHeal int
for _, eps := range localDisksInZoneHeal {
for range eps {
drivesToHeal++
}
}
return drivesToHeal
}
func initBackgroundHealing(ctx context.Context, objAPI ObjectLayer) { func initBackgroundHealing(ctx context.Context, objAPI ObjectLayer) {
// Run the background healer // Run the background healer
globalBackgroundHealRoutine = newHealRoutine() globalBackgroundHealRoutine = newHealRoutine()
@ -121,77 +97,65 @@ func initBackgroundHealing(ctx context.Context, objAPI ObjectLayer) {
// monitorLocalDisksAndHeal - ensures that detected new disks are healed // monitorLocalDisksAndHeal - ensures that detected new disks are healed
// 1. Only the concerned erasure set will be listed and healed // 1. Only the concerned erasure set will be listed and healed
// 2. Only the node hosting the disk is responsible to perform the heal // 2. Only the node hosting the disk is responsible to perform the heal
func monitorLocalDisksAndHeal(ctx context.Context, z *erasureZones, drivesToHeal int, localDisksInZoneHeal []Endpoints, bgSeq *healSequence) { func monitorLocalDisksAndHeal(ctx context.Context, z *erasureZones, bgSeq *healSequence) {
// Perform automatic disk healing when a disk is replaced locally. // Perform automatic disk healing when a disk is replaced locally.
for { for {
select { select {
case <-ctx.Done(): case <-ctx.Done():
return return
case <-time.After(defaultMonitorNewDiskInterval): case <-time.After(defaultMonitorNewDiskInterval):
// heal only if new disks found. waitForLowHTTPReq(int32(globalEndpoints.NEndpoints()), time.Second)
if drivesToHeal == 0 {
localDisksInZoneHeal = getLocalDisksToHeal(z)
drivesToHeal = getDrivesToHealCount(localDisksInZoneHeal)
if drivesToHeal == 0 {
// No drives to heal.
globalBackgroundHealState.updateHealLocalDisks(nil)
continue
}
globalBackgroundHealState.updateHealLocalDisks(localDisksInZoneHeal)
var erasureSetInZoneEndpointToHeal = make([]map[int]Endpoint, len(z.zones))
for i := range z.zones {
erasureSetInZoneEndpointToHeal[i] = map[int]Endpoint{}
}
healDisks := globalBackgroundHealState.getHealLocalDisks()
// heal only if new disks found.
for _, endpoint := range healDisks {
logger.Info(fmt.Sprintf("Found drives to heal %d, proceeding to heal content...", logger.Info(fmt.Sprintf("Found drives to heal %d, proceeding to heal content...",
drivesToHeal)) len(healDisks)))
// Reformat disks // Reformat disks
bgSeq.sourceCh <- healSource{bucket: SlashSeparator} bgSeq.sourceCh <- healSource{bucket: SlashSeparator}
// Ensure that reformatting disks is finished // Ensure that reformatting disks is finished
bgSeq.sourceCh <- healSource{bucket: nopHeal} bgSeq.sourceCh <- healSource{bucket: nopHeal}
}
var erasureSetInZoneToHeal = make([][]int, len(localDisksInZoneHeal)) // Load the new format of this passed endpoint
// Compute the list of erasure set to heal _, format, err := connectEndpoint(endpoint)
for i, localDisksToHeal := range localDisksInZoneHeal { if err != nil {
var erasureSetToHeal []int printEndpointError(endpoint, err, true)
for _, endpoint := range localDisksToHeal { continue
// Load the new format of this passed endpoint
_, format, err := connectEndpoint(endpoint)
if err != nil {
printEndpointError(endpoint, err, true)
continue
}
// Calculate the set index where the current endpoint belongs
setIndex, _, err := findDiskIndex(z.zones[i].format, format)
if err != nil {
printEndpointError(endpoint, err, false)
continue
}
erasureSetToHeal = append(erasureSetToHeal, setIndex)
} }
erasureSetInZoneToHeal[i] = erasureSetToHeal
}
logger.Info("New unformatted drives detected attempting to heal the content...") zoneIdx := globalEndpoints.GetLocalZoneIdx(endpoint)
for i, disks := range localDisksInZoneHeal { if zoneIdx < 0 {
for _, disk := range disks { continue
logger.Info("Healing disk '%s' on %s zone", disk, humanize.Ordinal(i+1))
} }
// Calculate the set index where the current endpoint belongs
setIndex, _, err := findDiskIndex(z.zones[zoneIdx].format, format)
if err != nil {
printEndpointError(endpoint, err, false)
continue
}
erasureSetInZoneEndpointToHeal[zoneIdx][setIndex] = endpoint
} }
// Heal all erasure sets that need for i, setMap := range erasureSetInZoneEndpointToHeal {
for i, erasureSetToHeal := range erasureSetInZoneToHeal { for setIndex, endpoint := range setMap {
for _, setIndex := range erasureSetToHeal { logger.Info("Healing disk '%s' on %s zone", endpoint, humanize.Ordinal(i+1))
err := healErasureSet(ctx, setIndex, z.zones[i].sets[setIndex], z.zones[i].setDriveCount)
if err != nil { if err := healErasureSet(ctx, setIndex, z.zones[i].sets[setIndex], z.zones[i].setDriveCount); err != nil {
logger.LogIf(ctx, err) logger.LogIf(ctx, err)
continue
} }
// Only upon success reduce the counter // Only upon success pop the healed disk.
if err == nil { globalBackgroundHealState.popHealLocalDisks(endpoint)
drivesToHeal--
}
} }
} }
} }

View File

@ -24,6 +24,7 @@ import (
"net/url" "net/url"
"path" "path"
"path/filepath" "path/filepath"
"reflect"
"runtime" "runtime"
"strconv" "strconv"
"strings" "strings"
@ -203,6 +204,21 @@ type ZoneEndpoints struct {
// EndpointZones - list of list of endpoints // EndpointZones - list of list of endpoints
type EndpointZones []ZoneEndpoints type EndpointZones []ZoneEndpoints
// GetLocalZoneIdx returns the zone which endpoint belongs to locally.
// if ep is remote this code will return -1 zoneIndex
func (l EndpointZones) GetLocalZoneIdx(ep Endpoint) int {
for i, zep := range l {
for _, cep := range zep.Endpoints {
if cep.IsLocal && ep.IsLocal {
if reflect.DeepEqual(cep, ep) {
return i
}
}
}
}
return -1
}
// Add add zone endpoints // Add add zone endpoints
func (l *EndpointZones) Add(zeps ZoneEndpoints) error { func (l *EndpointZones) Add(zeps ZoneEndpoints) error {
existSet := set.NewStringSet() existSet := set.NewStringSet()

View File

@ -137,13 +137,10 @@ func connectEndpoint(endpoint Endpoint) (StorageAPI, *formatErasureV3, error) {
format, err := loadFormatErasure(disk) format, err := loadFormatErasure(disk)
if err != nil { if err != nil {
// Close the internal connection to avoid connection leaks.
disk.Close()
if errors.Is(err, errUnformattedDisk) { if errors.Is(err, errUnformattedDisk) {
info, derr := disk.DiskInfo(context.TODO()) info, derr := disk.DiskInfo(context.TODO())
if derr != nil && info.RootDisk { if derr != nil && info.RootDisk {
return nil, nil, fmt.Errorf("Disk: %s returned %w but its a root disk refusing to use it", return nil, nil, fmt.Errorf("Disk: %s returned %w", disk, derr) // make sure to '%w' to wrap the error
disk, derr) // make sure to '%w' to wrap the error
} }
} }
return nil, nil, fmt.Errorf("Disk: %s returned %w", disk, err) // make sure to '%w' to wrap the error return nil, nil, fmt.Errorf("Disk: %s returned %w", disk, err) // make sure to '%w' to wrap the error
@ -213,14 +210,22 @@ func (s *erasureSets) connectDisks() {
defer wg.Done() defer wg.Done()
disk, format, err := connectEndpoint(endpoint) disk, format, err := connectEndpoint(endpoint)
if err != nil { if err != nil {
printEndpointError(endpoint, err, true) if endpoint.IsLocal && errors.Is(err, errUnformattedDisk) {
logger.Info(fmt.Sprintf("Found unformatted drive %s, attempting to heal...", endpoint))
globalBackgroundHealState.pushHealLocalDisks(endpoint)
} else {
printEndpointError(endpoint, err, true)
}
return return
} }
setIndex, diskIndex, err := findDiskIndex(s.format, format) setIndex, diskIndex, err := findDiskIndex(s.format, format)
if err != nil { if err != nil {
// Close the internal connection to avoid connection leaks. if endpoint.IsLocal {
disk.Close() globalBackgroundHealState.pushHealLocalDisks(endpoint)
printEndpointError(endpoint, err, false) logger.Info(fmt.Sprintf("Found inconsistent drive %s with format.json, attempting to heal...", endpoint))
} else {
printEndpointError(endpoint, err, false)
}
return return
} }
disk.SetDiskID(format.Erasure.This) disk.SetDiskID(format.Erasure.This)
@ -291,7 +296,9 @@ func (s *erasureSets) GetDisks(setIndex int) func() []StorageAPI {
} }
} }
const defaultMonitorConnectEndpointInterval = time.Second * 10 // Set to 10 secs. // defaultMonitorConnectEndpointInterval is the interval to monitor endpoint connections.
// Must be bigger than defaultMonitorNewDiskInterval.
const defaultMonitorConnectEndpointInterval = defaultMonitorNewDiskInterval + time.Second*5
// Initialize new set of erasure coded sets. // Initialize new set of erasure coded sets.
func newErasureSets(ctx context.Context, endpoints Endpoints, storageDisks []StorageAPI, format *formatErasureV3) (*erasureSets, error) { func newErasureSets(ctx context.Context, endpoints Endpoints, storageDisks []StorageAPI, format *formatErasureV3) (*erasureSets, error) {
@ -342,12 +349,10 @@ func newErasureSets(ctx context.Context, endpoints Endpoints, storageDisks []Sto
} }
diskID, derr := disk.GetDiskID() diskID, derr := disk.GetDiskID()
if derr != nil { if derr != nil {
disk.Close()
continue continue
} }
m, n, err := findDiskIndexByDiskID(format, diskID) m, n, err := findDiskIndexByDiskID(format, diskID)
if err != nil { if err != nil {
disk.Close()
continue continue
} }
s.endpointStrings[m*setDriveCount+n] = disk.String() s.endpointStrings[m*setDriveCount+n] = disk.String()
@ -1218,13 +1223,11 @@ func (s *erasureSets) ReloadFormat(ctx context.Context, dryRun bool) (err error)
diskID, err := disk.GetDiskID() diskID, err := disk.GetDiskID()
if err != nil { if err != nil {
disk.Close()
continue continue
} }
m, n, err := findDiskIndexByDiskID(refFormat, diskID) m, n, err := findDiskIndexByDiskID(refFormat, diskID)
if err != nil { if err != nil {
disk.Close()
continue continue
} }
@ -1248,17 +1251,14 @@ func (s *erasureSets) ReloadFormat(ctx context.Context, dryRun bool) (err error)
func isTestSetup(infos []DiskInfo, errs []error) bool { func isTestSetup(infos []DiskInfo, errs []error) bool {
rootDiskCount := 0 rootDiskCount := 0
for i := range errs { for i := range errs {
if errs[i] != nil && errs[i] != errUnformattedDisk { if errs[i] == nil || errs[i] == errUnformattedDisk {
// On any error which is not unformatted disk if infos[i].RootDisk {
// it is safer to reject healing. rootDiskCount++
return false }
}
if infos[i].RootDisk {
rootDiskCount++
} }
} }
// It is a test setup if all disks are root disks. // It is a test setup if all disks are root disks in quorum.
return rootDiskCount == len(infos) return rootDiskCount >= len(infos)/2+1
} }
func getHealDiskInfos(storageDisks []StorageAPI, errs []error) ([]DiskInfo, []error) { func getHealDiskInfos(storageDisks []StorageAPI, errs []error) ([]DiskInfo, []error) {
@ -1321,6 +1321,19 @@ func (s *erasureSets) HealFormat(ctx context.Context, dryRun bool) (res madmin.H
// Mark all root disks down // Mark all root disks down
markRootDisksAsDown(storageDisks, sErrs) markRootDisksAsDown(storageDisks, sErrs)
refFormat, err := getFormatErasureInQuorum(formats)
if err != nil {
return res, err
}
for i, format := range formats {
if format != nil {
if ferr := formatErasureV3Check(refFormat, format); ferr != nil {
sErrs[i] = errUnformattedDisk
}
}
}
// Prepare heal-result // Prepare heal-result
res = madmin.HealResultItem{ res = madmin.HealResultItem{
Type: madmin.HealItemMetadata, Type: madmin.HealItemMetadata,
@ -1346,11 +1359,6 @@ func (s *erasureSets) HealFormat(ctx context.Context, dryRun bool) (res madmin.H
return res, errNoHealRequired return res, errNoHealRequired
} }
refFormat, err := getFormatErasureInQuorum(formats)
if err != nil {
return res, err
}
// Mark all UUIDs which might be offline, use list // Mark all UUIDs which might be offline, use list
// of formats to mark them appropriately. // of formats to mark them appropriately.
markUUIDsOffline(refFormat, formats) markUUIDsOffline(refFormat, formats)
@ -1424,13 +1432,11 @@ func (s *erasureSets) HealFormat(ctx context.Context, dryRun bool) (res madmin.H
diskID, err := disk.GetDiskID() diskID, err := disk.GetDiskID()
if err != nil { if err != nil {
disk.Close()
continue continue
} }
m, n, err := findDiskIndexByDiskID(refFormat, diskID) m, n, err := findDiskIndexByDiskID(refFormat, diskID)
if err != nil { if err != nil {
disk.Close()
continue continue
} }

View File

@ -2056,6 +2056,25 @@ func (z *erasureZones) Health(ctx context.Context, opts HealthOptions) HealthRes
writeQuorum++ writeQuorum++
} }
var aggHealStateResult madmin.BgHealState
if opts.Maintenance {
// check if local disks are being healed, if they are being healed
// we need to tell healthy status as 'false' so that this server
// is not taken down for maintenance
var err error
aggHealStateResult, err = getAggregatedBackgroundHealState(ctx)
if err != nil {
logger.LogIf(logger.SetReqInfo(ctx, reqInfo), fmt.Errorf("Unable to verify global heal status: %w", err))
return HealthResult{
Healthy: false,
}
}
if len(aggHealStateResult.HealDisks) > 0 {
logger.LogIf(logger.SetReqInfo(ctx, reqInfo), fmt.Errorf("Total drives to be healed %d", len(aggHealStateResult.HealDisks)))
}
}
for zoneIdx := range erasureSetUpCount { for zoneIdx := range erasureSetUpCount {
for setIdx := range erasureSetUpCount[zoneIdx] { for setIdx := range erasureSetUpCount[zoneIdx] {
if erasureSetUpCount[zoneIdx][setIdx] < writeQuorum { if erasureSetUpCount[zoneIdx][setIdx] < writeQuorum {
@ -2063,10 +2082,11 @@ func (z *erasureZones) Health(ctx context.Context, opts HealthOptions) HealthRes
fmt.Errorf("Write quorum may be lost on zone: %d, set: %d, expected write quorum: %d", fmt.Errorf("Write quorum may be lost on zone: %d, set: %d, expected write quorum: %d",
zoneIdx, setIdx, writeQuorum)) zoneIdx, setIdx, writeQuorum))
return HealthResult{ return HealthResult{
Healthy: false, Healthy: false,
ZoneID: zoneIdx, HealingDrives: len(aggHealStateResult.HealDisks),
SetID: setIdx, ZoneID: zoneIdx,
WriteQuorum: writeQuorum, SetID: setIdx,
WriteQuorum: writeQuorum,
} }
} }
} }
@ -2081,21 +2101,6 @@ func (z *erasureZones) Health(ctx context.Context, opts HealthOptions) HealthRes
} }
} }
// check if local disks are being healed, if they are being healed
// we need to tell healthy status as 'false' so that this server
// is not taken down for maintenance
aggHealStateResult, err := getAggregatedBackgroundHealState(ctx)
if err != nil {
logger.LogIf(logger.SetReqInfo(ctx, reqInfo), fmt.Errorf("Unable to verify global heal status: %w", err))
return HealthResult{
Healthy: false,
}
}
if len(aggHealStateResult.HealDisks) > 0 {
logger.LogIf(logger.SetReqInfo(ctx, reqInfo), fmt.Errorf("Total drives to be healed %d", len(aggHealStateResult.HealDisks)))
}
return HealthResult{ return HealthResult{
Healthy: len(aggHealStateResult.HealDisks) == 0, Healthy: len(aggHealStateResult.HealDisks) == 0,
HealingDrives: len(aggHealStateResult.HealDisks), HealingDrives: len(aggHealStateResult.HealDisks),

View File

@ -73,11 +73,14 @@ func getLocalBackgroundHealStatus() (madmin.BgHealState, bool) {
return madmin.BgHealState{}, false return madmin.BgHealState{}, false
} }
objAPI := newObjectLayerWithoutSafeModeFn()
if objAPI == nil {
return madmin.BgHealState{}, false
}
var healDisks []string var healDisks []string
for _, eps := range globalBackgroundHealState.getHealLocalDisks() { for _, ep := range getLocalDisksToHeal() {
for _, ep := range eps { healDisks = append(healDisks, ep.String())
healDisks = append(healDisks, ep.String())
}
} }
return madmin.BgHealState{ return madmin.BgHealState{

View File

@ -43,7 +43,9 @@ func ClusterCheckHandler(w http.ResponseWriter, r *http.Request) {
} }
if !result.Healthy { if !result.Healthy {
// return how many drives are being healed if any // return how many drives are being healed if any
w.Header().Set("X-Minio-Healing-Drives", strconv.Itoa(result.HealingDrives)) if result.HealingDrives > 0 {
w.Header().Set("X-Minio-Healing-Drives", strconv.Itoa(result.HealingDrives))
}
// As a maintenance call we are purposefully asked to be taken // As a maintenance call we are purposefully asked to be taken
// down, this is for orchestrators to know if we can safely // down, this is for orchestrators to know if we can safely
// take this server down, return appropriate error. // take this server down, return appropriate error.