introduce new ServiceV2 API to handle guided restarts (#18826)

New API now verifies any hung disks before restart/stop,
provides a 'per node' break down of the restart/stop results.

Provides also how many blocked syscalls are present on the
drives and what users must do about them.

Adds options to do pre-flight checks to provide information
to the user regarding any hung disks. Provides 'force' option
to forcibly attempt a restart() even with waiting syscalls
on the drives.
This commit is contained in:
Harshavardhana
2024-01-19 14:22:36 -08:00
committed by GitHub
parent 83bf15a703
commit ac81f0248c
7 changed files with 203 additions and 20 deletions

View File

@@ -20,6 +20,7 @@ package cmd
import (
"context"
"encoding/gob"
"encoding/json"
"errors"
"fmt"
"io"
@@ -863,6 +864,21 @@ func (s *peerRESTServer) CommitBinaryHandler(w http.ResponseWriter, r *http.Requ
var errUnsupportedSignal = fmt.Errorf("unsupported signal")
func canWeRestartNode() map[string]DiskMetrics {
errs := make([]error, len(globalLocalDrives))
infos := make([]DiskInfo, len(globalLocalDrives))
for i, drive := range globalLocalDrives {
infos[i], errs[i] = drive.DiskInfo(GlobalContext, false)
}
infoMaps := make(map[string]DiskMetrics)
for i := range infos {
if infos[i].Metrics.TotalWaiting >= 1 && errors.Is(errs[i], errFaultyDisk) {
infoMaps[infos[i].Endpoint] = infos[i].Metrics
}
}
return infoMaps
}
// SignalServiceHandler - signal service handler.
func (s *peerRESTServer) SignalServiceHandler(w http.ResponseWriter, r *http.Request) {
if !s.IsValid(w, r) {
@@ -883,10 +899,26 @@ func (s *peerRESTServer) SignalServiceHandler(w http.ResponseWriter, r *http.Req
}
signal := serviceSignal(si)
switch signal {
case serviceRestart:
globalServiceSignalCh <- signal
case serviceStop:
globalServiceSignalCh <- signal
case serviceRestart, serviceStop:
dryRun := r.Form.Get("dry-run") == "true" // This is only supported for `restart/stop`
force := r.Form.Get("force") == "true"
waitingDisks := canWeRestartNode()
if len(waitingDisks) > 0 {
buf, err := json.Marshal(waitingDisks)
if err != nil {
s.writeErrorResponse(w, err)
return
}
s.writeErrorResponse(w, errors.New(string(buf)))
// if its forced we signal the process anyway.
if !force {
return
}
}
if !dryRun {
globalServiceSignalCh <- signal
}
case serviceFreeze:
freezeServices()
case serviceUnFreeze: