heal: calculate the number of workers based on NRRequests (#17945)

This commit is contained in:
Anis Eleuch 2023-09-11 14:48:54 -07:00 committed by GitHub
parent 9878031cfd
commit 41de53996b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 131 additions and 61 deletions

View File

@ -683,13 +683,6 @@ func (h *healSequence) healSequenceStart(objAPI ObjectLayer) {
} }
} }
func (h *healSequence) logHeal(healType madmin.HealItemType) {
h.mutex.Lock()
h.scannedItemsMap[healType]++
h.lastHealActivity = UTCNow()
h.mutex.Unlock()
}
func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItemType) error { func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItemType) error {
// Send heal request // Send heal request
task := healTask{ task := healTask{

View File

@ -20,6 +20,7 @@ package cmd
import ( import (
"context" "context"
"fmt" "fmt"
"runtime"
"sort" "sort"
"time" "time"
@ -30,6 +31,7 @@ import (
"github.com/minio/minio/internal/logger" "github.com/minio/minio/internal/logger"
"github.com/minio/pkg/v2/console" "github.com/minio/pkg/v2/console"
"github.com/minio/pkg/v2/wildcard" "github.com/minio/pkg/v2/wildcard"
"github.com/minio/pkg/v2/workers"
) )
const ( const (
@ -132,30 +134,8 @@ func getLocalBackgroundHealStatus(ctx context.Context, o ObjectLayer) (madmin.Bg
return status, true return status, true
} }
func mustGetHealSequence(ctx context.Context) *healSequence {
// Get background heal sequence to send elements to heal
for {
globalHealStateLK.RLock()
hstate := globalBackgroundHealState
globalHealStateLK.RUnlock()
if hstate == nil {
time.Sleep(time.Second)
continue
}
bgSeq, ok := hstate.getHealSequenceByToken(bgHealingUUID)
if !ok {
time.Sleep(time.Second)
continue
}
return bgSeq
}
}
// healErasureSet lists and heals all objects in a specific erasure set // healErasureSet lists and heals all objects in a specific erasure set
func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, tracker *healingTracker) error { func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, tracker *healingTracker) error {
bgSeq := mustGetHealSequence(ctx)
scanMode := madmin.HealNormalScan scanMode := madmin.HealNormalScan
// Make sure to copy since `buckets slice` // Make sure to copy since `buckets slice`
@ -173,6 +153,30 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
} }
} }
info, err := tracker.disk.DiskInfo(ctx, false)
if err != nil {
return fmt.Errorf("unable to get disk information before healing it: %w", err)
}
var numHealers uint64
if numCores := uint64(runtime.GOMAXPROCS(0)); info.NRRequests > numCores {
numHealers = numCores / 4
} else {
numHealers = info.NRRequests / 4
}
if numHealers < 4 {
numHealers = 4
}
// allow overriding this value as well..
if v := globalHealConfig.GetWorkers(); v > 0 {
numHealers = uint64(v)
}
logger.Info(fmt.Sprintf("Healing drive '%s' - use %d parallel workers.", tracker.disk.String(), numHealers))
jt, _ := workers.New(int(numHealers))
var retErr error var retErr error
// Heal all buckets with all objects // Heal all buckets with all objects
for _, bucket := range healBuckets { for _, bucket := range healBuckets {
@ -267,6 +271,8 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
// Note: updates from healEntry to tracker must be sent on results channel. // Note: updates from healEntry to tracker must be sent on results channel.
healEntry := func(bucket string, entry metaCacheEntry) { healEntry := func(bucket string, entry metaCacheEntry) {
defer jt.Give()
if entry.name == "" && len(entry.metadata) == 0 { if entry.name == "" && len(entry.metadata) == 0 {
// ignore entries that don't have metadata. // ignore entries that don't have metadata.
return return
@ -291,14 +297,17 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
} }
} }
// erasureObjects layer needs object names to be encoded
encodedEntryName := encodeDirObject(entry.name)
var result healEntryResult var result healEntryResult
fivs, err := entry.fileInfoVersions(bucket) fivs, err := entry.fileInfoVersions(bucket)
if err != nil { if err != nil {
err := bgSeq.queueHealTask(healSource{ _, err := er.HealObject(ctx, bucket, encodedEntryName, "",
bucket: bucket, madmin.HealOpts{
object: entry.name, ScanMode: scanMode,
versionID: "", Remove: healDeleteDangling,
}, madmin.HealItemObject) })
if err != nil { if err != nil {
if isErrObjectNotFound(err) { if isErrObjectNotFound(err) {
// queueing happens across namespace, ignore // queueing happens across namespace, ignore
@ -321,11 +330,11 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
if version.ModTime.After(tracker.Started) { if version.ModTime.After(tracker.Started) {
continue continue
} }
if err := bgSeq.queueHealTask(healSource{ if _, err := er.HealObject(ctx, bucket, encodedEntryName,
bucket: bucket, version.VersionID, madmin.HealOpts{
object: version.Name, ScanMode: scanMode,
versionID: version.VersionID, Remove: healDeleteDangling,
}, madmin.HealItemObject); err != nil { }); err != nil {
if isErrObjectNotFound(err) { if isErrObjectNotFound(err) {
// queueing happens across namespace, ignore // queueing happens across namespace, ignore
// objects that are not found. // objects that are not found.
@ -344,7 +353,6 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
} else { } else {
result = healEntrySuccess(uint64(version.Size)) result = healEntrySuccess(uint64(version.Size))
} }
bgSeq.logHeal(madmin.HealItemObject)
if !send(result) { if !send(result) {
return return
@ -382,7 +390,8 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
minDisks: 1, minDisks: 1,
reportNotFound: false, reportNotFound: false,
agreed: func(entry metaCacheEntry) { agreed: func(entry metaCacheEntry) {
healEntry(actualBucket, entry) jt.Take()
go healEntry(actualBucket, entry)
}, },
partial: func(entries metaCacheEntries, _ []error) { partial: func(entries metaCacheEntries, _ []error) {
entry, ok := entries.resolve(&resolver) entry, ok := entries.resolve(&resolver)
@ -391,10 +400,12 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
// proceed to heal nonetheless. // proceed to heal nonetheless.
entry, _ = entries.firstFound() entry, _ = entries.firstFound()
} }
healEntry(actualBucket, *entry) jt.Take()
go healEntry(actualBucket, *entry)
}, },
finished: nil, finished: nil,
}) })
jt.Wait() // synchronize all the concurrent heal jobs
close(results) close(results)
if err != nil { if err != nil {
// Set this such that when we return this function // Set this such that when we return this function

View File

@ -46,6 +46,7 @@ type DiskInfo struct {
FreeInodes uint64 FreeInodes uint64
Major uint32 Major uint32
Minor uint32 Minor uint32
NRRequests uint64
FSType string FSType string
RootDisk bool RootDisk bool
Healing bool Healing bool

View File

@ -14,8 +14,8 @@ func (z *DiskInfo) DecodeMsg(dc *msgp.Reader) (err error) {
err = msgp.WrapError(err) err = msgp.WrapError(err)
return return
} }
if zb0001 != 17 { if zb0001 != 18 {
err = msgp.ArrayError{Wanted: 17, Got: zb0001} err = msgp.ArrayError{Wanted: 18, Got: zb0001}
return return
} }
z.Total, err = dc.ReadUint64() z.Total, err = dc.ReadUint64()
@ -53,6 +53,11 @@ func (z *DiskInfo) DecodeMsg(dc *msgp.Reader) (err error) {
err = msgp.WrapError(err, "Minor") err = msgp.WrapError(err, "Minor")
return return
} }
z.NRRequests, err = dc.ReadUint64()
if err != nil {
err = msgp.WrapError(err, "NRRequests")
return
}
z.FSType, err = dc.ReadString() z.FSType, err = dc.ReadString()
if err != nil { if err != nil {
err = msgp.WrapError(err, "FSType") err = msgp.WrapError(err, "FSType")
@ -108,8 +113,8 @@ func (z *DiskInfo) DecodeMsg(dc *msgp.Reader) (err error) {
// EncodeMsg implements msgp.Encodable // EncodeMsg implements msgp.Encodable
func (z *DiskInfo) EncodeMsg(en *msgp.Writer) (err error) { func (z *DiskInfo) EncodeMsg(en *msgp.Writer) (err error) {
// array header, size 17 // array header, size 18
err = en.Append(0xdc, 0x0, 0x11) err = en.Append(0xdc, 0x0, 0x12)
if err != nil { if err != nil {
return return
} }
@ -148,6 +153,11 @@ func (z *DiskInfo) EncodeMsg(en *msgp.Writer) (err error) {
err = msgp.WrapError(err, "Minor") err = msgp.WrapError(err, "Minor")
return return
} }
err = en.WriteUint64(z.NRRequests)
if err != nil {
err = msgp.WrapError(err, "NRRequests")
return
}
err = en.WriteString(z.FSType) err = en.WriteString(z.FSType)
if err != nil { if err != nil {
err = msgp.WrapError(err, "FSType") err = msgp.WrapError(err, "FSType")
@ -204,8 +214,8 @@ func (z *DiskInfo) EncodeMsg(en *msgp.Writer) (err error) {
// MarshalMsg implements msgp.Marshaler // MarshalMsg implements msgp.Marshaler
func (z *DiskInfo) MarshalMsg(b []byte) (o []byte, err error) { func (z *DiskInfo) MarshalMsg(b []byte) (o []byte, err error) {
o = msgp.Require(b, z.Msgsize()) o = msgp.Require(b, z.Msgsize())
// array header, size 17 // array header, size 18
o = append(o, 0xdc, 0x0, 0x11) o = append(o, 0xdc, 0x0, 0x12)
o = msgp.AppendUint64(o, z.Total) o = msgp.AppendUint64(o, z.Total)
o = msgp.AppendUint64(o, z.Free) o = msgp.AppendUint64(o, z.Free)
o = msgp.AppendUint64(o, z.Used) o = msgp.AppendUint64(o, z.Used)
@ -213,6 +223,7 @@ func (z *DiskInfo) MarshalMsg(b []byte) (o []byte, err error) {
o = msgp.AppendUint64(o, z.FreeInodes) o = msgp.AppendUint64(o, z.FreeInodes)
o = msgp.AppendUint32(o, z.Major) o = msgp.AppendUint32(o, z.Major)
o = msgp.AppendUint32(o, z.Minor) o = msgp.AppendUint32(o, z.Minor)
o = msgp.AppendUint64(o, z.NRRequests)
o = msgp.AppendString(o, z.FSType) o = msgp.AppendString(o, z.FSType)
o = msgp.AppendBool(o, z.RootDisk) o = msgp.AppendBool(o, z.RootDisk)
o = msgp.AppendBool(o, z.Healing) o = msgp.AppendBool(o, z.Healing)
@ -238,8 +249,8 @@ func (z *DiskInfo) UnmarshalMsg(bts []byte) (o []byte, err error) {
err = msgp.WrapError(err) err = msgp.WrapError(err)
return return
} }
if zb0001 != 17 { if zb0001 != 18 {
err = msgp.ArrayError{Wanted: 17, Got: zb0001} err = msgp.ArrayError{Wanted: 18, Got: zb0001}
return return
} }
z.Total, bts, err = msgp.ReadUint64Bytes(bts) z.Total, bts, err = msgp.ReadUint64Bytes(bts)
@ -277,6 +288,11 @@ func (z *DiskInfo) UnmarshalMsg(bts []byte) (o []byte, err error) {
err = msgp.WrapError(err, "Minor") err = msgp.WrapError(err, "Minor")
return return
} }
z.NRRequests, bts, err = msgp.ReadUint64Bytes(bts)
if err != nil {
err = msgp.WrapError(err, "NRRequests")
return
}
z.FSType, bts, err = msgp.ReadStringBytes(bts) z.FSType, bts, err = msgp.ReadStringBytes(bts)
if err != nil { if err != nil {
err = msgp.WrapError(err, "FSType") err = msgp.WrapError(err, "FSType")
@ -333,7 +349,7 @@ func (z *DiskInfo) UnmarshalMsg(bts []byte) (o []byte, err error) {
// Msgsize returns an upper bound estimate of the number of bytes occupied by the serialized message // Msgsize returns an upper bound estimate of the number of bytes occupied by the serialized message
func (z *DiskInfo) Msgsize() (s int) { func (z *DiskInfo) Msgsize() (s int) {
s = 3 + msgp.Uint64Size + msgp.Uint64Size + msgp.Uint64Size + msgp.Uint64Size + msgp.Uint64Size + msgp.Uint32Size + msgp.Uint32Size + msgp.StringPrefixSize + len(z.FSType) + msgp.BoolSize + msgp.BoolSize + msgp.BoolSize + msgp.StringPrefixSize + len(z.Endpoint) + msgp.StringPrefixSize + len(z.MountPath) + msgp.StringPrefixSize + len(z.ID) + msgp.BoolSize + z.Metrics.Msgsize() + msgp.StringPrefixSize + len(z.Error) s = 3 + msgp.Uint64Size + msgp.Uint64Size + msgp.Uint64Size + msgp.Uint64Size + msgp.Uint64Size + msgp.Uint32Size + msgp.Uint32Size + msgp.Uint64Size + msgp.StringPrefixSize + len(z.FSType) + msgp.BoolSize + msgp.BoolSize + msgp.BoolSize + msgp.StringPrefixSize + len(z.Endpoint) + msgp.StringPrefixSize + len(z.MountPath) + msgp.StringPrefixSize + len(z.ID) + msgp.BoolSize + z.Metrics.Msgsize() + msgp.StringPrefixSize + len(z.Error)
return return
} }

View File

@ -114,6 +114,8 @@ type xlStorage struct {
formatData []byte formatData []byte
nrRequests uint64
// mutex to prevent concurrent read operations overloading walks. // mutex to prevent concurrent read operations overloading walks.
rotational bool rotational bool
walkMu *sync.Mutex walkMu *sync.Mutex
@ -244,6 +246,11 @@ func newXLStorage(ep Endpoint, cleanUp bool) (s *xlStorage, err error) {
diskIndex: -1, diskIndex: -1,
} }
// Sanitize before setting it
if info.NRRequests > 0 {
s.nrRequests = info.NRRequests
}
// We stagger listings only on HDDs. // We stagger listings only on HDDs.
if info.Rotational == nil || *info.Rotational { if info.Rotational == nil || *info.Rotational {
s.rotational = true s.rotational = true
@ -658,6 +665,7 @@ func (s *xlStorage) DiskInfo(_ context.Context, _ bool) (info DiskInfo, err erro
dcinfo.UsedInodes = di.Files - di.Ffree dcinfo.UsedInodes = di.Files - di.Ffree
dcinfo.FreeInodes = di.Ffree dcinfo.FreeInodes = di.Ffree
dcinfo.FSType = di.FSType dcinfo.FSType = di.FSType
dcinfo.NRRequests = s.nrRequests
dcinfo.Rotational = s.rotational dcinfo.Rotational = s.rotational
diskID, err := s.GetDiskID() diskID, err := s.GetDiskID()
// Healing is 'true' when // Healing is 'true' when

View File

@ -273,10 +273,13 @@ Once set the scanner settings are automatically applied without the need for ser
### Healing ### Healing
Healing is enabled by default. The following configuration settings allow for more staggered delay in terms of healing. The healing system by default adapts to the system speed and pauses up to '1sec' per object when the system has `max_io` number of concurrent requests. It is possible to adjust the `max_sleep` and `max_io` values thereby increasing the healing speed. The delays between each operation of the healer can be adjusted by the `mc admin config set alias/ heal max_sleep=1s` and maximum concurrent requests allowed before we start slowing things down can be configured with `mc admin config set alias/ heal max_io=30` . By default the wait delay is `1sec` beyond 10 concurrent operations. This means the healer will sleep *1 second* at max for each heal operation if there are more than *10* concurrent client requests. Healing is enabled by default. The following configuration settings allow for more staggered delay in terms of healing. The healing system by default adapts to the system speed and pauses up to '250ms' per object when the system has `max_io` number of concurrent requests. It is possible to adjust the `max_sleep` and `max_io` values thereby increasing the healing speed. The delays between each operation of the healer can be adjusted by the `mc admin config set alias/ heal max_sleep=1s` and maximum concurrent requests allowed before we start slowing things down can be configured with `mc admin config set alias/ heal max_io=30` . By default the wait delay is `250ms` beyond 100 concurrent operations. This means the healer will sleep *250 milliseconds* at max for each heal operation if there are more than *100* concurrent client requests.
In most setups this is sufficient to heal the content after drive replacements. Setting `max_sleep` to a *lower* value and setting `max_io` to a *higher* value would make heal go faster. In most setups this is sufficient to heal the content after drive replacements. Setting `max_sleep` to a *lower* value and setting `max_io` to a *higher* value would make heal go faster.
Each node is responsible of healing its local drives; Each drive will have multiple heal workers which is the quarter of the number of CPU cores of the node or the quarter of the configured nr_requests of the drive (https://www.kernel.org/doc/Documentation/block/queue-sysfs.txt). It is also possible to provide a custom number of workers by using this command: `mc admin config set alias/ heal drive_workers=100` .
``` ```
~ mc admin config set alias/ heal ~ mc admin config set alias/ heal
KEY: KEY:
@ -286,6 +289,7 @@ ARGS:
bitrotscan (on|off) perform bitrot scan on drives when checking objects during scanner bitrotscan (on|off) perform bitrot scan on drives when checking objects during scanner
max_sleep (duration) maximum sleep duration between objects to slow down heal operation. eg. 2s max_sleep (duration) maximum sleep duration between objects to slow down heal operation. eg. 2s
max_io (int) maximum IO requests allowed between objects to slow down heal operation. eg. 3 max_io (int) maximum IO requests allowed between objects to slow down heal operation. eg. 3
drive_workers (int) the number of workers per drive to heal a new disk replacement.
``` ```
Example: The following settings will increase the heal operation speed by allowing healing operation to run without delay up to `100` concurrent requests, and the maximum delay between each heal operation is set to `300ms`. Example: The following settings will increase the heal operation speed by allowing healing operation to run without delay up to `100` concurrent requests, and the maximum delay between each heal operation is set to `300ms`.

View File

@ -34,10 +34,12 @@ const (
Bitrot = "bitrotscan" Bitrot = "bitrotscan"
Sleep = "max_sleep" Sleep = "max_sleep"
IOCount = "max_io" IOCount = "max_io"
DriveWorkers = "drive_workers"
EnvBitrot = "MINIO_HEAL_BITROTSCAN" EnvBitrot = "MINIO_HEAL_BITROTSCAN"
EnvSleep = "MINIO_HEAL_MAX_SLEEP" EnvSleep = "MINIO_HEAL_MAX_SLEEP"
EnvIOCount = "MINIO_HEAL_MAX_IO" EnvIOCount = "MINIO_HEAL_MAX_IO"
EnvDriveWorkers = "MINIO_HEAL_DRIVE_WORKERS"
) )
var configMutex sync.RWMutex var configMutex sync.RWMutex
@ -51,6 +53,8 @@ type Config struct {
Sleep time.Duration `json:"sleep"` Sleep time.Duration `json:"sleep"`
IOCount int `json:"iocount"` IOCount int `json:"iocount"`
DriveWorkers int `json:"drive_workers"`
// Cached value from Bitrot field // Cached value from Bitrot field
cache struct { cache struct {
// -1: bitrot enabled, 0: bitrot disabled, > 0: bitrot cycle // -1: bitrot enabled, 0: bitrot disabled, > 0: bitrot cycle
@ -77,6 +81,13 @@ func (opts Config) Clone() (int, time.Duration, string) {
return opts.IOCount, opts.Sleep, opts.Bitrot return opts.IOCount, opts.Sleep, opts.Bitrot
} }
// GetWorkers returns the number of workers, -1 is none configured
func (opts Config) GetWorkers() int {
configMutex.RLock()
defer configMutex.RUnlock()
return opts.DriveWorkers
}
// Update updates opts with nopts // Update updates opts with nopts
func (opts *Config) Update(nopts Config) { func (opts *Config) Update(nopts Config) {
configMutex.Lock() configMutex.Lock()
@ -85,6 +96,7 @@ func (opts *Config) Update(nopts Config) {
opts.Bitrot = nopts.Bitrot opts.Bitrot = nopts.Bitrot
opts.IOCount = nopts.IOCount opts.IOCount = nopts.IOCount
opts.Sleep = nopts.Sleep opts.Sleep = nopts.Sleep
opts.DriveWorkers = nopts.DriveWorkers
opts.cache.bitrotCycle, _ = parseBitrotConfig(nopts.Bitrot) opts.cache.bitrotCycle, _ = parseBitrotConfig(nopts.Bitrot)
} }
@ -103,6 +115,10 @@ var DefaultKVS = config.KVS{
Key: IOCount, Key: IOCount,
Value: "100", Value: "100",
}, },
config.KV{
Key: DriveWorkers,
Value: "",
},
} }
const minimumBitrotCycleInMonths = 1 const minimumBitrotCycleInMonths = 1
@ -154,5 +170,18 @@ func LookupConfig(kvs config.KVS) (cfg Config, err error) {
if err != nil { if err != nil {
return cfg, fmt.Errorf("'heal:max_io' value invalid: %w", err) return cfg, fmt.Errorf("'heal:max_io' value invalid: %w", err)
} }
if ws := env.Get(EnvDriveWorkers, kvs.GetWithDefault(DriveWorkers, DefaultKVS)); ws != "" {
w, err := strconv.Atoi(ws)
if err != nil {
return cfg, fmt.Errorf("'heal:drive_workers' value invalid: %w", err)
}
if w < 1 {
return cfg, fmt.Errorf("'heal:drive_workers' value invalid: zero or negative integer unsupported")
}
cfg.DriveWorkers = w
} else {
cfg.DriveWorkers = -1
}
return cfg, nil return cfg, nil
} }

View File

@ -45,5 +45,11 @@ var (
Optional: true, Optional: true,
Type: "int", Type: "int",
}, },
config.HelpKV{
Key: DriveWorkers,
Description: `the number of workers per drive to heal a new disk replacement` + defaultHelpPostfix(DriveWorkers),
Optional: true,
Type: "int",
},
} }
) )

View File

@ -37,6 +37,7 @@ type Info struct {
Minor uint32 Minor uint32
Name string Name string
Rotational *bool Rotational *bool
NRRequests uint64
} }
// DevID is the drive major and minor ids // DevID is the drive major and minor ids

View File

@ -98,6 +98,7 @@ func GetInfo(path string, firstTime bool) (info Info, err error) {
} }
} }
if err == nil { if err == nil {
info.NRRequests = qst.NRRequests
rot := qst.Rotational == 1 // Rotational is '1' if the device is HDD rot := qst.Rotational == 1 // Rotational is '1' if the device is HDD
info.Rotational = &rot info.Rotational = &rot
} }