heal: Add MRF metrics to background heal API response (#12398)

This commit gathers MRF metrics from 
all nodes in a cluster and return it to the caller. This will show information about the 
number of objects in the MRF queues 
waiting to be healed.
This commit is contained in:
Anis Elleuch
2021-07-16 06:32:06 +01:00
committed by GitHub
parent ead8778305
commit b0b4696a64
12 changed files with 458 additions and 209 deletions

View File

@@ -96,8 +96,6 @@ type erasureSets struct {
disksStorageInfoCache timedValue
mrfMU sync.Mutex
mrfOperations map[healSource]int
lastConnectDisksOpTime time.Time
}
@@ -268,20 +266,11 @@ func (s *erasureSets) connectDisks() {
wg.Wait()
go func() {
idler := time.NewTimer(100 * time.Millisecond)
defer idler.Stop()
for setIndex, justConnected := range setsJustConnected {
if !justConnected {
continue
}
// Send a new set connect event with a timeout
idler.Reset(100 * time.Millisecond)
select {
case s.setReconnectEvent <- setIndex:
case <-idler.C:
}
globalMRFState.newSetReconnected(setIndex, s.poolIndex)
}
}()
}
@@ -375,7 +364,6 @@ func newErasureSets(ctx context.Context, endpoints Endpoints, storageDisks []Sto
setReconnectEvent: make(chan int),
distributionAlgo: format.Erasure.DistributionAlgo,
deploymentID: uuid.MustParse(format.ID),
mrfOperations: make(map[healSource]int),
poolIndex: poolIdx,
}
@@ -446,7 +434,6 @@ func newErasureSets(ctx context.Context, endpoints Endpoints, storageDisks []Sto
nsMutex: mutex,
bp: bp,
bpOld: bpOld,
mrfOpCh: make(chan partialOperation, 10000),
}
}
@@ -466,8 +453,6 @@ func newErasureSets(ctx context.Context, endpoints Endpoints, storageDisks []Sto
// Start the disk monitoring and connect routine.
go s.monitorAndConnectEndpoints(ctx, defaultMonitorConnectEndpointInterval)
go s.maintainMRFList()
go s.healMRFRoutine()
return s, nil
}
@@ -1388,71 +1373,6 @@ func (s *erasureSets) GetObjectTags(ctx context.Context, bucket, object string,
return er.GetObjectTags(ctx, bucket, object, opts)
}
// maintainMRFList gathers the list of successful partial uploads
// from all underlying er.sets and puts them in a global map which
// should not have more than 10000 entries.
func (s *erasureSets) maintainMRFList() {
var agg = make(chan partialOperation, 10000)
for i, er := range s.sets {
go func(c <-chan partialOperation, setIndex int) {
for msg := range c {
msg.failedSet = setIndex
select {
case agg <- msg:
default:
}
}
}(er.mrfOpCh, i)
}
for fOp := range agg {
s.mrfMU.Lock()
if len(s.mrfOperations) > 10000 {
s.mrfMU.Unlock()
continue
}
s.mrfOperations[healSource{
bucket: fOp.bucket,
object: fOp.object,
versionID: fOp.versionID,
opts: &madmin.HealOpts{Remove: true},
}] = fOp.failedSet
s.mrfMU.Unlock()
}
}
// healMRFRoutine monitors new disks connection, sweep the MRF list
// to find objects related to the new disk that needs to be healed.
func (s *erasureSets) healMRFRoutine() {
// Wait until background heal state is initialized
bgSeq := mustGetHealSequence(GlobalContext)
for setIndex := range s.setReconnectEvent {
// Get the list of objects related the er.set
// to which the connected disk belongs.
var mrfOperations []healSource
s.mrfMU.Lock()
for k, v := range s.mrfOperations {
if v == setIndex {
mrfOperations = append(mrfOperations, k)
}
}
s.mrfMU.Unlock()
// Heal objects
for _, u := range mrfOperations {
waitForLowHTTPReq(globalHealConfig.IOCount, globalHealConfig.Sleep)
// Send an object to background heal
bgSeq.sourceCh <- u
s.mrfMU.Lock()
delete(s.mrfOperations, u)
s.mrfMU.Unlock()
}
}
}
// TransitionObject - transition object content to target tier.
func (s *erasureSets) TransitionObject(ctx context.Context, bucket, object string, opts ObjectOptions) error {
return s.getHashedSet(object).TransitionObject(ctx, bucket, object, opts)