mirror of
https://github.com/minio/minio.git
synced 2025-01-12 07:23:23 -05:00
count metrics properly for any failures during drive heal (#20193)
or via `mc admin heal --set 1 --pool 1`
This commit is contained in:
parent
01a8c09920
commit
a9dc061d84
@ -761,6 +761,15 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem
|
||||
return nil
|
||||
}
|
||||
|
||||
countOKDrives := func(drives []madmin.HealDriveInfo) (count int) {
|
||||
for _, drive := range drives {
|
||||
if drive.State == madmin.DriveStateOk {
|
||||
count++
|
||||
}
|
||||
}
|
||||
return count
|
||||
}
|
||||
|
||||
// task queued, now wait for the response.
|
||||
select {
|
||||
case res := <-task.respCh:
|
||||
@ -781,6 +790,11 @@ func (h *healSequence) queueHealTask(source healSource, healType madmin.HealItem
|
||||
if res.err != nil {
|
||||
res.result.Detail = res.err.Error()
|
||||
}
|
||||
if res.result.ParityBlocks > 0 && res.result.DataBlocks > 0 && res.result.DataBlocks > res.result.ParityBlocks {
|
||||
if got := countOKDrives(res.result.After.Drives); got < res.result.ParityBlocks {
|
||||
res.result.Detail = fmt.Sprintf("quorum loss - expected %d minimum, got drive states in OK %d", res.result.ParityBlocks, got)
|
||||
}
|
||||
}
|
||||
return h.pushHealResultItem(res.result)
|
||||
case <-h.ctx.Done():
|
||||
return nil
|
||||
|
@ -150,6 +150,11 @@ type healEntryResult struct {
|
||||
|
||||
// healErasureSet lists and heals all objects in a specific erasure set
|
||||
func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string, tracker *healingTracker) error {
|
||||
bgSeq, found := globalBackgroundHealState.getHealSequenceByToken(bgHealingUUID)
|
||||
if !found {
|
||||
return errors.New("no local healing sequence initialized, unable to heal the drive")
|
||||
}
|
||||
|
||||
scanMode := madmin.HealNormalScan
|
||||
|
||||
// Make sure to copy since `buckets slice`
|
||||
@ -163,11 +168,7 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
|
||||
}
|
||||
|
||||
for _, bucket := range healBuckets {
|
||||
_, err := objAPI.HealBucket(ctx, bucket, madmin.HealOpts{
|
||||
Recreate: true,
|
||||
ScanMode: scanMode,
|
||||
})
|
||||
if err != nil {
|
||||
if err := bgSeq.healBucket(objAPI, bucket, true); err != nil {
|
||||
// Log bucket healing error if any, we shall retry again.
|
||||
healingLogIf(ctx, err)
|
||||
}
|
||||
@ -264,10 +265,7 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
|
||||
tracker.setBucket(bucket)
|
||||
// Heal current bucket again in case if it is failed
|
||||
// in the beginning of erasure set healing
|
||||
if _, err := objAPI.HealBucket(ctx, bucket, madmin.HealOpts{
|
||||
Recreate: true,
|
||||
ScanMode: scanMode,
|
||||
}); err != nil {
|
||||
if err := bgSeq.healBucket(objAPI, bucket, true); err != nil {
|
||||
// Set this such that when we return this function
|
||||
// we let the caller retry this disk again for the
|
||||
// buckets that failed healing.
|
||||
@ -366,6 +364,7 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
|
||||
}
|
||||
return false
|
||||
case results <- result:
|
||||
bgSeq.countScanned(madmin.HealItemObject)
|
||||
return true
|
||||
}
|
||||
}
|
||||
@ -416,8 +415,10 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
|
||||
return
|
||||
}
|
||||
result = healEntryFailure(0)
|
||||
bgSeq.countFailed(madmin.HealItemObject)
|
||||
healingLogIf(ctx, fmt.Errorf("unable to heal object %s/%s: %w", bucket, entry.name, err))
|
||||
} else {
|
||||
bgSeq.countHealed(madmin.HealItemObject)
|
||||
result = healEntrySuccess(uint64(res.ObjectSize))
|
||||
}
|
||||
|
||||
@ -463,8 +464,10 @@ func (er *erasureObjects) healErasureSet(ctx context.Context, buckets []string,
|
||||
}
|
||||
|
||||
if versionHealed {
|
||||
bgSeq.countHealed(madmin.HealItemObject)
|
||||
result = healEntrySuccess(uint64(version.Size))
|
||||
} else {
|
||||
bgSeq.countFailed(madmin.HealItemObject)
|
||||
result = healEntryFailure(uint64(version.Size))
|
||||
if version.VersionID != "" {
|
||||
healingLogIf(ctx, fmt.Errorf("unable to heal object %s/%s-v(%s): %w", bucket, version.Name, version.VersionID, err))
|
||||
|
Loading…
Reference in New Issue
Block a user