mirror of
https://github.com/minio/minio.git
synced 2025-01-16 09:13:16 -05:00
16f8cf1c52
If one object has many parts where all parts are readable but some parts are missing from some drives, this object can be sometimes un-healable, which is wrong. This commit will avoid reading from drives that have missing, corrupted or outdated xl.meta. It will also check if any part is unreadable to avoid healing in that case.
446 lines
12 KiB
Go
446 lines
12 KiB
Go
// Copyright (c) 2015-2021 MinIO, Inc.
|
|
//
|
|
// This file is part of MinIO Object Storage stack
|
|
//
|
|
// This program is free software: you can redistribute it and/or modify
|
|
// it under the terms of the GNU Affero General Public License as published by
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU Affero General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU Affero General Public License
|
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
package cmd
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"slices"
|
|
"time"
|
|
|
|
"github.com/minio/madmin-go/v3"
|
|
)
|
|
|
|
func commonETags(etags []string) (etag string, maxima int) {
|
|
etagOccurrenceMap := make(map[string]int, len(etags))
|
|
|
|
// Ignore the uuid sentinel and count the rest.
|
|
for _, etag := range etags {
|
|
if etag == "" {
|
|
continue
|
|
}
|
|
etagOccurrenceMap[etag]++
|
|
}
|
|
|
|
maxima = 0 // Counter for remembering max occurrence of elements.
|
|
latest := ""
|
|
|
|
// Find the common cardinality from previously collected
|
|
// occurrences of elements.
|
|
for etag, count := range etagOccurrenceMap {
|
|
if count < maxima {
|
|
continue
|
|
}
|
|
|
|
// We are at or above maxima
|
|
if count > maxima {
|
|
maxima = count
|
|
latest = etag
|
|
}
|
|
}
|
|
|
|
// Return the collected common max time, with maxima
|
|
return latest, maxima
|
|
}
|
|
|
|
// commonTime returns a maximally occurring time from a list of time.
|
|
func commonTimeAndOccurrence(times []time.Time, group time.Duration) (maxTime time.Time, maxima int) {
|
|
timeOccurrenceMap := make(map[int64]int, len(times))
|
|
groupNano := group.Nanoseconds()
|
|
// Ignore the uuid sentinel and count the rest.
|
|
for _, t := range times {
|
|
if t.Equal(timeSentinel) || t.IsZero() {
|
|
continue
|
|
}
|
|
nano := t.UnixNano()
|
|
if group > 0 {
|
|
for k := range timeOccurrenceMap {
|
|
if k == nano {
|
|
// We add to ourself later
|
|
continue
|
|
}
|
|
diff := k - nano
|
|
if diff < 0 {
|
|
diff = -diff
|
|
}
|
|
// We are within the limit
|
|
if diff < groupNano {
|
|
timeOccurrenceMap[k]++
|
|
}
|
|
}
|
|
}
|
|
// Add ourself...
|
|
timeOccurrenceMap[nano]++
|
|
}
|
|
|
|
maxima = 0 // Counter for remembering max occurrence of elements.
|
|
latest := int64(0)
|
|
|
|
// Find the common cardinality from previously collected
|
|
// occurrences of elements.
|
|
for nano, count := range timeOccurrenceMap {
|
|
if count < maxima {
|
|
continue
|
|
}
|
|
|
|
// We are at or above maxima
|
|
if count > maxima || nano > latest {
|
|
maxima = count
|
|
latest = nano
|
|
}
|
|
}
|
|
|
|
// Return the collected common max time, with maxima
|
|
return time.Unix(0, latest).UTC(), maxima
|
|
}
|
|
|
|
// commonTime returns a maximally occurring time from a list of time if it
|
|
// occurs >= quorum, else return timeSentinel
|
|
func commonTime(modTimes []time.Time, quorum int) time.Time {
|
|
if modTime, count := commonTimeAndOccurrence(modTimes, 0); count >= quorum {
|
|
return modTime
|
|
}
|
|
|
|
return timeSentinel
|
|
}
|
|
|
|
func commonETag(etags []string, quorum int) string {
|
|
if etag, count := commonETags(etags); count >= quorum {
|
|
return etag
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// Beginning of unix time is treated as sentinel value here.
|
|
var (
|
|
timeSentinel = time.Unix(0, 0).UTC()
|
|
timeSentinel1970 = time.Unix(0, 1).UTC() // 1970 used for special cases when xlmeta.version == 0
|
|
)
|
|
|
|
// Boot modTimes up to disk count, setting the value to time sentinel.
|
|
func bootModtimes(diskCount int) []time.Time {
|
|
modTimes := make([]time.Time, diskCount)
|
|
// Boots up all the modtimes.
|
|
for i := range modTimes {
|
|
modTimes[i] = timeSentinel
|
|
}
|
|
return modTimes
|
|
}
|
|
|
|
func listObjectETags(partsMetadata []FileInfo, errs []error, quorum int) (etags []string) {
|
|
etags = make([]string, len(partsMetadata))
|
|
vidMap := map[string]int{}
|
|
for index, metadata := range partsMetadata {
|
|
if errs[index] != nil {
|
|
continue
|
|
}
|
|
vid := metadata.VersionID
|
|
if metadata.VersionID == "" {
|
|
vid = nullVersionID
|
|
}
|
|
vidMap[vid]++
|
|
etags[index] = metadata.Metadata["etag"]
|
|
}
|
|
|
|
for _, count := range vidMap {
|
|
// do we have enough common versions
|
|
// that have enough quorum to satisfy
|
|
// the etag.
|
|
if count >= quorum {
|
|
return etags
|
|
}
|
|
}
|
|
|
|
return make([]string, len(partsMetadata))
|
|
}
|
|
|
|
// Extracts list of times from FileInfo slice and returns, skips
|
|
// slice elements which have errors.
|
|
func listObjectModtimes(partsMetadata []FileInfo, errs []error) (modTimes []time.Time) {
|
|
modTimes = bootModtimes(len(partsMetadata))
|
|
for index, metadata := range partsMetadata {
|
|
if errs[index] != nil {
|
|
continue
|
|
}
|
|
// Once the file is found, save the uuid saved on disk.
|
|
modTimes[index] = metadata.ModTime
|
|
}
|
|
return modTimes
|
|
}
|
|
|
|
func filterOnlineDisksInplace(fi FileInfo, partsMetadata []FileInfo, onlineDisks []StorageAPI) {
|
|
for i, meta := range partsMetadata {
|
|
if fi.XLV1 == meta.XLV1 {
|
|
continue
|
|
}
|
|
onlineDisks[i] = nil
|
|
}
|
|
}
|
|
|
|
// Notes:
|
|
// There are 5 possible states a disk could be in,
|
|
// 1. __online__ - has the latest copy of xl.meta - returned by listOnlineDisks
|
|
//
|
|
// 2. __offline__ - err == errDiskNotFound
|
|
//
|
|
// 3. __availableWithParts__ - has the latest copy of xl.meta and has all
|
|
// parts with checksums matching; returned by disksWithAllParts
|
|
//
|
|
// 4. __outdated__ - returned by outDatedDisk, provided []StorageAPI
|
|
// returned by diskWithAllParts is passed for latestDisks.
|
|
// - has an old copy of xl.meta
|
|
// - doesn't have xl.meta (errFileNotFound)
|
|
// - has the latest xl.meta but one or more parts are corrupt
|
|
//
|
|
// 5. __missingParts__ - has the latest copy of xl.meta but has some parts
|
|
// missing. This is identified separately since this may need manual
|
|
// inspection to understand the root cause. E.g, this could be due to
|
|
// backend filesystem corruption.
|
|
|
|
// listOnlineDisks - returns
|
|
// - a slice of disks where disk having 'older' xl.meta (or nothing)
|
|
// are set to nil.
|
|
// - latest (in time) of the maximally occurring modTime(s), which has at least quorum occurrences.
|
|
func listOnlineDisks(disks []StorageAPI, partsMetadata []FileInfo, errs []error, quorum int) (onlineDisks []StorageAPI, modTime time.Time, etag string) {
|
|
onlineDisks = make([]StorageAPI, len(disks))
|
|
|
|
// List all the file commit ids from parts metadata.
|
|
modTimes := listObjectModtimes(partsMetadata, errs)
|
|
|
|
// Reduce list of UUIDs to a single common value.
|
|
modTime = commonTime(modTimes, quorum)
|
|
|
|
if modTime.IsZero() || modTime.Equal(timeSentinel) {
|
|
etags := listObjectETags(partsMetadata, errs, quorum)
|
|
|
|
etag = commonETag(etags, quorum)
|
|
|
|
if etag != "" { // allow this fallback only if a non-empty etag is found.
|
|
for index, e := range etags {
|
|
if partsMetadata[index].IsValid() && e == etag {
|
|
onlineDisks[index] = disks[index]
|
|
} else {
|
|
onlineDisks[index] = nil
|
|
}
|
|
}
|
|
return onlineDisks, modTime, etag
|
|
}
|
|
}
|
|
|
|
// Create a new online disks slice, which have common uuid.
|
|
for index, t := range modTimes {
|
|
if partsMetadata[index].IsValid() && t.Equal(modTime) {
|
|
onlineDisks[index] = disks[index]
|
|
} else {
|
|
onlineDisks[index] = nil
|
|
}
|
|
}
|
|
|
|
return onlineDisks, modTime, ""
|
|
}
|
|
|
|
// Convert verify or check parts returned error to integer representation
|
|
func convPartErrToInt(err error) int {
|
|
err = unwrapAll(err)
|
|
switch err {
|
|
case nil:
|
|
return checkPartSuccess
|
|
case errFileNotFound, errFileVersionNotFound:
|
|
return checkPartFileNotFound
|
|
case errFileCorrupt:
|
|
return checkPartFileCorrupt
|
|
case errVolumeNotFound:
|
|
return checkPartVolumeNotFound
|
|
case errDiskNotFound:
|
|
return checkPartDiskNotFound
|
|
default:
|
|
return checkPartUnknown
|
|
}
|
|
}
|
|
|
|
func partNeedsHealing(partErrs []int) bool {
|
|
return slices.IndexFunc(partErrs, func(i int) bool { return i != checkPartSuccess && i != checkPartUnknown }) > -1
|
|
}
|
|
|
|
func countPartNotSuccess(partErrs []int) (c int) {
|
|
for _, pe := range partErrs {
|
|
if pe != checkPartSuccess {
|
|
c++
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
// checkObjectWithAllParts sets partsMetadata and onlineDisks when xl.meta is inexistant/corrupted or outdated
|
|
// it also checks if the status of each part (corrupted, missing, ok) in each drive
|
|
func checkObjectWithAllParts(ctx context.Context, onlineDisks []StorageAPI, partsMetadata []FileInfo,
|
|
errs []error, latestMeta FileInfo, filterByETag bool, bucket, object string,
|
|
scanMode madmin.HealScanMode,
|
|
) (dataErrsByDisk map[int][]int, dataErrsByPart map[int][]int) {
|
|
dataErrsByDisk = make(map[int][]int, len(onlineDisks))
|
|
for i := range onlineDisks {
|
|
dataErrsByDisk[i] = make([]int, len(latestMeta.Parts))
|
|
}
|
|
|
|
dataErrsByPart = make(map[int][]int, len(latestMeta.Parts))
|
|
for i := range latestMeta.Parts {
|
|
dataErrsByPart[i] = make([]int, len(onlineDisks))
|
|
}
|
|
|
|
inconsistent := 0
|
|
for i, meta := range partsMetadata {
|
|
if !meta.IsValid() {
|
|
// Since for majority of the cases erasure.Index matches with erasure.Distribution we can
|
|
// consider the offline disks as consistent.
|
|
continue
|
|
}
|
|
if !meta.Deleted {
|
|
if len(meta.Erasure.Distribution) != len(onlineDisks) {
|
|
// Erasure distribution seems to have lesser
|
|
// number of items than number of online disks.
|
|
inconsistent++
|
|
continue
|
|
}
|
|
if meta.Erasure.Distribution[i] != meta.Erasure.Index {
|
|
// Mismatch indexes with distribution order
|
|
inconsistent++
|
|
}
|
|
}
|
|
}
|
|
|
|
erasureDistributionReliable := true
|
|
if inconsistent > len(partsMetadata)/2 {
|
|
// If there are too many inconsistent files, then we can't trust erasure.Distribution (most likely
|
|
// because of bugs found in CopyObject/PutObjectTags) https://github.com/minio/minio/pull/10772
|
|
erasureDistributionReliable = false
|
|
}
|
|
|
|
metaErrs := make([]error, len(errs))
|
|
|
|
for i := range onlineDisks {
|
|
if errs[i] != nil {
|
|
metaErrs[i] = errs[i]
|
|
continue
|
|
}
|
|
if onlineDisks[i] == OfflineDisk {
|
|
metaErrs[i] = errDiskNotFound
|
|
continue
|
|
}
|
|
|
|
meta := partsMetadata[i]
|
|
corrupted := false
|
|
if filterByETag {
|
|
corrupted = meta.Metadata["etag"] != latestMeta.Metadata["etag"]
|
|
} else {
|
|
corrupted = !meta.ModTime.Equal(latestMeta.ModTime) || meta.DataDir != latestMeta.DataDir
|
|
}
|
|
|
|
if corrupted {
|
|
metaErrs[i] = errFileCorrupt
|
|
partsMetadata[i] = FileInfo{}
|
|
onlineDisks[i] = nil
|
|
continue
|
|
}
|
|
|
|
if erasureDistributionReliable {
|
|
if !meta.IsValid() {
|
|
partsMetadata[i] = FileInfo{}
|
|
metaErrs[i] = errFileCorrupt
|
|
onlineDisks[i] = nil
|
|
continue
|
|
}
|
|
|
|
if !meta.Deleted {
|
|
if len(meta.Erasure.Distribution) != len(onlineDisks) {
|
|
// Erasure distribution is not the same as onlineDisks
|
|
// attempt a fix if possible, assuming other entries
|
|
// might have the right erasure distribution.
|
|
partsMetadata[i] = FileInfo{}
|
|
metaErrs[i] = errFileCorrupt
|
|
onlineDisks[i] = nil
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Copy meta errors to part errors
|
|
for i, err := range metaErrs {
|
|
if err != nil {
|
|
partErr := convPartErrToInt(err)
|
|
for p := range latestMeta.Parts {
|
|
dataErrsByPart[p][i] = partErr
|
|
}
|
|
}
|
|
}
|
|
|
|
for i, onlineDisk := range onlineDisks {
|
|
if metaErrs[i] != nil {
|
|
continue
|
|
}
|
|
|
|
meta := partsMetadata[i]
|
|
if meta.Deleted || meta.IsRemote() {
|
|
continue
|
|
}
|
|
|
|
// Always check data, if we got it.
|
|
if (len(meta.Data) > 0 || meta.Size == 0) && len(meta.Parts) > 0 {
|
|
checksumInfo := meta.Erasure.GetChecksumInfo(meta.Parts[0].Number)
|
|
verifyErr := bitrotVerify(bytes.NewReader(meta.Data),
|
|
int64(len(meta.Data)),
|
|
meta.Erasure.ShardFileSize(meta.Size),
|
|
checksumInfo.Algorithm,
|
|
checksumInfo.Hash, meta.Erasure.ShardSize())
|
|
dataErrsByPart[0][i] = convPartErrToInt(verifyErr)
|
|
continue
|
|
}
|
|
|
|
var (
|
|
verifyErr error
|
|
verifyResp *CheckPartsResp
|
|
)
|
|
|
|
switch scanMode {
|
|
case madmin.HealDeepScan:
|
|
// disk has a valid xl.meta but may not have all the
|
|
// parts. This is considered an outdated disk, since
|
|
// it needs healing too.
|
|
verifyResp, verifyErr = onlineDisk.VerifyFile(ctx, bucket, object, meta)
|
|
default:
|
|
verifyResp, verifyErr = onlineDisk.CheckParts(ctx, bucket, object, meta)
|
|
}
|
|
|
|
for p := range latestMeta.Parts {
|
|
if verifyErr != nil {
|
|
dataErrsByPart[p][i] = convPartErrToInt(verifyErr)
|
|
} else {
|
|
dataErrsByPart[p][i] = verifyResp.Results[p]
|
|
}
|
|
}
|
|
}
|
|
|
|
// Build dataErrs by disk from dataErrs by part
|
|
for part, disks := range dataErrsByPart {
|
|
for disk := range disks {
|
|
dataErrsByDisk[disk][part] = dataErrsByPart[part][disk]
|
|
}
|
|
}
|
|
return
|
|
}
|