mirror of
https://github.com/minio/minio.git
synced 2025-01-11 23:13:23 -05:00
450 lines
12 KiB
Go
450 lines
12 KiB
Go
// Copyright (c) 2015-2021 MinIO, Inc.
|
|
//
|
|
// This file is part of MinIO Object Storage stack
|
|
//
|
|
// This program is free software: you can redistribute it and/or modify
|
|
// it under the terms of the GNU Affero General Public License as published by
|
|
// the Free Software Foundation, either version 3 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU Affero General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU Affero General Public License
|
|
// along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
|
|
package cmd
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"slices"
|
|
"time"
|
|
|
|
"github.com/minio/madmin-go/v3"
|
|
)
|
|
|
|
func commonETags(etags []string) (etag string, maxima int) {
|
|
etagOccurrenceMap := make(map[string]int, len(etags))
|
|
|
|
// Ignore the uuid sentinel and count the rest.
|
|
for _, etag := range etags {
|
|
if etag == "" {
|
|
continue
|
|
}
|
|
etagOccurrenceMap[etag]++
|
|
}
|
|
|
|
maxima = 0 // Counter for remembering max occurrence of elements.
|
|
latest := ""
|
|
|
|
// Find the common cardinality from previously collected
|
|
// occurrences of elements.
|
|
for etag, count := range etagOccurrenceMap {
|
|
if count < maxima {
|
|
continue
|
|
}
|
|
|
|
// We are at or above maxima
|
|
if count > maxima {
|
|
maxima = count
|
|
latest = etag
|
|
}
|
|
}
|
|
|
|
// Return the collected common max time, with maxima
|
|
return latest, maxima
|
|
}
|
|
|
|
// commonTime returns a maximally occurring time from a list of time.
|
|
func commonTimeAndOccurrence(times []time.Time, group time.Duration) (maxTime time.Time, maxima int) {
|
|
timeOccurrenceMap := make(map[int64]int, len(times))
|
|
groupNano := group.Nanoseconds()
|
|
// Ignore the uuid sentinel and count the rest.
|
|
for _, t := range times {
|
|
if t.Equal(timeSentinel) || t.IsZero() {
|
|
continue
|
|
}
|
|
nano := t.UnixNano()
|
|
if group > 0 {
|
|
for k := range timeOccurrenceMap {
|
|
if k == nano {
|
|
// We add to ourself later
|
|
continue
|
|
}
|
|
diff := k - nano
|
|
if diff < 0 {
|
|
diff = -diff
|
|
}
|
|
// We are within the limit
|
|
if diff < groupNano {
|
|
timeOccurrenceMap[k]++
|
|
}
|
|
}
|
|
}
|
|
// Add ourself...
|
|
timeOccurrenceMap[nano]++
|
|
}
|
|
|
|
maxima = 0 // Counter for remembering max occurrence of elements.
|
|
latest := int64(0)
|
|
|
|
// Find the common cardinality from previously collected
|
|
// occurrences of elements.
|
|
for nano, count := range timeOccurrenceMap {
|
|
if count < maxima {
|
|
continue
|
|
}
|
|
|
|
// We are at or above maxima
|
|
if count > maxima || nano > latest {
|
|
maxima = count
|
|
latest = nano
|
|
}
|
|
}
|
|
|
|
// Return the collected common max time, with maxima
|
|
return time.Unix(0, latest).UTC(), maxima
|
|
}
|
|
|
|
// commonTime returns a maximally occurring time from a list of time if it
|
|
// occurs >= quorum, else return timeSentinel
|
|
func commonTime(modTimes []time.Time, quorum int) time.Time {
|
|
if modTime, count := commonTimeAndOccurrence(modTimes, 0); count >= quorum {
|
|
return modTime
|
|
}
|
|
|
|
return timeSentinel
|
|
}
|
|
|
|
func commonETag(etags []string, quorum int) string {
|
|
if etag, count := commonETags(etags); count >= quorum {
|
|
return etag
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// Beginning of unix time is treated as sentinel value here.
|
|
var (
|
|
timeSentinel = time.Unix(0, 0).UTC()
|
|
timeSentinel1970 = time.Unix(0, 1).UTC() // 1970 used for special cases when xlmeta.version == 0
|
|
)
|
|
|
|
// Boot modTimes up to disk count, setting the value to time sentinel.
|
|
func bootModtimes(diskCount int) []time.Time {
|
|
modTimes := make([]time.Time, diskCount)
|
|
// Boots up all the modtimes.
|
|
for i := range modTimes {
|
|
modTimes[i] = timeSentinel
|
|
}
|
|
return modTimes
|
|
}
|
|
|
|
func listObjectETags(partsMetadata []FileInfo, errs []error, quorum int) (etags []string) {
|
|
etags = make([]string, len(partsMetadata))
|
|
vidMap := map[string]int{}
|
|
for index, metadata := range partsMetadata {
|
|
if errs[index] != nil {
|
|
continue
|
|
}
|
|
vid := metadata.VersionID
|
|
if metadata.VersionID == "" {
|
|
vid = nullVersionID
|
|
}
|
|
vidMap[vid]++
|
|
etags[index] = metadata.Metadata["etag"]
|
|
}
|
|
|
|
for _, count := range vidMap {
|
|
// do we have enough common versions
|
|
// that have enough quorum to satisfy
|
|
// the etag.
|
|
if count >= quorum {
|
|
return etags
|
|
}
|
|
}
|
|
|
|
return make([]string, len(partsMetadata))
|
|
}
|
|
|
|
// Extracts list of times from FileInfo slice and returns, skips
|
|
// slice elements which have errors.
|
|
func listObjectModtimes(partsMetadata []FileInfo, errs []error) (modTimes []time.Time) {
|
|
modTimes = bootModtimes(len(partsMetadata))
|
|
for index, metadata := range partsMetadata {
|
|
if errs[index] != nil {
|
|
continue
|
|
}
|
|
// Once the file is found, save the uuid saved on disk.
|
|
modTimes[index] = metadata.ModTime
|
|
}
|
|
return modTimes
|
|
}
|
|
|
|
func filterOnlineDisksInplace(fi FileInfo, partsMetadata []FileInfo, onlineDisks []StorageAPI) {
|
|
for i, meta := range partsMetadata {
|
|
if fi.XLV1 == meta.XLV1 {
|
|
continue
|
|
}
|
|
onlineDisks[i] = nil
|
|
}
|
|
}
|
|
|
|
// Notes:
|
|
// There are 5 possible states a disk could be in,
|
|
// 1. __online__ - has the latest copy of xl.meta - returned by listOnlineDisks
|
|
//
|
|
// 2. __offline__ - err == errDiskNotFound
|
|
//
|
|
// 3. __availableWithParts__ - has the latest copy of xl.meta and has all
|
|
// parts with checksums matching; returned by disksWithAllParts
|
|
//
|
|
// 4. __outdated__ - returned by outDatedDisk, provided []StorageAPI
|
|
// returned by diskWithAllParts is passed for latestDisks.
|
|
// - has an old copy of xl.meta
|
|
// - doesn't have xl.meta (errFileNotFound)
|
|
// - has the latest xl.meta but one or more parts are corrupt
|
|
//
|
|
// 5. __missingParts__ - has the latest copy of xl.meta but has some parts
|
|
// missing. This is identified separately since this may need manual
|
|
// inspection to understand the root cause. E.g, this could be due to
|
|
// backend filesystem corruption.
|
|
|
|
// listOnlineDisks - returns
|
|
// - a slice of disks where disk having 'older' xl.meta (or nothing)
|
|
// are set to nil.
|
|
// - latest (in time) of the maximally occurring modTime(s), which has at least quorum occurrences.
|
|
func listOnlineDisks(disks []StorageAPI, partsMetadata []FileInfo, errs []error, quorum int) (onlineDisks []StorageAPI, modTime time.Time, etag string) {
|
|
onlineDisks = make([]StorageAPI, len(disks))
|
|
|
|
// List all the file commit ids from parts metadata.
|
|
modTimes := listObjectModtimes(partsMetadata, errs)
|
|
|
|
// Reduce list of UUIDs to a single common value.
|
|
modTime = commonTime(modTimes, quorum)
|
|
|
|
if modTime.IsZero() || modTime.Equal(timeSentinel) {
|
|
etags := listObjectETags(partsMetadata, errs, quorum)
|
|
|
|
etag = commonETag(etags, quorum)
|
|
|
|
if etag != "" { // allow this fallback only if a non-empty etag is found.
|
|
for index, e := range etags {
|
|
if partsMetadata[index].IsValid() && e == etag {
|
|
onlineDisks[index] = disks[index]
|
|
} else {
|
|
onlineDisks[index] = nil
|
|
}
|
|
}
|
|
return onlineDisks, modTime, etag
|
|
}
|
|
}
|
|
|
|
// Create a new online disks slice, which have common uuid.
|
|
for index, t := range modTimes {
|
|
if partsMetadata[index].IsValid() && t.Equal(modTime) {
|
|
onlineDisks[index] = disks[index]
|
|
} else {
|
|
onlineDisks[index] = nil
|
|
}
|
|
}
|
|
|
|
return onlineDisks, modTime, ""
|
|
}
|
|
|
|
// Convert verify or check parts returned error to integer representation
|
|
func convPartErrToInt(err error) int {
|
|
err = unwrapAll(err)
|
|
switch err {
|
|
case nil:
|
|
return checkPartSuccess
|
|
case errFileNotFound, errFileVersionNotFound:
|
|
return checkPartFileNotFound
|
|
case errFileCorrupt:
|
|
return checkPartFileCorrupt
|
|
case errVolumeNotFound:
|
|
return checkPartVolumeNotFound
|
|
case errDiskNotFound:
|
|
return checkPartDiskNotFound
|
|
default:
|
|
return checkPartUnknown
|
|
}
|
|
}
|
|
|
|
func partNeedsHealing(partErrs []int) bool {
|
|
return slices.IndexFunc(partErrs, func(i int) bool { return i != checkPartSuccess && i != checkPartUnknown }) > -1
|
|
}
|
|
|
|
func hasPartErr(partErrs []int) bool {
|
|
return slices.IndexFunc(partErrs, func(i int) bool { return i != checkPartSuccess }) > -1
|
|
}
|
|
|
|
// disksWithAllParts - This function needs to be called with
|
|
// []StorageAPI returned by listOnlineDisks. Returns,
|
|
//
|
|
// - disks which have all parts specified in the latest xl.meta.
|
|
//
|
|
// - slice of errors about the state of data files on disk - can have
|
|
// a not-found error or a hash-mismatch error.
|
|
func disksWithAllParts(ctx context.Context, onlineDisks []StorageAPI, partsMetadata []FileInfo,
|
|
errs []error, latestMeta FileInfo, bucket, object string,
|
|
scanMode madmin.HealScanMode,
|
|
) (availableDisks []StorageAPI, dataErrsByDisk map[int][]int, dataErrsByPart map[int][]int) {
|
|
availableDisks = make([]StorageAPI, len(onlineDisks))
|
|
|
|
dataErrsByDisk = make(map[int][]int, len(onlineDisks))
|
|
for i := range onlineDisks {
|
|
dataErrsByDisk[i] = make([]int, len(latestMeta.Parts))
|
|
}
|
|
|
|
dataErrsByPart = make(map[int][]int, len(latestMeta.Parts))
|
|
for i := range latestMeta.Parts {
|
|
dataErrsByPart[i] = make([]int, len(onlineDisks))
|
|
}
|
|
|
|
inconsistent := 0
|
|
for i, meta := range partsMetadata {
|
|
if !meta.IsValid() {
|
|
// Since for majority of the cases erasure.Index matches with erasure.Distribution we can
|
|
// consider the offline disks as consistent.
|
|
continue
|
|
}
|
|
if !meta.Deleted {
|
|
if len(meta.Erasure.Distribution) != len(onlineDisks) {
|
|
// Erasure distribution seems to have lesser
|
|
// number of items than number of online disks.
|
|
inconsistent++
|
|
continue
|
|
}
|
|
if meta.Erasure.Distribution[i] != meta.Erasure.Index {
|
|
// Mismatch indexes with distribution order
|
|
inconsistent++
|
|
}
|
|
}
|
|
}
|
|
|
|
erasureDistributionReliable := true
|
|
if inconsistent > len(partsMetadata)/2 {
|
|
// If there are too many inconsistent files, then we can't trust erasure.Distribution (most likely
|
|
// because of bugs found in CopyObject/PutObjectTags) https://github.com/minio/minio/pull/10772
|
|
erasureDistributionReliable = false
|
|
}
|
|
|
|
metaErrs := make([]error, len(errs))
|
|
|
|
for i, onlineDisk := range onlineDisks {
|
|
if errs[i] != nil {
|
|
metaErrs[i] = errs[i]
|
|
continue
|
|
}
|
|
if onlineDisk == OfflineDisk {
|
|
metaErrs[i] = errDiskNotFound
|
|
continue
|
|
}
|
|
|
|
meta := partsMetadata[i]
|
|
if !meta.ModTime.Equal(latestMeta.ModTime) || meta.DataDir != latestMeta.DataDir {
|
|
metaErrs[i] = errFileCorrupt
|
|
partsMetadata[i] = FileInfo{}
|
|
continue
|
|
}
|
|
|
|
if erasureDistributionReliable {
|
|
if !meta.IsValid() {
|
|
partsMetadata[i] = FileInfo{}
|
|
metaErrs[i] = errFileCorrupt
|
|
continue
|
|
}
|
|
|
|
if !meta.Deleted {
|
|
if len(meta.Erasure.Distribution) != len(onlineDisks) {
|
|
// Erasure distribution is not the same as onlineDisks
|
|
// attempt a fix if possible, assuming other entries
|
|
// might have the right erasure distribution.
|
|
partsMetadata[i] = FileInfo{}
|
|
metaErrs[i] = errFileCorrupt
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Copy meta errors to part errors
|
|
for i, err := range metaErrs {
|
|
if err != nil {
|
|
partErr := convPartErrToInt(err)
|
|
for p := range latestMeta.Parts {
|
|
dataErrsByPart[p][i] = partErr
|
|
}
|
|
}
|
|
}
|
|
|
|
for i, onlineDisk := range onlineDisks {
|
|
if metaErrs[i] != nil {
|
|
continue
|
|
}
|
|
meta := partsMetadata[i]
|
|
|
|
if meta.Deleted || meta.IsRemote() {
|
|
continue
|
|
}
|
|
|
|
// Always check data, if we got it.
|
|
if (len(meta.Data) > 0 || meta.Size == 0) && len(meta.Parts) > 0 {
|
|
checksumInfo := meta.Erasure.GetChecksumInfo(meta.Parts[0].Number)
|
|
verifyErr := bitrotVerify(bytes.NewReader(meta.Data),
|
|
int64(len(meta.Data)),
|
|
meta.Erasure.ShardFileSize(meta.Size),
|
|
checksumInfo.Algorithm,
|
|
checksumInfo.Hash, meta.Erasure.ShardSize())
|
|
dataErrsByPart[0][i] = convPartErrToInt(verifyErr)
|
|
continue
|
|
}
|
|
|
|
var (
|
|
verifyErr error
|
|
verifyResp *CheckPartsResp
|
|
)
|
|
|
|
meta.DataDir = latestMeta.DataDir
|
|
switch scanMode {
|
|
case madmin.HealDeepScan:
|
|
// disk has a valid xl.meta but may not have all the
|
|
// parts. This is considered an outdated disk, since
|
|
// it needs healing too.
|
|
verifyResp, verifyErr = onlineDisk.VerifyFile(ctx, bucket, object, meta)
|
|
default:
|
|
verifyResp, verifyErr = onlineDisk.CheckParts(ctx, bucket, object, meta)
|
|
}
|
|
|
|
for p := range latestMeta.Parts {
|
|
if verifyErr != nil {
|
|
dataErrsByPart[p][i] = convPartErrToInt(verifyErr)
|
|
} else {
|
|
dataErrsByPart[p][i] = verifyResp.Results[p]
|
|
}
|
|
}
|
|
}
|
|
|
|
// Build dataErrs by disk from dataErrs by part
|
|
for part, disks := range dataErrsByPart {
|
|
for disk := range disks {
|
|
dataErrsByDisk[disk][part] = dataErrsByPart[part][disk]
|
|
}
|
|
}
|
|
|
|
for i, onlineDisk := range onlineDisks {
|
|
if metaErrs[i] == nil && !hasPartErr(dataErrsByDisk[i]) {
|
|
// All parts verified, mark it as all data available.
|
|
availableDisks[i] = onlineDisk
|
|
} else {
|
|
// upon errors just make that disk's fileinfo invalid
|
|
partsMetadata[i] = FileInfo{}
|
|
}
|
|
}
|
|
|
|
return
|
|
}
|