minio/cmd/xl-v1-healing.go
Harshavardhana 994fe53669 Avoid shadowing ignored errors listAllBuckets() (#5524)
It can happen such that one of the disks that was down would
return 'errDiskNotFound' but the err is preserved due to
loop shadowing which leads to issues when healing the bucket.
2018-02-13 17:03:50 -08:00

635 lines
19 KiB
Go

/*
* Minio Cloud Storage, (C) 2016 Minio, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package cmd
import (
"fmt"
"path"
"sort"
"sync"
"github.com/minio/minio/pkg/errors"
"github.com/minio/minio/pkg/madmin"
)
// healFormatXL - heals missing `format.json` on freshly or corrupted
// disks (missing format.json but does have erasure coded data in it).
func healFormatXL(storageDisks []StorageAPI, dryRun bool) (res madmin.HealResultItem,
err error) {
// Attempt to load all `format.json`.
formatConfigs, sErrs := loadAllFormats(storageDisks)
// Generic format check.
// - if (no quorum) return error
// - if (disks not recognized) // Always error.
if err = genericFormatCheckXL(formatConfigs, sErrs); err != nil {
return res, err
}
// Prepare heal-result
res = madmin.HealResultItem{
Type: madmin.HealItemMetadata,
Detail: "disk-format",
DiskCount: len(storageDisks),
}
res.InitDrives()
// Existing formats are available (i.e. ok), so save it in
// result, also populate disks to be healed.
for i, format := range formatConfigs {
drive := globalEndpoints.GetString(i)
switch {
case format != nil:
res.DriveInfo.Before[drive] = madmin.DriveStateOk
case sErrs[i] == errCorruptedFormat:
res.DriveInfo.Before[drive] = madmin.DriveStateCorrupt
case sErrs[i] == errUnformattedDisk:
res.DriveInfo.Before[drive] = madmin.DriveStateMissing
default:
res.DriveInfo.Before[drive] = madmin.DriveStateOffline
}
}
// Copy "after" drive state too
for k, v := range res.DriveInfo.Before {
res.DriveInfo.After[k] = v
}
numDisks := len(storageDisks)
_, unformattedDiskCount, diskNotFoundCount,
corruptedFormatCount, otherErrCount := formatErrsSummary(sErrs)
switch {
case unformattedDiskCount == numDisks:
// all unformatted.
if !dryRun {
err = initFormatXL(storageDisks)
if err != nil {
return res, err
}
for i := 0; i < len(storageDisks); i++ {
drive := globalEndpoints.GetString(i)
res.DriveInfo.After[drive] = madmin.DriveStateOk
}
}
return res, nil
case diskNotFoundCount > 0:
return res, fmt.Errorf("cannot proceed with heal as %s",
errSomeDiskOffline)
case otherErrCount > 0:
return res, fmt.Errorf("cannot proceed with heal as some disks had unhandled errors")
case corruptedFormatCount > 0:
// heal corrupted disks
err = healFormatXLCorruptedDisks(storageDisks, formatConfigs,
dryRun)
if err != nil {
return res, err
}
// success
if !dryRun {
for i := 0; i < len(storageDisks); i++ {
drive := globalEndpoints.GetString(i)
res.DriveInfo.After[drive] = madmin.DriveStateOk
}
}
return res, nil
case unformattedDiskCount > 0:
// heal unformatted disks
err = healFormatXLFreshDisks(storageDisks, formatConfigs,
dryRun)
if err != nil {
return res, err
}
// success
if !dryRun {
for i := 0; i < len(storageDisks); i++ {
drive := globalEndpoints.GetString(i)
res.DriveInfo.After[drive] = madmin.DriveStateOk
}
}
return res, nil
}
return res, nil
}
// Heals a bucket if it doesn't exist on one of the disks, additionally
// also heals the missing entries for bucket metadata files
// `policy.json, notification.xml, listeners.json`.
func (xl xlObjects) HealBucket(bucket string, dryRun bool) (
results []madmin.HealResultItem, err error) {
if err = checkBucketExist(bucket, xl); err != nil {
return nil, err
}
// get write quorum for an object
writeQuorum := len(xl.storageDisks)/2 + 1
bucketLock := xl.nsMutex.NewNSLock(bucket, "")
if err = bucketLock.GetLock(globalHealingTimeout); err != nil {
return nil, err
}
defer bucketLock.Unlock()
// Heal bucket.
result, err := healBucket(xl.storageDisks, bucket, writeQuorum, dryRun)
if err != nil {
return results, err
}
results = append(results, result)
// Proceed to heal bucket metadata.
metaResults, err := healBucketMetadata(xl, bucket, dryRun)
results = append(results, metaResults...)
return results, err
}
// Heal bucket - create buckets on disks where it does not exist.
func healBucket(storageDisks []StorageAPI, bucket string, writeQuorum int,
dryRun bool) (res madmin.HealResultItem, err error) {
// Initialize sync waitgroup.
var wg = &sync.WaitGroup{}
// Initialize list of errors.
var dErrs = make([]error, len(storageDisks))
// Disk states slices
beforeState := make([]string, len(storageDisks))
afterState := make([]string, len(storageDisks))
// Make a volume entry on all underlying storage disks.
for index, disk := range storageDisks {
if disk == nil {
dErrs[index] = errors.Trace(errDiskNotFound)
beforeState[index] = madmin.DriveStateOffline
afterState[index] = madmin.DriveStateOffline
continue
}
wg.Add(1)
// Make a volume inside a go-routine.
go func(index int, disk StorageAPI) {
defer wg.Done()
if _, err := disk.StatVol(bucket); err != nil {
if errors.Cause(err) != errVolumeNotFound {
beforeState[index] = madmin.DriveStateCorrupt
afterState[index] = madmin.DriveStateCorrupt
dErrs[index] = errors.Trace(err)
return
}
beforeState[index] = madmin.DriveStateMissing
afterState[index] = madmin.DriveStateMissing
// mutate only if not a dry-run
if dryRun {
return
}
makeErr := disk.MakeVol(bucket)
dErrs[index] = errors.Trace(makeErr)
if makeErr == nil {
afterState[index] = madmin.DriveStateOk
}
} else {
beforeState[index] = madmin.DriveStateOk
afterState[index] = madmin.DriveStateOk
}
}(index, disk)
}
// Wait for all make vol to finish.
wg.Wait()
// Initialize heal result info
res = madmin.HealResultItem{
Type: madmin.HealItemBucket,
Bucket: bucket,
DiskCount: len(storageDisks),
}
res.InitDrives()
for i, before := range beforeState {
drive := globalEndpoints.GetString(i)
res.DriveInfo.Before[drive] = before
res.DriveInfo.After[drive] = afterState[i]
}
reducedErr := reduceWriteQuorumErrs(dErrs, bucketOpIgnoredErrs, writeQuorum)
if errors.Cause(reducedErr) == errXLWriteQuorum {
// Purge successfully created buckets if we don't have writeQuorum.
undoMakeBucket(storageDisks, bucket)
}
return res, reducedErr
}
// Heals all the metadata associated for a given bucket, this function
// heals `policy.json`, `notification.xml` and `listeners.json`.
func healBucketMetadata(xl xlObjects, bucket string, dryRun bool) (
results []madmin.HealResultItem, err error) {
healBucketMetaFn := func(metaPath string) error {
result, healErr := xl.HealObject(minioMetaBucket, metaPath, dryRun)
// If object is not found, no result to add.
if isErrObjectNotFound(healErr) {
return nil
}
if healErr != nil {
return healErr
}
result.Type = madmin.HealItemBucketMetadata
results = append(results, result)
return nil
}
// Heal `policy.json` for missing entries, ignores if
// `policy.json` is not found.
policyPath := pathJoin(bucketConfigPrefix, bucket, bucketPolicyConfig)
err = healBucketMetaFn(policyPath)
if err != nil {
return results, err
}
// Heal `notification.xml` for missing entries, ignores if
// `notification.xml` is not found.
nConfigPath := path.Join(bucketConfigPrefix, bucket,
bucketNotificationConfig)
err = healBucketMetaFn(nConfigPath)
if err != nil {
return results, err
}
// Heal `listeners.json` for missing entries, ignores if
// `listeners.json` is not found.
lConfigPath := path.Join(bucketConfigPrefix, bucket, bucketListenerConfig)
err = healBucketMetaFn(lConfigPath)
return results, err
}
// listAllBuckets lists all buckets from all disks. It also
// returns the occurrence of each buckets in all disks
func listAllBuckets(storageDisks []StorageAPI) (buckets map[string]VolInfo,
bucketsOcc map[string]int, err error) {
buckets = make(map[string]VolInfo)
bucketsOcc = make(map[string]int)
for _, disk := range storageDisks {
if disk == nil {
continue
}
var volsInfo []VolInfo
volsInfo, err = disk.ListVols()
if err != nil {
if errors.IsErrIgnored(err, bucketMetadataOpIgnoredErrs...) {
continue
}
return nil, nil, err
}
for _, volInfo := range volsInfo {
// StorageAPI can send volume names which are
// incompatible with buckets - these are
// skipped, like the meta-bucket.
if !IsValidBucketName(volInfo.Name) ||
isMinioMetaBucketName(volInfo.Name) {
continue
}
// Increase counter per bucket name
bucketsOcc[volInfo.Name]++
// Save volume info under bucket name
buckets[volInfo.Name] = volInfo
}
}
return buckets, bucketsOcc, nil
}
// ListBucketsHeal - Find all buckets that need to be healed
func (xl xlObjects) ListBucketsHeal() ([]BucketInfo, error) {
listBuckets := []BucketInfo{}
// List all buckets that can be found in all disks
buckets, _, err := listAllBuckets(xl.storageDisks)
if err != nil {
return listBuckets, err
}
// Iterate over all buckets
for _, currBucket := range buckets {
listBuckets = append(listBuckets,
BucketInfo{currBucket.Name, currBucket.Created})
}
// Sort found buckets
sort.Sort(byBucketName(listBuckets))
return listBuckets, nil
}
// This function is meant for all the healing that needs to be done
// during startup i.e healing of buckets, bucket metadata (policy.json,
// notification.xml, listeners.json) etc. Currently this function
// supports quick healing of buckets, bucket metadata.
func quickHeal(xlObj xlObjects, writeQuorum int, readQuorum int) error {
// List all bucket name occurrence from all disks.
_, bucketOcc, err := listAllBuckets(xlObj.storageDisks)
if err != nil {
return err
}
// All bucket names and bucket metadata that should be healed.
for bucketName, occCount := range bucketOcc {
// Heal bucket only if healing is needed.
if occCount != len(xlObj.storageDisks) {
bucketLock := xlObj.nsMutex.NewNSLock(bucketName, "")
if perr := bucketLock.GetLock(globalHealingTimeout); perr != nil {
return perr
}
defer bucketLock.Unlock()
// Heal bucket and then proceed to heal bucket metadata if any.
if _, err = healBucket(xlObj.storageDisks, bucketName, writeQuorum, false); err == nil {
if _, err = healBucketMetadata(xlObj, bucketName, false); err == nil {
continue
}
return err
}
return err
}
}
// Success.
return nil
}
// Heals an object by re-writing corrupt/missing erasure blocks.
func healObject(storageDisks []StorageAPI, bucket string, object string,
quorum int, dryRun bool) (result madmin.HealResultItem, err error) {
partsMetadata, errs := readAllXLMetadata(storageDisks, bucket, object)
// readQuorum suffices for xl.json since we use monotonic
// system time to break the tie when a split-brain situation
// arises.
if reducedErr := reduceReadQuorumErrs(errs, nil, quorum); reducedErr != nil {
return result, toObjectErr(reducedErr, bucket, object)
}
// List of disks having latest version of the object xl.json
// (by modtime).
latestDisks, modTime := listOnlineDisks(storageDisks, partsMetadata, errs)
// List of disks having all parts as per latest xl.json.
availableDisks, dataErrs, aErr := disksWithAllParts(latestDisks, partsMetadata, errs, bucket, object)
if aErr != nil {
return result, toObjectErr(aErr, bucket, object)
}
// Initialize heal result object
result = madmin.HealResultItem{
Type: madmin.HealItemObject,
Bucket: bucket,
Object: object,
DiskCount: len(storageDisks),
// Initialize object size to -1, so we can detect if we are
// unable to reliably find the object size.
ObjectSize: -1,
}
result.InitDrives()
// Loop to find number of disks with valid data, per-drive
// data state and a list of outdated disks on which data needs
// to be healed.
outDatedDisks := make([]StorageAPI, len(storageDisks))
numAvailableDisks := 0
disksToHealCount := 0
for i, v := range availableDisks {
driveState := ""
switch {
case v != nil:
driveState = madmin.DriveStateOk
numAvailableDisks++
// If data is sane on any one disk, we can
// extract the correct object size.
result.ObjectSize = partsMetadata[i].Stat.Size
result.ParityBlocks = partsMetadata[i].Erasure.ParityBlocks
result.DataBlocks = partsMetadata[i].Erasure.DataBlocks
case errors.Cause(errs[i]) == errDiskNotFound:
driveState = madmin.DriveStateOffline
case errors.Cause(errs[i]) == errFileNotFound, errors.Cause(errs[i]) == errVolumeNotFound:
fallthrough
case errors.Cause(dataErrs[i]) == errFileNotFound, errors.Cause(dataErrs[i]) == errVolumeNotFound:
driveState = madmin.DriveStateMissing
default:
// all remaining cases imply corrupt data/metadata
driveState = madmin.DriveStateCorrupt
}
drive := globalEndpoints.GetString(i)
result.DriveInfo.Before[drive] = driveState
// copy for 'after' state
result.DriveInfo.After[drive] = driveState
// an online disk without valid data/metadata is
// outdated and can be healed.
if errs[i] != errDiskNotFound && v == nil {
outDatedDisks[i] = storageDisks[i]
disksToHealCount++
}
}
// If less than read quorum number of disks have all the parts
// of the data, we can't reconstruct the erasure-coded data.
if numAvailableDisks < quorum {
return result, toObjectErr(errXLReadQuorum, bucket, object)
}
if disksToHealCount == 0 {
// Nothing to heal!
return result, nil
}
// After this point, only have to repair data on disk - so
// return if it is a dry-run
if dryRun {
return result, nil
}
// Latest xlMetaV1 for reference. If a valid metadata is not
// present, it is as good as object not found.
latestMeta, pErr := pickValidXLMeta(partsMetadata, modTime)
if pErr != nil {
return result, toObjectErr(pErr, bucket, object)
}
// Clear data files of the object on outdated disks
for _, disk := range outDatedDisks {
// Before healing outdated disks, we need to remove
// xl.json and part files from "bucket/object/" so
// that rename(minioMetaBucket, "tmp/tmpuuid/",
// "bucket", "object/") succeeds.
if disk == nil {
// Not an outdated disk.
continue
}
// List and delete the object directory, ignoring
// errors.
files, derr := disk.ListDir(bucket, object)
if derr == nil {
for _, entry := range files {
_ = disk.DeleteFile(bucket,
pathJoin(object, entry))
}
}
}
// Reorder so that we have data disks first and parity disks next.
latestDisks = shuffleDisks(latestDisks, latestMeta.Erasure.Distribution)
outDatedDisks = shuffleDisks(outDatedDisks, latestMeta.Erasure.Distribution)
partsMetadata = shufflePartsMetadata(partsMetadata, latestMeta.Erasure.Distribution)
// We write at temporary location and then rename to final location.
tmpID := mustGetUUID()
// Checksum of the part files. checkSumInfos[index] will
// contain checksums of all the part files in the
// outDatedDisks[index]
checksumInfos := make([][]ChecksumInfo, len(outDatedDisks))
// Heal each part. erasureHealFile() will write the healed
// part to .minio/tmp/uuid/ which needs to be renamed later to
// the final location.
storage, err := NewErasureStorage(latestDisks, latestMeta.Erasure.DataBlocks,
latestMeta.Erasure.ParityBlocks, latestMeta.Erasure.BlockSize)
if err != nil {
return result, toObjectErr(err, bucket, object)
}
checksums := make([][]byte, len(latestDisks))
for partIndex := 0; partIndex < len(latestMeta.Parts); partIndex++ {
partName := latestMeta.Parts[partIndex].Name
partSize := latestMeta.Parts[partIndex].Size
erasure := latestMeta.Erasure
var algorithm BitrotAlgorithm
for i, disk := range storage.disks {
if disk != OfflineDisk {
info := partsMetadata[i].Erasure.GetChecksumInfo(partName)
algorithm = info.Algorithm
checksums[i] = info.Hash
}
}
// Heal the part file.
file, hErr := storage.HealFile(outDatedDisks, bucket, pathJoin(object, partName),
erasure.BlockSize, minioMetaTmpBucket, pathJoin(tmpID, partName), partSize,
algorithm, checksums)
if hErr != nil {
return result, toObjectErr(hErr, bucket, object)
}
// outDatedDisks that had write errors should not be
// written to for remaining parts, so we nil it out.
for i, disk := range outDatedDisks {
if disk == nil {
continue
}
// A non-nil stale disk which did not receive
// a healed part checksum had a write error.
if file.Checksums[i] == nil {
outDatedDisks[i] = nil
disksToHealCount--
continue
}
// append part checksums
checksumInfos[i] = append(checksumInfos[i],
ChecksumInfo{partName, file.Algorithm, file.Checksums[i]})
}
// If all disks are having errors, we give up.
if disksToHealCount == 0 {
return result, fmt.Errorf("all disks without up-to-date data had write errors")
}
}
// xl.json should be written to all the healed disks.
for index, disk := range outDatedDisks {
if disk == nil {
continue
}
partsMetadata[index] = latestMeta
partsMetadata[index].Erasure.Checksums = checksumInfos[index]
}
// Generate and write `xl.json` generated from other disks.
outDatedDisks, aErr = writeUniqueXLMetadata(outDatedDisks, minioMetaTmpBucket, tmpID,
partsMetadata, diskCount(outDatedDisks))
if aErr != nil {
return result, toObjectErr(aErr, bucket, object)
}
// Rename from tmp location to the actual location.
for diskIndex, disk := range outDatedDisks {
if disk == nil {
continue
}
// Attempt a rename now from healed data to final location.
aErr = disk.RenameFile(minioMetaTmpBucket, retainSlash(tmpID), bucket,
retainSlash(object))
if aErr != nil {
return result, toObjectErr(errors.Trace(aErr), bucket, object)
}
realDiskIdx := unshuffleIndex(diskIndex,
latestMeta.Erasure.Distribution)
drive := globalEndpoints.GetString(realDiskIdx)
result.DriveInfo.After[drive] = madmin.DriveStateOk
}
// Set the size of the object in the heal result
result.ObjectSize = latestMeta.Stat.Size
return result, nil
}
// HealObject - heal the given object.
//
// FIXME: If an object object was deleted and one disk was down,
// and later the disk comes back up again, heal on the object
// should delete it.
func (xl xlObjects) HealObject(bucket, object string, dryRun bool) (
hr madmin.HealResultItem, err error) {
// FIXME: Metadata is read again in the healObject() call below.
// Read metadata files from all the disks
partsMetadata, errs := readAllXLMetadata(xl.storageDisks, bucket, object)
// get read quorum for this object
var readQuorum int
readQuorum, _, err = objectQuorumFromMeta(xl, partsMetadata, errs)
if err != nil {
return hr, err
}
// Lock the object before healing.
objectLock := xl.nsMutex.NewNSLock(bucket, object)
if lerr := objectLock.GetRLock(globalHealingTimeout); lerr != nil {
return hr, lerr
}
defer objectLock.RUnlock()
// Heal the object.
return healObject(xl.storageDisks, bucket, object, readQuorum, dryRun)
}