mirror of
https://github.com/minio/minio.git
synced 2024-12-25 22:55:54 -05:00
082f777281
Bucket metadata healing in the current code was executed multiple times each time for a given set. Bucket metadata just like objects are hashed in accordance with its name on any given set, to allow hashing to play a role we should let the top level code decide where to navigate. Current code also had 3 bucket metadata files hardcoded, whereas we should make it generic by listing and navigating the .minio.sys to heal such objects. We also had another bug where due to isObjectDangling changes without pre-existing bucket metadata files, we were erroneously reporting it as grey/corrupted objects. This PR fixes all of the above items.
790 lines
21 KiB
Go
790 lines
21 KiB
Go
/*
|
|
* Minio Cloud Storage, (C) 2017 Minio, Inc.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
package cmd
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"net/http"
|
|
"runtime"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/minio/minio/cmd/logger"
|
|
"github.com/minio/minio/pkg/madmin"
|
|
"github.com/minio/minio/pkg/sync/errgroup"
|
|
)
|
|
|
|
// healStatusSummary - overall short summary of a healing sequence
|
|
type healStatusSummary string
|
|
|
|
// healStatusSummary constants
|
|
const (
|
|
healNotStartedStatus healStatusSummary = "not started"
|
|
healRunningStatus = "running"
|
|
healStoppedStatus = "stopped"
|
|
healFinishedStatus = "finished"
|
|
)
|
|
|
|
const (
|
|
// a heal sequence with this many un-consumed heal result
|
|
// items blocks until heal-status consumption resumes or is
|
|
// aborted due to timeout.
|
|
maxUnconsumedHealResultItems = 1000
|
|
|
|
// if no heal-results are consumed (via the heal-status API)
|
|
// for this timeout duration, the heal sequence is aborted.
|
|
healUnconsumedTimeout = 24 * time.Hour
|
|
|
|
// time-duration to keep heal sequence state after it
|
|
// completes.
|
|
keepHealSeqStateDuration = time.Minute * 10
|
|
)
|
|
|
|
var (
|
|
errHealIdleTimeout = fmt.Errorf("healing results were not consumed for too long")
|
|
errHealPushStopNDiscard = fmt.Errorf("heal push stopped due to heal stop signal")
|
|
errHealStopSignalled = fmt.Errorf("heal stop signaled")
|
|
|
|
errFnHealFromAPIErr = func(ctx context.Context, err error) error {
|
|
errCode := toAPIErrorCode(ctx, err)
|
|
apiErr := getAPIError(errCode)
|
|
return fmt.Errorf("Heal internal error: %s: %s",
|
|
apiErr.Code, apiErr.Description)
|
|
}
|
|
)
|
|
|
|
// healSequenceStatus - accumulated status of the heal sequence
|
|
type healSequenceStatus struct {
|
|
// lock to update this structure as it is concurrently
|
|
// accessed
|
|
updateLock *sync.RWMutex
|
|
|
|
// summary and detail for failures
|
|
Summary healStatusSummary `json:"Summary"`
|
|
FailureDetail string `json:"Detail,omitempty"`
|
|
StartTime time.Time `json:"StartTime"`
|
|
|
|
// disk information
|
|
NumDisks int `json:"NumDisks"`
|
|
|
|
// settings for the heal sequence
|
|
HealSettings madmin.HealOpts `json:"Settings"`
|
|
|
|
// slice of available heal result records
|
|
Items []madmin.HealResultItem `json:"Items"`
|
|
}
|
|
|
|
// structure to hold state of all heal sequences in server memory
|
|
type allHealState struct {
|
|
sync.Mutex
|
|
|
|
// map of heal path to heal sequence
|
|
healSeqMap map[string]*healSequence
|
|
}
|
|
|
|
var (
|
|
// global server heal state
|
|
globalAllHealState allHealState
|
|
)
|
|
|
|
// initAllHealState - initialize healing apparatus
|
|
func initAllHealState(isErasureMode bool) {
|
|
if !isErasureMode {
|
|
return
|
|
}
|
|
|
|
globalAllHealState = allHealState{
|
|
healSeqMap: make(map[string]*healSequence),
|
|
}
|
|
|
|
go globalAllHealState.periodicHealSeqsClean()
|
|
}
|
|
|
|
func (ahs *allHealState) periodicHealSeqsClean() {
|
|
// Launch clean-up routine to remove this heal sequence (after
|
|
// it ends) from the global state after timeout has elapsed.
|
|
ticker := time.NewTicker(time.Minute * 5)
|
|
defer ticker.Stop()
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
now := UTCNow()
|
|
ahs.Lock()
|
|
for path, h := range ahs.healSeqMap {
|
|
if h.hasEnded() && h.endTime.Add(keepHealSeqStateDuration).Before(now) {
|
|
delete(ahs.healSeqMap, path)
|
|
}
|
|
}
|
|
ahs.Unlock()
|
|
case <-GlobalServiceDoneCh:
|
|
// server could be restarting - need
|
|
// to exit immediately
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// getHealSequence - Retrieve a heal sequence by path. The second
|
|
// argument returns if a heal sequence actually exists.
|
|
func (ahs *allHealState) getHealSequence(path string) (h *healSequence, exists bool) {
|
|
ahs.Lock()
|
|
defer ahs.Unlock()
|
|
h, exists = ahs.healSeqMap[path]
|
|
return h, exists
|
|
}
|
|
|
|
func (ahs *allHealState) stopHealSequence(path string) ([]byte, APIErrorCode) {
|
|
var hsp madmin.HealStopSuccess
|
|
he, exists := ahs.getHealSequence(path)
|
|
if !exists {
|
|
hsp = madmin.HealStopSuccess{
|
|
ClientToken: "invalid",
|
|
StartTime: UTCNow(),
|
|
}
|
|
} else {
|
|
hsp = madmin.HealStopSuccess{
|
|
ClientToken: he.clientToken,
|
|
ClientAddress: he.clientAddress,
|
|
StartTime: he.startTime,
|
|
}
|
|
|
|
he.stop()
|
|
for !he.hasEnded() {
|
|
time.Sleep(1 * time.Second)
|
|
}
|
|
ahs.Lock()
|
|
defer ahs.Unlock()
|
|
// Heal sequence explicitly stopped, remove it.
|
|
delete(ahs.healSeqMap, path)
|
|
}
|
|
|
|
b, err := json.Marshal(&hsp)
|
|
return b, toAdminAPIErrCode(context.Background(), err)
|
|
}
|
|
|
|
// LaunchNewHealSequence - launches a background routine that performs
|
|
// healing according to the healSequence argument. For each heal
|
|
// sequence, state is stored in the `globalAllHealState`, which is a
|
|
// map of the heal path to `healSequence` which holds state about the
|
|
// heal sequence.
|
|
//
|
|
// Heal results are persisted in server memory for
|
|
// `keepHealSeqStateDuration`. This function also launches a
|
|
// background routine to clean up heal results after the
|
|
// aforementioned duration.
|
|
func (ahs *allHealState) LaunchNewHealSequence(h *healSequence) (
|
|
respBytes []byte, errCode APIErrorCode, errMsg string) {
|
|
|
|
existsAndLive := false
|
|
he, exists := ahs.getHealSequence(h.path)
|
|
if exists {
|
|
if !he.hasEnded() || len(he.currentStatus.Items) > 0 {
|
|
existsAndLive = true
|
|
}
|
|
}
|
|
|
|
if existsAndLive {
|
|
// A heal sequence exists on the given path.
|
|
if h.forceStarted {
|
|
// stop the running heal sequence - wait for it to finish.
|
|
he.stop()
|
|
for !he.hasEnded() {
|
|
time.Sleep(1 * time.Second)
|
|
}
|
|
} else {
|
|
errMsg = "Heal is already running on the given path " +
|
|
"(use force-start option to stop and start afresh). " +
|
|
fmt.Sprintf("The heal was started by IP %s at %s, token is %s",
|
|
h.clientAddress, h.startTime.Format(http.TimeFormat), h.clientToken)
|
|
|
|
return nil, ErrHealAlreadyRunning, errMsg
|
|
}
|
|
}
|
|
|
|
ahs.Lock()
|
|
defer ahs.Unlock()
|
|
|
|
// Check if new heal sequence to be started overlaps with any
|
|
// existing, running sequence
|
|
for k, hSeq := range ahs.healSeqMap {
|
|
if !hSeq.hasEnded() && (strings.HasPrefix(k, h.path) ||
|
|
strings.HasPrefix(h.path, k)) {
|
|
|
|
errMsg = "The provided heal sequence path overlaps with an existing " +
|
|
fmt.Sprintf("heal path: %s", k)
|
|
return nil, ErrHealOverlappingPaths, errMsg
|
|
}
|
|
}
|
|
|
|
// Add heal state and start sequence
|
|
ahs.healSeqMap[h.path] = h
|
|
|
|
// Launch top-level background heal go-routine
|
|
go h.healSequenceStart()
|
|
|
|
b, err := json.Marshal(madmin.HealStartSuccess{
|
|
ClientToken: h.clientToken,
|
|
ClientAddress: h.clientAddress,
|
|
StartTime: h.startTime,
|
|
})
|
|
if err != nil {
|
|
logger.LogIf(h.ctx, err)
|
|
return nil, ErrInternalError, ""
|
|
}
|
|
return b, ErrNone, ""
|
|
}
|
|
|
|
// PopHealStatusJSON - Called by heal-status API. It fetches the heal
|
|
// status results from global state and returns its JSON
|
|
// representation. The clientToken helps ensure there aren't
|
|
// conflicting clients fetching status.
|
|
func (ahs *allHealState) PopHealStatusJSON(path string,
|
|
clientToken string) ([]byte, APIErrorCode) {
|
|
|
|
// fetch heal state for given path
|
|
h, exists := ahs.getHealSequence(path)
|
|
if !exists {
|
|
// If there is no such heal sequence, return error.
|
|
return nil, ErrHealNoSuchProcess
|
|
}
|
|
|
|
// Check if client-token is valid
|
|
if clientToken != h.clientToken {
|
|
return nil, ErrHealInvalidClientToken
|
|
}
|
|
|
|
// Take lock to access and update the heal-sequence
|
|
h.currentStatus.updateLock.Lock()
|
|
defer h.currentStatus.updateLock.Unlock()
|
|
|
|
numItems := len(h.currentStatus.Items)
|
|
|
|
// calculate index of most recently available heal result
|
|
// record.
|
|
lastResultIndex := h.lastSentResultIndex
|
|
if numItems > 0 {
|
|
lastResultIndex = h.currentStatus.Items[numItems-1].ResultIndex
|
|
}
|
|
|
|
// After sending status to client, and before relinquishing
|
|
// the updateLock, reset Item to nil and record the result
|
|
// index sent to the client.
|
|
defer func(i int64) {
|
|
h.lastSentResultIndex = i
|
|
h.currentStatus.Items = nil
|
|
}(lastResultIndex)
|
|
|
|
jbytes, err := json.Marshal(h.currentStatus)
|
|
if err != nil {
|
|
logger.LogIf(h.ctx, err)
|
|
return nil, ErrInternalError
|
|
}
|
|
|
|
return jbytes, ErrNone
|
|
}
|
|
|
|
// healSequence - state for each heal sequence initiated on the
|
|
// server.
|
|
type healSequence struct {
|
|
// bucket, and prefix on which heal seq. was initiated
|
|
bucket, objPrefix string
|
|
|
|
// path is just pathJoin(bucket, objPrefix)
|
|
path string
|
|
|
|
// time at which heal sequence was started
|
|
startTime time.Time
|
|
|
|
// time at which heal sequence has ended
|
|
endTime time.Time
|
|
|
|
// Heal client info
|
|
clientToken, clientAddress string
|
|
|
|
// was this heal sequence force started?
|
|
forceStarted bool
|
|
|
|
// heal settings applied to this heal sequence
|
|
settings madmin.HealOpts
|
|
|
|
// current accumulated status of the heal sequence
|
|
currentStatus healSequenceStatus
|
|
|
|
// channel signaled by background routine when traversal has
|
|
// completed
|
|
traverseAndHealDoneCh chan error
|
|
|
|
// channel to signal heal sequence to stop (e.g. from the
|
|
// heal-stop API)
|
|
stopSignalCh chan struct{}
|
|
|
|
// the last result index sent to client
|
|
lastSentResultIndex int64
|
|
|
|
// Holds the request-info for logging
|
|
ctx context.Context
|
|
}
|
|
|
|
// NewHealSequence - creates healSettings, assumes bucket and
|
|
// objPrefix are already validated.
|
|
func newHealSequence(bucket, objPrefix, clientAddr string,
|
|
numDisks int, hs madmin.HealOpts, forceStart bool) *healSequence {
|
|
|
|
reqInfo := &logger.ReqInfo{RemoteHost: clientAddr, API: "Heal", BucketName: bucket}
|
|
reqInfo.AppendTags("prefix", objPrefix)
|
|
ctx := logger.SetReqInfo(context.Background(), reqInfo)
|
|
|
|
return &healSequence{
|
|
bucket: bucket,
|
|
objPrefix: objPrefix,
|
|
path: pathJoin(bucket, objPrefix),
|
|
startTime: UTCNow(),
|
|
clientToken: mustGetUUID(),
|
|
clientAddress: clientAddr,
|
|
forceStarted: forceStart,
|
|
settings: hs,
|
|
currentStatus: healSequenceStatus{
|
|
Summary: healNotStartedStatus,
|
|
HealSettings: hs,
|
|
NumDisks: numDisks,
|
|
updateLock: &sync.RWMutex{},
|
|
},
|
|
traverseAndHealDoneCh: make(chan error),
|
|
stopSignalCh: make(chan struct{}),
|
|
ctx: ctx,
|
|
}
|
|
}
|
|
|
|
// isQuitting - determines if the heal sequence is quitting (due to an
|
|
// external signal)
|
|
func (h *healSequence) isQuitting() bool {
|
|
select {
|
|
case <-h.stopSignalCh:
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
// check if the heal sequence has ended
|
|
func (h *healSequence) hasEnded() bool {
|
|
h.currentStatus.updateLock.RLock()
|
|
summary := h.currentStatus.Summary
|
|
h.currentStatus.updateLock.RUnlock()
|
|
return summary == healStoppedStatus || summary == healFinishedStatus
|
|
}
|
|
|
|
// stops the heal sequence - safe to call multiple times.
|
|
func (h *healSequence) stop() {
|
|
select {
|
|
case <-h.stopSignalCh:
|
|
default:
|
|
close(h.stopSignalCh)
|
|
}
|
|
}
|
|
|
|
// pushHealResultItem - pushes a heal result item for consumption in
|
|
// the heal-status API. It blocks if there are
|
|
// maxUnconsumedHealResultItems. When it blocks, the heal sequence
|
|
// routine is effectively paused - this happens when the server has
|
|
// accumulated the maximum number of heal records per heal
|
|
// sequence. When the client consumes further records, the heal
|
|
// sequence automatically resumes. The return value indicates if the
|
|
// operation succeeded.
|
|
func (h *healSequence) pushHealResultItem(r madmin.HealResultItem) error {
|
|
// start a timer to keep an upper time limit to find an empty
|
|
// slot to add the given heal result - if no slot is found it
|
|
// means that the server is holding the maximum amount of
|
|
// heal-results in memory and the client has not consumed it
|
|
// for too long.
|
|
unconsumedTimer := time.NewTimer(healUnconsumedTimeout)
|
|
defer func() {
|
|
// stop the timeout timer so it is garbage collected.
|
|
if !unconsumedTimer.Stop() {
|
|
<-unconsumedTimer.C
|
|
}
|
|
}()
|
|
|
|
var itemsLen int
|
|
for {
|
|
h.currentStatus.updateLock.Lock()
|
|
itemsLen = len(h.currentStatus.Items)
|
|
if itemsLen == maxUnconsumedHealResultItems {
|
|
// unlock and wait to check again if we can push
|
|
h.currentStatus.updateLock.Unlock()
|
|
|
|
// wait for a second, or quit if an external
|
|
// stop signal is received or the
|
|
// unconsumedTimer fires.
|
|
select {
|
|
// Check after a second
|
|
case <-time.After(time.Second):
|
|
continue
|
|
|
|
case <-h.stopSignalCh:
|
|
// discard result and return.
|
|
return errHealPushStopNDiscard
|
|
|
|
// Timeout if no results consumed for too
|
|
// long.
|
|
case <-unconsumedTimer.C:
|
|
return errHealIdleTimeout
|
|
|
|
}
|
|
}
|
|
break
|
|
}
|
|
|
|
// Set the correct result index for the new result item
|
|
if itemsLen > 0 {
|
|
r.ResultIndex = 1 + h.currentStatus.Items[itemsLen-1].ResultIndex
|
|
} else {
|
|
r.ResultIndex = 1 + h.lastSentResultIndex
|
|
}
|
|
|
|
// append to results
|
|
h.currentStatus.Items = append(h.currentStatus.Items, r)
|
|
|
|
// release lock
|
|
h.currentStatus.updateLock.Unlock()
|
|
|
|
// This is a "safe" point for the heal sequence to quit if
|
|
// signaled externally.
|
|
if h.isQuitting() {
|
|
return errHealStopSignalled
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// healSequenceStart - this is the top-level background heal
|
|
// routine. It launches another go-routine that actually traverses
|
|
// on-disk data, checks and heals according to the selected
|
|
// settings. This go-routine itself, (1) monitors the traversal
|
|
// routine for completion, and (2) listens for external stop
|
|
// signals. When either event happens, it sets the finish status for
|
|
// the heal-sequence.
|
|
func (h *healSequence) healSequenceStart() {
|
|
// Set status as running
|
|
h.currentStatus.updateLock.Lock()
|
|
h.currentStatus.Summary = healRunningStatus
|
|
h.currentStatus.StartTime = UTCNow()
|
|
h.currentStatus.updateLock.Unlock()
|
|
|
|
go h.traverseAndHeal()
|
|
|
|
select {
|
|
case err, ok := <-h.traverseAndHealDoneCh:
|
|
h.endTime = UTCNow()
|
|
h.currentStatus.updateLock.Lock()
|
|
defer h.currentStatus.updateLock.Unlock()
|
|
// Heal traversal is complete.
|
|
if ok {
|
|
// heal traversal had an error.
|
|
h.currentStatus.Summary = healStoppedStatus
|
|
h.currentStatus.FailureDetail = err.Error()
|
|
} else {
|
|
// heal traversal succeeded.
|
|
h.currentStatus.Summary = healFinishedStatus
|
|
}
|
|
|
|
case <-h.stopSignalCh:
|
|
h.endTime = UTCNow()
|
|
h.currentStatus.updateLock.Lock()
|
|
h.currentStatus.Summary = healStoppedStatus
|
|
h.currentStatus.FailureDetail = errHealStopSignalled.Error()
|
|
h.currentStatus.updateLock.Unlock()
|
|
|
|
// drain traverse channel so the traversal
|
|
// go-routine does not leak.
|
|
go func() {
|
|
// Eventually the traversal go-routine closes
|
|
// the channel and returns, so this go-routine
|
|
// itself will not leak.
|
|
<-h.traverseAndHealDoneCh
|
|
}()
|
|
}
|
|
}
|
|
|
|
// traverseAndHeal - traverses on-disk data and performs healing
|
|
// according to settings. At each "safe" point it also checks if an
|
|
// external quit signal has been received and quits if so. Since the
|
|
// healing traversal may be mutating on-disk data when an external
|
|
// quit signal is received, this routine cannot quit immediately and
|
|
// has to wait until a safe point is reached, such as between scanning
|
|
// two objects.
|
|
func (h *healSequence) traverseAndHeal() {
|
|
var err error
|
|
checkErr := func(f func() error) {
|
|
switch {
|
|
case err != nil:
|
|
return
|
|
case h.isQuitting():
|
|
err = errHealStopSignalled
|
|
return
|
|
}
|
|
err = f()
|
|
}
|
|
|
|
// Start with format healing
|
|
checkErr(h.healDiskFormat)
|
|
|
|
// Start healing the config prefix.
|
|
checkErr(h.healMinioSysMeta(minioConfigPrefix))
|
|
|
|
// Start healing the bucket config prefix.
|
|
checkErr(h.healMinioSysMeta(bucketConfigPrefix))
|
|
|
|
// Heal buckets and objects
|
|
checkErr(h.healBuckets)
|
|
|
|
if err != nil {
|
|
h.traverseAndHealDoneCh <- err
|
|
}
|
|
|
|
close(h.traverseAndHealDoneCh)
|
|
}
|
|
|
|
// healMinioSysMeta - heals all files under a given meta prefix, returns a function
|
|
// which in-turn heals the respective meta directory path and any files in int.
|
|
func (h *healSequence) healMinioSysMeta(metaPrefix string) func() error {
|
|
return func() error {
|
|
// Get current object layer instance.
|
|
objectAPI := newObjectLayerFn()
|
|
if objectAPI == nil {
|
|
return errServerNotInitialized
|
|
}
|
|
|
|
// NOTE: Healing on meta is run regardless
|
|
// of any bucket being selected, this is to ensure that
|
|
// meta are always upto date and correct.
|
|
marker := ""
|
|
isTruncated := true
|
|
for isTruncated {
|
|
if globalHTTPServer != nil {
|
|
// Wait at max 1 minute for an inprogress request
|
|
// before proceeding to heal
|
|
waitCount := 60
|
|
// Any requests in progress, delay the heal.
|
|
for globalHTTPServer.GetRequestCount() > 2 && waitCount > 0 {
|
|
waitCount--
|
|
time.Sleep(1 * time.Second)
|
|
}
|
|
}
|
|
|
|
// Lists all objects under `config` prefix.
|
|
objectInfos, err := objectAPI.ListObjectsHeal(h.ctx, minioMetaBucket, metaPrefix,
|
|
marker, "", 1000)
|
|
if err != nil {
|
|
return errFnHealFromAPIErr(h.ctx, err)
|
|
}
|
|
|
|
for index := range objectInfos.Objects {
|
|
if h.isQuitting() {
|
|
return errHealStopSignalled
|
|
}
|
|
o := objectInfos.Objects[index]
|
|
res, herr := objectAPI.HealObject(h.ctx, o.Bucket, o.Name, h.settings.DryRun, h.settings.Remove)
|
|
// Object might have been deleted, by the time heal
|
|
// was attempted we ignore this file an move on.
|
|
if isErrObjectNotFound(herr) {
|
|
continue
|
|
}
|
|
if herr != nil {
|
|
return herr
|
|
}
|
|
res.Type = madmin.HealItemBucketMetadata
|
|
if err = h.pushHealResultItem(res); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
isTruncated = objectInfos.IsTruncated
|
|
marker = objectInfos.NextMarker
|
|
}
|
|
return nil
|
|
}
|
|
}
|
|
|
|
// healDiskFormat - heals format.json, return value indicates if a
|
|
// failure error occurred.
|
|
func (h *healSequence) healDiskFormat() error {
|
|
if h.isQuitting() {
|
|
return errHealStopSignalled
|
|
}
|
|
|
|
// Get current object layer instance.
|
|
objectAPI := newObjectLayerFn()
|
|
if objectAPI == nil {
|
|
return errServerNotInitialized
|
|
}
|
|
|
|
res, err := objectAPI.HealFormat(h.ctx, h.settings.DryRun)
|
|
// return any error, ignore error returned when disks have
|
|
// already healed.
|
|
if err != nil && err != errNoHealRequired {
|
|
return errFnHealFromAPIErr(h.ctx, err)
|
|
}
|
|
|
|
// Healing succeeded notify the peers to reload format and re-initialize disks.
|
|
// We will not notify peers only if healing succeeded.
|
|
if err == nil {
|
|
for _, nerr := range globalNotificationSys.ReloadFormat(h.settings.DryRun) {
|
|
if nerr.Err != nil {
|
|
logger.GetReqInfo(h.ctx).SetTags("peerAddress", nerr.Host.String())
|
|
logger.LogIf(h.ctx, nerr.Err)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Push format heal result
|
|
return h.pushHealResultItem(res)
|
|
}
|
|
|
|
// healBuckets - check for all buckets heal or just particular bucket.
|
|
func (h *healSequence) healBuckets() error {
|
|
if h.isQuitting() {
|
|
return errHealStopSignalled
|
|
}
|
|
|
|
// 1. If a bucket was specified, heal only the bucket.
|
|
if h.bucket != "" {
|
|
return h.healBucket(h.bucket)
|
|
}
|
|
|
|
// Get current object layer instance.
|
|
objectAPI := newObjectLayerFn()
|
|
if objectAPI == nil {
|
|
return errServerNotInitialized
|
|
}
|
|
|
|
buckets, err := objectAPI.ListBucketsHeal(h.ctx)
|
|
if err != nil {
|
|
return errFnHealFromAPIErr(h.ctx, err)
|
|
}
|
|
|
|
for _, bucket := range buckets {
|
|
if err = h.healBucket(bucket.Name); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// healBucket - traverses and heals given bucket
|
|
func (h *healSequence) healBucket(bucket string) error {
|
|
// Get current object layer instance.
|
|
objectAPI := newObjectLayerFn()
|
|
if objectAPI == nil {
|
|
return errServerNotInitialized
|
|
}
|
|
|
|
result, err := objectAPI.HealBucket(h.ctx, bucket, h.settings.DryRun, h.settings.Remove)
|
|
// handle heal-bucket error
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if err = h.pushHealResultItem(result); err != nil {
|
|
return err
|
|
}
|
|
|
|
if !h.settings.Recursive {
|
|
if h.objPrefix != "" {
|
|
// Check if an object named as the objPrefix exists,
|
|
// and if so heal it.
|
|
_, err = objectAPI.GetObjectInfo(h.ctx, bucket, h.objPrefix, ObjectOptions{})
|
|
if err == nil {
|
|
if err = h.healObject(bucket, h.objPrefix); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
entries := runtime.NumCPU()
|
|
|
|
marker := ""
|
|
isTruncated := true
|
|
for isTruncated {
|
|
if globalHTTPServer != nil {
|
|
// Wait at max 1 minute for an inprogress request
|
|
// before proceeding to heal
|
|
waitCount := 60
|
|
// Any requests in progress, delay the heal.
|
|
for globalHTTPServer.GetRequestCount() > 2 && waitCount > 0 {
|
|
waitCount--
|
|
time.Sleep(1 * time.Second)
|
|
}
|
|
}
|
|
|
|
// Heal numCPU * nodes objects at a time.
|
|
objectInfos, err := objectAPI.ListObjectsHeal(h.ctx, bucket,
|
|
h.objPrefix, marker, "", entries)
|
|
if err != nil {
|
|
return errFnHealFromAPIErr(h.ctx, err)
|
|
}
|
|
|
|
g := errgroup.WithNErrs(len(objectInfos.Objects))
|
|
for index := range objectInfos.Objects {
|
|
index := index
|
|
g.Go(func() error {
|
|
o := objectInfos.Objects[index]
|
|
return h.healObject(o.Bucket, o.Name)
|
|
}, index)
|
|
}
|
|
|
|
for _, err := range g.Wait() {
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
isTruncated = objectInfos.IsTruncated
|
|
marker = objectInfos.NextMarker
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// healObject - heal the given object and record result
|
|
func (h *healSequence) healObject(bucket, object string) error {
|
|
if h.isQuitting() {
|
|
return errHealStopSignalled
|
|
}
|
|
|
|
// Get current object layer instance.
|
|
objectAPI := newObjectLayerFn()
|
|
if objectAPI == nil {
|
|
return errServerNotInitialized
|
|
}
|
|
|
|
hri, err := objectAPI.HealObject(h.ctx, bucket, object, h.settings.DryRun, h.settings.Remove)
|
|
if isErrObjectNotFound(err) {
|
|
return nil
|
|
}
|
|
if err != nil {
|
|
hri.Detail = err.Error()
|
|
}
|
|
return h.pushHealResultItem(hri)
|
|
}
|