mirror of
https://github.com/minio/minio.git
synced 2024-12-24 06:05:55 -05:00
Implement an offline mode for a distributed node (#4646)
Implement an offline mode for remote storage to cache the offline status of a node in order to prevent network calls that are bound to fail. After a time interval an attempt will be made to restore the connection and mark the node as online if successful. Fixes #4183
This commit is contained in:
parent
1978b9d8f9
commit
98b62cbec8
@ -341,6 +341,7 @@ func waitForFormatXLDisks(firstDisk bool, endpoints EndpointList, storageDisks [
|
||||
maxRetryAttempts: globalStorageInitRetryThreshold,
|
||||
retryUnit: time.Second,
|
||||
retryCap: time.Second * 30, // 30 seconds.
|
||||
offlineTimestamp: UTCNow(),
|
||||
}
|
||||
}
|
||||
|
||||
@ -361,6 +362,7 @@ func waitForFormatXLDisks(firstDisk bool, endpoints EndpointList, storageDisks [
|
||||
maxRetryAttempts: globalStorageRetryThreshold,
|
||||
retryUnit: time.Millisecond,
|
||||
retryCap: time.Millisecond * 5, // 5 milliseconds.
|
||||
offlineTimestamp: UTCNow(), // Set timestamp to prevent immediate marking as offline
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -30,210 +30,275 @@ const (
|
||||
// Attempt to retry only this many number of times before
|
||||
// giving up on the remote disk entirely after initialization.
|
||||
globalStorageRetryThreshold = 1
|
||||
|
||||
// Interval to check health status of a node whether it has
|
||||
// come back up online
|
||||
globalStorageHealthCheckInterval = 5 * time.Minute
|
||||
)
|
||||
|
||||
// Converts rpc.ServerError to underlying error. This function is
|
||||
// written so that the storageAPI errors are consistent across network
|
||||
// disks as well.
|
||||
func retryToStorageErr(err error) error {
|
||||
if err == errDiskNotFoundFromNetError || err == errDiskNotFoundFromRPCShutdown {
|
||||
return errDiskNotFound
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
// Retry storage is an instance of StorageAPI which
|
||||
// additionally verifies upon network shutdown if the
|
||||
// underlying storage is available and is really
|
||||
// formatted.
|
||||
// formatted. After the initialization phase it will
|
||||
// also cache when the underlying storage is offline
|
||||
// to prevent needless calls and recheck the health of
|
||||
// underlying storage in regular intervals.
|
||||
type retryStorage struct {
|
||||
remoteStorage StorageAPI
|
||||
maxRetryAttempts int
|
||||
retryUnit time.Duration
|
||||
retryCap time.Duration
|
||||
offline bool // Mark whether node is offline
|
||||
offlineTimestamp time.Time // Last timestamp of checking status of node
|
||||
}
|
||||
|
||||
// String representation of remoteStorage.
|
||||
func (f retryStorage) String() string {
|
||||
func (f *retryStorage) String() string {
|
||||
return f.remoteStorage.String()
|
||||
}
|
||||
|
||||
// Reconncts to underlying remote storage.
|
||||
func (f retryStorage) Init() (err error) {
|
||||
return f.remoteStorage.Init()
|
||||
// Reconnects to underlying remote storage.
|
||||
func (f *retryStorage) Init() (err error) {
|
||||
return retryToStorageErr(f.remoteStorage.Init())
|
||||
}
|
||||
|
||||
// Closes the underlying remote storage connection.
|
||||
func (f retryStorage) Close() (err error) {
|
||||
return f.remoteStorage.Close()
|
||||
func (f *retryStorage) Close() (err error) {
|
||||
return retryToStorageErr(f.remoteStorage.Close())
|
||||
}
|
||||
|
||||
// Return whether the underlying remote storage is offline
|
||||
// and, if so, try to reconnect at regular intervals to
|
||||
// restore the connection
|
||||
func (f *retryStorage) IsOffline() bool {
|
||||
// Check if offline and whether enough time has lapsed since most recent check
|
||||
if f.offline && UTCNow().Sub(f.offlineTimestamp) >= globalStorageHealthCheckInterval {
|
||||
f.offlineTimestamp = UTCNow() // reset timestamp
|
||||
|
||||
if e := f.reInit(nil); e == nil {
|
||||
// Connection has been re-established
|
||||
f.offline = false // Mark node as back online
|
||||
}
|
||||
}
|
||||
return f.offline
|
||||
}
|
||||
|
||||
// DiskInfo - a retryable implementation of disk info.
|
||||
func (f retryStorage) DiskInfo() (info disk.Info, err error) {
|
||||
info, err = f.remoteStorage.DiskInfo()
|
||||
if err == errDiskNotFound {
|
||||
err = f.reInit()
|
||||
if err == nil {
|
||||
return f.remoteStorage.DiskInfo()
|
||||
}
|
||||
func (f *retryStorage) DiskInfo() (info disk.Info, err error) {
|
||||
if f.IsOffline() {
|
||||
return info, errDiskNotFound
|
||||
}
|
||||
return info, err
|
||||
info, err = f.remoteStorage.DiskInfo()
|
||||
if f.reInitUponDiskNotFound(err) {
|
||||
info, err = f.remoteStorage.DiskInfo()
|
||||
return info, retryToStorageErr(err)
|
||||
}
|
||||
return info, retryToStorageErr(err)
|
||||
}
|
||||
|
||||
// MakeVol - a retryable implementation of creating a volume.
|
||||
func (f retryStorage) MakeVol(volume string) (err error) {
|
||||
err = f.remoteStorage.MakeVol(volume)
|
||||
if err == errDiskNotFound {
|
||||
err = f.reInit()
|
||||
if err == nil {
|
||||
return f.remoteStorage.MakeVol(volume)
|
||||
}
|
||||
func (f *retryStorage) MakeVol(volume string) (err error) {
|
||||
if f.IsOffline() {
|
||||
return errDiskNotFound
|
||||
}
|
||||
return err
|
||||
err = f.remoteStorage.MakeVol(volume)
|
||||
if f.reInitUponDiskNotFound(err) {
|
||||
return retryToStorageErr(f.remoteStorage.MakeVol(volume))
|
||||
}
|
||||
return retryToStorageErr(err)
|
||||
}
|
||||
|
||||
// ListVols - a retryable implementation of listing all the volumes.
|
||||
func (f retryStorage) ListVols() (vols []VolInfo, err error) {
|
||||
vols, err = f.remoteStorage.ListVols()
|
||||
if err == errDiskNotFound {
|
||||
err = f.reInit()
|
||||
if err == nil {
|
||||
return f.remoteStorage.ListVols()
|
||||
}
|
||||
func (f *retryStorage) ListVols() (vols []VolInfo, err error) {
|
||||
if f.IsOffline() {
|
||||
return vols, errDiskNotFound
|
||||
}
|
||||
return vols, err
|
||||
vols, err = f.remoteStorage.ListVols()
|
||||
if f.reInitUponDiskNotFound(err) {
|
||||
vols, err = f.remoteStorage.ListVols()
|
||||
return vols, retryToStorageErr(err)
|
||||
}
|
||||
return vols, retryToStorageErr(err)
|
||||
}
|
||||
|
||||
// StatVol - a retryable implementation of stating a volume.
|
||||
func (f retryStorage) StatVol(volume string) (vol VolInfo, err error) {
|
||||
vol, err = f.remoteStorage.StatVol(volume)
|
||||
if err == errDiskNotFound {
|
||||
err = f.reInit()
|
||||
if err == nil {
|
||||
return f.remoteStorage.StatVol(volume)
|
||||
}
|
||||
func (f *retryStorage) StatVol(volume string) (vol VolInfo, err error) {
|
||||
if f.IsOffline() {
|
||||
return vol, errDiskNotFound
|
||||
}
|
||||
return vol, err
|
||||
vol, err = f.remoteStorage.StatVol(volume)
|
||||
if f.reInitUponDiskNotFound(err) {
|
||||
vol, err = f.remoteStorage.StatVol(volume)
|
||||
return vol, retryToStorageErr(err)
|
||||
}
|
||||
return vol, retryToStorageErr(err)
|
||||
}
|
||||
|
||||
// DeleteVol - a retryable implementation of deleting a volume.
|
||||
func (f retryStorage) DeleteVol(volume string) (err error) {
|
||||
err = f.remoteStorage.DeleteVol(volume)
|
||||
if err == errDiskNotFound {
|
||||
err = f.reInit()
|
||||
if err == nil {
|
||||
return f.remoteStorage.DeleteVol(volume)
|
||||
}
|
||||
func (f *retryStorage) DeleteVol(volume string) (err error) {
|
||||
if f.IsOffline() {
|
||||
return errDiskNotFound
|
||||
}
|
||||
return err
|
||||
err = f.remoteStorage.DeleteVol(volume)
|
||||
if f.reInitUponDiskNotFound(err) {
|
||||
return retryToStorageErr(f.remoteStorage.DeleteVol(volume))
|
||||
}
|
||||
return retryToStorageErr(err)
|
||||
}
|
||||
|
||||
// PrepareFile - a retryable implementation of preparing a file.
|
||||
func (f retryStorage) PrepareFile(volume, path string, length int64) (err error) {
|
||||
err = f.remoteStorage.PrepareFile(volume, path, length)
|
||||
if err == errDiskNotFound {
|
||||
err = f.reInit()
|
||||
if err == nil {
|
||||
return f.remoteStorage.PrepareFile(volume, path, length)
|
||||
}
|
||||
func (f *retryStorage) PrepareFile(volume, path string, length int64) (err error) {
|
||||
if f.IsOffline() {
|
||||
return errDiskNotFound
|
||||
}
|
||||
return err
|
||||
err = f.remoteStorage.PrepareFile(volume, path, length)
|
||||
if f.reInitUponDiskNotFound(err) {
|
||||
return retryToStorageErr(f.remoteStorage.PrepareFile(volume, path, length))
|
||||
}
|
||||
return retryToStorageErr(err)
|
||||
}
|
||||
|
||||
// AppendFile - a retryable implementation of append to a file.
|
||||
func (f retryStorage) AppendFile(volume, path string, buffer []byte) (err error) {
|
||||
err = f.remoteStorage.AppendFile(volume, path, buffer)
|
||||
if err == errDiskNotFound {
|
||||
err = f.reInit()
|
||||
if err == nil {
|
||||
return f.remoteStorage.AppendFile(volume, path, buffer)
|
||||
}
|
||||
func (f *retryStorage) AppendFile(volume, path string, buffer []byte) (err error) {
|
||||
if f.IsOffline() {
|
||||
return errDiskNotFound
|
||||
}
|
||||
return err
|
||||
err = f.remoteStorage.AppendFile(volume, path, buffer)
|
||||
if f.reInitUponDiskNotFound(err) {
|
||||
return retryToStorageErr(f.remoteStorage.AppendFile(volume, path, buffer))
|
||||
}
|
||||
return retryToStorageErr(err)
|
||||
}
|
||||
|
||||
// StatFile - a retryable implementation of stating a file.
|
||||
func (f retryStorage) StatFile(volume, path string) (fileInfo FileInfo, err error) {
|
||||
fileInfo, err = f.remoteStorage.StatFile(volume, path)
|
||||
if err == errDiskNotFound {
|
||||
err = f.reInit()
|
||||
if err == nil {
|
||||
return f.remoteStorage.StatFile(volume, path)
|
||||
}
|
||||
func (f *retryStorage) StatFile(volume, path string) (fileInfo FileInfo, err error) {
|
||||
if f.IsOffline() {
|
||||
return fileInfo, errDiskNotFound
|
||||
}
|
||||
return fileInfo, err
|
||||
fileInfo, err = f.remoteStorage.StatFile(volume, path)
|
||||
if f.reInitUponDiskNotFound(err) {
|
||||
fileInfo, err = f.remoteStorage.StatFile(volume, path)
|
||||
return fileInfo, retryToStorageErr(err)
|
||||
}
|
||||
return fileInfo, retryToStorageErr(err)
|
||||
}
|
||||
|
||||
// ReadAll - a retryable implementation of reading all the content from a file.
|
||||
func (f retryStorage) ReadAll(volume, path string) (buf []byte, err error) {
|
||||
buf, err = f.remoteStorage.ReadAll(volume, path)
|
||||
if err == errDiskNotFound {
|
||||
err = f.reInit()
|
||||
if err == nil {
|
||||
return f.remoteStorage.ReadAll(volume, path)
|
||||
}
|
||||
func (f *retryStorage) ReadAll(volume, path string) (buf []byte, err error) {
|
||||
if f.IsOffline() {
|
||||
return buf, errDiskNotFound
|
||||
}
|
||||
return buf, err
|
||||
buf, err = f.remoteStorage.ReadAll(volume, path)
|
||||
if f.reInitUponDiskNotFound(err) {
|
||||
buf, err = f.remoteStorage.ReadAll(volume, path)
|
||||
return buf, retryToStorageErr(err)
|
||||
}
|
||||
return buf, retryToStorageErr(err)
|
||||
}
|
||||
|
||||
// ReadFile - a retryable implementation of reading at offset from a file.
|
||||
func (f retryStorage) ReadFile(volume, path string, offset int64, buffer []byte) (m int64, err error) {
|
||||
m, err = f.remoteStorage.ReadFile(volume, path, offset, buffer)
|
||||
if err == errDiskNotFound {
|
||||
err = f.reInit()
|
||||
if err == nil {
|
||||
return f.remoteStorage.ReadFile(volume, path, offset, buffer)
|
||||
}
|
||||
func (f *retryStorage) ReadFile(volume, path string, offset int64, buffer []byte) (m int64, err error) {
|
||||
if f.IsOffline() {
|
||||
return m, errDiskNotFound
|
||||
}
|
||||
return m, err
|
||||
m, err = f.remoteStorage.ReadFile(volume, path, offset, buffer)
|
||||
if f.reInitUponDiskNotFound(err) {
|
||||
m, err = f.remoteStorage.ReadFile(volume, path, offset, buffer)
|
||||
return m, retryToStorageErr(err)
|
||||
}
|
||||
return m, retryToStorageErr(err)
|
||||
}
|
||||
|
||||
// ReadFileWithVerify - a retryable implementation of reading at
|
||||
// offset from a file with verification.
|
||||
func (f retryStorage) ReadFileWithVerify(volume, path string, offset int64, buffer []byte,
|
||||
func (f *retryStorage) ReadFileWithVerify(volume, path string, offset int64, buffer []byte,
|
||||
algo HashAlgo, expectedHash string) (m int64, err error) {
|
||||
if f.IsOffline() {
|
||||
return m, errDiskNotFound
|
||||
}
|
||||
|
||||
m, err = f.remoteStorage.ReadFileWithVerify(volume, path, offset, buffer,
|
||||
algo, expectedHash)
|
||||
if err == errDiskNotFound {
|
||||
err = f.reInit()
|
||||
if err == nil {
|
||||
return f.remoteStorage.ReadFileWithVerify(volume, path,
|
||||
offset, buffer, algo, expectedHash)
|
||||
}
|
||||
if f.reInitUponDiskNotFound(err) {
|
||||
m, err = f.remoteStorage.ReadFileWithVerify(volume, path,
|
||||
offset, buffer, algo, expectedHash)
|
||||
return m, retryToStorageErr(err)
|
||||
}
|
||||
return m, err
|
||||
return m, retryToStorageErr(err)
|
||||
}
|
||||
|
||||
// ListDir - a retryable implementation of listing directory entries.
|
||||
func (f retryStorage) ListDir(volume, path string) (entries []string, err error) {
|
||||
entries, err = f.remoteStorage.ListDir(volume, path)
|
||||
if err == errDiskNotFound {
|
||||
err = f.reInit()
|
||||
if err == nil {
|
||||
return f.remoteStorage.ListDir(volume, path)
|
||||
}
|
||||
func (f *retryStorage) ListDir(volume, path string) (entries []string, err error) {
|
||||
if f.IsOffline() {
|
||||
return entries, errDiskNotFound
|
||||
}
|
||||
return entries, err
|
||||
entries, err = f.remoteStorage.ListDir(volume, path)
|
||||
if f.reInitUponDiskNotFound(err) {
|
||||
entries, err = f.remoteStorage.ListDir(volume, path)
|
||||
return entries, retryToStorageErr(err)
|
||||
}
|
||||
return entries, retryToStorageErr(err)
|
||||
}
|
||||
|
||||
// DeleteFile - a retryable implementation of deleting a file.
|
||||
func (f retryStorage) DeleteFile(volume, path string) (err error) {
|
||||
err = f.remoteStorage.DeleteFile(volume, path)
|
||||
if err == errDiskNotFound {
|
||||
err = f.reInit()
|
||||
if err == nil {
|
||||
return f.remoteStorage.DeleteFile(volume, path)
|
||||
}
|
||||
func (f *retryStorage) DeleteFile(volume, path string) (err error) {
|
||||
if f.IsOffline() {
|
||||
return errDiskNotFound
|
||||
}
|
||||
return err
|
||||
err = f.remoteStorage.DeleteFile(volume, path)
|
||||
if f.reInitUponDiskNotFound(err) {
|
||||
return retryToStorageErr(f.remoteStorage.DeleteFile(volume, path))
|
||||
}
|
||||
return retryToStorageErr(err)
|
||||
}
|
||||
|
||||
// RenameFile - a retryable implementation of renaming a file.
|
||||
func (f retryStorage) RenameFile(srcVolume, srcPath, dstVolume, dstPath string) (err error) {
|
||||
err = f.remoteStorage.RenameFile(srcVolume, srcPath, dstVolume, dstPath)
|
||||
if err == errDiskNotFound {
|
||||
err = f.reInit()
|
||||
if err == nil {
|
||||
return f.remoteStorage.RenameFile(srcVolume, srcPath, dstVolume, dstPath)
|
||||
}
|
||||
func (f *retryStorage) RenameFile(srcVolume, srcPath, dstVolume, dstPath string) (err error) {
|
||||
if f.IsOffline() {
|
||||
return errDiskNotFound
|
||||
}
|
||||
return err
|
||||
err = f.remoteStorage.RenameFile(srcVolume, srcPath, dstVolume, dstPath)
|
||||
if f.reInitUponDiskNotFound(err) {
|
||||
return retryToStorageErr(f.remoteStorage.RenameFile(srcVolume, srcPath, dstVolume, dstPath))
|
||||
}
|
||||
return retryToStorageErr(err)
|
||||
}
|
||||
|
||||
// Try to reinitialize the connection when we have some form of DiskNotFound error
|
||||
func (f *retryStorage) reInitUponDiskNotFound(err error) bool {
|
||||
if err == errDiskNotFound || err == errDiskNotFoundFromNetError || err == errDiskNotFoundFromRPCShutdown {
|
||||
return f.reInit(err) == nil
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// Connect and attempt to load the format from a disconnected node,
|
||||
// attempts three times before giving up.
|
||||
func (f retryStorage) reInit() (err error) {
|
||||
func (f *retryStorage) reInit(e error) (err error) {
|
||||
|
||||
// Only after initialization and minimum of one interval
|
||||
// has passed (to prevent marking a node as offline right
|
||||
// after initialization), check whether node has gone offline
|
||||
if f.maxRetryAttempts == globalStorageRetryThreshold &&
|
||||
UTCNow().Sub(f.offlineTimestamp) >= globalStorageHealthCheckInterval {
|
||||
if e == errDiskNotFoundFromNetError { // Make node offline due to network error
|
||||
f.offline = true // Marking node offline
|
||||
f.offlineTimestamp = UTCNow()
|
||||
return errDiskNotFound
|
||||
}
|
||||
// Continue for other errors like RPC shutdown (and retry connection below)
|
||||
}
|
||||
|
||||
// Close the underlying connection.
|
||||
f.remoteStorage.Close() // Error here is purposefully ignored.
|
||||
|
||||
|
@ -20,6 +20,7 @@ import (
|
||||
"bytes"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"errors"
|
||||
"reflect"
|
||||
"testing"
|
||||
"time"
|
||||
@ -423,3 +424,31 @@ func TestRetryStorage(t *testing.T) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Tests reply storage error transformation.
|
||||
func TestReplyStorageErr(t *testing.T) {
|
||||
unknownErr := errors.New("Unknown error")
|
||||
testCases := []struct {
|
||||
expectedErr error
|
||||
err error
|
||||
}{
|
||||
{
|
||||
expectedErr: errDiskNotFound,
|
||||
err: errDiskNotFoundFromNetError,
|
||||
},
|
||||
{
|
||||
expectedErr: errDiskNotFound,
|
||||
err: errDiskNotFoundFromRPCShutdown,
|
||||
},
|
||||
{
|
||||
expectedErr: unknownErr,
|
||||
err: unknownErr,
|
||||
},
|
||||
}
|
||||
for i, testCase := range testCases {
|
||||
resultErr := retryToStorageErr(testCase.err)
|
||||
if testCase.expectedErr != resultErr {
|
||||
t.Errorf("Test %d: Expected %s, got %s", i+1, testCase.expectedErr, resultErr)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -37,9 +37,15 @@ var errUnformattedDisk = errors.New("unformatted disk found")
|
||||
// errDiskFull - cannot create volume or files when disk is full.
|
||||
var errDiskFull = errors.New("disk path full")
|
||||
|
||||
// errDiskNotFount - cannot find the underlying configured disk anymore.
|
||||
// errDiskNotFound - cannot find the underlying configured disk anymore.
|
||||
var errDiskNotFound = errors.New("disk not found")
|
||||
|
||||
// errDiskNotFoundFromNetError - cannot find the underlying configured disk anymore due to network error.
|
||||
var errDiskNotFoundFromNetError = errors.New("disk not found from net error")
|
||||
|
||||
// errDiskNotFoundFromShutdown - cannot find the underlying configured disk anymore due to rpc shutdown.
|
||||
var errDiskNotFoundFromRPCShutdown = errors.New("disk not found from rpc shutdown")
|
||||
|
||||
// errFaultyRemoteDisk - remote disk is faulty.
|
||||
var errFaultyRemoteDisk = errors.New("remote disk is faulty")
|
||||
|
||||
|
@ -45,11 +45,11 @@ func toStorageErr(err error) error {
|
||||
|
||||
switch err.(type) {
|
||||
case *net.OpError:
|
||||
return errDiskNotFound
|
||||
return errDiskNotFoundFromNetError
|
||||
}
|
||||
|
||||
if err == rpc.ErrShutdown {
|
||||
return errDiskNotFound
|
||||
return errDiskNotFoundFromRPCShutdown
|
||||
}
|
||||
|
||||
switch err.Error() {
|
||||
|
@ -127,11 +127,11 @@ func TestStorageErr(t *testing.T) {
|
||||
err: fmt.Errorf("%s", io.ErrUnexpectedEOF.Error()),
|
||||
},
|
||||
{
|
||||
expectedErr: errDiskNotFound,
|
||||
expectedErr: errDiskNotFoundFromNetError,
|
||||
err: &net.OpError{},
|
||||
},
|
||||
{
|
||||
expectedErr: errDiskNotFound,
|
||||
expectedErr: errDiskNotFoundFromRPCShutdown,
|
||||
err: rpc.ErrShutdown,
|
||||
},
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user