From 56d4d7b8b14c16c8cc0000978c407b0f63556885 Mon Sep 17 00:00:00 2001 From: Anis Elleuch Date: Tue, 11 May 2021 17:19:15 +0100 Subject: [PATCH] MRF: Better detection of non stable disks (#12252) MRF does not detect when a node is disconnected and reconnected quickly this change will ensure that MRF is alerted by comparing the last disk reconnection timestamp with the last MRF check time. Signed-off-by: Anis Elleuch Co-authored-by: Klaus Post --- cmd/erasure-sets.go | 22 +++++++++++++++++----- cmd/naughty-disk_test.go | 5 +++++ cmd/rest/client.go | 9 +++++++++ cmd/storage-interface.go | 5 ++++- cmd/storage-rest-client.go | 5 +++++ cmd/xl-storage-disk-id-check.go | 4 ++++ cmd/xl-storage.go | 4 ++++ 7 files changed, 48 insertions(+), 6 deletions(-) diff --git a/cmd/erasure-sets.go b/cmd/erasure-sets.go index 65619dbc0..5b46ddbd9 100644 --- a/cmd/erasure-sets.go +++ b/cmd/erasure-sets.go @@ -95,16 +95,24 @@ type erasureSets struct { disksStorageInfoCache timedValue - mrfMU sync.Mutex - mrfOperations map[healSource]int + mrfMU sync.Mutex + mrfOperations map[healSource]int + lastConnectDisksOpTime time.Time } -func isEndpointConnected(diskMap map[string]StorageAPI, endpoint string) bool { +// Return false if endpoint is not connected or has been reconnected after last check +func isEndpointConnectionStable(diskMap map[string]StorageAPI, endpoint string, lastCheck time.Time) bool { disk := diskMap[endpoint] if disk == nil { return false } - return disk.IsOnline() + if !disk.IsOnline() { + return false + } + if disk.LastConn().After(lastCheck) { + return false + } + return true } func (s *erasureSets) getDiskMap() map[string]StorageAPI { @@ -196,6 +204,10 @@ func findDiskIndex(refFormat, format *formatErasureV3) (int, int, error) { // connectDisks - attempt to connect all the endpoints, loads format // and re-arranges the disks in proper position. func (s *erasureSets) connectDisks() { + defer func() { + s.lastConnectDisksOpTime = time.Now() + }() + var wg sync.WaitGroup var setsJustConnected = make([]bool, s.setCount) diskMap := s.getDiskMap() @@ -204,7 +216,7 @@ func (s *erasureSets) connectDisks() { if endpoint.IsLocal { diskPath = endpoint.Path } - if isEndpointConnected(diskMap, diskPath) { + if isEndpointConnectionStable(diskMap, diskPath, s.lastConnectDisksOpTime) { continue } wg.Add(1) diff --git a/cmd/naughty-disk_test.go b/cmd/naughty-disk_test.go index 21636f675..1fd83c0d8 100644 --- a/cmd/naughty-disk_test.go +++ b/cmd/naughty-disk_test.go @@ -21,6 +21,7 @@ import ( "context" "io" "sync" + "time" ) // naughtyDisk wraps a POSIX disk and returns programmed errors @@ -55,6 +56,10 @@ func (d *naughtyDisk) IsOnline() bool { return d.disk.IsOnline() } +func (d *naughtyDisk) LastConn() time.Time { + return d.disk.LastConn() +} + func (d *naughtyDisk) IsLocal() bool { return d.disk.IsLocal() } diff --git a/cmd/rest/client.go b/cmd/rest/client.go index 591382121..c82772aa2 100644 --- a/cmd/rest/client.go +++ b/cmd/rest/client.go @@ -75,6 +75,8 @@ func (n *NetworkError) Unwrap() error { // Client - http based RPC client. type Client struct { connected int32 // ref: https://golang.org/pkg/sync/atomic/#pkg-note-BUG + _ int32 // For 64 bits alignment + lastConn int64 // HealthCheckFn is the function set to test for health. // If not set the client will not keep track of health. @@ -196,6 +198,7 @@ func NewClient(url *url.URL, tr http.RoundTripper, newAuthToken func(aud string) url: url, newAuthToken: newAuthToken, connected: online, + lastConn: time.Now().UnixNano(), MaxErrResponseSize: 4096, HealthCheckInterval: 200 * time.Millisecond, HealthCheckTimeout: time.Second, @@ -207,6 +210,11 @@ func (c *Client) IsOnline() bool { return atomic.LoadInt32(&c.connected) == online } +// LastConn returns when the disk was (re-)connected +func (c *Client) LastConn() time.Time { + return time.Unix(0, atomic.LoadInt64(&c.lastConn)) +} + // MarkOffline - will mark a client as being offline and spawns // a goroutine that will attempt to reconnect if HealthCheckFn is set. // returns true if the node changed state from online to offline @@ -223,6 +231,7 @@ func (c *Client) MarkOffline() bool { if c.HealthCheckFn() { if atomic.CompareAndSwapInt32(&c.connected, offline, online) { logger.Info("Client %s online", c.url.String()) + atomic.StoreInt64(&c.lastConn, time.Now().UnixNano()) } return } diff --git a/cmd/storage-interface.go b/cmd/storage-interface.go index c043ec8aa..12875c0c3 100644 --- a/cmd/storage-interface.go +++ b/cmd/storage-interface.go @@ -20,6 +20,7 @@ package cmd import ( "context" "io" + "time" ) // StorageAPI interface. @@ -28,7 +29,9 @@ type StorageAPI interface { String() string // Storage operations. - IsOnline() bool // Returns true if disk is online. + IsOnline() bool // Returns true if disk is online. + LastConn() time.Time // Returns the last time this disk (re)-connected + IsLocal() bool Hostname() string // Returns host name if remote host. diff --git a/cmd/storage-rest-client.go b/cmd/storage-rest-client.go index d2024b968..55258f338 100644 --- a/cmd/storage-rest-client.go +++ b/cmd/storage-rest-client.go @@ -167,6 +167,11 @@ func (client *storageRESTClient) IsOnline() bool { return client.restClient.IsOnline() } +// LastConn - returns when the disk is seen to be connected the last time +func (client *storageRESTClient) LastConn() time.Time { + return client.restClient.LastConn() +} + func (client *storageRESTClient) IsLocal() bool { return false } diff --git a/cmd/xl-storage-disk-id-check.go b/cmd/xl-storage-disk-id-check.go index d30ffa49b..88bed34fa 100644 --- a/cmd/xl-storage-disk-id-check.go +++ b/cmd/xl-storage-disk-id-check.go @@ -138,6 +138,10 @@ func (p *xlStorageDiskIDCheck) IsOnline() bool { return storedDiskID == p.diskID } +func (p *xlStorageDiskIDCheck) LastConn() time.Time { + return p.storage.LastConn() +} + func (p *xlStorageDiskIDCheck) IsLocal() bool { return p.storage.IsLocal() } diff --git a/cmd/xl-storage.go b/cmd/xl-storage.go index cb0c45d8c..a0e6d044b 100644 --- a/cmd/xl-storage.go +++ b/cmd/xl-storage.go @@ -335,6 +335,10 @@ func (s *xlStorage) IsOnline() bool { return true } +func (s *xlStorage) LastConn() time.Time { + return time.Time{} +} + func (s *xlStorage) IsLocal() bool { return true }