mirror of
https://github.com/minio/minio.git
synced 2025-05-21 09:33:50 -04:00
fix: lock maintenance should honor quorum (#9138)
The staleness of a lock should be determined by the quorum number of entries returning stale, this allows for situations when locks are held when nodes are down - we don't accidentally clear locks unintentionally when they are valid and correct. Also lock maintenance should be run by all servers, not one server, stale locks need to be run outside the requirement for holding distributed locks. Thanks @klauspost for reproducing this issue
This commit is contained in:
parent
10fd53d6bb
commit
c9212819af
@ -225,8 +225,6 @@ func getLongLivedLocks(interval time.Duration) map[Endpoint][]nameLockRequesterI
|
|||||||
return nlripMap
|
return nlripMap
|
||||||
}
|
}
|
||||||
|
|
||||||
var lockMaintenanceTimeout = newDynamicTimeout(60*time.Second, time.Second)
|
|
||||||
|
|
||||||
// lockMaintenance loops over locks that have been active for some time and checks back
|
// lockMaintenance loops over locks that have been active for some time and checks back
|
||||||
// with the original server whether it is still alive or not
|
// with the original server whether it is still alive or not
|
||||||
//
|
//
|
||||||
@ -235,28 +233,19 @@ var lockMaintenanceTimeout = newDynamicTimeout(60*time.Second, time.Second)
|
|||||||
// - some network error (and server is up normally)
|
// - some network error (and server is up normally)
|
||||||
//
|
//
|
||||||
// We will ignore the error, and we will retry later to get a resolve on this lock
|
// We will ignore the error, and we will retry later to get a resolve on this lock
|
||||||
func lockMaintenance(ctx context.Context, interval time.Duration, objAPI ObjectLayer) error {
|
func lockMaintenance(ctx context.Context, interval time.Duration) error {
|
||||||
// Lock to avoid concurrent lock maintenance loops
|
|
||||||
maintenanceLock := objAPI.NewNSLock(ctx, "system", "lock-maintenance-ops")
|
|
||||||
if err := maintenanceLock.GetLock(lockMaintenanceTimeout); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
defer maintenanceLock.Unlock()
|
|
||||||
|
|
||||||
// Validate if long lived locks are indeed clean.
|
// Validate if long lived locks are indeed clean.
|
||||||
// Get list of long lived locks to check for staleness.
|
// Get list of long lived locks to check for staleness.
|
||||||
for lendpoint, nlrips := range getLongLivedLocks(interval) {
|
for lendpoint, nlrips := range getLongLivedLocks(interval) {
|
||||||
|
nlripsMap := make(map[string]int, len(nlrips))
|
||||||
for _, nlrip := range nlrips {
|
for _, nlrip := range nlrips {
|
||||||
// Locks are only held on first zone, make sure that
|
// Locks are only held on first zone, make sure that
|
||||||
// we only look for ownership of locks from endpoints
|
// we only look for ownership of locks from endpoints
|
||||||
// on first zone.
|
// on first zone.
|
||||||
for _, endpoint := range globalEndpoints[0].Endpoints {
|
for _, endpoint := range globalEndpoints[0].Endpoints {
|
||||||
if endpoint.String() == lendpoint.String() {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
c := newLockAPI(endpoint)
|
c := newLockAPI(endpoint)
|
||||||
if !c.IsOnline() {
|
if !c.IsOnline() {
|
||||||
|
nlripsMap[nlrip.name]++
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -266,25 +255,37 @@ func lockMaintenance(ctx context.Context, interval time.Duration, objAPI ObjectL
|
|||||||
UID: nlrip.lri.UID,
|
UID: nlrip.lri.UID,
|
||||||
Resources: []string{nlrip.name},
|
Resources: []string{nlrip.name},
|
||||||
})
|
})
|
||||||
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
nlripsMap[nlrip.name]++
|
||||||
c.Close()
|
c.Close()
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
// For successful response, verify if lock was indeed active or stale.
|
if !expired {
|
||||||
if expired {
|
nlripsMap[nlrip.name]++
|
||||||
// The lock is no longer active at server that originated
|
|
||||||
// the lock, attempt to remove the lock.
|
|
||||||
globalLockServers[lendpoint].mutex.Lock()
|
|
||||||
// Purge the stale entry if it exists.
|
|
||||||
globalLockServers[lendpoint].removeEntryIfExists(nlrip)
|
|
||||||
globalLockServers[lendpoint].mutex.Unlock()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Close the connection regardless of the call response.
|
// Close the connection regardless of the call response.
|
||||||
c.Close()
|
c.Close()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Read locks we assume quorum for be N/2 success
|
||||||
|
quorum := globalXLSetDriveCount / 2
|
||||||
|
if nlrip.lri.Writer {
|
||||||
|
// For write locks we need N/2+1 success
|
||||||
|
quorum = globalXLSetDriveCount/2 + 1
|
||||||
|
}
|
||||||
|
|
||||||
|
// less than the quorum, we have locks expired.
|
||||||
|
if nlripsMap[nlrip.name] < quorum {
|
||||||
|
// The lock is no longer active at server that originated
|
||||||
|
// the lock, attempt to remove the lock.
|
||||||
|
globalLockServers[lendpoint].mutex.Lock()
|
||||||
|
// Purge the stale entry if it exists.
|
||||||
|
globalLockServers[lendpoint].removeEntryIfExists(nlrip)
|
||||||
|
globalLockServers[lendpoint].mutex.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -293,12 +294,13 @@ func lockMaintenance(ctx context.Context, interval time.Duration, objAPI ObjectL
|
|||||||
|
|
||||||
// Start lock maintenance from all lock servers.
|
// Start lock maintenance from all lock servers.
|
||||||
func startLockMaintenance() {
|
func startLockMaintenance() {
|
||||||
var objAPI ObjectLayer
|
|
||||||
var ctx = context.Background()
|
var ctx = context.Background()
|
||||||
|
|
||||||
// Wait until the object API is ready
|
// Wait until the object API is ready
|
||||||
|
// no need to start the lock maintenance
|
||||||
|
// if ObjectAPI is not initialized.
|
||||||
for {
|
for {
|
||||||
objAPI = newObjectLayerWithoutSafeModeFn()
|
objAPI := newObjectLayerWithoutSafeModeFn()
|
||||||
if objAPI == nil {
|
if objAPI == nil {
|
||||||
time.Sleep(time.Second)
|
time.Sleep(time.Second)
|
||||||
continue
|
continue
|
||||||
@ -322,7 +324,7 @@ func startLockMaintenance() {
|
|||||||
// "synchronous checks" between servers
|
// "synchronous checks" between servers
|
||||||
duration := time.Duration(r.Float64() * float64(lockMaintenanceInterval))
|
duration := time.Duration(r.Float64() * float64(lockMaintenanceInterval))
|
||||||
time.Sleep(duration)
|
time.Sleep(duration)
|
||||||
if err := lockMaintenance(ctx, lockValidityCheckInterval, objAPI); err != nil {
|
if err := lockMaintenance(ctx, lockValidityCheckInterval); err != nil {
|
||||||
// Sleep right after an error.
|
// Sleep right after an error.
|
||||||
duration := time.Duration(r.Float64() * float64(lockMaintenanceInterval))
|
duration := time.Duration(r.Float64() * float64(lockMaintenanceInterval))
|
||||||
time.Sleep(duration)
|
time.Sleep(duration)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user