mirror of
https://github.com/minio/minio.git
synced 2025-11-09 13:39:46 -05:00
Fix a bug in dsync initialization and communication (#5428)
In current implementation we used as many dsync clients as per number of endpoints(along with path) which is not the expected implementation. The implementation of Dsync was expected to be just for the endpoint Host alone such that if you have 4 servers and each with 4 disks we need to only have 4 dsync clients and 4 dsync servers. But we currently had 8 clients, servers which in-fact is unexpected and should be avoided. This PR brings the implementation back to its original intention. This issue was found #5160
This commit is contained in:
committed by
kannappanr
parent
bb73c84b10
commit
f3f09ed14e
@@ -19,7 +19,6 @@ package cmd
|
||||
import (
|
||||
"fmt"
|
||||
"math/rand"
|
||||
"path"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@@ -64,49 +63,45 @@ type lockServer struct {
|
||||
}
|
||||
|
||||
// Start lock maintenance from all lock servers.
|
||||
func startLockMaintenance(lockServers []*lockServer) {
|
||||
for _, locker := range lockServers {
|
||||
// Start loop for stale lock maintenance
|
||||
go func(lk *lockServer) {
|
||||
// Initialize a new ticker with a minute between each ticks.
|
||||
ticker := time.NewTicker(lockMaintenanceInterval)
|
||||
func startLockMaintenance(lkSrv *lockServer) {
|
||||
// Start loop for stale lock maintenance
|
||||
go func(lk *lockServer) {
|
||||
// Initialize a new ticker with a minute between each ticks.
|
||||
ticker := time.NewTicker(lockMaintenanceInterval)
|
||||
|
||||
// Start with random sleep time, so as to avoid "synchronous checks" between servers
|
||||
time.Sleep(time.Duration(rand.Float64() * float64(lockMaintenanceInterval)))
|
||||
for {
|
||||
// Verifies every minute for locks held more than 2minutes.
|
||||
select {
|
||||
case <-globalServiceDoneCh:
|
||||
// Stop the timer upon service closure and cleanup the go-routine.
|
||||
ticker.Stop()
|
||||
return
|
||||
case <-ticker.C:
|
||||
lk.lockMaintenance(lockValidityCheckInterval)
|
||||
}
|
||||
// Start with random sleep time, so as to avoid "synchronous checks" between servers
|
||||
time.Sleep(time.Duration(rand.Float64() * float64(lockMaintenanceInterval)))
|
||||
for {
|
||||
// Verifies every minute for locks held more than 2minutes.
|
||||
select {
|
||||
case <-globalServiceDoneCh:
|
||||
// Stop the timer upon service closure and cleanup the go-routine.
|
||||
ticker.Stop()
|
||||
return
|
||||
case <-ticker.C:
|
||||
lk.lockMaintenance(lockValidityCheckInterval)
|
||||
}
|
||||
}(locker)
|
||||
}
|
||||
}
|
||||
}(lkSrv)
|
||||
}
|
||||
|
||||
// Register distributed NS lock handlers.
|
||||
func registerDistNSLockRouter(mux *router.Router, endpoints EndpointList) error {
|
||||
// Start lock maintenance from all lock servers.
|
||||
startLockMaintenance(globalLockServers)
|
||||
startLockMaintenance(globalLockServer)
|
||||
|
||||
// Register initialized lock servers to their respective rpc endpoints.
|
||||
return registerStorageLockers(mux, globalLockServers)
|
||||
return registerStorageLockers(mux, globalLockServer)
|
||||
}
|
||||
|
||||
// registerStorageLockers - register locker rpc handlers for net/rpc library clients
|
||||
func registerStorageLockers(mux *router.Router, lockServers []*lockServer) error {
|
||||
for _, lockServer := range lockServers {
|
||||
lockRPCServer := newRPCServer()
|
||||
if err := lockRPCServer.RegisterName(lockServiceName, lockServer); err != nil {
|
||||
return errors.Trace(err)
|
||||
}
|
||||
lockRouter := mux.PathPrefix(minioReservedBucketPath).Subrouter()
|
||||
lockRouter.Path(path.Join(lockServicePath, lockServer.ll.serviceEndpoint)).Handler(lockRPCServer)
|
||||
func registerStorageLockers(mux *router.Router, lkSrv *lockServer) error {
|
||||
lockRPCServer := newRPCServer()
|
||||
if err := lockRPCServer.RegisterName(lockServiceName, lkSrv); err != nil {
|
||||
return errors.Trace(err)
|
||||
}
|
||||
lockRouter := mux.PathPrefix(minioReservedBucketPath).Subrouter()
|
||||
lockRouter.Path(lockServicePath).Handler(lockRPCServer)
|
||||
return nil
|
||||
}
|
||||
|
||||
|
||||
@@ -425,8 +425,8 @@ func TestLockRpcServerExpired(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
// Test initialization of lock servers.
|
||||
func TestLockServers(t *testing.T) {
|
||||
// Test initialization of lock server.
|
||||
func TestLockServerInit(t *testing.T) {
|
||||
if runtime.GOOS == globalWindowsOSName {
|
||||
return
|
||||
}
|
||||
@@ -438,8 +438,10 @@ func TestLockServers(t *testing.T) {
|
||||
defer os.RemoveAll(rootPath)
|
||||
|
||||
currentIsDistXL := globalIsDistXL
|
||||
currentLockServer := globalLockServer
|
||||
defer func() {
|
||||
globalIsDistXL = currentIsDistXL
|
||||
globalLockServer = currentLockServer
|
||||
}()
|
||||
|
||||
case1Endpoints := mustGetNewEndpointList(
|
||||
@@ -468,26 +470,27 @@ func TestLockServers(t *testing.T) {
|
||||
|
||||
globalMinioHost = ""
|
||||
testCases := []struct {
|
||||
isDistXL bool
|
||||
endpoints EndpointList
|
||||
totalLockServers int
|
||||
isDistXL bool
|
||||
endpoints EndpointList
|
||||
}{
|
||||
// Test - 1 one lock server initialized.
|
||||
{true, case1Endpoints, 1},
|
||||
// Test - 2 two servers possible.
|
||||
{true, case2Endpoints, 2},
|
||||
{true, case1Endpoints},
|
||||
// Test - similar endpoint hosts should
|
||||
// converge to single lock server
|
||||
// initialized.
|
||||
{true, case2Endpoints},
|
||||
}
|
||||
|
||||
// Validates lock server initialization.
|
||||
for i, testCase := range testCases {
|
||||
globalIsDistXL = testCase.isDistXL
|
||||
globalLockServers = nil
|
||||
globalLockServer = nil
|
||||
_, _ = newDsyncNodes(testCase.endpoints)
|
||||
if err != nil {
|
||||
t.Fatalf("Got unexpected error initializing lock servers: %v", err)
|
||||
}
|
||||
if len(globalLockServers) != testCase.totalLockServers {
|
||||
t.Fatalf("Test %d: Expected total %d, got %d", i+1, testCase.totalLockServers, len(globalLockServers))
|
||||
if globalLockServer == nil && testCase.isDistXL {
|
||||
t.Errorf("Test %d: Expected initialized lockServer, but got uninitialized", i+1)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -28,13 +28,17 @@ import (
|
||||
|
||||
"github.com/minio/dsync"
|
||||
"github.com/minio/lsync"
|
||||
"github.com/minio/minio-go/pkg/set"
|
||||
)
|
||||
|
||||
// Global name space lock.
|
||||
var globalNSMutex *nsLockMap
|
||||
|
||||
// Global lock servers
|
||||
var globalLockServers []*lockServer
|
||||
// Global lock server one per server.
|
||||
var globalLockServer *lockServer
|
||||
|
||||
// Instance of dsync for distributed clients.
|
||||
var globalDsync *dsync.Dsync
|
||||
|
||||
// RWLocker - locker interface to introduce GetRLock, RUnlock.
|
||||
type RWLocker interface {
|
||||
@@ -56,39 +60,41 @@ type RWLockerSync interface {
|
||||
// Returns lock clients and the node index for the current server.
|
||||
func newDsyncNodes(endpoints EndpointList) (clnts []dsync.NetLocker, myNode int) {
|
||||
cred := globalServerConfig.GetCredential()
|
||||
clnts = make([]dsync.NetLocker, len(endpoints))
|
||||
myNode = -1
|
||||
for index, endpoint := range endpoints {
|
||||
seenHosts := set.NewStringSet()
|
||||
for _, endpoint := range endpoints {
|
||||
if seenHosts.Contains(endpoint.Host) {
|
||||
continue
|
||||
}
|
||||
seenHosts.Add(endpoint.Host)
|
||||
if !endpoint.IsLocal {
|
||||
// For a remote endpoints setup a lock RPC client.
|
||||
clnts[index] = newLockRPCClient(authConfig{
|
||||
clnts = append(clnts, newLockRPCClient(authConfig{
|
||||
accessKey: cred.AccessKey,
|
||||
secretKey: cred.SecretKey,
|
||||
serverAddr: endpoint.Host,
|
||||
secureConn: globalIsSSL,
|
||||
serviceEndpoint: pathutil.Join(minioReservedBucketPath, lockServicePath, endpoint.Path),
|
||||
serviceEndpoint: pathutil.Join(minioReservedBucketPath, lockServicePath),
|
||||
serviceName: lockServiceName,
|
||||
})
|
||||
}))
|
||||
continue
|
||||
}
|
||||
|
||||
// Local endpoint
|
||||
if myNode == -1 {
|
||||
myNode = index
|
||||
}
|
||||
myNode = len(clnts)
|
||||
|
||||
// For a local endpoint, setup a local lock server to
|
||||
// avoid network requests.
|
||||
localLockServer := lockServer{
|
||||
AuthRPCServer: AuthRPCServer{},
|
||||
ll: localLocker{
|
||||
mutex: sync.Mutex{},
|
||||
serviceEndpoint: endpoint.Path,
|
||||
serverAddr: endpoint.Host,
|
||||
serviceEndpoint: pathutil.Join(minioReservedBucketPath, lockServicePath),
|
||||
lockMap: make(map[string][]lockRequesterInfo),
|
||||
},
|
||||
}
|
||||
globalLockServers = append(globalLockServers, &localLockServer)
|
||||
clnts[index] = &(localLockServer.ll)
|
||||
globalLockServer = &localLockServer
|
||||
clnts = append(clnts, &(localLockServer.ll))
|
||||
}
|
||||
|
||||
return clnts, myNode
|
||||
@@ -149,7 +155,7 @@ func (n *nsLockMap) lock(volume, path string, lockSource, opsID string, readLock
|
||||
nsLk = &nsLock{
|
||||
RWLockerSync: func() RWLockerSync {
|
||||
if n.isDistXL {
|
||||
return dsync.NewDRWMutex(pathJoin(volume, path))
|
||||
return dsync.NewDRWMutex(pathJoin(volume, path), globalDsync)
|
||||
}
|
||||
return &lsync.LRWMutex{}
|
||||
}(),
|
||||
@@ -303,7 +309,7 @@ func (n *nsLockMap) ForceUnlock(volume, path string) {
|
||||
// are blocking can now proceed as normal and any new locks will also
|
||||
// participate normally.
|
||||
if n.isDistXL { // For distributed mode, broadcast ForceUnlock message.
|
||||
dsync.NewDRWMutex(pathJoin(volume, path)).ForceUnlock()
|
||||
dsync.NewDRWMutex(pathJoin(volume, path), globalDsync).ForceUnlock()
|
||||
}
|
||||
|
||||
param := nsParam{volume, path}
|
||||
|
||||
@@ -186,8 +186,8 @@ func serverMain(ctx *cli.Context) {
|
||||
|
||||
// Set nodes for dsync for distributed setup.
|
||||
if globalIsDistXL {
|
||||
clnts, myNode := newDsyncNodes(globalEndpoints)
|
||||
fatalIf(dsync.Init(clnts, myNode), "Unable to initialize distributed locking clients")
|
||||
globalDsync, err = dsync.New(newDsyncNodes(globalEndpoints))
|
||||
fatalIf(err, "Unable to initialize distributed locking clients")
|
||||
}
|
||||
|
||||
// Initialize name space lock.
|
||||
|
||||
Reference in New Issue
Block a user