mirror of
https://github.com/juanfont/headscale.git
synced 2025-11-29 05:18:48 -05:00
hscontrol/state: make NodeStore batch configuration tunable (#2886)
This commit is contained in:
@@ -20,7 +20,7 @@ func TestEphemeralNodeDeleteWithConcurrentUpdate(t *testing.T) {
|
||||
node := createTestNode(1, 1, "test-user", "test-node")
|
||||
|
||||
// Create NodeStore
|
||||
store := NewNodeStore(nil, allowAllPeersFunc)
|
||||
store := NewNodeStore(nil, allowAllPeersFunc, TestBatchSize, TestBatchTimeout)
|
||||
store.Start()
|
||||
defer store.Stop()
|
||||
|
||||
@@ -57,8 +57,6 @@ func TestEphemeralNodeDeleteWithConcurrentUpdate(t *testing.T) {
|
||||
|
||||
// Goroutine 2: DeleteNode (simulates handleLogout for ephemeral node)
|
||||
go func() {
|
||||
// Small delay to increase chance of batching together
|
||||
time.Sleep(1 * time.Millisecond)
|
||||
store.DeleteNode(node.ID)
|
||||
done <- true
|
||||
}()
|
||||
@@ -67,15 +65,11 @@ func TestEphemeralNodeDeleteWithConcurrentUpdate(t *testing.T) {
|
||||
<-done
|
||||
<-done
|
||||
|
||||
// Give batching time to complete
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
|
||||
// The key assertion: if UpdateNode and DeleteNode were batched together
|
||||
// with DELETE after UPDATE, then UpdateNode should return an invalid node
|
||||
// OR it should return a valid node but the node should no longer exist in the store
|
||||
|
||||
_, found = store.GetNode(node.ID)
|
||||
assert.False(t, found, "node should be deleted from NodeStore")
|
||||
// Verify node is eventually deleted
|
||||
require.EventuallyWithT(t, func(c *assert.CollectT) {
|
||||
_, found = store.GetNode(node.ID)
|
||||
assert.False(c, found, "node should be deleted from NodeStore")
|
||||
}, 1*time.Second, 10*time.Millisecond, "waiting for node to be deleted")
|
||||
|
||||
// If the update happened before delete in the batch, the returned node might be invalid
|
||||
if updateOk {
|
||||
@@ -95,22 +89,21 @@ func TestEphemeralNodeDeleteWithConcurrentUpdate(t *testing.T) {
|
||||
func TestUpdateNodeReturnsInvalidWhenDeletedInSameBatch(t *testing.T) {
|
||||
node := createTestNode(2, 1, "test-user", "test-node-2")
|
||||
|
||||
store := NewNodeStore(nil, allowAllPeersFunc)
|
||||
// Use batch size of 2 to guarantee UpdateNode and DeleteNode batch together
|
||||
store := NewNodeStore(nil, allowAllPeersFunc, 2, TestBatchTimeout)
|
||||
store.Start()
|
||||
defer store.Stop()
|
||||
|
||||
// Put node in store
|
||||
_ = store.PutNode(node)
|
||||
|
||||
// Simulate the exact sequence: UpdateNode gets queued, then DeleteNode gets queued,
|
||||
// they batch together, and we check what UpdateNode returns
|
||||
|
||||
// Queue UpdateNode and DeleteNode - with batch size of 2, they will batch together
|
||||
resultChan := make(chan struct {
|
||||
node types.NodeView
|
||||
ok bool
|
||||
})
|
||||
|
||||
// Start UpdateNode - it will block until batch is applied
|
||||
// Start UpdateNode in goroutine - it will queue and wait for batch
|
||||
go func() {
|
||||
node, ok := store.UpdateNode(node.ID, func(n *types.Node) {
|
||||
n.LastSeen = ptr.To(time.Now())
|
||||
@@ -121,18 +114,15 @@ func TestUpdateNodeReturnsInvalidWhenDeletedInSameBatch(t *testing.T) {
|
||||
}{node, ok}
|
||||
}()
|
||||
|
||||
// Give UpdateNode a moment to queue its work
|
||||
time.Sleep(5 * time.Millisecond)
|
||||
|
||||
// Now queue DeleteNode - should batch with the UPDATE
|
||||
store.DeleteNode(node.ID)
|
||||
// Start DeleteNode in goroutine - it will queue and trigger batch processing
|
||||
// Since batch size is 2, both operations will be processed together
|
||||
go func() {
|
||||
store.DeleteNode(node.ID)
|
||||
}()
|
||||
|
||||
// Get the result from UpdateNode
|
||||
result := <-resultChan
|
||||
|
||||
// Wait for batch to complete
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
|
||||
// Node should be deleted
|
||||
_, found := store.GetNode(node.ID)
|
||||
assert.False(t, found, "node should be deleted")
|
||||
@@ -157,7 +147,7 @@ func TestUpdateNodeReturnsInvalidWhenDeletedInSameBatch(t *testing.T) {
|
||||
func TestPersistNodeToDBPreventsRaceCondition(t *testing.T) {
|
||||
node := createTestNode(3, 1, "test-user", "test-node-3")
|
||||
|
||||
store := NewNodeStore(nil, allowAllPeersFunc)
|
||||
store := NewNodeStore(nil, allowAllPeersFunc, TestBatchSize, TestBatchTimeout)
|
||||
store.Start()
|
||||
defer store.Stop()
|
||||
|
||||
@@ -174,12 +164,11 @@ func TestPersistNodeToDBPreventsRaceCondition(t *testing.T) {
|
||||
// Now delete the node (simulating ephemeral logout happening concurrently)
|
||||
store.DeleteNode(node.ID)
|
||||
|
||||
// Wait for deletion to complete
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
|
||||
// Verify node is deleted
|
||||
_, found := store.GetNode(node.ID)
|
||||
require.False(t, found, "node should be deleted")
|
||||
// Verify node is eventually deleted
|
||||
require.EventuallyWithT(t, func(c *assert.CollectT) {
|
||||
_, found := store.GetNode(node.ID)
|
||||
assert.False(c, found, "node should be deleted")
|
||||
}, 1*time.Second, 10*time.Millisecond, "waiting for node to be deleted")
|
||||
|
||||
// Now try to use the updatedNode from before the deletion
|
||||
// In the old code, this would re-insert the node into the database
|
||||
@@ -213,7 +202,8 @@ func TestEphemeralNodeLogoutRaceCondition(t *testing.T) {
|
||||
Ephemeral: true,
|
||||
}
|
||||
|
||||
store := NewNodeStore(nil, allowAllPeersFunc)
|
||||
// Use batch size of 2 to guarantee UpdateNode and DeleteNode batch together
|
||||
store := NewNodeStore(nil, allowAllPeersFunc, 2, TestBatchTimeout)
|
||||
store.Start()
|
||||
defer store.Stop()
|
||||
|
||||
@@ -238,7 +228,6 @@ func TestEphemeralNodeLogoutRaceCondition(t *testing.T) {
|
||||
|
||||
// Goroutine 2: DeleteNode (simulates handleLogout for ephemeral node)
|
||||
go func() {
|
||||
time.Sleep(1 * time.Millisecond) // Slight delay to batch operations
|
||||
store.DeleteNode(ephemeralNode.ID)
|
||||
done <- true
|
||||
}()
|
||||
@@ -247,12 +236,11 @@ func TestEphemeralNodeLogoutRaceCondition(t *testing.T) {
|
||||
<-done
|
||||
<-done
|
||||
|
||||
// Give batching time to complete
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
|
||||
// Node should be deleted from store
|
||||
_, found := store.GetNode(ephemeralNode.ID)
|
||||
assert.False(t, found, "ephemeral node should be deleted from NodeStore")
|
||||
// Verify node is eventually deleted
|
||||
require.EventuallyWithT(t, func(c *assert.CollectT) {
|
||||
_, found := store.GetNode(ephemeralNode.ID)
|
||||
assert.False(c, found, "ephemeral node should be deleted from NodeStore")
|
||||
}, 1*time.Second, 10*time.Millisecond, "waiting for ephemeral node to be deleted")
|
||||
|
||||
// Critical assertion: if UpdateNode returned before DeleteNode completed,
|
||||
// the updatedNode might be valid but the node is actually deleted.
|
||||
@@ -288,51 +276,57 @@ func TestUpdateNodeFromMapRequestEphemeralLogoutSequence(t *testing.T) {
|
||||
Ephemeral: true,
|
||||
}
|
||||
|
||||
store := NewNodeStore(nil, allowAllPeersFunc)
|
||||
// Use batch size of 2 to guarantee UpdateNode and DeleteNode batch together
|
||||
// Use batch size of 2 to guarantee UpdateNode and DeleteNode batch together
|
||||
store := NewNodeStore(nil, allowAllPeersFunc, 2, TestBatchTimeout)
|
||||
store.Start()
|
||||
defer store.Stop()
|
||||
|
||||
// Initial state: ephemeral node exists
|
||||
// Put ephemeral node in store
|
||||
_ = store.PutNode(ephemeralNode)
|
||||
|
||||
// Step 1: UpdateNodeFromMapRequest calls UpdateNode
|
||||
// (simulating client sending MapRequest with endpoint updates)
|
||||
updateStarted := make(chan bool)
|
||||
var updatedNode types.NodeView
|
||||
var updateOk bool
|
||||
updateResult := make(chan struct {
|
||||
node types.NodeView
|
||||
ok bool
|
||||
})
|
||||
|
||||
go func() {
|
||||
updateStarted <- true
|
||||
updatedNode, updateOk = store.UpdateNode(ephemeralNode.ID, func(n *types.Node) {
|
||||
node, ok := store.UpdateNode(ephemeralNode.ID, func(n *types.Node) {
|
||||
n.LastSeen = ptr.To(time.Now())
|
||||
endpoint := netip.MustParseAddrPort("10.0.0.1:41641")
|
||||
n.Endpoints = []netip.AddrPort{endpoint}
|
||||
})
|
||||
updateResult <- struct {
|
||||
node types.NodeView
|
||||
ok bool
|
||||
}{node, ok}
|
||||
}()
|
||||
|
||||
<-updateStarted
|
||||
// Small delay to ensure UpdateNode is queued
|
||||
time.Sleep(5 * time.Millisecond)
|
||||
|
||||
// Step 2: Logout happens - handleLogout calls DeleteNode
|
||||
// (simulating client sending logout with past expiry)
|
||||
store.DeleteNode(ephemeralNode.ID)
|
||||
// With batch size of 2, this will trigger batch processing with UpdateNode
|
||||
go func() {
|
||||
store.DeleteNode(ephemeralNode.ID)
|
||||
}()
|
||||
|
||||
// Wait for batching to complete
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
// Step 3: Wait and verify node is eventually deleted
|
||||
require.EventuallyWithT(t, func(c *assert.CollectT) {
|
||||
_, nodeExists := store.GetNode(ephemeralNode.ID)
|
||||
assert.False(c, nodeExists, "ephemeral node must be deleted after logout")
|
||||
}, 1*time.Second, 10*time.Millisecond, "waiting for ephemeral node to be deleted")
|
||||
|
||||
// Step 3: Check results
|
||||
_, nodeExists := store.GetNode(ephemeralNode.ID)
|
||||
assert.False(t, nodeExists, "ephemeral node must be deleted after logout")
|
||||
// Step 4: Get the update result
|
||||
result := <-updateResult
|
||||
|
||||
// Step 4: Simulate what happens if we try to persist the updatedNode
|
||||
if updateOk && updatedNode.Valid() {
|
||||
// Simulate what happens if we try to persist the updatedNode
|
||||
if result.ok && result.node.Valid() {
|
||||
// This is the problematic path - UpdateNode returned a valid node
|
||||
// but the node was deleted in the same batch
|
||||
t.Log("UpdateNode returned valid node even though node was deleted")
|
||||
|
||||
// The fix: persistNodeToDB must check NodeStore before persisting
|
||||
_, checkExists := store.GetNode(updatedNode.ID())
|
||||
_, checkExists := store.GetNode(result.node.ID())
|
||||
if checkExists {
|
||||
t.Error("BUG: Node still exists in NodeStore after deletion - should be impossible")
|
||||
} else {
|
||||
@@ -353,14 +347,15 @@ func TestUpdateNodeFromMapRequestEphemeralLogoutSequence(t *testing.T) {
|
||||
func TestUpdateNodeDeletedInSameBatchReturnsInvalid(t *testing.T) {
|
||||
node := createTestNode(6, 1, "test-user", "test-node-6")
|
||||
|
||||
store := NewNodeStore(nil, allowAllPeersFunc)
|
||||
// Use batch size of 2 to guarantee UpdateNode and DeleteNode batch together
|
||||
store := NewNodeStore(nil, allowAllPeersFunc, 2, TestBatchTimeout)
|
||||
store.Start()
|
||||
defer store.Stop()
|
||||
|
||||
// Put node in store
|
||||
_ = store.PutNode(node)
|
||||
|
||||
// Queue UpdateNode
|
||||
// Queue UpdateNode and DeleteNode - with batch size of 2, they will batch together
|
||||
updateDone := make(chan struct {
|
||||
node types.NodeView
|
||||
ok bool
|
||||
@@ -376,18 +371,14 @@ func TestUpdateNodeDeletedInSameBatchReturnsInvalid(t *testing.T) {
|
||||
}{updatedNode, ok}
|
||||
}()
|
||||
|
||||
// Small delay to ensure UpdateNode is queued
|
||||
time.Sleep(5 * time.Millisecond)
|
||||
|
||||
// Queue DeleteNode - should batch with UpdateNode
|
||||
store.DeleteNode(node.ID)
|
||||
// Queue DeleteNode - with batch size of 2, this triggers batch processing
|
||||
go func() {
|
||||
store.DeleteNode(node.ID)
|
||||
}()
|
||||
|
||||
// Get UpdateNode result
|
||||
result := <-updateDone
|
||||
|
||||
// Wait for batch to complete
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
|
||||
// Node should be deleted
|
||||
_, exists := store.GetNode(node.ID)
|
||||
assert.False(t, exists, "node should be deleted from store")
|
||||
@@ -417,30 +408,28 @@ func TestPersistNodeToDBChecksNodeStoreBeforePersist(t *testing.T) {
|
||||
Ephemeral: true,
|
||||
}
|
||||
|
||||
store := NewNodeStore(nil, allowAllPeersFunc)
|
||||
store := NewNodeStore(nil, allowAllPeersFunc, TestBatchSize, TestBatchTimeout)
|
||||
store.Start()
|
||||
defer store.Stop()
|
||||
|
||||
// Put node in store
|
||||
// Put node
|
||||
_ = store.PutNode(ephemeralNode)
|
||||
|
||||
// Simulate the race:
|
||||
// 1. UpdateNode is called (from UpdateNodeFromMapRequest)
|
||||
// UpdateNode returns a node
|
||||
updatedNode, ok := store.UpdateNode(ephemeralNode.ID, func(n *types.Node) {
|
||||
n.LastSeen = ptr.To(time.Now())
|
||||
})
|
||||
require.True(t, ok, "UpdateNode should succeed")
|
||||
require.True(t, updatedNode.Valid(), "UpdateNode should return valid node")
|
||||
require.True(t, updatedNode.Valid(), "updated node should be valid")
|
||||
|
||||
// 2. Node is deleted (from handleLogout for ephemeral node)
|
||||
// Delete the node
|
||||
store.DeleteNode(ephemeralNode.ID)
|
||||
|
||||
// Wait for deletion
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
|
||||
// 3. Verify node is deleted from store
|
||||
_, exists := store.GetNode(ephemeralNode.ID)
|
||||
require.False(t, exists, "node should be deleted from NodeStore")
|
||||
// Verify node is eventually deleted
|
||||
require.EventuallyWithT(t, func(c *assert.CollectT) {
|
||||
_, exists := store.GetNode(ephemeralNode.ID)
|
||||
assert.False(c, exists, "node should be deleted from NodeStore")
|
||||
}, 1*time.Second, 10*time.Millisecond, "waiting for node to be deleted")
|
||||
|
||||
// 4. Simulate what persistNodeToDB does - check if node still exists
|
||||
// The fix in persistNodeToDB checks NodeStore before persisting:
|
||||
|
||||
Reference in New Issue
Block a user