hscontrol/state: make NodeStore batch configuration tunable (#2886)

This commit is contained in:
Kristoffer Dalby
2025-11-28 16:38:29 +01:00
committed by GitHub
parent 9c4c017eac
commit db293e0698
11 changed files with 267 additions and 140 deletions

View File

@@ -20,7 +20,7 @@ func TestEphemeralNodeDeleteWithConcurrentUpdate(t *testing.T) {
node := createTestNode(1, 1, "test-user", "test-node")
// Create NodeStore
store := NewNodeStore(nil, allowAllPeersFunc)
store := NewNodeStore(nil, allowAllPeersFunc, TestBatchSize, TestBatchTimeout)
store.Start()
defer store.Stop()
@@ -57,8 +57,6 @@ func TestEphemeralNodeDeleteWithConcurrentUpdate(t *testing.T) {
// Goroutine 2: DeleteNode (simulates handleLogout for ephemeral node)
go func() {
// Small delay to increase chance of batching together
time.Sleep(1 * time.Millisecond)
store.DeleteNode(node.ID)
done <- true
}()
@@ -67,15 +65,11 @@ func TestEphemeralNodeDeleteWithConcurrentUpdate(t *testing.T) {
<-done
<-done
// Give batching time to complete
time.Sleep(50 * time.Millisecond)
// The key assertion: if UpdateNode and DeleteNode were batched together
// with DELETE after UPDATE, then UpdateNode should return an invalid node
// OR it should return a valid node but the node should no longer exist in the store
_, found = store.GetNode(node.ID)
assert.False(t, found, "node should be deleted from NodeStore")
// Verify node is eventually deleted
require.EventuallyWithT(t, func(c *assert.CollectT) {
_, found = store.GetNode(node.ID)
assert.False(c, found, "node should be deleted from NodeStore")
}, 1*time.Second, 10*time.Millisecond, "waiting for node to be deleted")
// If the update happened before delete in the batch, the returned node might be invalid
if updateOk {
@@ -95,22 +89,21 @@ func TestEphemeralNodeDeleteWithConcurrentUpdate(t *testing.T) {
func TestUpdateNodeReturnsInvalidWhenDeletedInSameBatch(t *testing.T) {
node := createTestNode(2, 1, "test-user", "test-node-2")
store := NewNodeStore(nil, allowAllPeersFunc)
// Use batch size of 2 to guarantee UpdateNode and DeleteNode batch together
store := NewNodeStore(nil, allowAllPeersFunc, 2, TestBatchTimeout)
store.Start()
defer store.Stop()
// Put node in store
_ = store.PutNode(node)
// Simulate the exact sequence: UpdateNode gets queued, then DeleteNode gets queued,
// they batch together, and we check what UpdateNode returns
// Queue UpdateNode and DeleteNode - with batch size of 2, they will batch together
resultChan := make(chan struct {
node types.NodeView
ok bool
})
// Start UpdateNode - it will block until batch is applied
// Start UpdateNode in goroutine - it will queue and wait for batch
go func() {
node, ok := store.UpdateNode(node.ID, func(n *types.Node) {
n.LastSeen = ptr.To(time.Now())
@@ -121,18 +114,15 @@ func TestUpdateNodeReturnsInvalidWhenDeletedInSameBatch(t *testing.T) {
}{node, ok}
}()
// Give UpdateNode a moment to queue its work
time.Sleep(5 * time.Millisecond)
// Now queue DeleteNode - should batch with the UPDATE
store.DeleteNode(node.ID)
// Start DeleteNode in goroutine - it will queue and trigger batch processing
// Since batch size is 2, both operations will be processed together
go func() {
store.DeleteNode(node.ID)
}()
// Get the result from UpdateNode
result := <-resultChan
// Wait for batch to complete
time.Sleep(50 * time.Millisecond)
// Node should be deleted
_, found := store.GetNode(node.ID)
assert.False(t, found, "node should be deleted")
@@ -157,7 +147,7 @@ func TestUpdateNodeReturnsInvalidWhenDeletedInSameBatch(t *testing.T) {
func TestPersistNodeToDBPreventsRaceCondition(t *testing.T) {
node := createTestNode(3, 1, "test-user", "test-node-3")
store := NewNodeStore(nil, allowAllPeersFunc)
store := NewNodeStore(nil, allowAllPeersFunc, TestBatchSize, TestBatchTimeout)
store.Start()
defer store.Stop()
@@ -174,12 +164,11 @@ func TestPersistNodeToDBPreventsRaceCondition(t *testing.T) {
// Now delete the node (simulating ephemeral logout happening concurrently)
store.DeleteNode(node.ID)
// Wait for deletion to complete
time.Sleep(50 * time.Millisecond)
// Verify node is deleted
_, found := store.GetNode(node.ID)
require.False(t, found, "node should be deleted")
// Verify node is eventually deleted
require.EventuallyWithT(t, func(c *assert.CollectT) {
_, found := store.GetNode(node.ID)
assert.False(c, found, "node should be deleted")
}, 1*time.Second, 10*time.Millisecond, "waiting for node to be deleted")
// Now try to use the updatedNode from before the deletion
// In the old code, this would re-insert the node into the database
@@ -213,7 +202,8 @@ func TestEphemeralNodeLogoutRaceCondition(t *testing.T) {
Ephemeral: true,
}
store := NewNodeStore(nil, allowAllPeersFunc)
// Use batch size of 2 to guarantee UpdateNode and DeleteNode batch together
store := NewNodeStore(nil, allowAllPeersFunc, 2, TestBatchTimeout)
store.Start()
defer store.Stop()
@@ -238,7 +228,6 @@ func TestEphemeralNodeLogoutRaceCondition(t *testing.T) {
// Goroutine 2: DeleteNode (simulates handleLogout for ephemeral node)
go func() {
time.Sleep(1 * time.Millisecond) // Slight delay to batch operations
store.DeleteNode(ephemeralNode.ID)
done <- true
}()
@@ -247,12 +236,11 @@ func TestEphemeralNodeLogoutRaceCondition(t *testing.T) {
<-done
<-done
// Give batching time to complete
time.Sleep(50 * time.Millisecond)
// Node should be deleted from store
_, found := store.GetNode(ephemeralNode.ID)
assert.False(t, found, "ephemeral node should be deleted from NodeStore")
// Verify node is eventually deleted
require.EventuallyWithT(t, func(c *assert.CollectT) {
_, found := store.GetNode(ephemeralNode.ID)
assert.False(c, found, "ephemeral node should be deleted from NodeStore")
}, 1*time.Second, 10*time.Millisecond, "waiting for ephemeral node to be deleted")
// Critical assertion: if UpdateNode returned before DeleteNode completed,
// the updatedNode might be valid but the node is actually deleted.
@@ -288,51 +276,57 @@ func TestUpdateNodeFromMapRequestEphemeralLogoutSequence(t *testing.T) {
Ephemeral: true,
}
store := NewNodeStore(nil, allowAllPeersFunc)
// Use batch size of 2 to guarantee UpdateNode and DeleteNode batch together
// Use batch size of 2 to guarantee UpdateNode and DeleteNode batch together
store := NewNodeStore(nil, allowAllPeersFunc, 2, TestBatchTimeout)
store.Start()
defer store.Stop()
// Initial state: ephemeral node exists
// Put ephemeral node in store
_ = store.PutNode(ephemeralNode)
// Step 1: UpdateNodeFromMapRequest calls UpdateNode
// (simulating client sending MapRequest with endpoint updates)
updateStarted := make(chan bool)
var updatedNode types.NodeView
var updateOk bool
updateResult := make(chan struct {
node types.NodeView
ok bool
})
go func() {
updateStarted <- true
updatedNode, updateOk = store.UpdateNode(ephemeralNode.ID, func(n *types.Node) {
node, ok := store.UpdateNode(ephemeralNode.ID, func(n *types.Node) {
n.LastSeen = ptr.To(time.Now())
endpoint := netip.MustParseAddrPort("10.0.0.1:41641")
n.Endpoints = []netip.AddrPort{endpoint}
})
updateResult <- struct {
node types.NodeView
ok bool
}{node, ok}
}()
<-updateStarted
// Small delay to ensure UpdateNode is queued
time.Sleep(5 * time.Millisecond)
// Step 2: Logout happens - handleLogout calls DeleteNode
// (simulating client sending logout with past expiry)
store.DeleteNode(ephemeralNode.ID)
// With batch size of 2, this will trigger batch processing with UpdateNode
go func() {
store.DeleteNode(ephemeralNode.ID)
}()
// Wait for batching to complete
time.Sleep(50 * time.Millisecond)
// Step 3: Wait and verify node is eventually deleted
require.EventuallyWithT(t, func(c *assert.CollectT) {
_, nodeExists := store.GetNode(ephemeralNode.ID)
assert.False(c, nodeExists, "ephemeral node must be deleted after logout")
}, 1*time.Second, 10*time.Millisecond, "waiting for ephemeral node to be deleted")
// Step 3: Check results
_, nodeExists := store.GetNode(ephemeralNode.ID)
assert.False(t, nodeExists, "ephemeral node must be deleted after logout")
// Step 4: Get the update result
result := <-updateResult
// Step 4: Simulate what happens if we try to persist the updatedNode
if updateOk && updatedNode.Valid() {
// Simulate what happens if we try to persist the updatedNode
if result.ok && result.node.Valid() {
// This is the problematic path - UpdateNode returned a valid node
// but the node was deleted in the same batch
t.Log("UpdateNode returned valid node even though node was deleted")
// The fix: persistNodeToDB must check NodeStore before persisting
_, checkExists := store.GetNode(updatedNode.ID())
_, checkExists := store.GetNode(result.node.ID())
if checkExists {
t.Error("BUG: Node still exists in NodeStore after deletion - should be impossible")
} else {
@@ -353,14 +347,15 @@ func TestUpdateNodeFromMapRequestEphemeralLogoutSequence(t *testing.T) {
func TestUpdateNodeDeletedInSameBatchReturnsInvalid(t *testing.T) {
node := createTestNode(6, 1, "test-user", "test-node-6")
store := NewNodeStore(nil, allowAllPeersFunc)
// Use batch size of 2 to guarantee UpdateNode and DeleteNode batch together
store := NewNodeStore(nil, allowAllPeersFunc, 2, TestBatchTimeout)
store.Start()
defer store.Stop()
// Put node in store
_ = store.PutNode(node)
// Queue UpdateNode
// Queue UpdateNode and DeleteNode - with batch size of 2, they will batch together
updateDone := make(chan struct {
node types.NodeView
ok bool
@@ -376,18 +371,14 @@ func TestUpdateNodeDeletedInSameBatchReturnsInvalid(t *testing.T) {
}{updatedNode, ok}
}()
// Small delay to ensure UpdateNode is queued
time.Sleep(5 * time.Millisecond)
// Queue DeleteNode - should batch with UpdateNode
store.DeleteNode(node.ID)
// Queue DeleteNode - with batch size of 2, this triggers batch processing
go func() {
store.DeleteNode(node.ID)
}()
// Get UpdateNode result
result := <-updateDone
// Wait for batch to complete
time.Sleep(50 * time.Millisecond)
// Node should be deleted
_, exists := store.GetNode(node.ID)
assert.False(t, exists, "node should be deleted from store")
@@ -417,30 +408,28 @@ func TestPersistNodeToDBChecksNodeStoreBeforePersist(t *testing.T) {
Ephemeral: true,
}
store := NewNodeStore(nil, allowAllPeersFunc)
store := NewNodeStore(nil, allowAllPeersFunc, TestBatchSize, TestBatchTimeout)
store.Start()
defer store.Stop()
// Put node in store
// Put node
_ = store.PutNode(ephemeralNode)
// Simulate the race:
// 1. UpdateNode is called (from UpdateNodeFromMapRequest)
// UpdateNode returns a node
updatedNode, ok := store.UpdateNode(ephemeralNode.ID, func(n *types.Node) {
n.LastSeen = ptr.To(time.Now())
})
require.True(t, ok, "UpdateNode should succeed")
require.True(t, updatedNode.Valid(), "UpdateNode should return valid node")
require.True(t, updatedNode.Valid(), "updated node should be valid")
// 2. Node is deleted (from handleLogout for ephemeral node)
// Delete the node
store.DeleteNode(ephemeralNode.ID)
// Wait for deletion
time.Sleep(50 * time.Millisecond)
// 3. Verify node is deleted from store
_, exists := store.GetNode(ephemeralNode.ID)
require.False(t, exists, "node should be deleted from NodeStore")
// Verify node is eventually deleted
require.EventuallyWithT(t, func(c *assert.CollectT) {
_, exists := store.GetNode(ephemeralNode.ID)
assert.False(c, exists, "node should be deleted from NodeStore")
}, 1*time.Second, 10*time.Millisecond, "waiting for node to be deleted")
// 4. Simulate what persistNodeToDB does - check if node still exists
// The fix in persistNodeToDB checks NodeStore before persisting: