Files
headscale/hscontrol/state/ephemeral_test.go

450 lines
16 KiB
Go

package state
import (
"net/netip"
"testing"
"time"
"github.com/juanfont/headscale/hscontrol/types"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"tailscale.com/types/ptr"
)
// TestEphemeralNodeDeleteWithConcurrentUpdate tests the race condition where UpdateNode and DeleteNode
// are called concurrently and may be batched together. This reproduces the issue where ephemeral nodes
// are not properly deleted during logout because UpdateNodeFromMapRequest returns a stale node view
// after the node has been deleted from the NodeStore.
func TestEphemeralNodeDeleteWithConcurrentUpdate(t *testing.T) {
// Create a simple test node
node := createTestNode(1, 1, "test-user", "test-node")
// Create NodeStore
store := NewNodeStore(nil, allowAllPeersFunc, TestBatchSize, TestBatchTimeout)
store.Start()
defer store.Stop()
// Put the node in the store
resultNode := store.PutNode(node)
require.True(t, resultNode.Valid(), "initial PutNode should return valid node")
// Verify node exists
retrievedNode, found := store.GetNode(node.ID)
require.True(t, found)
require.Equal(t, node.ID, retrievedNode.ID())
// Test scenario: UpdateNode is called, returns a node view from the batch,
// but in the same batch a DeleteNode removes the node.
// This simulates what happens when:
// 1. UpdateNodeFromMapRequest calls UpdateNode and gets back updatedNode
// 2. At the same time, handleLogout calls DeleteNode
// 3. They get batched together: [UPDATE, DELETE]
// 4. UPDATE modifies the node, DELETE removes it
// 5. UpdateNode returns a node view based on the state AFTER both operations
// 6. If DELETE came after UPDATE, the returned node should be invalid
done := make(chan bool, 2)
var updatedNode types.NodeView
var updateOk bool
// Goroutine 1: UpdateNode (simulates UpdateNodeFromMapRequest)
go func() {
updatedNode, updateOk = store.UpdateNode(node.ID, func(n *types.Node) {
n.LastSeen = ptr.To(time.Now())
})
done <- true
}()
// Goroutine 2: DeleteNode (simulates handleLogout for ephemeral node)
go func() {
store.DeleteNode(node.ID)
done <- true
}()
// Wait for both operations
<-done
<-done
// Verify node is eventually deleted
require.EventuallyWithT(t, func(c *assert.CollectT) {
_, found = store.GetNode(node.ID)
assert.False(c, found, "node should be deleted from NodeStore")
}, 1*time.Second, 10*time.Millisecond, "waiting for node to be deleted")
// If the update happened before delete in the batch, the returned node might be invalid
if updateOk {
t.Logf("UpdateNode returned ok=true, valid=%v", updatedNode.Valid())
// This is the bug scenario - UpdateNode thinks it succeeded but node is gone
if updatedNode.Valid() {
t.Logf("WARNING: UpdateNode returned valid node but node was deleted - this indicates the race condition bug")
}
} else {
t.Logf("UpdateNode correctly returned ok=false (node deleted in same batch)")
}
}
// TestUpdateNodeReturnsInvalidWhenDeletedInSameBatch specifically tests that when
// UpdateNode and DeleteNode are in the same batch with DELETE after UPDATE,
// the UpdateNode should return an invalid node view.
func TestUpdateNodeReturnsInvalidWhenDeletedInSameBatch(t *testing.T) {
node := createTestNode(2, 1, "test-user", "test-node-2")
// Use batch size of 2 to guarantee UpdateNode and DeleteNode batch together
store := NewNodeStore(nil, allowAllPeersFunc, 2, TestBatchTimeout)
store.Start()
defer store.Stop()
// Put node in store
_ = store.PutNode(node)
// Queue UpdateNode and DeleteNode - with batch size of 2, they will batch together
resultChan := make(chan struct {
node types.NodeView
ok bool
})
// Start UpdateNode in goroutine - it will queue and wait for batch
go func() {
node, ok := store.UpdateNode(node.ID, func(n *types.Node) {
n.LastSeen = ptr.To(time.Now())
})
resultChan <- struct {
node types.NodeView
ok bool
}{node, ok}
}()
// Start DeleteNode in goroutine - it will queue and trigger batch processing
// Since batch size is 2, both operations will be processed together
go func() {
store.DeleteNode(node.ID)
}()
// Get the result from UpdateNode
result := <-resultChan
// Node should be deleted
_, found := store.GetNode(node.ID)
assert.False(t, found, "node should be deleted")
// The critical check: what did UpdateNode return?
// After the commit c6b09289988f34398eb3157e31ba092eb8721a9f,
// UpdateNode returns the node state from the batch.
// If DELETE came after UPDATE in the batch, the node doesn't exist anymore,
// so UpdateNode should return (invalid, false)
t.Logf("UpdateNode returned: ok=%v, valid=%v", result.ok, result.node.Valid())
// This is the expected behavior - if node was deleted in same batch,
// UpdateNode should return invalid node
if result.ok && result.node.Valid() {
t.Error("BUG: UpdateNode returned valid node even though it was deleted in same batch")
}
}
// TestPersistNodeToDBPreventsRaceCondition tests that persistNodeToDB correctly handles
// the race condition where a node is deleted after UpdateNode returns but before
// persistNodeToDB is called. This reproduces the ephemeral node deletion bug.
func TestPersistNodeToDBPreventsRaceCondition(t *testing.T) {
node := createTestNode(3, 1, "test-user", "test-node-3")
store := NewNodeStore(nil, allowAllPeersFunc, TestBatchSize, TestBatchTimeout)
store.Start()
defer store.Stop()
// Put node in store
_ = store.PutNode(node)
// Simulate UpdateNode being called
updatedNode, ok := store.UpdateNode(node.ID, func(n *types.Node) {
n.LastSeen = ptr.To(time.Now())
})
require.True(t, ok, "UpdateNode should succeed")
require.True(t, updatedNode.Valid(), "UpdateNode should return valid node")
// Now delete the node (simulating ephemeral logout happening concurrently)
store.DeleteNode(node.ID)
// Verify node is eventually deleted
require.EventuallyWithT(t, func(c *assert.CollectT) {
_, found := store.GetNode(node.ID)
assert.False(c, found, "node should be deleted")
}, 1*time.Second, 10*time.Millisecond, "waiting for node to be deleted")
// Now try to use the updatedNode from before the deletion
// In the old code, this would re-insert the node into the database
// With our fix, GetNode check in persistNodeToDB should prevent this
// Simulate what persistNodeToDB does - check if node still exists
_, exists := store.GetNode(updatedNode.ID())
if !exists {
t.Log("SUCCESS: persistNodeToDB check would prevent re-insertion of deleted node")
} else {
t.Error("BUG: Node still exists in NodeStore after deletion")
}
// The key assertion: after deletion, attempting to persist the old updatedNode
// should fail because the node no longer exists in NodeStore
assert.False(t, exists, "persistNodeToDB should detect node was deleted and refuse to persist")
}
// TestEphemeralNodeLogoutRaceCondition tests the specific race condition that occurs
// when an ephemeral node logs out. This reproduces the bug where:
// 1. UpdateNodeFromMapRequest calls UpdateNode and receives a node view
// 2. Concurrently, handleLogout is called for the ephemeral node and calls DeleteNode
// 3. UpdateNode and DeleteNode get batched together
// 4. If UpdateNode's result is used to call persistNodeToDB after the deletion,
// the node could be re-inserted into the database even though it was deleted
func TestEphemeralNodeLogoutRaceCondition(t *testing.T) {
ephemeralNode := createTestNode(4, 1, "test-user", "ephemeral-node")
ephemeralNode.AuthKey = &types.PreAuthKey{
ID: 1,
Key: "test-key",
Ephemeral: true,
}
// Use batch size of 2 to guarantee UpdateNode and DeleteNode batch together
store := NewNodeStore(nil, allowAllPeersFunc, 2, TestBatchTimeout)
store.Start()
defer store.Stop()
// Put ephemeral node in store
_ = store.PutNode(ephemeralNode)
// Simulate concurrent operations:
// 1. UpdateNode (from UpdateNodeFromMapRequest during polling)
// 2. DeleteNode (from handleLogout when client sends logout request)
var updatedNode types.NodeView
var updateOk bool
done := make(chan bool, 2)
// Goroutine 1: UpdateNode (simulates UpdateNodeFromMapRequest)
go func() {
updatedNode, updateOk = store.UpdateNode(ephemeralNode.ID, func(n *types.Node) {
n.LastSeen = ptr.To(time.Now())
})
done <- true
}()
// Goroutine 2: DeleteNode (simulates handleLogout for ephemeral node)
go func() {
store.DeleteNode(ephemeralNode.ID)
done <- true
}()
// Wait for both operations
<-done
<-done
// Verify node is eventually deleted
require.EventuallyWithT(t, func(c *assert.CollectT) {
_, found := store.GetNode(ephemeralNode.ID)
assert.False(c, found, "ephemeral node should be deleted from NodeStore")
}, 1*time.Second, 10*time.Millisecond, "waiting for ephemeral node to be deleted")
// Critical assertion: if UpdateNode returned before DeleteNode completed,
// the updatedNode might be valid but the node is actually deleted.
// This is the bug - UpdateNodeFromMapRequest would get a valid node,
// then try to persist it, re-inserting the deleted ephemeral node.
if updateOk && updatedNode.Valid() {
t.Log("UpdateNode returned valid node, but node is deleted - this is the race condition")
// In the real code, this would cause persistNodeToDB to be called with updatedNode
// The fix in persistNodeToDB checks if the node still exists:
_, stillExists := store.GetNode(updatedNode.ID())
assert.False(t, stillExists, "persistNodeToDB should check NodeStore and find node deleted")
} else if !updateOk || !updatedNode.Valid() {
t.Log("UpdateNode correctly returned invalid/not-ok result (delete happened in same batch)")
}
}
// TestUpdateNodeFromMapRequestEphemeralLogoutSequence tests the exact sequence
// that causes ephemeral node logout failures:
// 1. Client sends MapRequest with updated endpoint info
// 2. UpdateNodeFromMapRequest starts processing, calls UpdateNode
// 3. Client sends logout request (past expiry)
// 4. handleLogout calls DeleteNode for ephemeral node
// 5. UpdateNode and DeleteNode batch together
// 6. UpdateNode returns a valid node (from before delete in batch)
// 7. persistNodeToDB is called with the stale valid node
// 8. Node gets re-inserted into database instead of staying deleted
func TestUpdateNodeFromMapRequestEphemeralLogoutSequence(t *testing.T) {
ephemeralNode := createTestNode(5, 1, "test-user", "ephemeral-node-5")
ephemeralNode.AuthKey = &types.PreAuthKey{
ID: 2,
Key: "test-key-2",
Ephemeral: true,
}
// Use batch size of 2 to guarantee UpdateNode and DeleteNode batch together
// Use batch size of 2 to guarantee UpdateNode and DeleteNode batch together
store := NewNodeStore(nil, allowAllPeersFunc, 2, TestBatchTimeout)
store.Start()
defer store.Stop()
// Put ephemeral node in store
_ = store.PutNode(ephemeralNode)
// Step 1: UpdateNodeFromMapRequest calls UpdateNode
// (simulating client sending MapRequest with endpoint updates)
updateResult := make(chan struct {
node types.NodeView
ok bool
})
go func() {
node, ok := store.UpdateNode(ephemeralNode.ID, func(n *types.Node) {
n.LastSeen = ptr.To(time.Now())
endpoint := netip.MustParseAddrPort("10.0.0.1:41641")
n.Endpoints = []netip.AddrPort{endpoint}
})
updateResult <- struct {
node types.NodeView
ok bool
}{node, ok}
}()
// Step 2: Logout happens - handleLogout calls DeleteNode
// With batch size of 2, this will trigger batch processing with UpdateNode
go func() {
store.DeleteNode(ephemeralNode.ID)
}()
// Step 3: Wait and verify node is eventually deleted
require.EventuallyWithT(t, func(c *assert.CollectT) {
_, nodeExists := store.GetNode(ephemeralNode.ID)
assert.False(c, nodeExists, "ephemeral node must be deleted after logout")
}, 1*time.Second, 10*time.Millisecond, "waiting for ephemeral node to be deleted")
// Step 4: Get the update result
result := <-updateResult
// Simulate what happens if we try to persist the updatedNode
if result.ok && result.node.Valid() {
// This is the problematic path - UpdateNode returned a valid node
// but the node was deleted in the same batch
t.Log("UpdateNode returned valid node even though node was deleted")
// The fix: persistNodeToDB must check NodeStore before persisting
_, checkExists := store.GetNode(result.node.ID())
if checkExists {
t.Error("BUG: Node still exists in NodeStore after deletion - should be impossible")
} else {
t.Log("SUCCESS: persistNodeToDB would detect node is deleted and refuse to persist")
}
} else {
t.Log("UpdateNode correctly indicated node was deleted (returned invalid or not-ok)")
}
// Final assertion: node must not exist
_, finalExists := store.GetNode(ephemeralNode.ID)
assert.False(t, finalExists, "ephemeral node must remain deleted")
}
// TestUpdateNodeDeletedInSameBatchReturnsInvalid specifically tests that when
// UpdateNode and DeleteNode are batched together with DELETE after UPDATE,
// UpdateNode returns ok=false to indicate the node was deleted.
func TestUpdateNodeDeletedInSameBatchReturnsInvalid(t *testing.T) {
node := createTestNode(6, 1, "test-user", "test-node-6")
// Use batch size of 2 to guarantee UpdateNode and DeleteNode batch together
store := NewNodeStore(nil, allowAllPeersFunc, 2, TestBatchTimeout)
store.Start()
defer store.Stop()
// Put node in store
_ = store.PutNode(node)
// Queue UpdateNode and DeleteNode - with batch size of 2, they will batch together
updateDone := make(chan struct {
node types.NodeView
ok bool
})
go func() {
updatedNode, ok := store.UpdateNode(node.ID, func(n *types.Node) {
n.LastSeen = ptr.To(time.Now())
})
updateDone <- struct {
node types.NodeView
ok bool
}{updatedNode, ok}
}()
// Queue DeleteNode - with batch size of 2, this triggers batch processing
go func() {
store.DeleteNode(node.ID)
}()
// Get UpdateNode result
result := <-updateDone
// Node should be deleted
_, exists := store.GetNode(node.ID)
assert.False(t, exists, "node should be deleted from store")
// UpdateNode should indicate the node was deleted
// After c6b09289988f34398eb3157e31ba092eb8721a9f, when UPDATE and DELETE
// are in the same batch with DELETE after UPDATE, UpdateNode returns
// the state after the batch is applied - which means the node doesn't exist
assert.False(t, result.ok, "UpdateNode should return ok=false when node deleted in same batch")
assert.False(t, result.node.Valid(), "UpdateNode should return invalid node when node deleted in same batch")
}
// TestPersistNodeToDBChecksNodeStoreBeforePersist verifies that persistNodeToDB
// checks if the node still exists in NodeStore before persisting to database.
// This prevents the race condition where:
// 1. UpdateNodeFromMapRequest calls UpdateNode and gets a valid node
// 2. Ephemeral node logout calls DeleteNode
// 3. UpdateNode and DeleteNode batch together
// 4. UpdateNode returns a valid node (from before delete in batch)
// 5. UpdateNodeFromMapRequest calls persistNodeToDB with the stale node
// 6. persistNodeToDB must detect the node is deleted and refuse to persist
func TestPersistNodeToDBChecksNodeStoreBeforePersist(t *testing.T) {
ephemeralNode := createTestNode(7, 1, "test-user", "ephemeral-node-7")
ephemeralNode.AuthKey = &types.PreAuthKey{
ID: 3,
Key: "test-key-3",
Ephemeral: true,
}
store := NewNodeStore(nil, allowAllPeersFunc, TestBatchSize, TestBatchTimeout)
store.Start()
defer store.Stop()
// Put node
_ = store.PutNode(ephemeralNode)
// UpdateNode returns a node
updatedNode, ok := store.UpdateNode(ephemeralNode.ID, func(n *types.Node) {
n.LastSeen = ptr.To(time.Now())
})
require.True(t, ok, "UpdateNode should succeed")
require.True(t, updatedNode.Valid(), "updated node should be valid")
// Delete the node
store.DeleteNode(ephemeralNode.ID)
// Verify node is eventually deleted
require.EventuallyWithT(t, func(c *assert.CollectT) {
_, exists := store.GetNode(ephemeralNode.ID)
assert.False(c, exists, "node should be deleted from NodeStore")
}, 1*time.Second, 10*time.Millisecond, "waiting for node to be deleted")
// 4. Simulate what persistNodeToDB does - check if node still exists
// The fix in persistNodeToDB checks NodeStore before persisting:
// if !exists { return error }
// This prevents re-inserting the deleted node into the database
// Verify the node from UpdateNode is valid but node is gone from store
assert.True(t, updatedNode.Valid(), "UpdateNode returned a valid node view")
_, stillExists := store.GetNode(updatedNode.ID())
assert.False(t, stillExists, "but node should be deleted from NodeStore")
// This is the critical test: persistNodeToDB must check NodeStore
// and refuse to persist if the node doesn't exist anymore
// The actual persistNodeToDB implementation does:
// _, exists := s.nodeStore.GetNode(node.ID())
// if !exists { return error }
}