mapper: send change instead of full update (#2775)

This commit is contained in:
Kristoffer Dalby
2025-09-17 14:23:21 +02:00
committed by GitHub
parent 4de56c40d8
commit ed3a9c8d6d
8 changed files with 96 additions and 69 deletions

View File

@@ -88,16 +88,9 @@ func generateMapResponse(nodeID types.NodeID, version tailcfg.CapabilityVersion,
// TODO(kradalby): This can potentially be a peer update of the old and new subnet router.
mapResp, err = mapper.fullMapResponse(nodeID, version)
} else {
// CRITICAL FIX: Read actual online status from NodeStore when available,
// fall back to deriving from change type for unit tests or when NodeStore is empty
var onlineStatus bool
if node, found := mapper.state.GetNodeByID(c.NodeID); found && node.IsOnline().Valid() {
// Use actual NodeStore status when available (production case)
onlineStatus = node.IsOnline().Get()
} else {
// Fall back to deriving from change type (unit test case or initial setup)
onlineStatus = c.Change == change.NodeCameOnline
}
// Trust the change type for online/offline status to avoid race conditions
// between NodeStore updates and change processing
onlineStatus := c.Change == change.NodeCameOnline
mapResp, err = mapper.peerChangedPatchResponse(nodeID, []*tailcfg.PeerChange{
{
@@ -108,11 +101,33 @@ func generateMapResponse(nodeID types.NodeID, version tailcfg.CapabilityVersion,
}
case change.NodeNewOrUpdate:
mapResp, err = mapper.fullMapResponse(nodeID, version)
// If the node is the one being updated, we send a self update that preserves peer information
// to ensure the node sees changes to its own properties (e.g., hostname/DNS name changes)
// without losing its view of peer status during rapid reconnection cycles
if c.IsSelfUpdate(nodeID) {
mapResp, err = mapper.selfMapResponse(nodeID, version)
} else {
mapResp, err = mapper.peerChangeResponse(nodeID, version, c.NodeID)
}
case change.NodeRemove:
mapResp, err = mapper.peerRemovedResponse(nodeID, c.NodeID)
case change.NodeKeyExpiry:
// If the node is the one whose key is expiring, we send a "full" self update
// as nodes will ignore patch updates about themselves (?).
if c.IsSelfUpdate(nodeID) {
mapResp, err = mapper.selfMapResponse(nodeID, version)
// mapResp, err = mapper.fullMapResponse(nodeID, version)
} else {
mapResp, err = mapper.peerChangedPatchResponse(nodeID, []*tailcfg.PeerChange{
{
NodeID: c.NodeID.NodeID(),
KeyExpiry: c.NodeExpiry,
},
})
}
default:
// The following will always hit this:
// change.Full, change.Policy

View File

@@ -1028,7 +1028,9 @@ func TestBatcherWorkQueueBatching(t *testing.T) {
// Add multiple changes rapidly to test batching
batcher.AddWork(change.DERPSet)
batcher.AddWork(change.KeyExpiry(testNodes[1].n.ID))
// Use a valid expiry time for testing since test nodes don't have expiry set
testExpiry := time.Now().Add(24 * time.Hour)
batcher.AddWork(change.KeyExpiry(testNodes[1].n.ID, testExpiry))
batcher.AddWork(change.DERPSet)
batcher.AddWork(change.NodeAdded(testNodes[1].n.ID))
batcher.AddWork(change.DERPSet)
@@ -1278,7 +1280,9 @@ func TestBatcherWorkerChannelSafety(t *testing.T) {
// Add node-specific work occasionally
if i%10 == 0 {
batcher.AddWork(change.KeyExpiry(testNode.n.ID))
// Use a valid expiry time for testing since test nodes don't have expiry set
testExpiry := time.Now().Add(24 * time.Hour)
batcher.AddWork(change.KeyExpiry(testNode.n.ID, testExpiry))
}
// Rapid removal creates race between worker and removal
@@ -1493,7 +1497,9 @@ func TestBatcherConcurrentClients(t *testing.T) {
if i%7 == 0 && len(allNodes) > 0 {
// Node-specific changes using real nodes
node := allNodes[i%len(allNodes)]
batcher.AddWork(change.KeyExpiry(node.n.ID))
// Use a valid expiry time for testing since test nodes don't have expiry set
testExpiry := time.Now().Add(24 * time.Hour)
batcher.AddWork(change.KeyExpiry(node.n.ID, testExpiry))
}
// Small delay to allow some batching

View File

@@ -28,6 +28,7 @@ type debugType string
const (
fullResponseDebug debugType = "full"
selfResponseDebug debugType = "self"
patchResponseDebug debugType = "patch"
removeResponseDebug debugType = "remove"
changeResponseDebug debugType = "change"
@@ -68,24 +69,17 @@ func (b *MapResponseBuilder) WithCapabilityVersion(capVer tailcfg.CapabilityVers
// WithSelfNode adds the requesting node to the response.
func (b *MapResponseBuilder) WithSelfNode() *MapResponseBuilder {
nodeView, ok := b.mapper.state.GetNodeByID(b.nodeID)
nv, ok := b.mapper.state.GetNodeByID(b.nodeID)
if !ok {
b.addError(errors.New("node not found"))
return b
}
// Always use batcher's view of online status for self node
// The batcher respects grace periods for logout scenarios
node := nodeView.AsStruct()
// if b.mapper.batcher != nil {
// node.IsOnline = ptr.To(b.mapper.batcher.IsConnected(b.nodeID))
// }
_, matchers := b.mapper.state.Filter()
tailnode, err := tailNode(
node.View(), b.capVer, b.mapper.state,
nv, b.capVer, b.mapper.state,
func(id types.NodeID) []netip.Prefix {
return policy.ReduceRoutes(node.View(), b.mapper.state.GetNodePrimaryRoutes(id), matchers)
return policy.ReduceRoutes(nv, b.mapper.state.GetNodePrimaryRoutes(id), matchers)
},
b.mapper.cfg)
if err != nil {

View File

@@ -158,6 +158,26 @@ func (m *mapper) fullMapResponse(
Build()
}
func (m *mapper) selfMapResponse(
nodeID types.NodeID,
capVer tailcfg.CapabilityVersion,
) (*tailcfg.MapResponse, error) {
ma, err := m.NewMapResponseBuilder(nodeID).
WithDebugType(selfResponseDebug).
WithCapabilityVersion(capVer).
WithSelfNode().
Build()
if err != nil {
return nil, err
}
// Set the peers to nil, to ensure the node does not think
// its getting a new list.
ma.Peers = nil
return ma, err
}
func (m *mapper) derpMapResponse(
nodeID types.NodeID,
) (*tailcfg.MapResponse, error) {
@@ -190,7 +210,6 @@ func (m *mapper) peerChangeResponse(
return m.NewMapResponseBuilder(nodeID).
WithDebugType(changeResponseDebug).
WithCapabilityVersion(capVer).
WithSelfNode().
WithUserProfiles(peers).
WithPeerChanges(peers).
Build()