policy: fix autogroup:self propagation and optimize cache invalidation (#2807)

This commit is contained in:
Kristoffer Dalby
2025-10-23 17:57:41 +02:00
committed by GitHub
parent 66826232ff
commit 2bf1200483
32 changed files with 3318 additions and 1770 deletions

View File

@@ -20,9 +20,10 @@ const (
)
const (
put = 1
del = 2
update = 3
put = 1
del = 2
update = 3
rebuildPeerMaps = 4
)
const prometheusNamespace = "headscale"
@@ -142,6 +143,8 @@ type work struct {
updateFn UpdateNodeFunc
result chan struct{}
nodeResult chan types.NodeView // Channel to return the resulting node after batch application
// For rebuildPeerMaps operation
rebuildResult chan struct{}
}
// PutNode adds or updates a node in the store.
@@ -298,6 +301,9 @@ func (s *NodeStore) applyBatch(batch []work) {
// Track which work items need node results
nodeResultRequests := make(map[types.NodeID][]*work)
// Track rebuildPeerMaps operations
var rebuildOps []*work
for i := range batch {
w := &batch[i]
switch w.op {
@@ -321,6 +327,10 @@ func (s *NodeStore) applyBatch(batch []work) {
if w.nodeResult != nil {
nodeResultRequests[w.nodeID] = append(nodeResultRequests[w.nodeID], w)
}
case rebuildPeerMaps:
// rebuildPeerMaps doesn't modify nodes, it just forces the snapshot rebuild
// below to recalculate peer relationships using the current peersFunc
rebuildOps = append(rebuildOps, w)
}
}
@@ -347,9 +357,16 @@ func (s *NodeStore) applyBatch(batch []work) {
}
}
// Signal completion for all work items
// Signal completion for rebuildPeerMaps operations
for _, w := range rebuildOps {
close(w.rebuildResult)
}
// Signal completion for all other work items
for _, w := range batch {
close(w.result)
if w.op != rebuildPeerMaps {
close(w.result)
}
}
}
@@ -546,6 +563,22 @@ func (s *NodeStore) ListPeers(id types.NodeID) views.Slice[types.NodeView] {
return views.SliceOf(s.data.Load().peersByNode[id])
}
// RebuildPeerMaps rebuilds the peer relationship map using the current peersFunc.
// This must be called after policy changes because peersFunc uses PolicyManager's
// filters to determine which nodes can see each other. Without rebuilding, the
// peer map would use stale filter data until the next node add/delete.
func (s *NodeStore) RebuildPeerMaps() {
result := make(chan struct{})
w := work{
op: rebuildPeerMaps,
rebuildResult: result,
}
s.writeQueue <- w
<-result
}
// ListNodesByUser returns a slice of all nodes for a given user ID.
func (s *NodeStore) ListNodesByUser(uid types.UserID) views.Slice[types.NodeView] {
timer := prometheus.NewTimer(nodeStoreOperationDuration.WithLabelValues("list_by_user"))

View File

@@ -132,9 +132,10 @@ func NewState(cfg *types.Config) (*State, error) {
return nil, fmt.Errorf("init policy manager: %w", err)
}
// PolicyManager.BuildPeerMap handles both global and per-node filter complexity.
// This moves the complex peer relationship logic into the policy package where it belongs.
nodeStore := NewNodeStore(nodes, func(nodes []types.NodeView) map[types.NodeID][]types.NodeView {
_, matchers := polMan.Filter()
return policy.BuildPeerMap(views.SliceOf(nodes), matchers)
return polMan.BuildPeerMap(views.SliceOf(nodes))
})
nodeStore.Start()
@@ -225,6 +226,12 @@ func (s *State) ReloadPolicy() ([]change.ChangeSet, error) {
return nil, fmt.Errorf("setting policy: %w", err)
}
// Rebuild peer maps after policy changes because the peersFunc in NodeStore
// uses the PolicyManager's filters. Without this, nodes won't see newly allowed
// peers until a node is added/removed, causing autogroup:self policies to not
// propagate correctly when switching between policy types.
s.nodeStore.RebuildPeerMaps()
cs := []change.ChangeSet{change.PolicyChange()}
// Always call autoApproveNodes during policy reload, regardless of whether
@@ -797,6 +804,11 @@ func (s *State) FilterForNode(node types.NodeView) ([]tailcfg.FilterRule, error)
return s.polMan.FilterForNode(node)
}
// MatchersForNode returns matchers for peer relationship determination (unreduced).
func (s *State) MatchersForNode(node types.NodeView) ([]matcher.Match, error) {
return s.polMan.MatchersForNode(node)
}
// NodeCanHaveTag checks if a node is allowed to have a specific tag.
func (s *State) NodeCanHaveTag(node types.NodeView, tag string) bool {
return s.polMan.NodeCanHaveTag(node, tag)