lint and leftover

Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com>
This commit is contained in:
Kristoffer Dalby
2025-09-05 16:32:46 +02:00
committed by Kristoffer Dalby
parent 39443184d6
commit 233dffc186
34 changed files with 1429 additions and 506 deletions

View File

@@ -10,18 +10,21 @@ import (
"testing"
"time"
"github.com/google/go-cmp/cmp"
v1 "github.com/juanfont/headscale/gen/go/headscale/v1"
"github.com/juanfont/headscale/hscontrol/types"
"github.com/juanfont/headscale/hscontrol/util"
"github.com/juanfont/headscale/integration/hsic"
"github.com/juanfont/headscale/integration/integrationutil"
"github.com/juanfont/headscale/integration/tsic"
"github.com/rs/zerolog/log"
"github.com/samber/lo"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"golang.org/x/exp/maps"
"golang.org/x/exp/slices"
"golang.org/x/sync/errgroup"
"tailscale.com/client/tailscale/apitype"
"tailscale.com/tailcfg"
"tailscale.com/types/key"
)
@@ -59,13 +62,15 @@ func TestPingAllByIP(t *testing.T) {
hs, err := scenario.Headscale()
require.NoError(t, err)
assert.EventuallyWithT(t, func(ct *assert.CollectT) {
all, err := hs.GetAllMapReponses()
assert.NoError(ct, err)
onlineMap := buildExpectedOnlineMap(all)
assertExpectedOnlineMapAllOnline(ct, len(allClients)-1, onlineMap)
}, 30*time.Second, 2*time.Second)
// Extract node IDs for validation
expectedNodes := make([]types.NodeID, 0, len(allClients))
for _, client := range allClients {
status := client.MustStatus()
nodeID, err := strconv.ParseUint(string(status.Self.ID), 10, 64)
require.NoError(t, err, "failed to parse node ID")
expectedNodes = append(expectedNodes, types.NodeID(nodeID))
}
requireAllClientsOnline(t, hs, expectedNodes, true, "all clients should be online across all systems", 30*time.Second)
// assertClientsState(t, allClients)
@@ -73,6 +78,14 @@ func TestPingAllByIP(t *testing.T) {
return x.String()
})
// Get headscale instance for batcher debug check
headscale, err := scenario.Headscale()
assertNoErr(t, err)
// Test our DebugBatcher functionality
t.Logf("Testing DebugBatcher functionality...")
requireAllClientsOnline(t, headscale, expectedNodes, true, "all clients should be connected to the batcher", 30*time.Second)
success := pingAllHelper(t, allClients, allAddrs)
t.Logf("%d successful pings out of %d", success, len(allClients)*len(allIps))
}
@@ -962,9 +975,6 @@ func TestPingAllByIPManyUpDown(t *testing.T) {
)
assertNoErrHeadscaleEnv(t, err)
hs, err := scenario.Headscale()
require.NoError(t, err)
allClients, err := scenario.ListTailscaleClients()
assertNoErrListClients(t, err)
@@ -980,14 +990,31 @@ func TestPingAllByIPManyUpDown(t *testing.T) {
return x.String()
})
// Get headscale instance for batcher debug checks
headscale, err := scenario.Headscale()
assertNoErr(t, err)
// Initial check: all nodes should be connected to batcher
// Extract node IDs for validation
expectedNodes := make([]types.NodeID, 0, len(allClients))
for _, client := range allClients {
status := client.MustStatus()
nodeID, err := strconv.ParseUint(string(status.Self.ID), 10, 64)
assertNoErr(t, err)
expectedNodes = append(expectedNodes, types.NodeID(nodeID))
}
requireAllClientsOnline(t, headscale, expectedNodes, true, "all clients should be connected to batcher", 30*time.Second)
success := pingAllHelper(t, allClients, allAddrs)
t.Logf("%d successful pings out of %d", success, len(allClients)*len(allIps))
wg, _ := errgroup.WithContext(context.Background())
for run := range 3 {
t.Logf("Starting DownUpPing run %d at %s", run+1, time.Now().Format("2006-01-02T15-04-05.999999999"))
// Create fresh errgroup with timeout for each run
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
wg, _ := errgroup.WithContext(ctx)
for _, client := range allClients {
c := client
wg.Go(func() error {
@@ -1001,6 +1028,9 @@ func TestPingAllByIPManyUpDown(t *testing.T) {
}
t.Logf("All nodes taken down at %s", time.Now().Format("2006-01-02T15-04-05.999999999"))
// After taking down all nodes, verify all systems show nodes offline
requireAllClientsOnline(t, headscale, expectedNodes, false, fmt.Sprintf("Run %d: all nodes should be offline after Down()", run+1), 120*time.Second)
for _, client := range allClients {
c := client
wg.Go(func() error {
@@ -1014,22 +1044,22 @@ func TestPingAllByIPManyUpDown(t *testing.T) {
}
t.Logf("All nodes brought up at %s", time.Now().Format("2006-01-02T15-04-05.999999999"))
// After bringing up all nodes, verify batcher shows all reconnected
requireAllClientsOnline(t, headscale, expectedNodes, true, fmt.Sprintf("Run %d: all nodes should be reconnected after Up()", run+1), 120*time.Second)
// Wait for sync and successful pings after nodes come back up
err = scenario.WaitForTailscaleSync()
assert.NoError(t, err)
t.Logf("All nodes synced up %s", time.Now().Format("2006-01-02T15-04-05.999999999"))
assert.EventuallyWithT(t, func(ct *assert.CollectT) {
all, err := hs.GetAllMapReponses()
assert.NoError(ct, err)
onlineMap := buildExpectedOnlineMap(all)
assertExpectedOnlineMapAllOnline(ct, len(allClients)-1, onlineMap)
}, 60*time.Second, 2*time.Second)
requireAllClientsOnline(t, headscale, expectedNodes, true, fmt.Sprintf("Run %d: all systems should show nodes online after reconnection", run+1), 60*time.Second)
success := pingAllHelper(t, allClients, allAddrs)
assert.Equalf(t, len(allClients)*len(allIps), success, "%d successful pings out of %d", success, len(allClients)*len(allIps))
// Clean up context for this run
cancel()
}
}
@@ -1141,51 +1171,158 @@ func Test2118DeletingOnlineNodePanics(t *testing.T) {
assert.Equal(t, nodeList[1].GetId(), nodeListAfter[0].GetId())
}
func buildExpectedOnlineMap(all map[types.NodeID][]tailcfg.MapResponse) map[types.NodeID]map[types.NodeID]bool {
res := make(map[types.NodeID]map[types.NodeID]bool)
for nid, mrs := range all {
res[nid] = make(map[types.NodeID]bool)
for _, mr := range mrs {
for _, peer := range mr.Peers {
if peer.Online != nil {
res[nid][types.NodeID(peer.ID)] = *peer.Online
}
}
for _, peer := range mr.PeersChanged {
if peer.Online != nil {
res[nid][types.NodeID(peer.ID)] = *peer.Online
}
}
for _, peer := range mr.PeersChangedPatch {
if peer.Online != nil {
res[nid][types.NodeID(peer.NodeID)] = *peer.Online
}
}
}
}
return res
// NodeSystemStatus represents the online status of a node across different systems
type NodeSystemStatus struct {
Batcher bool
BatcherConnCount int
MapResponses bool
NodeStore bool
}
func assertExpectedOnlineMapAllOnline(t *assert.CollectT, expectedPeerCount int, onlineMap map[types.NodeID]map[types.NodeID]bool) {
for nid, peers := range onlineMap {
onlineCount := 0
for _, online := range peers {
if online {
onlineCount++
// requireAllSystemsOnline checks that nodes are online/offline across batcher, mapresponses, and nodestore
func requireAllClientsOnline(t *testing.T, headscale ControlServer, expectedNodes []types.NodeID, expectedOnline bool, message string, timeout time.Duration) {
t.Helper()
startTime := time.Now()
t.Logf("requireAllSystemsOnline: Starting validation at %s - %s", startTime.Format("2006-01-02T15:04:05.000"), message)
var prevReport string
require.EventuallyWithT(t, func(c *assert.CollectT) {
// Get batcher state
debugInfo, err := headscale.DebugBatcher()
assert.NoError(c, err, "Failed to get batcher debug info")
if err != nil {
return
}
// Get map responses
mapResponses, err := headscale.GetAllMapReponses()
assert.NoError(c, err, "Failed to get map responses")
if err != nil {
return
}
// Get nodestore state
nodeStore, err := headscale.DebugNodeStore()
assert.NoError(c, err, "Failed to get nodestore debug info")
if err != nil {
return
}
// Validate node counts first
expectedCount := len(expectedNodes)
assert.Equal(c, expectedCount, debugInfo.TotalNodes, "Batcher total nodes mismatch")
assert.Equal(c, expectedCount, len(nodeStore), "NodeStore total nodes mismatch")
// Check that we have map responses for expected nodes
mapResponseCount := len(mapResponses)
assert.Equal(c, expectedCount, mapResponseCount, "MapResponses total nodes mismatch")
// Build status map for each node
nodeStatus := make(map[types.NodeID]NodeSystemStatus)
// Initialize all expected nodes
for _, nodeID := range expectedNodes {
nodeStatus[nodeID] = NodeSystemStatus{}
}
// Check batcher state
for nodeIDStr, nodeInfo := range debugInfo.ConnectedNodes {
nodeID := types.MustParseNodeID(nodeIDStr)
if status, exists := nodeStatus[nodeID]; exists {
status.Batcher = nodeInfo.Connected
status.BatcherConnCount = nodeInfo.ActiveConnections
nodeStatus[nodeID] = status
}
}
assert.Equalf(t, expectedPeerCount, len(peers), "node:%d had an unexpected number of peers in online map", nid)
if expectedPeerCount != onlineCount {
var sb strings.Builder
sb.WriteString(fmt.Sprintf("Not all of node:%d peers where online:\n", nid))
for pid, online := range peers {
sb.WriteString(fmt.Sprintf("\tPeer node:%d online: %t\n", pid, online))
// Check map responses using buildExpectedOnlineMap
onlineFromMaps := make(map[types.NodeID]bool)
onlineMap := integrationutil.BuildExpectedOnlineMap(mapResponses)
for nodeID := range nodeStatus {
NODE_STATUS:
for id, peerMap := range onlineMap {
if id == nodeID {
continue
}
online := peerMap[nodeID]
// If the node is offline in any map response, we consider it offline
if !online {
onlineFromMaps[nodeID] = false
continue NODE_STATUS
}
onlineFromMaps[nodeID] = true
}
sb.WriteString("timestamp: " + time.Now().Format("2006-01-02T15-04-05.999999999") + "\n")
sb.WriteString("expected all peers to be online.")
t.Errorf("%s", sb.String())
}
}
assert.Lenf(c, onlineFromMaps, expectedCount, "MapResponses missing nodes in status check")
// Update status with map response data
for nodeID, online := range onlineFromMaps {
if status, exists := nodeStatus[nodeID]; exists {
status.MapResponses = online
nodeStatus[nodeID] = status
}
}
// Check nodestore state
for nodeID, node := range nodeStore {
if status, exists := nodeStatus[nodeID]; exists {
// Check if node is online in nodestore
status.NodeStore = node.IsOnline != nil && *node.IsOnline
nodeStatus[nodeID] = status
}
}
// Verify all systems show nodes in expected state and report failures
allMatch := true
var failureReport strings.Builder
ids := types.NodeIDs(maps.Keys(nodeStatus))
slices.Sort(ids)
for _, nodeID := range ids {
status := nodeStatus[nodeID]
systemsMatch := (status.Batcher == expectedOnline) &&
(status.MapResponses == expectedOnline) &&
(status.NodeStore == expectedOnline)
if !systemsMatch {
allMatch = false
stateStr := "offline"
if expectedOnline {
stateStr = "online"
}
failureReport.WriteString(fmt.Sprintf("node:%d is not fully %s:\n", nodeID, stateStr))
failureReport.WriteString(fmt.Sprintf(" - batcher: %t\n", status.Batcher))
failureReport.WriteString(fmt.Sprintf(" - conn count: %d\n", status.BatcherConnCount))
failureReport.WriteString(fmt.Sprintf(" - mapresponses: %t (down with at least one peer)\n", status.MapResponses))
failureReport.WriteString(fmt.Sprintf(" - nodestore: %t\n", status.NodeStore))
}
}
if !allMatch {
if diff := cmp.Diff(prevReport, failureReport.String()); diff != "" {
t.Log("Diff between reports:")
t.Logf("Prev report: \n%s\n", prevReport)
t.Logf("New report: \n%s\n", failureReport.String())
t.Log("timestamp: " + time.Now().Format("2006-01-02T15-04-05.999999999") + "\n")
prevReport = failureReport.String()
}
failureReport.WriteString("timestamp: " + time.Now().Format("2006-01-02T15-04-05.999999999") + "\n")
assert.Fail(c, failureReport.String())
}
stateStr := "offline"
if expectedOnline {
stateStr = "online"
}
assert.True(c, allMatch, fmt.Sprintf("Not all nodes are %s across all systems", stateStr))
}, timeout, 2*time.Second, message)
endTime := time.Now()
duration := endTime.Sub(startTime)
t.Logf("requireAllSystemsOnline: Completed validation at %s - Duration: %v - %s", endTime.Format("2006-01-02T15:04:05.000"), duration, message)
}