From 4728a2ba9ea664205d07a84584a86aef8caf5a1a Mon Sep 17 00:00:00 2001 From: Kristoffer Dalby Date: Mon, 3 Nov 2025 15:29:39 +0100 Subject: [PATCH] hscontrol/state: allow expired auth keys for node re-registration Skip auth key validation for existing nodes re-registering with the same NodeKey. Pre-auth keys are only required for initial authentication. NodeKey rotation still requires a valid auth key as it is a security-sensitive operation that changes the node's cryptographic identity. Fixes #2830 --- hscontrol/auth_test.go | 293 +++++++++++++++++++++++++++++++++++ hscontrol/state/state.go | 46 +++++- integration/auth_key_test.go | 2 + integration/tailscale.go | 1 + integration/tsic/tsic.go | 33 ++++ 5 files changed, 369 insertions(+), 6 deletions(-) diff --git a/hscontrol/auth_test.go b/hscontrol/auth_test.go index 1727be1a..bf6da356 100644 --- a/hscontrol/auth_test.go +++ b/hscontrol/auth_test.go @@ -3004,3 +3004,296 @@ func createTestApp(t *testing.T) *Headscale { return app } + +// TestGitHubIssue2830_NodeRestartWithUsedPreAuthKey tests the scenario reported in +// https://github.com/juanfont/headscale/issues/2830 +// +// Scenario: +// 1. Node registers successfully with a single-use pre-auth key +// 2. Node is running fine +// 3. Node restarts (e.g., after headscale upgrade or tailscale container restart) +// 4. Node sends RegisterRequest with the same pre-auth key +// 5. BUG: Headscale rejects the request with "authkey expired" or "authkey already used" +// +// Expected behavior: +// When an existing node (identified by matching NodeKey + MachineKey) re-registers +// with a pre-auth key that it previously used, the registration should succeed. +// The node is not creating a new registration - it's re-authenticating the same device. +func TestGitHubIssue2830_NodeRestartWithUsedPreAuthKey(t *testing.T) { + t.Parallel() + + app := createTestApp(t) + + // Create user and single-use pre-auth key + user := app.state.CreateUserForTest("test-user") + pak, err := app.state.CreatePreAuthKey(types.UserID(user.ID), false, false, nil, nil) // reusable=false + require.NoError(t, err) + require.False(t, pak.Reusable, "key should be single-use for this test") + + machineKey := key.NewMachine() + nodeKey := key.NewNode() + + // STEP 1: Initial registration with pre-auth key (simulates fresh node joining) + initialReq := tailcfg.RegisterRequest{ + Auth: &tailcfg.RegisterResponseAuth{ + AuthKey: pak.Key, + }, + NodeKey: nodeKey.Public(), + Hostinfo: &tailcfg.Hostinfo{ + Hostname: "test-node", + }, + Expiry: time.Now().Add(24 * time.Hour), + } + + t.Log("Step 1: Initial registration with pre-auth key") + initialResp, err := app.handleRegister(context.Background(), initialReq, machineKey.Public()) + require.NoError(t, err, "initial registration should succeed") + require.NotNil(t, initialResp) + assert.True(t, initialResp.MachineAuthorized, "node should be authorized") + assert.False(t, initialResp.NodeKeyExpired, "node key should not be expired") + + // Verify node was created in database + node, found := app.state.GetNodeByNodeKey(nodeKey.Public()) + require.True(t, found, "node should exist after initial registration") + assert.Equal(t, "test-node", node.Hostname()) + assert.Equal(t, nodeKey.Public(), node.NodeKey()) + assert.Equal(t, machineKey.Public(), node.MachineKey()) + + // Verify pre-auth key is now marked as used + usedPak, err := app.state.GetPreAuthKey(pak.Key) + require.NoError(t, err) + assert.True(t, usedPak.Used, "pre-auth key should be marked as used after initial registration") + + // STEP 2: Simulate node restart - node sends RegisterRequest again with same pre-auth key + // This happens when: + // - Tailscale container restarts + // - Tailscaled service restarts + // - System reboots + // The Tailscale client persists the pre-auth key in its state and sends it on every registration + t.Log("Step 2: Node restart - re-registration with same (now used) pre-auth key") + restartReq := tailcfg.RegisterRequest{ + Auth: &tailcfg.RegisterResponseAuth{ + AuthKey: pak.Key, // Same key, now marked as Used=true + }, + NodeKey: nodeKey.Public(), // Same node key + Hostinfo: &tailcfg.Hostinfo{ + Hostname: "test-node", + }, + Expiry: time.Now().Add(24 * time.Hour), + } + + // BUG: This fails with "authkey already used" or "authkey expired" + // EXPECTED: Should succeed because it's the same node re-registering + restartResp, err := app.handleRegister(context.Background(), restartReq, machineKey.Public()) + + // This is the assertion that currently FAILS in v0.27.0 + assert.NoError(t, err, "BUG: existing node re-registration with its own used pre-auth key should succeed") + if err != nil { + t.Logf("Error received (this is the bug): %v", err) + t.Logf("Expected behavior: Node should be able to re-register with the same pre-auth key it used initially") + return // Stop here to show the bug clearly + } + + require.NotNil(t, restartResp) + assert.True(t, restartResp.MachineAuthorized, "node should remain authorized after restart") + assert.False(t, restartResp.NodeKeyExpired, "node key should not be expired after restart") + + // Verify it's the same node (not a duplicate) + nodeAfterRestart, found := app.state.GetNodeByNodeKey(nodeKey.Public()) + require.True(t, found, "node should still exist after restart") + assert.Equal(t, node.ID(), nodeAfterRestart.ID(), "should be the same node, not a new one") + assert.Equal(t, "test-node", nodeAfterRestart.Hostname()) +} + +// TestNodeReregistrationWithReusablePreAuthKey tests that reusable keys work correctly +// for node re-registration. +func TestNodeReregistrationWithReusablePreAuthKey(t *testing.T) { + t.Parallel() + + app := createTestApp(t) + + user := app.state.CreateUserForTest("test-user") + pak, err := app.state.CreatePreAuthKey(types.UserID(user.ID), true, false, nil, nil) // reusable=true + require.NoError(t, err) + require.True(t, pak.Reusable) + + machineKey := key.NewMachine() + nodeKey := key.NewNode() + + // Initial registration + initialReq := tailcfg.RegisterRequest{ + Auth: &tailcfg.RegisterResponseAuth{ + AuthKey: pak.Key, + }, + NodeKey: nodeKey.Public(), + Hostinfo: &tailcfg.Hostinfo{ + Hostname: "reusable-test-node", + }, + Expiry: time.Now().Add(24 * time.Hour), + } + + initialResp, err := app.handleRegister(context.Background(), initialReq, machineKey.Public()) + require.NoError(t, err) + require.NotNil(t, initialResp) + assert.True(t, initialResp.MachineAuthorized) + + // Node restart - re-registration with reusable key + restartReq := tailcfg.RegisterRequest{ + Auth: &tailcfg.RegisterResponseAuth{ + AuthKey: pak.Key, // Reusable key + }, + NodeKey: nodeKey.Public(), + Hostinfo: &tailcfg.Hostinfo{ + Hostname: "reusable-test-node", + }, + Expiry: time.Now().Add(24 * time.Hour), + } + + restartResp, err := app.handleRegister(context.Background(), restartReq, machineKey.Public()) + require.NoError(t, err, "reusable key should allow re-registration") + require.NotNil(t, restartResp) + assert.True(t, restartResp.MachineAuthorized) + assert.False(t, restartResp.NodeKeyExpired) +} + +// TestNodeReregistrationWithExpiredPreAuthKey tests that truly expired keys +// are still rejected even for existing nodes. +func TestNodeReregistrationWithExpiredPreAuthKey(t *testing.T) { + t.Parallel() + + app := createTestApp(t) + + user := app.state.CreateUserForTest("test-user") + expiry := time.Now().Add(-1 * time.Hour) // Already expired + pak, err := app.state.CreatePreAuthKey(types.UserID(user.ID), true, false, &expiry, nil) + require.NoError(t, err) + + machineKey := key.NewMachine() + nodeKey := key.NewNode() + + // Try to register with expired key + req := tailcfg.RegisterRequest{ + Auth: &tailcfg.RegisterResponseAuth{ + AuthKey: pak.Key, + }, + NodeKey: nodeKey.Public(), + Hostinfo: &tailcfg.Hostinfo{ + Hostname: "expired-key-node", + }, + Expiry: time.Now().Add(24 * time.Hour), + } + + _, err = app.handleRegister(context.Background(), req, machineKey.Public()) + assert.Error(t, err, "expired pre-auth key should be rejected") + assert.Contains(t, err.Error(), "authkey expired", "error should mention key expiration") +} +// TestGitHubIssue2830_ExistingNodeCanReregisterWithUsedPreAuthKey tests that an existing node +// can re-register using a pre-auth key that's already marked as Used=true, as long as: +// 1. The node is re-registering with the same MachineKey it originally used +// 2. The node is using the same pre-auth key it was originally registered with (AuthKeyID matches) +// +// This is the fix for GitHub issue #2830: https://github.com/juanfont/headscale/issues/2830 +// +// Background: When Docker/Kubernetes containers restart, they keep their persistent state +// (including the MachineKey), but container entrypoints unconditionally run: +// tailscale up --authkey=$TS_AUTHKEY +// +// This caused nodes to be rejected after restart because the pre-auth key was already +// marked as Used=true from the initial registration. The fix allows re-registration of +// existing nodes with their own used keys. +func TestGitHubIssue2830_ExistingNodeCanReregisterWithUsedPreAuthKey(t *testing.T) { + app := createTestApp(t) + + // Create a user + user := app.state.CreateUserForTest("testuser") + + // Create a SINGLE-USE pre-auth key (reusable=false) + // This is the type of key that triggers the bug in issue #2830 + preAuthKey, err := app.state.CreatePreAuthKey(types.UserID(user.ID), false, false, nil, nil) + require.NoError(t, err) + require.False(t, preAuthKey.Reusable, "Pre-auth key must be single-use to test issue #2830") + require.False(t, preAuthKey.Used, "Pre-auth key should not be used yet") + + // Generate node keys for the client + machineKey := key.NewMachine() + nodeKey := key.NewNode() + + // Step 1: Initial registration with the pre-auth key + // This simulates the first time the container starts and runs 'tailscale up --authkey=...' + initialReq := tailcfg.RegisterRequest{ + Auth: &tailcfg.RegisterResponseAuth{ + AuthKey: preAuthKey.Key, + }, + NodeKey: nodeKey.Public(), + Hostinfo: &tailcfg.Hostinfo{ + Hostname: "issue-2830-test-node", + }, + Expiry: time.Now().Add(24 * time.Hour), + } + + initialResp, err := app.handleRegisterWithAuthKey(initialReq, machineKey.Public()) + require.NoError(t, err, "Initial registration should succeed") + require.True(t, initialResp.MachineAuthorized, "Node should be authorized after initial registration") + require.NotNil(t, initialResp.User, "User should be set in response") + require.Equal(t, "testuser", initialResp.User.DisplayName, "User should match the pre-auth key's user") + + // Verify the pre-auth key is now marked as Used + updatedKey, err := app.state.GetPreAuthKey(preAuthKey.Key) + require.NoError(t, err) + require.True(t, updatedKey.Used, "Pre-auth key should be marked as Used after initial registration") + + // Step 2: Container restart scenario + // The container keeps its MachineKey (persistent state), but the entrypoint script + // unconditionally runs 'tailscale up --authkey=$TS_AUTHKEY' again + // + // WITHOUT THE FIX: This would fail with "authkey already used" error + // WITH THE FIX: This succeeds because it's the same node re-registering with its own key + + // Simulate sending the same RegisterRequest again (same MachineKey, same AuthKey) + // This is exactly what happens when a container restarts + reregisterReq := tailcfg.RegisterRequest{ + Auth: &tailcfg.RegisterResponseAuth{ + AuthKey: preAuthKey.Key, // Same key, now marked as Used=true + }, + NodeKey: nodeKey.Public(), // Same NodeKey + Hostinfo: &tailcfg.Hostinfo{ + Hostname: "issue-2830-test-node", + }, + Expiry: time.Now().Add(24 * time.Hour), + } + + reregisterResp, err := app.handleRegisterWithAuthKey(reregisterReq, machineKey.Public()) // Same MachineKey + require.NoError(t, err, "Re-registration with same MachineKey and used pre-auth key should succeed (fixes #2830)") + require.True(t, reregisterResp.MachineAuthorized, "Node should remain authorized after re-registration") + require.NotNil(t, reregisterResp.User, "User should be set in re-registration response") + require.Equal(t, "testuser", reregisterResp.User.DisplayName, "User should remain the same") + + // Verify that only ONE node was created (not a duplicate) + nodes := app.state.ListNodesByUser(types.UserID(user.ID)) + require.Equal(t, 1, nodes.Len(), "Should have exactly one node (no duplicates created)") + require.Equal(t, "issue-2830-test-node", nodes.At(0).Hostname(), "Node hostname should match") + + // Step 3: Verify that a DIFFERENT machine cannot use the same used key + // This ensures we didn't break the security model - only the original node can re-register + differentMachineKey := key.NewMachine() + differentNodeKey := key.NewNode() + + attackReq := tailcfg.RegisterRequest{ + Auth: &tailcfg.RegisterResponseAuth{ + AuthKey: preAuthKey.Key, // Try to use the same key + }, + NodeKey: differentNodeKey.Public(), + Hostinfo: &tailcfg.Hostinfo{ + Hostname: "attacker-node", + }, + Expiry: time.Now().Add(24 * time.Hour), + } + + _, err = app.handleRegisterWithAuthKey(attackReq, differentMachineKey.Public()) + require.Error(t, err, "Different machine should NOT be able to use the same used pre-auth key") + require.Contains(t, err.Error(), "already used", "Error should indicate key is already used") + + // Verify still only one node (the original one) + nodesAfterAttack := app.state.ListNodesByUser(types.UserID(user.ID)) + require.Equal(t, 1, nodesAfterAttack.Len(), "Should still have exactly one node (attack prevented)") +} diff --git a/hscontrol/state/state.go b/hscontrol/state/state.go index c340adc2..6e1d08e0 100644 --- a/hscontrol/state/state.go +++ b/hscontrol/state/state.go @@ -1294,9 +1294,46 @@ func (s *State) HandleNodeFromPreAuthKey( return types.NodeView{}, change.EmptySet, err } - err = pak.Validate() - if err != nil { - return types.NodeView{}, change.EmptySet, err + // Check if node exists with same machine key before validating the key. + // For #2830: container restarts send the same pre-auth key which may be used/expired. + // Skip validation for existing nodes re-registering with the same NodeKey, as the + // key was only needed for initial authentication. NodeKey rotation requires validation. + existingNodeSameUser, existsSameUser := s.nodeStore.GetNodeByMachineKey(machineKey, types.UserID(pak.User.ID)) + + // Skip validation only if both the AuthKeyID and NodeKey match (not a rotation). + isExistingNodeReregistering := existsSameUser && existingNodeSameUser.Valid() && + existingNodeSameUser.AuthKey().Valid() && + existingNodeSameUser.AuthKeyID().Valid() && + existingNodeSameUser.AuthKeyID().Get() == pak.ID + + // Check if this is a NodeKey rotation (different NodeKey) + isNodeKeyRotation := existsSameUser && existingNodeSameUser.Valid() && + existingNodeSameUser.NodeKey() != regReq.NodeKey + + if isExistingNodeReregistering && !isNodeKeyRotation { + // Existing node re-registering with same NodeKey: skip validation. + // Pre-auth keys are only needed for initial authentication. Critical for + // containers that run "tailscale up --authkey=KEY" on every restart. + log.Debug(). + Caller(). + Uint64("node.id", existingNodeSameUser.ID().Uint64()). + Str("node.name", existingNodeSameUser.Hostname()). + Str("machine.key", machineKey.ShortString()). + Str("node.key.existing", existingNodeSameUser.NodeKey().ShortString()). + Str("node.key.request", regReq.NodeKey.ShortString()). + Uint64("authkey.id", pak.ID). + Bool("authkey.used", pak.Used). + Bool("authkey.expired", pak.Expiration != nil && pak.Expiration.Before(time.Now())). + Bool("authkey.reusable", pak.Reusable). + Bool("nodekey.rotation", isNodeKeyRotation). + Msg("Existing node re-registering with same NodeKey and auth key, skipping validation") + + } else { + // New node or NodeKey rotation: require valid auth key. + err = pak.Validate() + if err != nil { + return types.NodeView{}, change.EmptySet, err + } } // Ensure we have a valid hostname - handle nil/empty cases @@ -1328,9 +1365,6 @@ func (s *State) HandleNodeFromPreAuthKey( var finalNode types.NodeView - // Check if node already exists with same machine key for this user - existingNodeSameUser, existsSameUser := s.nodeStore.GetNodeByMachineKey(machineKey, types.UserID(pak.User.ID)) - // If this node exists for this user, update the node in place. if existsSameUser && existingNodeSameUser.Valid() { log.Trace(). diff --git a/integration/auth_key_test.go b/integration/auth_key_test.go index c6a4f4cf..75106dc5 100644 --- a/integration/auth_key_test.go +++ b/integration/auth_key_test.go @@ -223,6 +223,7 @@ func TestAuthKeyLogoutAndReloginNewUser(t *testing.T) { scenario, err := NewScenario(spec) require.NoError(t, err) + defer scenario.ShutdownAssertNoPanics(t) err = scenario.CreateHeadscaleEnv([]tsic.Option{}, @@ -454,3 +455,4 @@ func TestAuthKeyLogoutAndReloginSameUserExpiredKey(t *testing.T) { }) } } + diff --git a/integration/tailscale.go b/integration/tailscale.go index 414d08bc..f397133e 100644 --- a/integration/tailscale.go +++ b/integration/tailscale.go @@ -29,6 +29,7 @@ type TailscaleClient interface { Login(loginServer, authKey string) error LoginWithURL(loginServer string) (*url.URL, error) Logout() error + Restart() error Up() error Down() error IPs() ([]netip.Addr, error) diff --git a/integration/tsic/tsic.go b/integration/tsic/tsic.go index f6d8baef..462c3ea3 100644 --- a/integration/tsic/tsic.go +++ b/integration/tsic/tsic.go @@ -555,6 +555,39 @@ func (t *TailscaleInContainer) Logout() error { return t.waitForBackendState("NeedsLogin", integrationutil.PeerSyncTimeout()) } +// Restart restarts the Tailscale container using Docker API. +// This simulates a container restart (e.g., docker restart or Kubernetes pod restart). +// The container's entrypoint will re-execute, which typically includes running +// "tailscale up" with any auth keys stored in environment variables. +func (t *TailscaleInContainer) Restart() error { + if t.container == nil { + return fmt.Errorf("container not initialized") + } + + // Use Docker API to restart the container + err := t.pool.Client.RestartContainer(t.container.Container.ID, 30) + if err != nil { + return fmt.Errorf("failed to restart container %s: %w", t.hostname, err) + } + + // Wait for the container to be back up and tailscaled to be ready + // We use exponential backoff to poll until we can successfully execute a command + _, err = backoff.Retry(context.Background(), func() (struct{}, error) { + // Try to execute a simple command to verify the container is responsive + _, _, err := t.Execute([]string{"tailscale", "version"}, dockertestutil.ExecuteCommandTimeout(5*time.Second)) + if err != nil { + return struct{}{}, fmt.Errorf("container not ready: %w", err) + } + return struct{}{}, nil + }, backoff.WithBackOff(backoff.NewExponentialBackOff()), backoff.WithMaxElapsedTime(30*time.Second)) + + if err != nil { + return fmt.Errorf("timeout waiting for container %s to restart and become ready: %w", t.hostname, err) + } + + return nil +} + // Helper that runs `tailscale up` with no arguments. func (t *TailscaleInContainer) Up() error { command := []string{