stability and race conditions in auth and node store (#2781)

This PR addresses some consistency issues that was introduced or discovered with the nodestore.

nodestore:
Now returns the node that is being put or updated when it is finished. This closes a race condition where when we read it back, we do not necessarily get the node with the given change and it ensures we get all the other updates from that batch write.

auth:
Authentication paths have been unified and simplified. It removes a lot of bad branches and ensures we only do the minimal work.
A comprehensive auth test set has been created so we do not have to run integration tests to validate auth and it has allowed us to generate test cases for all the branches we currently know of.

integration:
added a lot more tooling and checks to validate that nodes reach the expected state when they come up and down. Standardised between the different auth models. A lot of this is to support or detect issues in the changes to nodestore (races) and auth (inconsistencies after login and reaching correct state)

This PR was assisted, particularly tests, by claude code.
This commit is contained in:
Kristoffer Dalby
2025-10-16 12:17:43 +02:00
committed by GitHub
parent 881a6b9227
commit fddc7117e4
34 changed files with 7408 additions and 1876 deletions

View File

@@ -11,6 +11,7 @@ import (
"github.com/juanfont/headscale/hscontrol/types"
"github.com/juanfont/headscale/hscontrol/types/change"
"github.com/juanfont/headscale/hscontrol/util"
"github.com/rs/zerolog/log"
"gorm.io/gorm"
"tailscale.com/tailcfg"
@@ -25,26 +26,84 @@ type AuthProvider interface {
func (h *Headscale) handleRegister(
ctx context.Context,
regReq tailcfg.RegisterRequest,
req tailcfg.RegisterRequest,
machineKey key.MachinePublic,
) (*tailcfg.RegisterResponse, error) {
node, ok := h.state.GetNodeByNodeKey(regReq.NodeKey)
// Check for logout/expiry FIRST, before checking auth key.
// Tailscale clients may send logout requests with BOTH a past expiry AND an auth key.
// A past expiry takes precedence - it's a logout regardless of other fields.
if !req.Expiry.IsZero() && req.Expiry.Before(time.Now()) {
log.Debug().
Str("node.key", req.NodeKey.ShortString()).
Time("expiry", req.Expiry).
Bool("has_auth", req.Auth != nil).
Msg("Detected logout attempt with past expiry")
if ok {
resp, err := h.handleExistingNode(node.AsStruct(), regReq, machineKey)
if err != nil {
return nil, fmt.Errorf("handling existing node: %w", err)
// This is a logout attempt (expiry in the past)
if node, ok := h.state.GetNodeByNodeKey(req.NodeKey); ok {
log.Debug().
Uint64("node.id", node.ID().Uint64()).
Str("node.name", node.Hostname()).
Bool("is_ephemeral", node.IsEphemeral()).
Bool("has_authkey", node.AuthKey().Valid()).
Msg("Found existing node for logout, calling handleLogout")
resp, err := h.handleLogout(node, req, machineKey)
if err != nil {
return nil, fmt.Errorf("handling logout: %w", err)
}
if resp != nil {
return resp, nil
}
} else {
log.Warn().
Str("node.key", req.NodeKey.ShortString()).
Msg("Logout attempt but node not found in NodeStore")
}
return resp, nil
}
if regReq.Followup != "" {
return h.waitForFollowup(ctx, regReq, machineKey)
// If the register request does not contain a Auth struct, it means we are logging
// out an existing node (legacy logout path for clients that send Auth=nil).
if req.Auth == nil {
// If the register request present a NodeKey that is currently in use, we will
// check if the node needs to be sent to re-auth, or if the node is logging out.
// We do not look up nodes by [key.MachinePublic] as it might belong to multiple
// nodes, separated by users and this path is handling expiring/logout paths.
if node, ok := h.state.GetNodeByNodeKey(req.NodeKey); ok {
resp, err := h.handleLogout(node, req, machineKey)
if err != nil {
return nil, fmt.Errorf("handling existing node: %w", err)
}
// If resp is not nil, we have a response to return to the node.
// If resp is nil, we should proceed and see if the node is trying to re-auth.
if resp != nil {
return resp, nil
}
} else {
// If the register request is not attempting to register a node, and
// we cannot match it with an existing node, we consider that unexpected
// as only register nodes should attempt to log out.
log.Debug().
Str("node.key", req.NodeKey.ShortString()).
Str("machine.key", machineKey.ShortString()).
Bool("unexpected", true).
Msg("received register request with no auth, and no existing node")
}
}
if regReq.Auth != nil && regReq.Auth.AuthKey != "" {
resp, err := h.handleRegisterWithAuthKey(regReq, machineKey)
// If the [tailcfg.RegisterRequest] has a Followup URL, it means that the
// node has already started the registration process and we should wait for
// it to finish the original registration.
if req.Followup != "" {
return h.waitForFollowup(ctx, req, machineKey)
}
// Pre authenticated keys are handled slightly different than interactive
// logins as they can be done fully sync and we can respond to the node with
// the result as it is waiting.
if isAuthKey(req) {
resp, err := h.handleRegisterWithAuthKey(req, machineKey)
if err != nil {
// Preserve HTTPError types so they can be handled properly by the HTTP layer
var httpErr HTTPError
@@ -58,7 +117,7 @@ func (h *Headscale) handleRegister(
return resp, nil
}
resp, err := h.handleRegisterInteractive(regReq, machineKey)
resp, err := h.handleRegisterInteractive(req, machineKey)
if err != nil {
return nil, fmt.Errorf("handling register interactive: %w", err)
}
@@ -66,20 +125,34 @@ func (h *Headscale) handleRegister(
return resp, nil
}
func (h *Headscale) handleExistingNode(
node *types.Node,
regReq tailcfg.RegisterRequest,
// handleLogout checks if the [tailcfg.RegisterRequest] is a
// logout attempt from a node. If the node is not attempting to
func (h *Headscale) handleLogout(
node types.NodeView,
req tailcfg.RegisterRequest,
machineKey key.MachinePublic,
) (*tailcfg.RegisterResponse, error) {
if node.MachineKey != machineKey {
// Fail closed if it looks like this is an attempt to modify a node where
// the node key and the machine key the noise session was started with does
// not align.
if node.MachineKey() != machineKey {
return nil, NewHTTPError(http.StatusUnauthorized, "node exist with different machine key", nil)
}
expired := node.IsExpired()
// Note: We do NOT return early if req.Auth is set, because Tailscale clients
// may send logout requests with BOTH a past expiry AND an auth key.
// A past expiry indicates logout, regardless of whether Auth is present.
// The expiry check below will handle the logout logic.
// If the node is expired and this is not a re-authentication attempt,
// force the client to re-authenticate
if expired && regReq.Auth == nil {
// force the client to re-authenticate.
// TODO(kradalby): I wonder if this is a path we ever hit?
if node.IsExpired() {
log.Trace().Str("node.name", node.Hostname()).
Uint64("node.id", node.ID().Uint64()).
Interface("reg.req", req).
Bool("unexpected", true).
Msg("Node key expired, forcing re-authentication")
return &tailcfg.RegisterResponse{
NodeKeyExpired: true,
MachineAuthorized: false,
@@ -87,49 +160,76 @@ func (h *Headscale) handleExistingNode(
}, nil
}
if !expired && !regReq.Expiry.IsZero() {
requestExpiry := regReq.Expiry
// If we get here, the node is not currently expired, and not trying to
// do an auth.
// The node is likely logging out, but before we run that logic, we will validate
// that the node is not attempting to tamper/extend their expiry.
// If it is not, we will expire the node or in the case of an ephemeral node, delete it.
// The client is trying to extend their key, this is not allowed.
if requestExpiry.After(time.Now()) {
return nil, NewHTTPError(http.StatusBadRequest, "extending key is not allowed", nil)
}
// If the request expiry is in the past, we consider it a logout.
if requestExpiry.Before(time.Now()) {
if node.IsEphemeral() {
c, err := h.state.DeleteNode(node.View())
if err != nil {
return nil, fmt.Errorf("deleting ephemeral node: %w", err)
}
h.Change(c)
return nil, nil
}
}
updatedNode, c, err := h.state.SetNodeExpiry(node.ID, requestExpiry)
if err != nil {
return nil, fmt.Errorf("setting node expiry: %w", err)
}
h.Change(c)
// CRITICAL: Use the updated node view for the response
// The original node object has stale expiry information
node = updatedNode.AsStruct()
// The client is trying to extend their key, this is not allowed.
if req.Expiry.After(time.Now()) {
return nil, NewHTTPError(http.StatusBadRequest, "extending key is not allowed", nil)
}
return nodeToRegisterResponse(node), nil
// If the request expiry is in the past, we consider it a logout.
if req.Expiry.Before(time.Now()) {
log.Debug().
Uint64("node.id", node.ID().Uint64()).
Str("node.name", node.Hostname()).
Bool("is_ephemeral", node.IsEphemeral()).
Bool("has_authkey", node.AuthKey().Valid()).
Time("req.expiry", req.Expiry).
Msg("Processing logout request with past expiry")
if node.IsEphemeral() {
log.Info().
Uint64("node.id", node.ID().Uint64()).
Str("node.name", node.Hostname()).
Msg("Deleting ephemeral node during logout")
c, err := h.state.DeleteNode(node)
if err != nil {
return nil, fmt.Errorf("deleting ephemeral node: %w", err)
}
h.Change(c)
return &tailcfg.RegisterResponse{
NodeKeyExpired: true,
MachineAuthorized: false,
}, nil
}
log.Debug().
Uint64("node.id", node.ID().Uint64()).
Str("node.name", node.Hostname()).
Msg("Node is not ephemeral, setting expiry instead of deleting")
}
// Update the internal state with the nodes new expiry, meaning it is
// logged out.
updatedNode, c, err := h.state.SetNodeExpiry(node.ID(), req.Expiry)
if err != nil {
return nil, fmt.Errorf("setting node expiry: %w", err)
}
h.Change(c)
return nodeToRegisterResponse(updatedNode), nil
}
func nodeToRegisterResponse(node *types.Node) *tailcfg.RegisterResponse {
// isAuthKey reports if the register request is a registration request
// using an pre auth key.
func isAuthKey(req tailcfg.RegisterRequest) bool {
return req.Auth != nil && req.Auth.AuthKey != ""
}
func nodeToRegisterResponse(node types.NodeView) *tailcfg.RegisterResponse {
return &tailcfg.RegisterResponse{
// TODO(kradalby): Only send for user-owned nodes
// and not tagged nodes when tags is working.
User: *node.User.TailscaleUser(),
Login: *node.User.TailscaleLogin(),
User: node.UserView().TailscaleUser(),
Login: node.UserView().TailscaleLogin(),
NodeKeyExpired: node.IsExpired(),
// Headscale does not implement the concept of machine authorization
@@ -141,10 +241,10 @@ func nodeToRegisterResponse(node *types.Node) *tailcfg.RegisterResponse {
func (h *Headscale) waitForFollowup(
ctx context.Context,
regReq tailcfg.RegisterRequest,
req tailcfg.RegisterRequest,
machineKey key.MachinePublic,
) (*tailcfg.RegisterResponse, error) {
fu, err := url.Parse(regReq.Followup)
fu, err := url.Parse(req.Followup)
if err != nil {
return nil, NewHTTPError(http.StatusUnauthorized, "invalid followup URL", err)
}
@@ -161,21 +261,21 @@ func (h *Headscale) waitForFollowup(
case node := <-reg.Registered:
if node == nil {
// registration is expired in the cache, instruct the client to try a new registration
return h.reqToNewRegisterResponse(regReq, machineKey)
return h.reqToNewRegisterResponse(req, machineKey)
}
return nodeToRegisterResponse(node), nil
return nodeToRegisterResponse(node.View()), nil
}
}
// if the follow-up registration isn't found anymore, instruct the client to try a new registration
return h.reqToNewRegisterResponse(regReq, machineKey)
return h.reqToNewRegisterResponse(req, machineKey)
}
// reqToNewRegisterResponse refreshes the registration flow by creating a new
// registration ID and returning the corresponding AuthURL so the client can
// restart the authentication process.
func (h *Headscale) reqToNewRegisterResponse(
regReq tailcfg.RegisterRequest,
req tailcfg.RegisterRequest,
machineKey key.MachinePublic,
) (*tailcfg.RegisterResponse, error) {
newRegID, err := types.NewRegistrationID()
@@ -183,18 +283,25 @@ func (h *Headscale) reqToNewRegisterResponse(
return nil, NewHTTPError(http.StatusInternalServerError, "failed to generate registration ID", err)
}
// Ensure we have valid hostinfo and hostname
validHostinfo, hostname := util.EnsureValidHostinfo(
req.Hostinfo,
machineKey.String(),
req.NodeKey.String(),
)
nodeToRegister := types.NewRegisterNode(
types.Node{
Hostname: regReq.Hostinfo.Hostname,
Hostname: hostname,
MachineKey: machineKey,
NodeKey: regReq.NodeKey,
Hostinfo: regReq.Hostinfo,
NodeKey: req.NodeKey,
Hostinfo: validHostinfo,
LastSeen: ptr.To(time.Now()),
},
)
if !regReq.Expiry.IsZero() {
nodeToRegister.Node.Expiry = &regReq.Expiry
if !req.Expiry.IsZero() {
nodeToRegister.Node.Expiry = &req.Expiry
}
log.Info().Msgf("New followup node registration using key: %s", newRegID)
@@ -206,11 +313,11 @@ func (h *Headscale) reqToNewRegisterResponse(
}
func (h *Headscale) handleRegisterWithAuthKey(
regReq tailcfg.RegisterRequest,
req tailcfg.RegisterRequest,
machineKey key.MachinePublic,
) (*tailcfg.RegisterResponse, error) {
node, changed, err := h.state.HandleNodeFromPreAuthKey(
regReq,
req,
machineKey,
)
if err != nil {
@@ -262,18 +369,26 @@ func (h *Headscale) handleRegisterWithAuthKey(
// h.Change(policyChange)
// }
user := node.User()
return &tailcfg.RegisterResponse{
resp := &tailcfg.RegisterResponse{
MachineAuthorized: true,
NodeKeyExpired: node.IsExpired(),
User: *user.TailscaleUser(),
Login: *user.TailscaleLogin(),
}, nil
User: node.UserView().TailscaleUser(),
Login: node.UserView().TailscaleLogin(),
}
log.Trace().
Caller().
Interface("reg.resp", resp).
Interface("reg.req", req).
Str("node.name", node.Hostname()).
Uint64("node.id", node.ID().Uint64()).
Msg("RegisterResponse")
return resp, nil
}
func (h *Headscale) handleRegisterInteractive(
regReq tailcfg.RegisterRequest,
req tailcfg.RegisterRequest,
machineKey key.MachinePublic,
) (*tailcfg.RegisterResponse, error) {
registrationId, err := types.NewRegistrationID()
@@ -281,18 +396,39 @@ func (h *Headscale) handleRegisterInteractive(
return nil, fmt.Errorf("generating registration ID: %w", err)
}
// Ensure we have valid hostinfo and hostname
validHostinfo, hostname := util.EnsureValidHostinfo(
req.Hostinfo,
machineKey.String(),
req.NodeKey.String(),
)
if req.Hostinfo == nil {
log.Warn().
Str("machine.key", machineKey.ShortString()).
Str("node.key", req.NodeKey.ShortString()).
Str("generated.hostname", hostname).
Msg("Received registration request with nil hostinfo, generated default hostname")
} else if req.Hostinfo.Hostname == "" {
log.Warn().
Str("machine.key", machineKey.ShortString()).
Str("node.key", req.NodeKey.ShortString()).
Str("generated.hostname", hostname).
Msg("Received registration request with empty hostname, generated default")
}
nodeToRegister := types.NewRegisterNode(
types.Node{
Hostname: regReq.Hostinfo.Hostname,
Hostname: hostname,
MachineKey: machineKey,
NodeKey: regReq.NodeKey,
Hostinfo: regReq.Hostinfo,
NodeKey: req.NodeKey,
Hostinfo: validHostinfo,
LastSeen: ptr.To(time.Now()),
},
)
if !regReq.Expiry.IsZero() {
nodeToRegister.Node.Expiry = &regReq.Expiry
if !req.Expiry.IsZero() {
nodeToRegister.Node.Expiry = &req.Expiry
}
h.state.SetRegistrationCacheEntry(