policy/v2: make default (#2546)

* policy/v2: make default Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com> * integration: do not run v1 tests Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com> * policy/v2: fix potential nil pointers Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com> * mapper: fix test failures in v2 Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com> --------- Signed-off-by: Kristoffer Dalby <kristoffer@tailscale.com>
2025-05-03 08:20:25 -04:00 · 2025-04-29 17:27:41 +03:00 · 2025-04-29 17:27:41 +03:00 · 2b38f7bef7
commit 2b38f7bef7
parent 9a4d0e1a99
9 changed files with 35 additions and 202 deletions
--- a/.github/workflows/gh-action-integration-generator.go
+++ b/.github/workflows/gh-action-integration-generator.go
@ -71,5 +71,4 @@ func main() {
 	}
 	updateYAML(quotedTests, "./test-integration.yaml")
 	updateYAML(quotedTests, "./test-integration-policyv2.yaml")
 }
--- a/.github/workflows/test-integration-policyv2.yaml
+++ b/.github/workflows/test-integration-policyv2.yaml
@ -1,169 +0,0 @@
 name: Integration Tests (policy v2)
 # To debug locally on a branch, and when needing secrets
 # change this to include `push` so the build is ran on
 # the main repository.
 on: [pull_request]
 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
  cancel-in-progress: true
 jobs:
  integration-test:
    runs-on: ubuntu-latest
    strategy:
      fail-fast: false
      matrix:
        test:
          - TestACLHostsInNetMapTable
          - TestACLAllowUser80Dst
          - TestACLDenyAllPort80
          - TestACLAllowUserDst
          - TestACLAllowStarDst
          - TestACLNamedHostsCanReachBySubnet
          - TestACLNamedHostsCanReach
          - TestACLDevice1CanAccessDevice2
          - TestPolicyUpdateWhileRunningWithCLIInDatabase
          - TestAuthKeyLogoutAndReloginSameUser
          - TestAuthKeyLogoutAndReloginNewUser
          - TestAuthKeyLogoutAndReloginSameUserExpiredKey
          - TestOIDCAuthenticationPingAll
          - TestOIDCExpireNodesBasedOnTokenExpiry
          - TestOIDC024UserCreation
          - TestOIDCAuthenticationWithPKCE
          - TestOIDCReloginSameNodeNewUser
          - TestAuthWebFlowAuthenticationPingAll
          - TestAuthWebFlowLogoutAndRelogin
          - TestUserCommand
          - TestPreAuthKeyCommand
          - TestPreAuthKeyCommandWithoutExpiry
          - TestPreAuthKeyCommandReusableEphemeral
          - TestPreAuthKeyCorrectUserLoggedInCommand
          - TestApiKeyCommand
          - TestNodeTagCommand
          - TestNodeAdvertiseTagCommand
          - TestNodeCommand
          - TestNodeExpireCommand
          - TestNodeRenameCommand
          - TestNodeMoveCommand
          - TestPolicyCommand
          - TestPolicyBrokenConfigCommand
          - TestDERPVerifyEndpoint
          - TestResolveMagicDNS
          - TestResolveMagicDNSExtraRecordsPath
          - TestValidateResolvConf
          - TestDERPServerScenario
          - TestDERPServerWebsocketScenario
          - TestPingAllByIP
          - TestPingAllByIPPublicDERP
          - TestEphemeral
          - TestEphemeralInAlternateTimezone
          - TestEphemeral2006DeletedTooQuickly
          - TestPingAllByHostname
          - TestTaildrop
          - TestUpdateHostnameFromClient
          - TestExpireNode
          - TestNodeOnlineStatus
          - TestPingAllByIPManyUpDown
          - Test2118DeletingOnlineNodePanics
          - TestEnablingRoutes
          - TestHASubnetRouterFailover
          - TestSubnetRouteACL
          - TestEnablingExitRoutes
          - TestSubnetRouterMultiNetwork
          - TestSubnetRouterMultiNetworkExitNode
          - TestAutoApproveMultiNetwork
          - TestHeadscale
          - TestTailscaleNodesJoiningHeadcale
          - TestSSHOneUserToAll
          - TestSSHMultipleUsersAllToAll
          - TestSSHNoSSHConfigured
          - TestSSHIsBlockedInACL
          - TestSSHUserOnlyIsolation
        database: [postgres, sqlite]
    env:
      # Github does not allow us to access secrets in pull requests,
      # so this env var is used to check if we have the secret or not.
      # If we have the secrets, meaning we are running on push in a fork,
      # there might be secrets available for more debugging.
      # If TS_OAUTH_CLIENT_ID and TS_OAUTH_SECRET is set, then the job
      # will join a debug tailscale network, set up SSH and a tmux session.
      # The SSH will be configured to use the SSH key of the Github user
      # that triggered the build.
      HAS_TAILSCALE_SECRET: ${{ secrets.TS_OAUTH_CLIENT_ID }}
    steps:
      - uses: actions/checkout@v4
        with:
          fetch-depth: 2
      - name: Get changed files
        id: changed-files
        uses: dorny/paths-filter@v3
        with:
          filters: |
            files:
              - '*.nix'
              - 'go.*'
              - '**/*.go'
              - 'integration_test/'
              - 'config-example.yaml'
      - name: Tailscale
        if: ${{ env.HAS_TAILSCALE_SECRET }}
        uses: tailscale/github-action@v2
        with:
          oauth-client-id: ${{ secrets.TS_OAUTH_CLIENT_ID }}
          oauth-secret: ${{ secrets.TS_OAUTH_SECRET }}
          tags: tag:gh
      - name: Setup SSH server for Actor
        if: ${{ env.HAS_TAILSCALE_SECRET }}
        uses: alexellis/setup-sshd-actor@master
      - uses: DeterminateSystems/nix-installer-action@main
        if: steps.changed-files.outputs.files == 'true'
      - uses: DeterminateSystems/magic-nix-cache-action@main
        if: steps.changed-files.outputs.files == 'true'
      - uses: satackey/action-docker-layer-caching@main
        if: steps.changed-files.outputs.files == 'true'
        continue-on-error: true
      - name: Run Integration Test
        uses: Wandalen/wretry.action@master
        if: steps.changed-files.outputs.files == 'true'
        env:
          USE_POSTGRES: ${{ matrix.database == 'postgres' && '1' || '0' }}
        with:
          # Our integration tests are started like a thundering herd, often
          # hitting limits of the various external repositories we depend on
          # like docker hub. This will retry jobs every 5 min, 10 times,
          # hopefully letting us avoid manual intervention and restarting jobs.
          # One could of course argue that we should invest in trying to avoid
          # this, but currently it seems like a larger investment to be cleverer
          # about this.
          # Some of the jobs might still require manual restart as they are really
          # slow and this will cause them to eventually be killed by Github actions.
          attempt_delay: 300000   # 5 min
          attempt_limit: 10
          command: |
            nix develop --command -- docker run \
              --tty --rm \
              --volume ~/.cache/hs-integration-go:/go \
              --name headscale-test-suite \
              --volume $PWD:$PWD -w $PWD/integration \
              --volume /var/run/docker.sock:/var/run/docker.sock \
              --volume $PWD/control_logs:/tmp/control \
              --env HEADSCALE_INTEGRATION_POSTGRES=${{env.USE_POSTGRES}} \
              --env HEADSCALE_EXPERIMENTAL_POLICY_V2=1 \
              golang:1 \
                go run gotest.tools/gotestsum@latest -- ./... \
                  -failfast \
                  -timeout 120m \
                  -parallel 1 \
                  -run "^${{ matrix.test }}$"
      - uses: actions/upload-artifact@v4
        if: always() && steps.changed-files.outputs.files == 'true'
        with:
          name: ${{ matrix.test }}-${{matrix.database}}-${{matrix.policy}}-logs
          path: "control_logs/*.log"
      - uses: actions/upload-artifact@v4
        if: always() && steps.changed-files.outputs.files == 'true'
        with:
          name: ${{ matrix.test }}-${{matrix.database}}-${{matrix.policy}}-pprof
          path: "control_logs/*.pprof.tar"
      - name: Setup a blocking tmux session
        if: ${{ env.HAS_TAILSCALE_SECRET }}
        uses: alexellis/block-with-tmux-action@master
--- a/.github/workflows/test-integration.yaml
+++ b/.github/workflows/test-integration.yaml
@ -136,7 +136,7 @@ jobs:
          # about this.
          # Some of the jobs might still require manual restart as they are really
          # slow and this will cause them to eventually be killed by Github actions.
-          attempt_delay: 300000   # 5 min
+          attempt_delay: 300000 # 5 min
          attempt_limit: 10
          command: |
            nix develop --command -- docker run \
@ -147,7 +147,6 @@ jobs:
              --volume /var/run/docker.sock:/var/run/docker.sock \
              --volume $PWD/control_logs:/tmp/control \
              --env HEADSCALE_INTEGRATION_POSTGRES=${{env.USE_POSTGRES}} \
              --env HEADSCALE_EXPERIMENTAL_POLICY_V2=0 \
              golang:1 \
                go run gotest.tools/gotestsum@latest -- ./... \
                  -failfast \
@ -157,12 +156,12 @@ jobs:
      - uses: actions/upload-artifact@v4
        if: always() && steps.changed-files.outputs.files == 'true'
        with:
-          name: ${{ matrix.test }}-${{matrix.database}}-${{matrix.policy}}-logs
+          name: ${{ matrix.test }}-${{matrix.database}}-logs
          path: "control_logs/*.log"
      - uses: actions/upload-artifact@v4
        if: always() && steps.changed-files.outputs.files == 'true'
        with:
-          name: ${{ matrix.test }}-${{matrix.database}}-${{matrix.policy}}-pprof
+          name: ${{ matrix.test }}-${{matrix.database}}-pprof
          path: "control_logs/*.pprof.tar"
      - name: Setup a blocking tmux session
        if: ${{ env.HAS_TAILSCALE_SECRET }}
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -4,6 +4,8 @@
 ### BREAKING
 #### Routes
 Route internals have been rewritten, removing the dedicated route table in the
 database. This was done to simplify the codebase, which had grown unnecessarily
 complex after the routes were split into separate tables. The overhead of having
@ -35,14 +37,15 @@ will be approved.
 - Routes are now managed via the Node API
  [#2422](https://github.com/juanfont/headscale/pull/2422)
-### Experimental Policy v2
+#### Policy v2
-This release introduces a new experimental version of Headscales policy
+This release introduces a new policy implementation. The new policy is a
-implementation. In this context, experimental means that the feature is not yet
+complete rewrite, and it introduces some significant quality and consistency
-fully tested and may contain bugs or unexpected behavior and that we are still
+improvements. In principle, there are not really any new features, but some long
-experimenting with how the final interface/behavior will be.
+standing bugs should have been resolved, or be easier to fix in the future. The
 new policy code passes all of our tests.
-#### Breaking changes
+**Changes**
 - The policy is validated and "resolved" when loading, providing errors for
  invalid rules and conditions.
@ -59,19 +62,14 @@ experimenting with how the final interface/behavior will be.
    `@` should be appended at the end. For example, if your user is `john`, it
    must be written as `john@` in the policy.
-#### Current state
+**Current state**
 The new policy is passing all tests, both integration and unit tests. This does
 not mean it is perfect, but it is a good start. Corner cases that is currently
 working in v1 and not tested might be broken in v2 (and vice versa).
-**We do need help testing this code**, and we think that most of the user facing
+**We do need help testing this code**
 API will not really change. We are not sure yet when this code will replace v1,
 but we are confident that it will, and all new changes and fixes will be made
 towards this code.
 The new policy can be used by setting the environment variable
 `HEADSCALE_EXPERIMENTAL_POLICY_V2` to `1`.
 #### Other breaking
--- a/hscontrol/mapper/mapper_test.go
+++ b/hscontrol/mapper/mapper_test.go
@ -346,7 +346,7 @@ func Test_fullMapResponse(t *testing.T) {
 						{
 							"action": "accept",
 							"src": ["100.64.0.2"],
-							"dst": ["user1:*"],
+							"dst": ["user1@:*"],
 						},
 					],
 				}
@ -382,7 +382,7 @@ func Test_fullMapResponse(t *testing.T) {
 						},
 					},
 				},
-				SSHPolicy: &tailcfg.SSHPolicy{},
+				SSHPolicy: nil,
 				UserProfiles: []tailcfg.UserProfile{
 					{ID: tailcfg.UserID(user1.ID), LoginName: "user1", DisplayName: "user1"},
 					{ID: tailcfg.UserID(user2.ID), LoginName: "user2", DisplayName: "user2"},
--- a/hscontrol/policy/pm.go
+++ b/hscontrol/policy/pm.go
@ -11,7 +11,7 @@ import (
 )
 var (
-	polv2 = envknob.Bool("HEADSCALE_EXPERIMENTAL_POLICY_V2")
+	polv1 = envknob.Bool("HEADSCALE_POLICY_V1")
 )
 type PolicyManager interface {
@ -35,13 +35,13 @@ type PolicyManager interface {
 func NewPolicyManager(pol []byte, users []types.User, nodes types.Nodes) (PolicyManager, error) {
 	var polMan PolicyManager
 	var err error
-	if polv2 {
+	if polv1 {
-		polMan, err = policyv2.NewPolicyManager(pol, users, nodes)
+		polMan, err = policyv1.NewPolicyManager(pol, users, nodes)
 		if err != nil {
 			return nil, err
 		}
 	} else {
-		polMan, err = policyv1.NewPolicyManager(pol, users, nodes)
+		polMan, err = policyv2.NewPolicyManager(pol, users, nodes)
 		if err != nil {
 			return nil, err
 		}
--- a/hscontrol/policy/v2/filter.go
+++ b/hscontrol/policy/v2/filter.go
@ -38,7 +38,7 @@ func (pol *Policy) compileFilterRules(
 			log.Trace().Err(err).Msgf("resolving source ips")
 		}
-		if len(srcIPs.Prefixes()) == 0 {
+		if srcIPs == nil || len(srcIPs.Prefixes()) == 0 {
 			continue
 		}
@ -56,6 +56,10 @@ func (pol *Policy) compileFilterRules(
 				log.Trace().Err(err).Msgf("resolving destination ips")
 			}
 			if ips == nil {
 				continue
 			}
 			for _, pref := range ips.Prefixes() {
 				for _, port := range dest.Ports {
 					pr := tailcfg.NetPortRange{
@ -162,6 +166,10 @@ func (pol *Policy) compileSSHPolicy(
 func ipSetToPrefixStringList(ips *netipx.IPSet) []string {
 	var out []string
 	if ips == nil {
 		return out
 	}
 	for _, pref := range ips.Prefixes() {
 		out = append(out, pref.String())
 	}
--- a/integration/hsic/hsic.go
+++ b/integration/hsic/hsic.go
@ -70,7 +70,6 @@ type HeadscaleInContainer struct {
 	tlsKey           []byte
 	filesInContainer []fileInContainer
 	postgres         bool
 	policyV2         bool
 	policyMode       types.PolicyMode
 }
@ -188,11 +187,10 @@ func WithPostgres() Option {
 	}
 }
-// WithPolicyV2 tells the integration test to use the new v2 filter.
+// WithPolicyV1 tells the integration test to use the old v1 filter.
-func WithPolicyV2() Option {
+func WithPolicyV1() Option {
 	return func(hsic *HeadscaleInContainer) {
-		hsic.policyV2 = true
+		hsic.env["HEADSCALE_POLICY_V1"] = "1"
 		hsic.env["HEADSCALE_EXPERIMENTAL_POLICY_V2"] = "1"
 	}
 }
--- a/integration/scenario.go
+++ b/integration/scenario.go
@ -47,7 +47,7 @@ const (
 )
 var usePostgresForTest = envknob.Bool("HEADSCALE_INTEGRATION_POSTGRES")
-var usePolicyV2ForTest = envknob.Bool("HEADSCALE_EXPERIMENTAL_POLICY_V2")
+var usePolicyV1ForTest = envknob.Bool("HEADSCALE_POLICY_V1")
 var (
 	errNoHeadscaleAvailable = errors.New("no headscale available")
@ -408,8 +408,8 @@ func (s *Scenario) Headscale(opts ...hsic.Option) (ControlServer, error) {
 		opts = append(opts, hsic.WithPostgres())
 	}
-	if usePolicyV2ForTest {
+	if usePolicyV1ForTest {
-		opts = append(opts, hsic.WithPolicyV2())
+		opts = append(opts, hsic.WithPolicyV1())
 	}
 	headscale, err := hsic.New(s.pool, s.Networks(), opts...)