From 64319f79ff1934865805fc73be2228dddce0ec80 Mon Sep 17 00:00:00 2001 From: Kristoffer Dalby Date: Wed, 11 Sep 2024 12:00:32 +0200 Subject: [PATCH] make stream shutdown if self-node has been removed (#2125) * add shutdown that asserts if headscale had panics Signed-off-by: Kristoffer Dalby * add test case producing 2118 panic Signed-off-by: Kristoffer Dalby * make stream shutdown if self-node has been removed Currently we will read the node from database, and since it is deleted, the id might be set to nil. Keep the node around and just shutdown, so it is cleanly removed from notifier. Fixes #2118 Signed-off-by: Kristoffer Dalby --------- Signed-off-by: Kristoffer Dalby --- .github/workflows/test-integration.yaml | 1 + hscontrol/poll.go | 7 ++ integration/control.go | 4 +- integration/dockertestutil/logs.go | 18 +++-- integration/general_test.go | 99 +++++++++++++++++++++++++ integration/hsic/hsic.go | 8 +- integration/scenario.go | 28 +++++-- integration/tsic/tsic.go | 4 +- 8 files changed, 148 insertions(+), 21 deletions(-) diff --git a/.github/workflows/test-integration.yaml b/.github/workflows/test-integration.yaml index ed194da1..d6c7eff2 100644 --- a/.github/workflows/test-integration.yaml +++ b/.github/workflows/test-integration.yaml @@ -52,6 +52,7 @@ jobs: - TestExpireNode - TestNodeOnlineStatus - TestPingAllByIPManyUpDown + - Test2118DeletingOnlineNodePanics - TestEnablingRoutes - TestHASubnetRouterFailover - TestEnableDisableAutoApprovedRoute diff --git a/hscontrol/poll.go b/hscontrol/poll.go index d7ba682e..82a5295f 100644 --- a/hscontrol/poll.go +++ b/hscontrol/poll.go @@ -5,6 +5,7 @@ import ( "fmt" "math/rand/v2" "net/http" + "slices" "sort" "strings" "time" @@ -273,6 +274,12 @@ func (m *mapSession) serveLongPoll() { return } + // If the node has been removed from headscale, close the stream + if slices.Contains(update.Removed, m.node.ID) { + m.tracef("node removed, closing stream") + return + } + m.tracef("received stream update: %s %s", update.Type.String(), update.Message) mapResponseUpdateReceived.WithLabelValues(update.Type.String()).Inc() diff --git a/integration/control.go b/integration/control.go index 8a34bde8..b5699577 100644 --- a/integration/control.go +++ b/integration/control.go @@ -6,8 +6,8 @@ import ( ) type ControlServer interface { - Shutdown() error - SaveLog(string) error + Shutdown() (string, string, error) + SaveLog(string) (string, string, error) SaveProfile(string) error Execute(command []string) (string, error) WriteFile(path string, content []byte) error diff --git a/integration/dockertestutil/logs.go b/integration/dockertestutil/logs.go index 98ba970a..64c3c9ac 100644 --- a/integration/dockertestutil/logs.go +++ b/integration/dockertestutil/logs.go @@ -17,10 +17,10 @@ func SaveLog( pool *dockertest.Pool, resource *dockertest.Resource, basePath string, -) error { +) (string, string, error) { err := os.MkdirAll(basePath, os.ModePerm) if err != nil { - return err + return "", "", err } var stdout bytes.Buffer @@ -41,28 +41,30 @@ func SaveLog( }, ) if err != nil { - return err + return "", "", err } log.Printf("Saving logs for %s to %s\n", resource.Container.Name, basePath) + stdoutPath := path.Join(basePath, resource.Container.Name+".stdout.log") err = os.WriteFile( - path.Join(basePath, resource.Container.Name+".stdout.log"), + stdoutPath, stdout.Bytes(), filePerm, ) if err != nil { - return err + return "", "", err } + stderrPath := path.Join(basePath, resource.Container.Name+".stderr.log") err = os.WriteFile( - path.Join(basePath, resource.Container.Name+".stderr.log"), + stderrPath, stderr.Bytes(), filePerm, ) if err != nil { - return err + return "", "", err } - return nil + return stdoutPath, stderrPath, nil } diff --git a/integration/general_test.go b/integration/general_test.go index 6de00fd2..a8421f47 100644 --- a/integration/general_test.go +++ b/integration/general_test.go @@ -954,3 +954,102 @@ func TestPingAllByIPManyUpDown(t *testing.T) { t.Logf("%d successful pings out of %d", success, len(allClients)*len(allIps)) } } + +func Test2118DeletingOnlineNodePanics(t *testing.T) { + IntegrationSkip(t) + t.Parallel() + + scenario, err := NewScenario(dockertestMaxWait()) + assertNoErr(t, err) + defer scenario.ShutdownAssertNoPanics(t) + + // TODO(kradalby): it does not look like the user thing works, only second + // get created? maybe only when many? + spec := map[string]int{ + "user1": 1, + "user2": 1, + } + + err = scenario.CreateHeadscaleEnv(spec, + []tsic.Option{}, + hsic.WithTestName("deletenocrash"), + hsic.WithEmbeddedDERPServerOnly(), + hsic.WithTLS(), + hsic.WithHostnameAsServerURL(), + ) + assertNoErrHeadscaleEnv(t, err) + + allClients, err := scenario.ListTailscaleClients() + assertNoErrListClients(t, err) + + allIps, err := scenario.ListTailscaleClientsIPs() + assertNoErrListClientIPs(t, err) + + err = scenario.WaitForTailscaleSync() + assertNoErrSync(t, err) + + allAddrs := lo.Map(allIps, func(x netip.Addr, index int) string { + return x.String() + }) + + success := pingAllHelper(t, allClients, allAddrs) + t.Logf("%d successful pings out of %d", success, len(allClients)*len(allIps)) + + headscale, err := scenario.Headscale() + assertNoErr(t, err) + + // Test list all nodes after added otherUser + var nodeList []v1.Node + err = executeAndUnmarshal( + headscale, + []string{ + "headscale", + "nodes", + "list", + "--output", + "json", + }, + &nodeList, + ) + assert.Nil(t, err) + assert.Len(t, nodeList, 2) + assert.True(t, nodeList[0].Online) + assert.True(t, nodeList[1].Online) + + // Delete the first node, which is online + _, err = headscale.Execute( + []string{ + "headscale", + "nodes", + "delete", + "--identifier", + // Delete the last added machine + fmt.Sprintf("%d", nodeList[0].Id), + "--output", + "json", + "--force", + }, + ) + assert.Nil(t, err) + + time.Sleep(2 * time.Second) + + // Ensure that the node has been deleted, this did not occur due to a panic. + var nodeListAfter []v1.Node + err = executeAndUnmarshal( + headscale, + []string{ + "headscale", + "nodes", + "list", + "--output", + "json", + }, + &nodeListAfter, + ) + assert.Nil(t, err) + assert.Len(t, nodeListAfter, 1) + assert.True(t, nodeListAfter[0].Online) + assert.Equal(t, nodeList[1].Id, nodeListAfter[0].Id) + +} diff --git a/integration/hsic/hsic.go b/integration/hsic/hsic.go index b9026225..20a778b8 100644 --- a/integration/hsic/hsic.go +++ b/integration/hsic/hsic.go @@ -398,8 +398,8 @@ func (t *HeadscaleInContainer) hasTLS() bool { } // Shutdown stops and cleans up the Headscale container. -func (t *HeadscaleInContainer) Shutdown() error { - err := t.SaveLog("/tmp/control") +func (t *HeadscaleInContainer) Shutdown() (string, string, error) { + stdoutPath, stderrPath, err := t.SaveLog("/tmp/control") if err != nil { log.Printf( "Failed to save log from control: %s", @@ -458,12 +458,12 @@ func (t *HeadscaleInContainer) Shutdown() error { t.pool.Purge(t.pgContainer) } - return t.pool.Purge(t.container) + return stdoutPath, stderrPath, t.pool.Purge(t.container) } // SaveLog saves the current stdout log of the container to a path // on the host system. -func (t *HeadscaleInContainer) SaveLog(path string) error { +func (t *HeadscaleInContainer) SaveLog(path string) (string, string, error) { return dockertestutil.SaveLog(t.pool, t.container, path) } diff --git a/integration/scenario.go b/integration/scenario.go index 075d1fd5..df978f2a 100644 --- a/integration/scenario.go +++ b/integration/scenario.go @@ -8,6 +8,7 @@ import ( "os" "sort" "sync" + "testing" "time" v1 "github.com/juanfont/headscale/gen/go/headscale/v1" @@ -18,6 +19,7 @@ import ( "github.com/ory/dockertest/v3" "github.com/puzpuzpuz/xsync/v3" "github.com/samber/lo" + "github.com/stretchr/testify/assert" "golang.org/x/sync/errgroup" "tailscale.com/envknob" ) @@ -187,13 +189,9 @@ func NewScenario(maxWait time.Duration) (*Scenario, error) { }, nil } -// Shutdown shuts down and cleans up all the containers (ControlServer, TailscaleClient) -// and networks associated with it. -// In addition, it will save the logs of the ControlServer to `/tmp/control` in the -// environment running the tests. -func (s *Scenario) Shutdown() { +func (s *Scenario) ShutdownAssertNoPanics(t *testing.T) { s.controlServers.Range(func(_ string, control ControlServer) bool { - err := control.Shutdown() + stdoutPath, stderrPath, err := control.Shutdown() if err != nil { log.Printf( "Failed to shut down control: %s", @@ -201,6 +199,16 @@ func (s *Scenario) Shutdown() { ) } + if t != nil { + stdout, err := os.ReadFile(stdoutPath) + assert.NoError(t, err) + assert.NotContains(t, string(stdout), "panic") + + stderr, err := os.ReadFile(stderrPath) + assert.NoError(t, err) + assert.NotContains(t, string(stderr), "panic") + } + return true }) @@ -224,6 +232,14 @@ func (s *Scenario) Shutdown() { // } } +// Shutdown shuts down and cleans up all the containers (ControlServer, TailscaleClient) +// and networks associated with it. +// In addition, it will save the logs of the ControlServer to `/tmp/control` in the +// environment running the tests. +func (s *Scenario) Shutdown() { + s.ShutdownAssertNoPanics(nil) +} + // Users returns the name of all users associated with the Scenario. func (s *Scenario) Users() []string { users := make([]string, 0) diff --git a/integration/tsic/tsic.go b/integration/tsic/tsic.go index e1045ec3..a3fac17c 100644 --- a/integration/tsic/tsic.go +++ b/integration/tsic/tsic.go @@ -998,7 +998,9 @@ func (t *TailscaleInContainer) WriteFile(path string, data []byte) error { // SaveLog saves the current stdout log of the container to a path // on the host system. func (t *TailscaleInContainer) SaveLog(path string) error { - return dockertestutil.SaveLog(t.pool, t.container, path) + // TODO(kradalby): Assert if tailscale logs contains panics. + _, _, err := dockertestutil.SaveLog(t.pool, t.container, path) + return err } // ReadFile reads a file from the Tailscale container.