cmd/hi: improve test cleanup to reduce CI disk usage (#2881)

2025-11-28 21:18:02 -05:00 · 2025-11-28 16:59:54 +01:00
parent db293e0698
commit ed78bf4b98
7 changed files with 230 additions and 17 deletions
--- a/cmd/hi/README.md
+++ b/cmd/hi/README.md
@@ -0,0 +1,6 @@
+# hi
+
+hi (headscale integration runner) is an entirely "vibe coded" wrapper around our
+[integration test suite](../integration). It essentially runs the docker
+commands for you with some added benefits of extracting resources like logs and
+databases.
--- a/cmd/hi/cleanup.go
+++ b/cmd/hi/cleanup.go
@@ -3,6 +3,9 @@ package main
 import (
 	"context"
 	"fmt"
+	"log"
+	"os"
+	"path/filepath"
 	"strings"
 	"time"

@@ -205,3 +208,110 @@ func cleanCacheVolume(ctx context.Context) error {

 	return nil
 }
+
+// cleanupSuccessfulTestArtifacts removes artifacts from successful test runs to save disk space.
+// This function removes large artifacts that are mainly useful for debugging failures:
+// - Database dumps (.db files)
+// - Profile data (pprof directories)
+// - MapResponse data (mapresponses directories)
+// - Prometheus metrics files
+//
+// It preserves:
+// - Log files (.log) which are small and useful for verification.
+func cleanupSuccessfulTestArtifacts(logsDir string, verbose bool) error {
+	entries, err := os.ReadDir(logsDir)
+	if err != nil {
+		return fmt.Errorf("failed to read logs directory: %w", err)
+	}
+
+	var (
+		removedFiles, removedDirs int
+		totalSize                 int64
+	)
+
+	for _, entry := range entries {
+		name := entry.Name()
+		fullPath := filepath.Join(logsDir, name)
+
+		if entry.IsDir() {
+			// Remove pprof and mapresponses directories (typically large)
+			// These directories contain artifacts from all containers in the test run
+			if name == "pprof" || name == "mapresponses" {
+				size, sizeErr := getDirSize(fullPath)
+				if sizeErr == nil {
+					totalSize += size
+				}
+
+				err := os.RemoveAll(fullPath)
+				if err != nil {
+					if verbose {
+						log.Printf("Warning: failed to remove directory %s: %v", name, err)
+					}
+				} else {
+					removedDirs++
+
+					if verbose {
+						log.Printf("Removed directory: %s/", name)
+					}
+				}
+			}
+		} else {
+			// Only process test-related files (headscale and tailscale)
+			if !strings.HasPrefix(name, "hs-") && !strings.HasPrefix(name, "ts-") {
+				continue
+			}
+
+			// Remove database, metrics, and status files, but keep logs
+			shouldRemove := strings.HasSuffix(name, ".db") ||
+				strings.HasSuffix(name, "_metrics.txt") ||
+				strings.HasSuffix(name, "_status.json")
+
+			if shouldRemove {
+				info, infoErr := entry.Info()
+				if infoErr == nil {
+					totalSize += info.Size()
+				}
+
+				err := os.Remove(fullPath)
+				if err != nil {
+					if verbose {
+						log.Printf("Warning: failed to remove file %s: %v", name, err)
+					}
+				} else {
+					removedFiles++
+
+					if verbose {
+						log.Printf("Removed file: %s", name)
+					}
+				}
+			}
+		}
+	}
+
+	if removedFiles > 0 || removedDirs > 0 {
+		const bytesPerMB = 1024 * 1024
+		log.Printf("Cleaned up %d files and %d directories (freed ~%.2f MB)",
+			removedFiles, removedDirs, float64(totalSize)/bytesPerMB)
+	}
+
+	return nil
+}
+
+// getDirSize calculates the total size of a directory.
+func getDirSize(path string) (int64, error) {
+	var size int64
+
+	err := filepath.Walk(path, func(_ string, info os.FileInfo, err error) error {
+		if err != nil {
+			return err
+		}
+
+		if !info.IsDir() {
+			size += info.Size()
+		}
+
+		return nil
+	})
+
+	return size, err
+}
--- a/cmd/hi/docker.go
+++ b/cmd/hi/docker.go
@@ -154,6 +154,19 @@ func runTestContainer(ctx context.Context, config *RunConfig) error {
 		if cleanErr := cleanupAfterTest(ctx, cli, resp.ID); cleanErr != nil && config.Verbose {
 			log.Printf("Warning: post-test cleanup failed: %v", cleanErr)
 		}
+
+		// Clean up artifacts from successful tests to save disk space in CI
+		if exitCode == 0 {
+			if config.Verbose {
+				log.Printf("Test succeeded, cleaning up artifacts to save disk space...")
+			}
+
+			cleanErr := cleanupSuccessfulTestArtifacts(logsDir, config.Verbose)
+
+			if cleanErr != nil && config.Verbose {
+				log.Printf("Warning: artifact cleanup failed: %v", cleanErr)
+			}
+		}
 	}

 	if err != nil {
--- a/integration/dockertestutil/build.go
+++ b/integration/dockertestutil/build.go
@@ -1,17 +1,25 @@
 package dockertestutil

 import (
+	"context"
 	"os/exec"
+	"time"
 )

 // RunDockerBuildForDiagnostics runs docker build manually to get detailed error output.
 // This is used when a docker build fails to provide more detailed diagnostic information
 // than what dockertest typically provides.
-func RunDockerBuildForDiagnostics(contextDir, dockerfile string) string {
-	cmd := exec.Command("docker", "build", "-f", dockerfile, contextDir)
+//
+// Returns the build output regardless of success/failure, and an error if the build failed.
+func RunDockerBuildForDiagnostics(contextDir, dockerfile string) (string, error) {
+	// Use a context with timeout to prevent hanging builds
+	const buildTimeout = 10 * time.Minute
+
+	ctx, cancel := context.WithTimeout(context.Background(), buildTimeout)
+	defer cancel()
+
+	cmd := exec.CommandContext(ctx, "docker", "build", "--progress=plain", "--no-cache", "-f", dockerfile, contextDir)
 	output, err := cmd.CombinedOutput()
-	if err != nil {
-		return string(output)
-	}
-	return ""
+
+	return string(output), err
 }
--- a/integration/dockertestutil/network.go
+++ b/integration/dockertestutil/network.go
@@ -108,6 +108,8 @@ func CleanUnreferencedNetworks(pool *dockertest.Pool) error {
 }

 // CleanImagesInCI removes images if running in CI.
+// It only removes dangling (untagged) images to avoid forcing rebuilds.
+// Tagged images (golang:*, tailscale/tailscale:*, etc.) are automatically preserved.
 func CleanImagesInCI(pool *dockertest.Pool) error {
 	if !util.IsCI() {
 		log.Println("Skipping image cleanup outside of CI")
@@ -119,9 +121,26 @@ func CleanImagesInCI(pool *dockertest.Pool) error {
 		return fmt.Errorf("getting images: %w", err)
 	}

+	removedCount := 0
 	for _, image := range images {
-		log.Printf("removing image: %s, %v", image.ID, image.RepoTags)
-		_ = pool.Client.RemoveImage(image.ID)
+		// Only remove dangling (untagged) images to avoid forcing rebuilds
+		// Dangling images have no RepoTags or only have "<none>:<none>"
+		if len(image.RepoTags) == 0 || (len(image.RepoTags) == 1 && image.RepoTags[0] == "<none>:<none>") {
+			log.Printf("Removing dangling image: %s", image.ID[:12])
+
+			err := pool.Client.RemoveImage(image.ID)
+			if err != nil {
+				log.Printf("Warning: failed to remove image %s: %v", image.ID[:12], err)
+			} else {
+				removedCount++
+			}
+		}
+	}
+
+	if removedCount > 0 {
+		log.Printf("Removed %d dangling images in CI", removedCount)
+	} else {
+		log.Println("No dangling images to remove in CI")
 	}

 	return nil
--- a/integration/hsic/hsic.go
+++ b/integration/hsic/hsic.go
@@ -462,11 +462,33 @@ func New(
 	if err != nil {
 		// Try to get more detailed build output
 		log.Printf("Docker build failed, attempting to get detailed output...")
-		buildOutput := dockertestutil.RunDockerBuildForDiagnostics(dockerContextPath, IntegrationTestDockerFileName)
-		if buildOutput != "" {
-			return nil, fmt.Errorf("could not start headscale container: %w\n\nDetailed build output:\n%s", err, buildOutput)
+
+		buildOutput, buildErr := dockertestutil.RunDockerBuildForDiagnostics(dockerContextPath, IntegrationTestDockerFileName)
+
+		// Show the last 100 lines of build output to avoid overwhelming the logs
+		lines := strings.Split(buildOutput, "\n")
+
+		const maxLines = 100
+
+		startLine := 0
+		if len(lines) > maxLines {
+			startLine = len(lines) - maxLines
 		}
-		return nil, fmt.Errorf("could not start headscale container: %w", err)
+
+		relevantOutput := strings.Join(lines[startLine:], "\n")
+
+		if buildErr != nil {
+			// The diagnostic build also failed - this is the real error
+			return nil, fmt.Errorf("could not start headscale container: %w\n\nDocker build failed. Last %d lines of output:\n%s", err, maxLines, relevantOutput)
+		}
+
+		if buildOutput != "" {
+			// Build succeeded on retry but container creation still failed
+			return nil, fmt.Errorf("could not start headscale container: %w\n\nDocker build succeeded on retry, but container creation failed. Last %d lines of build output:\n%s", err, maxLines, relevantOutput)
+		}
+
+		// No output at all - diagnostic build command may have failed
+		return nil, fmt.Errorf("could not start headscale container: %w\n\nUnable to get diagnostic build output (command may have failed silently)", err)
 	}
 	log.Printf("Created %s container\n", hsic.hostname)

--- a/integration/tsic/tsic.go
+++ b/integration/tsic/tsic.go
@@ -327,16 +327,52 @@ func New(
 		if err != nil {
 			// Try to get more detailed build output
 			log.Printf("Docker build failed for %s, attempting to get detailed output...", hostname)
-			buildOutput := dockertestutil.RunDockerBuildForDiagnostics(dockerContextPath, "Dockerfile.tailscale-HEAD")
-			if buildOutput != "" {
+
+			buildOutput, buildErr := dockertestutil.RunDockerBuildForDiagnostics(dockerContextPath, "Dockerfile.tailscale-HEAD")
+
+			// Show the last 100 lines of build output to avoid overwhelming the logs
+			lines := strings.Split(buildOutput, "\n")
+
+			const maxLines = 100
+
+			startLine := 0
+			if len(lines) > maxLines {
+				startLine = len(lines) - maxLines
+			}
+
+			relevantOutput := strings.Join(lines[startLine:], "\n")
+
+			if buildErr != nil {
+				// The diagnostic build also failed - this is the real error
 				return nil, fmt.Errorf(
-					"%s could not start tailscale container (version: %s): %w\n\nDetailed build output:\n%s",
+					"%s could not start tailscale container (version: %s): %w\n\nDocker build failed. Last %d lines of output:\n%s",
 					hostname,
 					version,
 					err,
-					buildOutput,
+					maxLines,
+					relevantOutput,
 				)
 			}
+
+			if buildOutput != "" {
+				// Build succeeded on retry but container creation still failed
+				return nil, fmt.Errorf(
+					"%s could not start tailscale container (version: %s): %w\n\nDocker build succeeded on retry, but container creation failed. Last %d lines of build output:\n%s",
+					hostname,
+					version,
+					err,
+					maxLines,
+					relevantOutput,
+				)
+			}
+
+			// No output at all - diagnostic build command may have failed
+			return nil, fmt.Errorf(
+				"%s could not start tailscale container (version: %s): %w\n\nUnable to get diagnostic build output (command may have failed silently)",
+				hostname,
+				version,
+				err,
+			)
 		}
 	case "unstable":
 		tailscaleOptions.Repository = "tailscale/tailscale"
@@ -580,7 +616,6 @@ func (t *TailscaleInContainer) Restart() error {
 		}
 		return struct{}{}, nil
 	}, backoff.WithBackOff(backoff.NewExponentialBackOff()), backoff.WithMaxElapsedTime(30*time.Second))
-
 	if err != nil {
 		return fmt.Errorf("timeout waiting for container %s to restart and become ready: %w", t.hostname, err)
 	}