cmd/hi: improve test cleanup to reduce CI disk usage (#2881)

2025-11-29 05:18:48 -05:00 · 2025-11-28 16:59:54 +01:00
parent db293e0698
commit ed78bf4b98
7 changed files with 230 additions and 17 deletions
--- a/cmd/hi/README.md
+++ b/cmd/hi/README.md
@@ -0,0 +1,6 @@
 # hi
 hi (headscale integration runner) is an entirely "vibe coded" wrapper around our
 [integration test suite](../integration). It essentially runs the docker
 commands for you with some added benefits of extracting resources like logs and
 databases.
--- a/cmd/hi/cleanup.go
+++ b/cmd/hi/cleanup.go
@@ -3,6 +3,9 @@ package main
 import (
 	"context"
 	"fmt"
 	"log"
 	"os"
 	"path/filepath"
 	"strings"
 	"time"
@@ -205,3 +208,110 @@ func cleanCacheVolume(ctx context.Context) error {
 	return nil
 }
 // cleanupSuccessfulTestArtifacts removes artifacts from successful test runs to save disk space.
 // This function removes large artifacts that are mainly useful for debugging failures:
 // - Database dumps (.db files)
 // - Profile data (pprof directories)
 // - MapResponse data (mapresponses directories)
 // - Prometheus metrics files
 //
 // It preserves:
 // - Log files (.log) which are small and useful for verification.
 func cleanupSuccessfulTestArtifacts(logsDir string, verbose bool) error {
 	entries, err := os.ReadDir(logsDir)
 	if err != nil {
 		return fmt.Errorf("failed to read logs directory: %w", err)
 	}
 	var (
 		removedFiles, removedDirs int
 		totalSize                 int64
 	)
 	for _, entry := range entries {
 		name := entry.Name()
 		fullPath := filepath.Join(logsDir, name)
 		if entry.IsDir() {
 			// Remove pprof and mapresponses directories (typically large)
 			// These directories contain artifacts from all containers in the test run
 			if name == "pprof" || name == "mapresponses" {
 				size, sizeErr := getDirSize(fullPath)
 				if sizeErr == nil {
 					totalSize += size
 				}
 				err := os.RemoveAll(fullPath)
 				if err != nil {
 					if verbose {
 						log.Printf("Warning: failed to remove directory %s: %v", name, err)
 					}
 				} else {
 					removedDirs++
 					if verbose {
 						log.Printf("Removed directory: %s/", name)
 					}
 				}
 			}
 		} else {
 			// Only process test-related files (headscale and tailscale)
 			if !strings.HasPrefix(name, "hs-") && !strings.HasPrefix(name, "ts-") {
 				continue
 			}
 			// Remove database, metrics, and status files, but keep logs
 			shouldRemove := strings.HasSuffix(name, ".db") ||
 				strings.HasSuffix(name, "_metrics.txt") ||
 				strings.HasSuffix(name, "_status.json")
 			if shouldRemove {
 				info, infoErr := entry.Info()
 				if infoErr == nil {
 					totalSize += info.Size()
 				}
 				err := os.Remove(fullPath)
 				if err != nil {
 					if verbose {
 						log.Printf("Warning: failed to remove file %s: %v", name, err)
 					}
 				} else {
 					removedFiles++
 					if verbose {
 						log.Printf("Removed file: %s", name)
 					}
 				}
 			}
 		}
 	}
 	if removedFiles > 0 || removedDirs > 0 {
 		const bytesPerMB = 1024 * 1024
 		log.Printf("Cleaned up %d files and %d directories (freed ~%.2f MB)",
 			removedFiles, removedDirs, float64(totalSize)/bytesPerMB)
 	}
 	return nil
 }
 // getDirSize calculates the total size of a directory.
 func getDirSize(path string) (int64, error) {
 	var size int64
 	err := filepath.Walk(path, func(_ string, info os.FileInfo, err error) error {
 		if err != nil {
 			return err
 		}
 		if !info.IsDir() {
 			size += info.Size()
 		}
 		return nil
 	})
 	return size, err
 }
--- a/cmd/hi/docker.go
+++ b/cmd/hi/docker.go
@@ -154,6 +154,19 @@ func runTestContainer(ctx context.Context, config *RunConfig) error {
 		if cleanErr := cleanupAfterTest(ctx, cli, resp.ID); cleanErr != nil && config.Verbose {
 			log.Printf("Warning: post-test cleanup failed: %v", cleanErr)
 		}
 		// Clean up artifacts from successful tests to save disk space in CI
 		if exitCode == 0 {
 			if config.Verbose {
 				log.Printf("Test succeeded, cleaning up artifacts to save disk space...")
 			}
 			cleanErr := cleanupSuccessfulTestArtifacts(logsDir, config.Verbose)
 			if cleanErr != nil && config.Verbose {
 				log.Printf("Warning: artifact cleanup failed: %v", cleanErr)
 			}
 		}
 	}
 	if err != nil {
--- a/integration/dockertestutil/build.go
+++ b/integration/dockertestutil/build.go
@@ -1,17 +1,25 @@
 package dockertestutil
 import (
 	"context"
 	"os/exec"
 	"time"
 )
 // RunDockerBuildForDiagnostics runs docker build manually to get detailed error output.
 // This is used when a docker build fails to provide more detailed diagnostic information
 // than what dockertest typically provides.
-func RunDockerBuildForDiagnostics(contextDir, dockerfile string) string {
+//
-	cmd := exec.Command("docker", "build", "-f", dockerfile, contextDir)
+// Returns the build output regardless of success/failure, and an error if the build failed.
 func RunDockerBuildForDiagnostics(contextDir, dockerfile string) (string, error) {
 	// Use a context with timeout to prevent hanging builds
 	const buildTimeout = 10 * time.Minute
 	ctx, cancel := context.WithTimeout(context.Background(), buildTimeout)
 	defer cancel()
 	cmd := exec.CommandContext(ctx, "docker", "build", "--progress=plain", "--no-cache", "-f", dockerfile, contextDir)
 	output, err := cmd.CombinedOutput()
-	if err != nil {
+
-		return string(output)
+	return string(output), err
 	}
 	return ""
 }
--- a/integration/dockertestutil/network.go
+++ b/integration/dockertestutil/network.go
@@ -108,6 +108,8 @@ func CleanUnreferencedNetworks(pool *dockertest.Pool) error {
 }
 // CleanImagesInCI removes images if running in CI.
 // It only removes dangling (untagged) images to avoid forcing rebuilds.
 // Tagged images (golang:*, tailscale/tailscale:*, etc.) are automatically preserved.
 func CleanImagesInCI(pool *dockertest.Pool) error {
 	if !util.IsCI() {
 		log.Println("Skipping image cleanup outside of CI")
@@ -119,9 +121,26 @@ func CleanImagesInCI(pool *dockertest.Pool) error {
 		return fmt.Errorf("getting images: %w", err)
 	}
 	removedCount := 0
 	for _, image := range images {
-		log.Printf("removing image: %s, %v", image.ID, image.RepoTags)
+		// Only remove dangling (untagged) images to avoid forcing rebuilds
-		_ = pool.Client.RemoveImage(image.ID)
+		// Dangling images have no RepoTags or only have "<none>:<none>"
 		if len(image.RepoTags) == 0 || (len(image.RepoTags) == 1 && image.RepoTags[0] == "<none>:<none>") {
 			log.Printf("Removing dangling image: %s", image.ID[:12])
 			err := pool.Client.RemoveImage(image.ID)
 			if err != nil {
 				log.Printf("Warning: failed to remove image %s: %v", image.ID[:12], err)
 			} else {
 				removedCount++
 			}
 		}
 	}
 	if removedCount > 0 {
 		log.Printf("Removed %d dangling images in CI", removedCount)
 	} else {
 		log.Println("No dangling images to remove in CI")
 	}
 	return nil
--- a/integration/hsic/hsic.go
+++ b/integration/hsic/hsic.go
@@ -462,11 +462,33 @@ func New(
 	if err != nil {
 		// Try to get more detailed build output
 		log.Printf("Docker build failed, attempting to get detailed output...")
-		buildOutput := dockertestutil.RunDockerBuildForDiagnostics(dockerContextPath, IntegrationTestDockerFileName)
+
-		if buildOutput != "" {
+		buildOutput, buildErr := dockertestutil.RunDockerBuildForDiagnostics(dockerContextPath, IntegrationTestDockerFileName)
-			return nil, fmt.Errorf("could not start headscale container: %w\n\nDetailed build output:\n%s", err, buildOutput)
+
 		// Show the last 100 lines of build output to avoid overwhelming the logs
 		lines := strings.Split(buildOutput, "\n")
 		const maxLines = 100
 		startLine := 0
 		if len(lines) > maxLines {
 			startLine = len(lines) - maxLines
 		}
-		return nil, fmt.Errorf("could not start headscale container: %w", err)
+
 		relevantOutput := strings.Join(lines[startLine:], "\n")
 		if buildErr != nil {
 			// The diagnostic build also failed - this is the real error
 			return nil, fmt.Errorf("could not start headscale container: %w\n\nDocker build failed. Last %d lines of output:\n%s", err, maxLines, relevantOutput)
 		}
 		if buildOutput != "" {
 			// Build succeeded on retry but container creation still failed
 			return nil, fmt.Errorf("could not start headscale container: %w\n\nDocker build succeeded on retry, but container creation failed. Last %d lines of build output:\n%s", err, maxLines, relevantOutput)
 		}
 		// No output at all - diagnostic build command may have failed
 		return nil, fmt.Errorf("could not start headscale container: %w\n\nUnable to get diagnostic build output (command may have failed silently)", err)
 	}
 	log.Printf("Created %s container\n", hsic.hostname)
--- a/integration/tsic/tsic.go
+++ b/integration/tsic/tsic.go
@@ -327,16 +327,52 @@ func New(
 		if err != nil {
 			// Try to get more detailed build output
 			log.Printf("Docker build failed for %s, attempting to get detailed output...", hostname)
-			buildOutput := dockertestutil.RunDockerBuildForDiagnostics(dockerContextPath, "Dockerfile.tailscale-HEAD")
+
-			if buildOutput != "" {
+			buildOutput, buildErr := dockertestutil.RunDockerBuildForDiagnostics(dockerContextPath, "Dockerfile.tailscale-HEAD")
 			// Show the last 100 lines of build output to avoid overwhelming the logs
 			lines := strings.Split(buildOutput, "\n")
 			const maxLines = 100
 			startLine := 0
 			if len(lines) > maxLines {
 				startLine = len(lines) - maxLines
 			}
 			relevantOutput := strings.Join(lines[startLine:], "\n")
 			if buildErr != nil {
 				// The diagnostic build also failed - this is the real error
 				return nil, fmt.Errorf(
-					"%s could not start tailscale container (version: %s): %w\n\nDetailed build output:\n%s",
+					"%s could not start tailscale container (version: %s): %w\n\nDocker build failed. Last %d lines of output:\n%s",
 					hostname,
 					version,
 					err,
-					buildOutput,
+					maxLines,
 					relevantOutput,
 				)
 			}
 			if buildOutput != "" {
 				// Build succeeded on retry but container creation still failed
 				return nil, fmt.Errorf(
 					"%s could not start tailscale container (version: %s): %w\n\nDocker build succeeded on retry, but container creation failed. Last %d lines of build output:\n%s",
 					hostname,
 					version,
 					err,
 					maxLines,
 					relevantOutput,
 				)
 			}
 			// No output at all - diagnostic build command may have failed
 			return nil, fmt.Errorf(
 				"%s could not start tailscale container (version: %s): %w\n\nUnable to get diagnostic build output (command may have failed silently)",
 				hostname,
 				version,
 				err,
 			)
 		}
 	case "unstable":
 		tailscaleOptions.Repository = "tailscale/tailscale"
@@ -580,7 +616,6 @@ func (t *TailscaleInContainer) Restart() error {
 		}
 		return struct{}{}, nil
 	}, backoff.WithBackOff(backoff.NewExponentialBackOff()), backoff.WithMaxElapsedTime(30*time.Second))
 	if err != nil {
 		return fmt.Errorf("timeout waiting for container %s to restart and become ready: %w", t.hostname, err)
 	}