From 0fbe392499cbdc9adb7668673838bc6d5d7134ef Mon Sep 17 00:00:00 2001 From: Kristoffer Dalby Date: Wed, 16 Apr 2025 12:42:26 +0200 Subject: [PATCH] more wait, more retry (#2532) --- .github/workflows/test-integration-policyv2.yaml | 12 +++++++++++- .github/workflows/test-integration.yaml | 12 +++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-integration-policyv2.yaml b/.github/workflows/test-integration-policyv2.yaml index a05873a4..c334a5a7 100644 --- a/.github/workflows/test-integration-policyv2.yaml +++ b/.github/workflows/test-integration-policyv2.yaml @@ -127,7 +127,17 @@ jobs: env: USE_POSTGRES: ${{ matrix.database == 'postgres' && '1' || '0' }} with: - attempt_limit: 5 + # Our integration tests are started like a thundering herd, often + # hitting limits of the various external repositories we depend on + # like docker hub. This will retry jobs every 5 min, 10 times, + # hopefully letting us avoid manual intervention and restarting jobs. + # One could of course argue that we should invest in trying to avoid + # this, but currently it seems like a larger investment to be cleverer + # about this. + # Some of the jobs might still require manual restart as they are really + # slow and this will cause them to eventually be killed by Github actions. + attempt_delay: 300000 # 5 min + attempt_limit: 10 command: | nix develop --command -- docker run \ --tty --rm \ diff --git a/.github/workflows/test-integration.yaml b/.github/workflows/test-integration.yaml index e74fbc23..ba2a4e2e 100644 --- a/.github/workflows/test-integration.yaml +++ b/.github/workflows/test-integration.yaml @@ -127,7 +127,17 @@ jobs: env: USE_POSTGRES: ${{ matrix.database == 'postgres' && '1' || '0' }} with: - attempt_limit: 5 + # Our integration tests are started like a thundering herd, often + # hitting limits of the various external repositories we depend on + # like docker hub. This will retry jobs every 5 min, 10 times, + # hopefully letting us avoid manual intervention and restarting jobs. + # One could of course argue that we should invest in trying to avoid + # this, but currently it seems like a larger investment to be cleverer + # about this. + # Some of the jobs might still require manual restart as they are really + # slow and this will cause them to eventually be killed by Github actions. + attempt_delay: 300000 # 5 min + attempt_limit: 10 command: | nix develop --command -- docker run \ --tty --rm \