Add sufficient deadlines and countermeasures to handle hung node scenario (#19688)

Signed-off-by: Shubhendu Ram Tripathi <shubhendu@minio.io>
Signed-off-by: Harshavardhana <harsha@minio.io>
This commit is contained in:
Shubhendu
2024-05-23 04:37:14 +05:30
committed by GitHub
parent ca80eced24
commit 7c7650b7c3
34 changed files with 292 additions and 133 deletions

View File

@@ -4,7 +4,6 @@ on:
pull_request:
branches:
- master
- next
# This ensures that previous jobs for the PR are canceled when the PR is
# updated.

View File

@@ -3,8 +3,7 @@ name: FIPS Build Test
on:
pull_request:
branches:
- master
- next
- master
# This ensures that previous jobs for the PR are canceled when the PR is
# updated.

View File

@@ -3,8 +3,7 @@ name: Healing Functional Tests
on:
pull_request:
branches:
- master
- next
- master
# This ensures that previous jobs for the PR are canceled when the PR is
# updated.

View File

@@ -3,12 +3,11 @@ name: Linters and Tests
on:
pull_request:
branches:
- master
- next
- master
# This ensures that previous jobs for the PR are canceled when the PR is
# updated.
concurrency:
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref }}
cancel-in-progress: true
@@ -35,9 +34,10 @@ jobs:
CGO_ENABLED: 0
GO111MODULE: on
run: |
Set-MpPreference -DisableRealtimeMonitoring $true
netsh int ipv4 set dynamicport tcp start=60000 num=61000
go build --ldflags="-s -w" -o %GOPATH%\bin\minio.exe
go test -v --timeout 50m ./...
go test -v --timeout 120m ./...
- name: Build on ${{ matrix.os }}
if: matrix.os == 'ubuntu-latest'
env:

View File

@@ -3,8 +3,7 @@ name: Functional Tests
on:
pull_request:
branches:
- master
- next
- master
# This ensures that previous jobs for the PR are canceled when the PR is
# updated.

View File

@@ -3,8 +3,7 @@ name: Helm Chart linting
on:
pull_request:
branches:
- master
- next
- master
# This ensures that previous jobs for the PR are canceled when the PR is
# updated.

View File

@@ -4,7 +4,6 @@ on:
pull_request:
branches:
- master
- next
# This ensures that previous jobs for the PR are canceled when the PR is
# updated.

View File

@@ -4,7 +4,6 @@ on:
pull_request:
branches:
- master
- next
# This ensures that previous jobs for the PR are canceled when the PR is
# updated.
@@ -56,6 +55,10 @@ jobs:
run: |
${GITHUB_WORKSPACE}/.github/workflows/run-mint.sh "erasure" "minio" "minio123" "${{ steps.vars.outputs.sha_short }}"
- name: resiliency
run: |
${GITHUB_WORKSPACE}/.github/workflows/run-mint.sh "resiliency" "minio" "minio123" "${{ steps.vars.outputs.sha_short }}"
- name: The job must cleanup
if: ${{ always() }}
run: |

View File

@@ -0,0 +1,78 @@
version: '3.7'
# Settings and configurations that are common for all containers
x-minio-common: &minio-common
image: quay.io/minio/minio:${JOB_NAME}
command: server --console-address ":9001" http://minio{1...4}/rdata{1...2}
expose:
- "9000"
- "9001"
environment:
MINIO_CI_CD: "on"
MINIO_ROOT_USER: "minio"
MINIO_ROOT_PASSWORD: "minio123"
MINIO_KMS_SECRET_KEY: "my-minio-key:OSMM+vkKUTCvQs9YL/CVMIMt43HFhkUpqJxTmGl6rYw="
MINIO_DRIVE_MAX_TIMEOUT: "5s"
healthcheck:
test: ["CMD", "mc", "ready", "local"]
interval: 5s
timeout: 5s
retries: 5
# starts 4 docker containers running minio server instances.
# using nginx reverse proxy, load balancing, you can access
# it through port 9000.
services:
minio1:
<<: *minio-common
hostname: minio1
volumes:
- rdata1-1:/rdata1
- rdata1-2:/rdata2
minio2:
<<: *minio-common
hostname: minio2
volumes:
- rdata2-1:/rdata1
- rdata2-2:/rdata2
minio3:
<<: *minio-common
hostname: minio3
volumes:
- rdata3-1:/rdata1
- rdata3-2:/rdata2
minio4:
<<: *minio-common
hostname: minio4
volumes:
- rdata4-1:/rdata1
- rdata4-2:/rdata2
nginx:
image: nginx:1.19.2-alpine
hostname: nginx
volumes:
- ./nginx-4-node.conf:/etc/nginx/nginx.conf:ro
ports:
- "9000:9000"
- "9001:9001"
depends_on:
- minio1
- minio2
- minio3
- minio4
## By default this config uses default local driver,
## For custom volumes replace with volume driver configuration.
volumes:
rdata1-1:
rdata1-2:
rdata2-1:
rdata2-2:
rdata3-1:
rdata3-2:
rdata4-1:
rdata4-2:

View File

@@ -3,8 +3,7 @@ name: MinIO advanced tests
on:
pull_request:
branches:
- master
- next
- master
# This ensures that previous jobs for the PR are canceled when the PR is
# updated.

View File

@@ -3,8 +3,7 @@ name: Root lockdown tests
on:
pull_request:
branches:
- master
- next
- master
# This ensures that previous jobs for the PR are canceled when the PR is
# updated.

View File

@@ -16,7 +16,7 @@ docker volume rm $(docker volume ls -f dangling=true) || true
cd .github/workflows/mint
docker-compose -f minio-${MODE}.yaml up -d
sleep 30s
sleep 1m
docker system prune -f || true
docker volume prune -f || true
@@ -26,6 +26,9 @@ docker volume rm $(docker volume ls -q -f dangling=true) || true
[ "${MODE}" == "pools" ] && docker-compose -f minio-${MODE}.yaml stop minio2
[ "${MODE}" == "pools" ] && docker-compose -f minio-${MODE}.yaml stop minio6
# Pause one node, to check that all S3 calls work while one node goes wrong
[ "${MODE}" == "resiliency" ] && docker-compose -f minio-${MODE}.yaml pause minio4
docker run --rm --net=mint_default \
--name="mint-${MODE}-${JOB_NAME}" \
-e SERVER_ENDPOINT="nginx:9000" \
@@ -35,6 +38,18 @@ docker run --rm --net=mint_default \
-e MINT_MODE="${MINT_MODE}" \
docker.io/minio/mint:edge
# FIXME: enable this after fixing aws-sdk-java-v2 tests
# # unpause the node, to check that all S3 calls work while one node goes wrong
# [ "${MODE}" == "resiliency" ] && docker-compose -f minio-${MODE}.yaml unpause minio4
# [ "${MODE}" == "resiliency" ] && docker run --rm --net=mint_default \
# --name="mint-${MODE}-${JOB_NAME}" \
# -e SERVER_ENDPOINT="nginx:9000" \
# -e ACCESS_KEY="${ACCESS_KEY}" \
# -e SECRET_KEY="${SECRET_KEY}" \
# -e ENABLE_HTTPS=0 \
# -e MINT_MODE="${MINT_MODE}" \
# docker.io/minio/mint:edge
docker-compose -f minio-${MODE}.yaml down || true
sleep 10s

View File

@@ -3,8 +3,7 @@ name: Shell formatting checks
on:
pull_request:
branches:
- master
- next
- master
permissions:
contents: read

View File

@@ -3,8 +3,7 @@ name: Upgrade old version tests
on:
pull_request:
branches:
- master
- next
- master
# This ensures that previous jobs for the PR are canceled when the PR is
# updated.