#!/usr/bin/env bash # HA Failover Test — Decision D3 # # Deliberately kills the primary PostgreSQL pod and asserts that: # 1. Gitea remains accessible during failover # 2. pgpool recovers to Running state # 3. All postgresql-ha pods return to Running # # Must be run against a live cluster. Exits 0 on full pass. # Run: bash tests/test_ha_failover.sh [GITEA_URL] # # GITEA_URL defaults to http://localhost:3000 — override for your ingress: # bash tests/test_ha_failover.sh https://git.example.com set -uo pipefail GITEA_URL="${1:-http://localhost:32166}" NAMESPACE="default" KUBECTL="k3s kubectl" FAILOVER_TIMEOUT=60 # seconds to wait for repmgr promotion RECOVERY_TIMEOUT=120 # seconds to wait for all pods Running again PASS=0 FAIL=0 ok() { echo "[OK] $*"; ((PASS++)) || true; } fail() { echo "[FAIL] $*"; ((FAIL++)) || true; } info() { echo "[INFO] $*"; } # ── Pre-flight ──────────────────────────────────────────────────────────────── info "Target cluster: $($KUBECTL config current-context 2>/dev/null || echo 'default')" info "Gitea URL: ${GITEA_URL}" info "Namespace: ${NAMESPACE}" echo "" # Confirm a postgresql node pod exists (component=postgresql excludes pgpool) PRIMARY_POD=$($KUBECTL get pods -n "${NAMESPACE}" \ -l app.kubernetes.io/name=postgresql-ha,app.kubernetes.io/component=postgresql \ -o jsonpath='{.items[0].metadata.name}' 2>/dev/null) if [[ -z "$PRIMARY_POD" ]]; then fail "No postgresql-ha postgresql pods found — is Gitea deployed?" exit 1 fi info "PostgreSQL pod to kill: ${PRIMARY_POD}" # ── Baseline: Gitea accessible before failover ──────────────────────────────── info "Checking Gitea baseline..." HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 "${GITEA_URL}" 2>/dev/null || echo "000") if [[ "$HTTP_CODE" =~ ^[23] ]]; then ok "Gitea accessible before failover (HTTP ${HTTP_CODE})" else fail "Gitea not accessible before failover (HTTP ${HTTP_CODE}) — aborting test" exit 1 fi # ── Trigger failover: kill primary pod ─────────────────────────────────────── info "Deleting primary pod ${PRIMARY_POD} to trigger failover..." $KUBECTL delete pod -n "${NAMESPACE}" "${PRIMARY_POD}" --grace-period=0 FAILOVER_START=$(date +%s) # ── Wait for repmgr promotion ───────────────────────────────────────────────── info "Waiting up to ${FAILOVER_TIMEOUT}s for a replica to be promoted..." PROMOTED=false while (( $(date +%s) - FAILOVER_START < FAILOVER_TIMEOUT )); do RUNNING=$($KUBECTL get pods -n "${NAMESPACE}" -l app.kubernetes.io/name=postgresql-ha 2>/dev/null \ | grep " Running " | wc -l) if [[ "$RUNNING" -ge 1 ]]; then PROMOTED=true ELAPSED=$(( $(date +%s) - FAILOVER_START )) info "Replica promoted in ${ELAPSED}s" break fi sleep 3 done if $PROMOTED; then ok "PostgreSQL replica promoted within ${FAILOVER_TIMEOUT}s" else fail "No replica promoted within ${FAILOVER_TIMEOUT}s" fi # ── Gitea accessible after failover ────────────────────────────────────────── info "Checking Gitea accessibility after failover..." GITEA_OK=false for i in $(seq 1 10); do HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "${GITEA_URL}" 2>/dev/null || echo "000") if [[ "$HTTP_CODE" =~ ^[23] ]]; then GITEA_OK=true break fi sleep 1 done if $GITEA_OK; then ok "Gitea accessible after failover (HTTP ${HTTP_CODE})" else fail "Gitea not accessible within 10s of failover (last HTTP ${HTTP_CODE})" fi # ── pgpool Running after failover ───────────────────────────────────────────── info "Checking pgpool state..." PGPOOL_OK=false for i in $(seq 1 20); do PGPOOL_STATE=$($KUBECTL get pods -n "${NAMESPACE}" -l app.kubernetes.io/component=pgpool 2>/dev/null \ | grep -v "^NAME" | awk '{print $3}' | head -1) if [[ "$PGPOOL_STATE" == "Running" ]]; then PGPOOL_OK=true break fi sleep 3 done if $PGPOOL_OK; then ok "pgpool pod Running after failover" else fail "pgpool not Running after failover (state: ${PGPOOL_STATE:-not found}) — missing pgpool-password?" fi # ── All postgresql-ha pods recover ─────────────────────────────────────────── info "Waiting up to ${RECOVERY_TIMEOUT}s for all postgresql-ha pods to return to Running..." ALL_OK=false RECOVERY_START=$(date +%s) while (( $(date +%s) - RECOVERY_START < RECOVERY_TIMEOUT )); do TOTAL=$($KUBECTL get pods -n "${NAMESPACE}" -l app.kubernetes.io/name=postgresql-ha 2>/dev/null \ | grep -v "^NAME" | wc -l) RUNNING=$($KUBECTL get pods -n "${NAMESPACE}" -l app.kubernetes.io/name=postgresql-ha 2>/dev/null \ | grep " Running " | wc -l) if [[ "$TOTAL" -gt 0 && "$TOTAL" -eq "$RUNNING" ]]; then ALL_OK=true ELAPSED=$(( $(date +%s) - RECOVERY_START )) info "All ${TOTAL} postgresql-ha pods Running after ${ELAPSED}s" break fi sleep 5 done if $ALL_OK; then ok "All postgresql-ha pods recovered to Running" else fail "Not all postgresql-ha pods recovered within ${RECOVERY_TIMEOUT}s" $KUBECTL get pods -n "${NAMESPACE}" -l app.kubernetes.io/name=postgresql-ha 2>/dev/null || true fi # ── Summary ─────────────────────────────────────────────────────────────────── echo "" echo "Results: ${PASS} passed, ${FAIL} failed" echo "" if [[ "$FAIL" -gt 0 ]]; then echo "FAILOVER TEST FAILED — review output above" exit 1 else echo "FAILOVER TEST PASSED — cluster is HA-verified (D3 satisfied)" exit 0 fi