Three bugs: - GITEA_URL defaulted to localhost:3000; Gitea NodePort is 32166 - Pod label app.kubernetes.io/name=postgresql-ha matched pgpool pod too; added component=postgresql to target only postgres nodes - Used bare 'kubectl' which is not on PATH; switched to 'k3s kubectl' Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
154 lines
6.0 KiB
Bash
Executable File
154 lines
6.0 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# HA Failover Test — Decision D3
|
|
#
|
|
# Deliberately kills the primary PostgreSQL pod and asserts that:
|
|
# 1. Gitea remains accessible during failover
|
|
# 2. pgpool recovers to Running state
|
|
# 3. All postgresql-ha pods return to Running
|
|
#
|
|
# Must be run against a live cluster. Exits 0 on full pass.
|
|
# Run: bash tests/test_ha_failover.sh [GITEA_URL]
|
|
#
|
|
# GITEA_URL defaults to http://localhost:3000 — override for your ingress:
|
|
# bash tests/test_ha_failover.sh https://git.example.com
|
|
|
|
set -uo pipefail
|
|
|
|
GITEA_URL="${1:-http://localhost:32166}"
|
|
NAMESPACE="default"
|
|
KUBECTL="k3s kubectl"
|
|
FAILOVER_TIMEOUT=60 # seconds to wait for repmgr promotion
|
|
RECOVERY_TIMEOUT=120 # seconds to wait for all pods Running again
|
|
PASS=0
|
|
FAIL=0
|
|
|
|
ok() { echo "[OK] $*"; ((PASS++)) || true; }
|
|
fail() { echo "[FAIL] $*"; ((FAIL++)) || true; }
|
|
info() { echo "[INFO] $*"; }
|
|
|
|
# ── Pre-flight ────────────────────────────────────────────────────────────────
|
|
info "Target cluster: $($KUBECTL config current-context 2>/dev/null || echo 'default')"
|
|
info "Gitea URL: ${GITEA_URL}"
|
|
info "Namespace: ${NAMESPACE}"
|
|
echo ""
|
|
|
|
# Confirm a postgresql node pod exists (component=postgresql excludes pgpool)
|
|
PRIMARY_POD=$($KUBECTL get pods -n "${NAMESPACE}" \
|
|
-l app.kubernetes.io/name=postgresql-ha,app.kubernetes.io/component=postgresql \
|
|
-o jsonpath='{.items[0].metadata.name}' 2>/dev/null)
|
|
if [[ -z "$PRIMARY_POD" ]]; then
|
|
fail "No postgresql-ha postgresql pods found — is Gitea deployed?"
|
|
exit 1
|
|
fi
|
|
info "PostgreSQL pod to kill: ${PRIMARY_POD}"
|
|
|
|
# ── Baseline: Gitea accessible before failover ────────────────────────────────
|
|
info "Checking Gitea baseline..."
|
|
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 "${GITEA_URL}" 2>/dev/null || echo "000")
|
|
if [[ "$HTTP_CODE" =~ ^[23] ]]; then
|
|
ok "Gitea accessible before failover (HTTP ${HTTP_CODE})"
|
|
else
|
|
fail "Gitea not accessible before failover (HTTP ${HTTP_CODE}) — aborting test"
|
|
exit 1
|
|
fi
|
|
|
|
# ── Trigger failover: kill primary pod ───────────────────────────────────────
|
|
info "Deleting primary pod ${PRIMARY_POD} to trigger failover..."
|
|
$KUBECTL delete pod -n "${NAMESPACE}" "${PRIMARY_POD}" --grace-period=0
|
|
FAILOVER_START=$(date +%s)
|
|
|
|
# ── Wait for repmgr promotion ─────────────────────────────────────────────────
|
|
info "Waiting up to ${FAILOVER_TIMEOUT}s for a replica to be promoted..."
|
|
PROMOTED=false
|
|
while (( $(date +%s) - FAILOVER_START < FAILOVER_TIMEOUT )); do
|
|
RUNNING=$($KUBECTL get pods -n "${NAMESPACE}" -l app.kubernetes.io/name=postgresql-ha 2>/dev/null \
|
|
| grep " Running " | wc -l)
|
|
if [[ "$RUNNING" -ge 1 ]]; then
|
|
PROMOTED=true
|
|
ELAPSED=$(( $(date +%s) - FAILOVER_START ))
|
|
info "Replica promoted in ${ELAPSED}s"
|
|
break
|
|
fi
|
|
sleep 3
|
|
done
|
|
|
|
if $PROMOTED; then
|
|
ok "PostgreSQL replica promoted within ${FAILOVER_TIMEOUT}s"
|
|
else
|
|
fail "No replica promoted within ${FAILOVER_TIMEOUT}s"
|
|
fi
|
|
|
|
# ── Gitea accessible after failover ──────────────────────────────────────────
|
|
info "Checking Gitea accessibility after failover..."
|
|
GITEA_OK=false
|
|
for i in $(seq 1 10); do
|
|
HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 "${GITEA_URL}" 2>/dev/null || echo "000")
|
|
if [[ "$HTTP_CODE" =~ ^[23] ]]; then
|
|
GITEA_OK=true
|
|
break
|
|
fi
|
|
sleep 1
|
|
done
|
|
|
|
if $GITEA_OK; then
|
|
ok "Gitea accessible after failover (HTTP ${HTTP_CODE})"
|
|
else
|
|
fail "Gitea not accessible within 10s of failover (last HTTP ${HTTP_CODE})"
|
|
fi
|
|
|
|
# ── pgpool Running after failover ─────────────────────────────────────────────
|
|
info "Checking pgpool state..."
|
|
PGPOOL_OK=false
|
|
for i in $(seq 1 20); do
|
|
PGPOOL_STATE=$($KUBECTL get pods -n "${NAMESPACE}" -l app.kubernetes.io/component=pgpool 2>/dev/null \
|
|
| grep -v "^NAME" | awk '{print $3}' | head -1)
|
|
if [[ "$PGPOOL_STATE" == "Running" ]]; then
|
|
PGPOOL_OK=true
|
|
break
|
|
fi
|
|
sleep 3
|
|
done
|
|
|
|
if $PGPOOL_OK; then
|
|
ok "pgpool pod Running after failover"
|
|
else
|
|
fail "pgpool not Running after failover (state: ${PGPOOL_STATE:-not found}) — missing pgpool-password?"
|
|
fi
|
|
|
|
# ── All postgresql-ha pods recover ───────────────────────────────────────────
|
|
info "Waiting up to ${RECOVERY_TIMEOUT}s for all postgresql-ha pods to return to Running..."
|
|
ALL_OK=false
|
|
RECOVERY_START=$(date +%s)
|
|
while (( $(date +%s) - RECOVERY_START < RECOVERY_TIMEOUT )); do
|
|
TOTAL=$($KUBECTL get pods -n "${NAMESPACE}" -l app.kubernetes.io/name=postgresql-ha 2>/dev/null \
|
|
| grep -v "^NAME" | wc -l)
|
|
RUNNING=$($KUBECTL get pods -n "${NAMESPACE}" -l app.kubernetes.io/name=postgresql-ha 2>/dev/null \
|
|
| grep " Running " | wc -l)
|
|
if [[ "$TOTAL" -gt 0 && "$TOTAL" -eq "$RUNNING" ]]; then
|
|
ALL_OK=true
|
|
ELAPSED=$(( $(date +%s) - RECOVERY_START ))
|
|
info "All ${TOTAL} postgresql-ha pods Running after ${ELAPSED}s"
|
|
break
|
|
fi
|
|
sleep 5
|
|
done
|
|
|
|
if $ALL_OK; then
|
|
ok "All postgresql-ha pods recovered to Running"
|
|
else
|
|
fail "Not all postgresql-ha pods recovered within ${RECOVERY_TIMEOUT}s"
|
|
$KUBECTL get pods -n "${NAMESPACE}" -l app.kubernetes.io/name=postgresql-ha 2>/dev/null || true
|
|
fi
|
|
|
|
# ── Summary ───────────────────────────────────────────────────────────────────
|
|
echo ""
|
|
echo "Results: ${PASS} passed, ${FAIL} failed"
|
|
echo ""
|
|
if [[ "$FAIL" -gt 0 ]]; then
|
|
echo "FAILOVER TEST FAILED — review output above"
|
|
exit 1
|
|
else
|
|
echo "FAILOVER TEST PASSED — cluster is HA-verified (D3 satisfied)"
|
|
exit 0
|
|
fi
|