Files
railiance-cluster/tools/cmd/railiance-admin-sync-smoke
2026-07-02 10:44:06 +02:00

156 lines
6.7 KiB
Bash
Executable File

#!/usr/bin/env bash
# Prove POST /admin/sync works without restarting the activity-core worker.
set -euo pipefail
NAMESPACE="${ACTIVITY_CORE_NAMESPACE:-activity-core}"
CLUSTER_HOST="${ACTIVITY_CORE_CLUSTER_HOST:-railiance01}"
STATE_HUB_URL="${STATE_HUB_URL:-http://127.0.0.1:8000}"
ACTIVITY_CORE_ALLOW_LOCAL_KUBECTL="${ACTIVITY_CORE_ALLOW_LOCAL_KUBECTL:-0}"
ACTIVITY_CORE_ADMIN_SYNC_FIXTURE_COMMAND="${ACTIVITY_CORE_ADMIN_SYNC_FIXTURE_COMMAND:-}"
ACTIVITY_CORE_ADMIN_SYNC_REQUIRE_FIXTURE="${ACTIVITY_CORE_ADMIN_SYNC_REQUIRE_FIXTURE:-0}"
EVIDENCE_WORKSTREAM_ID="${STATE_HUB_EVIDENCE_WORKSTREAM_ID:-2c9e8e96-ec6a-433c-9e6d-0efbcd18679e}"
EVIDENCE_TASK_ID="${STATE_HUB_EVIDENCE_TASK_ID:-60f3387d-3d14-42a9-b8a3-725a86468510}"
STARTED_AT="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
CURRENT_GATE=startup
BEFORE_JSON=""
AFTER_JSON=""
FIXTURE_STATUS=skipped
SYNC_RESPONSE_JSON=""
EVIDENCE_NOTE_JSON=""
export NAMESPACE CLUSTER_HOST STATE_HUB_URL EVIDENCE_WORKSTREAM_ID EVIDENCE_TASK_ID
export STARTED_AT BEFORE_JSON AFTER_JSON FIXTURE_STATUS SYNC_RESPONSE_JSON
log() { printf '[activity-core-admin-sync-smoke] %s\n' "$*"; }
quote() { printf '%q' "$1"; }
cluster_bash() { if [[ -n "$CLUSTER_HOST" ]]; then ssh "$CLUSTER_HOST" "bash -s" <<<"$1"; else bash -s <<<"$1"; fi; }
post_evidence() {
local status="$1" failing_gate="${2:-}"
export EVIDENCE_STATUS="$status" FAILING_GATE="$failing_gate"
python3 - <<'PY'
import json, os, sys, urllib.request
def env_json(name):
raw = os.environ.get(name, "")
if not raw:
return None
try:
return json.loads(raw)
except json.JSONDecodeError:
return {"raw": raw}
status = os.environ["EVIDENCE_STATUS"]
failing_gate = os.environ.get("FAILING_GATE") or None
detail = {
"producer": "railiance-cluster",
"verification": "activity-core no-restart admin sync smoke",
"status": status,
"failing_gate": failing_gate,
"cluster_host": os.environ.get("CLUSTER_HOST") or "local-kubectl",
"namespace": os.environ.get("NAMESPACE"),
"worker_before": env_json("BEFORE_JSON"),
"worker_after": env_json("AFTER_JSON"),
"fixture_status": os.environ.get("FIXTURE_STATUS"),
"sync_response": env_json("SYNC_RESPONSE_JSON"),
"started_at": os.environ.get("STARTED_AT"),
}
summary = (
"Railiance activity-core no-restart admin-sync smoke passed: POST /admin/sync returned expected counters and worker pod identity/restart count stayed stable."
if status == "passed"
else "Railiance activity-core no-restart admin-sync smoke failed" + (f" at {failing_gate}" if failing_gate else "") + "; see non-secret evidence detail."
)
payload = {"summary": summary, "event_type": "note", "author": "railiance-cluster", "detail": detail}
if os.environ.get("EVIDENCE_WORKSTREAM_ID"):
payload["workstream_id"] = os.environ["EVIDENCE_WORKSTREAM_ID"]
if os.environ.get("EVIDENCE_TASK_ID"):
payload["task_id"] = os.environ["EVIDENCE_TASK_ID"]
req = urllib.request.Request(os.environ["STATE_HUB_URL"].rstrip("/") + "/progress/", data=json.dumps(payload).encode(), headers={"Content-Type": "application/json"}, method="POST")
with urllib.request.urlopen(req, timeout=20) as resp:
sys.stdout.write(resp.read().decode())
PY
}
on_error() { local code=$?; trap - ERR; post_evidence failed "$CURRENT_GATE" >/dev/null || true; exit "$code"; }
trap on_error ERR
if [[ "$CLUSTER_HOST" == local ]]; then
[[ "$ACTIVITY_CORE_ALLOW_LOCAL_KUBECTL" == 1 ]] || { echo 'ACTIVITY_CORE_CLUSTER_HOST=local requires ACTIVITY_CORE_ALLOW_LOCAL_KUBECTL=1' >&2; exit 2; }
CLUSTER_HOST=""
fi
export CLUSTER_HOST
CURRENT_GATE='cluster executor preflight'
log "using cluster executor: ${CLUSTER_HOST:-local kubectl}"
cluster_bash 'set -euo pipefail; command -v kubectl >/dev/null; command -v python3 >/dev/null'
worker_snapshot_script='import json,sys
items=json.load(sys.stdin).get("items",[])
if not items: raise SystemExit("no actcore-worker pods found")
pod=sorted(items,key=lambda item:item["metadata"]["name"])[0]
container=pod["status"]["containerStatuses"][0]
print(json.dumps({"name":pod["metadata"]["name"],"uid":pod["metadata"]["uid"],"phase":pod["status"].get("phase"),"restart_count":container.get("restartCount",0),"image":container.get("image"),"image_id":container.get("imageID")}, sort_keys=True))'
CURRENT_GATE='worker baseline capture'
BEFORE_JSON="$(cluster_bash "kubectl -n $(quote "$NAMESPACE") get pod -l app.kubernetes.io/name=actcore-worker -o json | python3 -c $(quote "$worker_snapshot_script")")"
export BEFORE_JSON
CURRENT_GATE='admin sync fixture'
if [[ -n "$ACTIVITY_CORE_ADMIN_SYNC_FIXTURE_COMMAND" ]]; then
log 'running operator-supplied fixture command'
cluster_bash "$ACTIVITY_CORE_ADMIN_SYNC_FIXTURE_COMMAND"
FIXTURE_STATUS=ran
elif [[ "$ACTIVITY_CORE_ADMIN_SYNC_REQUIRE_FIXTURE" == 1 ]]; then
echo 'ACTIVITY_CORE_ADMIN_SYNC_REQUIRE_FIXTURE=1 but no fixture command was supplied' >&2
exit 2
else
FIXTURE_STATUS=skipped
fi
export FIXTURE_STATUS
CURRENT_GATE='POST /admin/sync'
log 'calling POST /admin/sync?definitions=true&schedules=true'
SYNC_RESPONSE_JSON="$(
cluster_bash "$(cat <<EOF
set -euo pipefail
kubectl -n $(quote "$NAMESPACE") exec -i deploy/actcore-api -- python - <<'PY'
import json, urllib.request
req = urllib.request.Request('http://localhost:8010/admin/sync?definitions=true&schedules=true', method='POST')
with urllib.request.urlopen(req, timeout=60) as resp:
payload = json.loads(resp.read().decode())
required = [('definitions','synced'),('schedules','upserted'),('schedules','paused'),('schedules','deleted_orphans'),('errors',None)]
for section, key in required:
if section not in payload:
raise SystemExit(f'missing sync response section {section!r}')
if key is not None and key not in payload[section]:
raise SystemExit(f'missing sync response key {section}.{key}')
if payload.get('errors'):
raise SystemExit('admin sync returned errors: ' + json.dumps(payload['errors']))
print(json.dumps(payload, sort_keys=True))
PY
EOF
)"
)"
export SYNC_RESPONSE_JSON
CURRENT_GATE='worker no-restart verification'
AFTER_JSON="$(cluster_bash "kubectl -n $(quote "$NAMESPACE") get pod -l app.kubernetes.io/name=actcore-worker -o json | python3 -c $(quote "$worker_snapshot_script")")"
python3 - <<'PY'
import json, os
before = json.loads(os.environ['BEFORE_JSON'])
after = json.loads(os.environ['AFTER_JSON'])
if before['uid'] != after['uid']:
raise SystemExit(f"worker pod changed uid: {before['uid']} -> {after['uid']}")
if before['restart_count'] != after['restart_count']:
raise SystemExit(f"worker restart count changed: {before['restart_count']} -> {after['restart_count']}")
PY
export AFTER_JSON
CURRENT_GATE='State Hub evidence note'
log 'posting non-secret evidence note to State Hub'
EVIDENCE_NOTE_JSON="$(post_evidence passed '')"
trap - ERR
log 'verification passed'
printf '%s\n' "$EVIDENCE_NOTE_JSON"