From dddc7ebd8122c61f38c82a3b5f3e5d29bc403af7 Mon Sep 17 00:00:00 2001 From: tegwick Date: Tue, 16 Jun 2026 03:51:01 +0200 Subject: [PATCH] Add activity-core cluster verifier --- Makefile | 5 +- tools/cmd/railiance-verify-activity-core | 384 ++++++++++++++++++ ...tivity-core-cluster-owned-deploy-verify.md | 110 +++++ 3 files changed, 498 insertions(+), 1 deletion(-) create mode 100755 tools/cmd/railiance-verify-activity-core create mode 100644 workplans/RAILIANCE-WP-0012-activity-core-cluster-owned-deploy-verify.md diff --git a/Makefile b/Makefile index dac8ef5..e95e3ee 100644 --- a/Makefile +++ b/Makefile @@ -24,6 +24,9 @@ smoke: ## Run Kubernetes smoke tests test-ha-failover: ## Run HA failover test (D3) — kills primary PG pod, asserts recovery bash tests/test_ha_failover.sh $(if $(GITEA_URL),$(GITEA_URL),) +verify-activity-core: ## Reconcile activity-core runtime and verify disabled ops inventory probe evidence + tools/cmd/railiance-verify-activity-core + ##@ Help help: ## Show this help @@ -31,4 +34,4 @@ help: ## Show this help /^[a-zA-Z_-]+:.*?##/ { printf " \033[36m%-20s\033[0m %s\n", $$1, $$2 } \ /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) }' $(MAKEFILE_LIST) -.PHONY: backup restore preflight k3s-install smoke test-ha-failover help +.PHONY: backup restore preflight k3s-install smoke test-ha-failover verify-activity-core help diff --git a/tools/cmd/railiance-verify-activity-core b/tools/cmd/railiance-verify-activity-core new file mode 100755 index 0000000..9bc6315 --- /dev/null +++ b/tools/cmd/railiance-verify-activity-core @@ -0,0 +1,384 @@ +#!/usr/bin/env bash +# Cluster-owned activity-core runtime reconcile and ops inventory probe evidence path. +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" + +NAMESPACE="${ACTIVITY_CORE_NAMESPACE:-activity-core}" +DEFINITION_ID="${ACTIVITY_CORE_OPS_DEFINITION_ID:-40d15a87-7ff6-4d8e-992c-37df15f95110}" +DEFINITION_SLUG="${ACTIVITY_CORE_OPS_DEFINITION_SLUG:-ops-service-inventory-probes}" +DEFINITION_NAME="${ACTIVITY_CORE_OPS_DEFINITION_NAME:-Ops Service Inventory Probes}" + +STATE_HUB_URL="${STATE_HUB_URL:-http://127.0.0.1:8000}" +STATE_HUB_PROGRESS_TIMEOUT_SECONDS="${STATE_HUB_PROGRESS_TIMEOUT_SECONDS:-180}" +STATE_HUB_PROGRESS_POLL_SECONDS="${STATE_HUB_PROGRESS_POLL_SECONDS:-5}" + +ACTIVITY_CORE_REPO="${ACTIVITY_CORE_REPO:-/home/worsch/activity-core}" +ACTIVITY_CORE_REMOTE_REPO="${ACTIVITY_CORE_REMOTE_REPO:-}" +ACTIVITY_CORE_CLUSTER_HOST="${ACTIVITY_CORE_CLUSTER_HOST:-railiance01}" +ACTIVITY_CORE_SYNC_RUNTIME_BUNDLE="${ACTIVITY_CORE_SYNC_RUNTIME_BUNDLE:-auto}" +ACTIVITY_CORE_RESTART_DEPLOYMENTS="${ACTIVITY_CORE_RESTART_DEPLOYMENTS:-0}" +if [[ "$ACTIVITY_CORE_CLUSTER_HOST" == "local" ]]; then + ACTIVITY_CORE_CLUSTER_HOST="" +fi +if [[ -z "$ACTIVITY_CORE_REMOTE_REPO" ]]; then + if [[ -n "$ACTIVITY_CORE_CLUSTER_HOST" ]]; then + ACTIVITY_CORE_REMOTE_REPO="$(ssh "$ACTIVITY_CORE_CLUSTER_HOST" pwd)/activity-core" + else + ACTIVITY_CORE_REMOTE_REPO="$ACTIVITY_CORE_REPO" + fi +fi + +EVIDENCE_WORKSTREAM_ID="${STATE_HUB_EVIDENCE_WORKSTREAM_ID:-c91a0946-92f9-4b41-8a92-005b29952916}" +EVIDENCE_TASK_ID="${STATE_HUB_EVIDENCE_TASK_ID:-d15fc947-3fbe-4269-93c6-d98577352149}" +INTER_HUB_SUBMISSION_STATUS="${INTER_HUB_SUBMISSION_STATUS:-deferred}" +INTER_HUB_DEFER_REASON="${INTER_HUB_DEFER_REASON:-ops-hub key custody and Inter-Hub production intake remain operator-gated; State Hub fallback evidence is accepted for this handoff}" + +STARTED_AT="$(date -u +"%Y-%m-%dT%H:%M:%SZ")" +CURRENT_GATE="startup" +REMOTE_REVISION="" +API_IMAGE="" +SYNC_STATUS_JSON="" +DEFINITION_JSON="" +TRIGGER_JSON="" +PROGRESS_JSON="" +EVIDENCE_NOTE_JSON="" + +export NAMESPACE DEFINITION_ID DEFINITION_SLUG DEFINITION_NAME +export STATE_HUB_URL EVIDENCE_WORKSTREAM_ID EVIDENCE_TASK_ID +export STATE_HUB_PROGRESS_TIMEOUT_SECONDS STATE_HUB_PROGRESS_POLL_SECONDS +export INTER_HUB_SUBMISSION_STATUS INTER_HUB_DEFER_REASON STARTED_AT +export ACTIVITY_CORE_CLUSTER_HOST ACTIVITY_CORE_REMOTE_REPO +export ACTIVITY_CORE_SYNC_RUNTIME_BUNDLE ACTIVITY_CORE_RESTART_DEPLOYMENTS +export REMOTE_REVISION API_IMAGE SYNC_STATUS_JSON DEFINITION_JSON TRIGGER_JSON PROGRESS_JSON + +log() { + printf '[activity-core-verify] %s\n' "$*" +} + +quote() { + printf '%q' "$1" +} + +cluster_bash() { + local script="$1" + if [[ -n "$ACTIVITY_CORE_CLUSTER_HOST" ]]; then + ssh "$ACTIVITY_CORE_CLUSTER_HOST" "bash -s" <<<"$script" + else + bash -s <<<"$script" + fi +} + +should_sync_runtime_bundle() { + case "$ACTIVITY_CORE_SYNC_RUNTIME_BUNDLE" in + 1|true|yes) return 0 ;; + 0|false|no) return 1 ;; + auto) + [[ -n "$ACTIVITY_CORE_CLUSTER_HOST" && -d "$ACTIVITY_CORE_REPO/k8s/railiance" ]] + return + ;; + *) + printf 'invalid ACTIVITY_CORE_SYNC_RUNTIME_BUNDLE=%s\n' "$ACTIVITY_CORE_SYNC_RUNTIME_BUNDLE" >&2 + exit 2 + ;; + esac +} + +post_evidence() { + local status="$1" + local failing_gate="${2:-}" + export EVIDENCE_STATUS="$status" + export FAILING_GATE="$failing_gate" + python3 - <<'PY' +import json +import os +import sys +import urllib.request + +def from_json_env(name): + raw = os.environ.get(name, "") + if not raw: + return None + try: + return json.loads(raw) + except json.JSONDecodeError: + return {"raw": raw} + +status = os.environ["EVIDENCE_STATUS"] +failing_gate = os.environ.get("FAILING_GATE") or None +definition = from_json_env("DEFINITION_JSON") +trigger = from_json_env("TRIGGER_JSON") +progress = from_json_env("PROGRESS_JSON") +sync_status = from_json_env("SYNC_STATUS_JSON") + +detail = { + "producer": "railiance-cluster", + "verification": "activity-core cluster-owned deploy/verify", + "status": status, + "failing_gate": failing_gate, + "cluster_host": os.environ.get("ACTIVITY_CORE_CLUSTER_HOST") or "local-kubectl", + "namespace": os.environ.get("NAMESPACE"), + "activity_core_repo": os.environ.get("ACTIVITY_CORE_REMOTE_REPO"), + "activity_core_revision": os.environ.get("REMOTE_REVISION") or None, + "api_image": os.environ.get("API_IMAGE") or None, + "runtime_bundle": "k8s/railiance/20-runtime.yaml", + "sync_job": sync_status, + "definition": definition, + "manual_trigger": trigger, + "state_hub_progress": progress, + "inter_hub_submission": { + "status": os.environ.get("INTER_HUB_SUBMISSION_STATUS"), + "reason": os.environ.get("INTER_HUB_DEFER_REASON"), + }, + "started_at": os.environ.get("STARTED_AT"), +} + +if status == "passed": + summary = ( + "Railiance activity-core deploy/verify passed: runtime reconciled, " + "actcore-sync completed, ops-service-inventory-probes remains disabled, " + f"manual trigger {trigger.get('workflow_id') if isinstance(trigger, dict) else 'unknown'} ran, " + f"and State Hub ops_inventory_probe progress {progress.get('id') if isinstance(progress, dict) else 'unknown'} exists." + ) +else: + summary = ( + "Railiance activity-core deploy/verify failed" + + (f" at {failing_gate}" if failing_gate else "") + + "; see non-secret evidence detail for the last completed gate." + ) + +payload = { + "summary": summary, + "event_type": "note", + "author": "railiance-cluster", + "detail": detail, +} +if os.environ.get("EVIDENCE_WORKSTREAM_ID"): + payload["workstream_id"] = os.environ["EVIDENCE_WORKSTREAM_ID"] +if os.environ.get("EVIDENCE_TASK_ID"): + payload["task_id"] = os.environ["EVIDENCE_TASK_ID"] + +body = json.dumps(payload).encode("utf-8") +req = urllib.request.Request( + os.environ["STATE_HUB_URL"].rstrip("/") + "/progress/", + data=body, + headers={"Content-Type": "application/json"}, + method="POST", +) +try: + with urllib.request.urlopen(req, timeout=20) as resp: + sys.stdout.write(resp.read().decode("utf-8")) +except Exception as exc: + sys.stderr.write(f"failed to post State Hub evidence note: {exc}\n") + raise +PY +} + +on_error() { + local code=$? + trap - ERR + post_evidence "failed" "$CURRENT_GATE" >/dev/null || true + exit "$code" +} +trap on_error ERR + +CURRENT_GATE="cluster executor preflight" +log "using cluster executor: ${ACTIVITY_CORE_CLUSTER_HOST:-local kubectl}" +cluster_bash "$(cat </dev/null +EOF +)" + +CURRENT_GATE="runtime bundle sync" +if should_sync_runtime_bundle; then + if [[ -z "$ACTIVITY_CORE_CLUSTER_HOST" ]]; then + log "runtime bundle already local at ${ACTIVITY_CORE_REPO}/k8s/railiance" + else + log "syncing runtime bundle to ${ACTIVITY_CORE_CLUSTER_HOST}:${ACTIVITY_CORE_REMOTE_REPO}/k8s/railiance" + ssh "$ACTIVITY_CORE_CLUSTER_HOST" "mkdir -p $(quote "$ACTIVITY_CORE_REMOTE_REPO")/k8s/railiance" + rsync -a --delete \ + "$ACTIVITY_CORE_REPO/k8s/railiance/" \ + "${ACTIVITY_CORE_CLUSTER_HOST}:${ACTIVITY_CORE_REMOTE_REPO}/k8s/railiance/" + fi +fi + +CURRENT_GATE="runtime bundle preflight" +cluster_bash "$(cat </dev/null || true +EOF +)" +)" +export REMOTE_REVISION + +CURRENT_GATE="runtime bundle reconcile" +log "reconciling activity-core runtime bundle" +cluster_bash "$(cat <&2 + exit 1 +fi +export DEFINITION_JSON + +CURRENT_GATE="manual disabled trigger" +log "triggering disabled definition manually" +TRIGGER_JSON="$( + cluster_bash "$(cat <&2 + exit 1 +fi +export TRIGGER_JSON + +CURRENT_GATE="State Hub ops_inventory_probe evidence" +log "polling State Hub for ops_inventory_probe progress" +PROGRESS_JSON="$( + python3 - <<'PY' +from datetime import datetime, timezone +import json +import os +import time +import urllib.parse +import urllib.request + +base = os.environ["STATE_HUB_URL"].rstrip("/") +started = datetime.fromisoformat(os.environ["STARTED_AT"].replace("Z", "+00:00")) +timeout = int(os.environ["STATE_HUB_PROGRESS_TIMEOUT_SECONDS"]) +interval = int(os.environ["STATE_HUB_PROGRESS_POLL_SECONDS"]) +deadline = time.monotonic() + timeout +url = base + "/progress/?" + urllib.parse.urlencode({"event_type": "ops_inventory_probe"}) + +while time.monotonic() < deadline: + with urllib.request.urlopen(url, timeout=20) as resp: + events = json.load(resp) + for event in events: + created_at = datetime.fromisoformat(event["created_at"].replace("Z", "+00:00")) + if created_at >= started: + detail = event.get("detail") or {} + print(json.dumps({ + "id": event["id"], + "event_type": event.get("event_type"), + "summary": event.get("summary"), + "author": event.get("author"), + "created_at": event.get("created_at"), + "detail_keys": sorted(detail.keys()) if isinstance(detail, dict) else [], + })) + raise SystemExit(0) + time.sleep(interval) + +raise SystemExit(f"no ops_inventory_probe progress found after {timeout}s") +PY +)" +export PROGRESS_JSON + +CURRENT_GATE="State Hub evidence note" +log "posting non-secret evidence note to State Hub" +EVIDENCE_NOTE_JSON="$(post_evidence "passed" "")" +export EVIDENCE_NOTE_JSON + +trap - ERR +log "verification passed" +printf '%s\n' "$EVIDENCE_NOTE_JSON" diff --git a/workplans/RAILIANCE-WP-0012-activity-core-cluster-owned-deploy-verify.md b/workplans/RAILIANCE-WP-0012-activity-core-cluster-owned-deploy-verify.md new file mode 100644 index 0000000..b988135 --- /dev/null +++ b/workplans/RAILIANCE-WP-0012-activity-core-cluster-owned-deploy-verify.md @@ -0,0 +1,110 @@ +--- +id: RAILIANCE-WP-0012 +type: workplan +title: "activity-core cluster-owned deploy/verify" +domain: railiance +repo: railiance-cluster +status: finished +owner: codex +topic_slug: railiance +created: "2026-06-15" +updated: "2026-06-16" +state_hub_workstream_id: "6434f7cb-e13c-4c05-839b-197bb239d5cd" +--- + +# activity-core cluster-owned deploy/verify + +## Context + +activity-core `ACTIVITY-WP-0007-T06` needs live Railiance cluster evidence for +the disabled ops inventory probe. That live verification should be owned by the +cluster/operator layer, not by arbitrary activity-core sessions with local +`kubectl` assumptions. + +This workplan creates a cluster-owned path that keeps credentials in +operator-owned locations while returning only non-secret evidence to State Hub. + +## Implement cluster-owned verifier + +```task +id: RAILIANCE-WP-0012-T01 +status: done +priority: high +state_hub_task_id: "3769fdfb-b4f1-431b-a55a-672d93b3ea55" +``` + +Add a repeatable command that: + +- reconciles the activity-core Railiance runtime bundle; +- reruns `actcore-sync`; +- checks the `ops-service-inventory-probes` ActivityDefinition exists and is + still disabled; +- triggers the disabled definition manually through the in-cluster API path; +- verifies a fresh `ops_inventory_probe` progress event exists in State Hub; +- posts a non-secret State Hub evidence note for activity-core to cite. + +Implemented as `tools/cmd/railiance-verify-activity-core` with Makefile target +`verify-activity-core`. The script defaults to the `railiance01` SSH executor; +use `ACTIVITY_CORE_CLUSTER_HOST=local` only for an explicitly selected local +`kubectl` context. + +## Run live verification and publish evidence + +```task +id: RAILIANCE-WP-0012-T02 +status: done +priority: high +state_hub_task_id: "6d7f87c3-a533-4de1-84de-9ca65f2e2779" +``` + +Run `make verify-activity-core` against the Railiance cluster. On success, cite +the State Hub evidence note id in this task and in activity-core +`ACTIVITY-WP-0007-T06`. + +If a gate fails, the verifier must still post a non-secret State Hub note with +the failing gate and last completed evidence fields. + +2026-06-15: Completed against Railiance01 after refreshing the same-tag +`activity-core:railiance01-prod` image from activity-core commit `ab17378`, +importing digest `sha256:cff43c72455b9fc4fc11a0a997b4671a38987bb4583a600245dd961965af0e40` +into k3s containerd, syncing the current runtime bundle to +`/home/tegwick/activity-core/k8s/railiance`, and restarting the activity-core +runtime deployments. The verifier reconciled the runtime bundle, completed +`actcore-sync`, confirmed `ops-service-inventory-probes` exists and remains +disabled, triggered it manually, verified State Hub progress +`4c82360d-33e7-455b-8ab4-33facd4a3f8e`, and posted evidence note +`baeeaeac-aa6d-4406-ae64-e54577f21386`. + +An intermediate verifier invocation accidentally targeted the local +CoulombCore `kubectl` context. It created only `actcore-*` runtime resources in +the existing `activity-core` namespace; those resources were removed with the +runtime manifest cleanup, and the pre-existing `llm-connect` deployment remains +running. + +Operational cleanup note: the successful Railiance01 verifier run used +`ACTIVITY_CORE_RESTART_DEPLOYMENTS=1` after importing the same-tag image. The +script was corrected afterward to restart only `actcore-api`, +`actcore-worker`, and `actcore-event-router`, because +`actcore-state-hub-bridge` uses host networking and a rolling restart leaves a +new bridge pod pending behind the host-bound running pod. A 2026-06-16 cleanup +check showed the bridge rollout had settled on Railiance01: the host-bound +bridge pod was running and the replacement ReplicaSet was scaled to zero, so no +manual live cleanup was needed. + +## Handoff closure to activity-core + +```task +id: RAILIANCE-WP-0012-T03 +status: done +priority: medium +state_hub_task_id: "43f652c6-fcc4-49fa-90cc-4122eb6d5321" +``` + +After live evidence exists, update activity-core `ACTIVITY-WP-0007-T06` to cite +the Railiance evidence and close it if Inter-Hub submission is active or +explicitly deferred with the clean State Hub fallback result. + +2026-06-15: Updated activity-core `ACTIVITY-WP-0007-T06` to cite Railiance +evidence note `baeeaeac-aa6d-4406-ae64-e54577f21386` and close the task with +Inter-Hub submission explicitly deferred while the State Hub fallback evidence +path is verified.