From 6f42bf114b4760cac46013a7231586895abe121e Mon Sep 17 00:00:00 2001 From: tegwick Date: Tue, 16 Jun 2026 08:02:27 +0200 Subject: [PATCH] Harden activity-core verifier evidence --- tools/cmd/railiance-verify-activity-core | 102 ++++++++++++--- ...tivity-core-verifier-evidence-hardening.md | 120 ++++++++++++++++++ 2 files changed, 206 insertions(+), 16 deletions(-) create mode 100644 workplans/RAILIANCE-WP-0013-activity-core-verifier-evidence-hardening.md diff --git a/tools/cmd/railiance-verify-activity-core b/tools/cmd/railiance-verify-activity-core index 9bc6315..5d26e13 100755 --- a/tools/cmd/railiance-verify-activity-core +++ b/tools/cmd/railiance-verify-activity-core @@ -16,9 +16,17 @@ STATE_HUB_PROGRESS_POLL_SECONDS="${STATE_HUB_PROGRESS_POLL_SECONDS:-5}" ACTIVITY_CORE_REPO="${ACTIVITY_CORE_REPO:-/home/worsch/activity-core}" ACTIVITY_CORE_REMOTE_REPO="${ACTIVITY_CORE_REMOTE_REPO:-}" ACTIVITY_CORE_CLUSTER_HOST="${ACTIVITY_CORE_CLUSTER_HOST:-railiance01}" +ACTIVITY_CORE_ALLOW_LOCAL_KUBECTL="${ACTIVITY_CORE_ALLOW_LOCAL_KUBECTL:-0}" ACTIVITY_CORE_SYNC_RUNTIME_BUNDLE="${ACTIVITY_CORE_SYNC_RUNTIME_BUNDLE:-auto}" ACTIVITY_CORE_RESTART_DEPLOYMENTS="${ACTIVITY_CORE_RESTART_DEPLOYMENTS:-0}" if [[ "$ACTIVITY_CORE_CLUSTER_HOST" == "local" ]]; then + if [[ "$ACTIVITY_CORE_ALLOW_LOCAL_KUBECTL" != "1" ]]; then + { + echo "ACTIVITY_CORE_CLUSTER_HOST=local requires ACTIVITY_CORE_ALLOW_LOCAL_KUBECTL=1" + echo "Default verifier execution is cluster-owned via railiance01/SSH." + } >&2 + exit 2 + fi ACTIVITY_CORE_CLUSTER_HOST="" fi if [[ -z "$ACTIVITY_CORE_REMOTE_REPO" ]]; then @@ -38,9 +46,12 @@ STARTED_AT="$(date -u +"%Y-%m-%dT%H:%M:%SZ")" CURRENT_GATE="startup" REMOTE_REVISION="" API_IMAGE="" +API_IMAGE_ID="" SYNC_STATUS_JSON="" DEFINITION_JSON="" TRIGGER_JSON="" +TRIGGER_KEY="" +EXPECTED_RUN_ID="" PROGRESS_JSON="" EVIDENCE_NOTE_JSON="" @@ -49,8 +60,8 @@ export STATE_HUB_URL EVIDENCE_WORKSTREAM_ID EVIDENCE_TASK_ID export STATE_HUB_PROGRESS_TIMEOUT_SECONDS STATE_HUB_PROGRESS_POLL_SECONDS export INTER_HUB_SUBMISSION_STATUS INTER_HUB_DEFER_REASON STARTED_AT export ACTIVITY_CORE_CLUSTER_HOST ACTIVITY_CORE_REMOTE_REPO -export ACTIVITY_CORE_SYNC_RUNTIME_BUNDLE ACTIVITY_CORE_RESTART_DEPLOYMENTS -export REMOTE_REVISION API_IMAGE SYNC_STATUS_JSON DEFINITION_JSON TRIGGER_JSON PROGRESS_JSON +export ACTIVITY_CORE_ALLOW_LOCAL_KUBECTL ACTIVITY_CORE_SYNC_RUNTIME_BUNDLE ACTIVITY_CORE_RESTART_DEPLOYMENTS +export REMOTE_REVISION API_IMAGE API_IMAGE_ID SYNC_STATUS_JSON DEFINITION_JSON TRIGGER_JSON TRIGGER_KEY EXPECTED_RUN_ID PROGRESS_JSON log() { printf '[activity-core-verify] %s\n' "$*" @@ -121,10 +132,12 @@ detail = { "activity_core_repo": os.environ.get("ACTIVITY_CORE_REMOTE_REPO"), "activity_core_revision": os.environ.get("REMOTE_REVISION") or None, "api_image": os.environ.get("API_IMAGE") or None, + "api_image_id": os.environ.get("API_IMAGE_ID") or None, "runtime_bundle": "k8s/railiance/20-runtime.yaml", "sync_job": sync_status, "definition": definition, "manual_trigger": trigger, + "expected_activity_core_run_id": os.environ.get("EXPECTED_RUN_ID") or None, "state_hub_progress": progress, "inter_hub_submission": { "status": os.environ.get("INTER_HUB_SUBMISSION_STATUS"), @@ -137,8 +150,9 @@ if status == "passed": summary = ( "Railiance activity-core deploy/verify passed: runtime reconciled, " "actcore-sync completed, ops-service-inventory-probes remains disabled, " - f"manual trigger {trigger.get('workflow_id') if isinstance(trigger, dict) else 'unknown'} ran, " - f"and State Hub ops_inventory_probe progress {progress.get('id') if isinstance(progress, dict) else 'unknown'} exists." + f"manual trigger {trigger.get('workflow_id') if isinstance(trigger, dict) else 'unknown'} ran as " + f"{os.environ.get('EXPECTED_RUN_ID') or 'unknown run'}, and State Hub ops_inventory_probe progress " + f"{progress.get('id') if isinstance(progress, dict) else 'unknown'} matched that run." ) else: summary = ( @@ -189,6 +203,16 @@ set -euo pipefail command -v kubectl >/dev/null EOF )" +if [[ -z "$ACTIVITY_CORE_CLUSTER_HOST" ]]; then + LOCAL_CONTEXT="$( + cluster_bash "$(cat <<'EOF' +set -euo pipefail +kubectl config current-context 2>/dev/null || true +EOF +)" + )" + log "local kubectl context: ${LOCAL_CONTEXT:-unknown}" +fi CURRENT_GATE="runtime bundle sync" if should_sync_runtime_bundle; then @@ -255,6 +279,17 @@ kubectl -n $(quote "$NAMESPACE") get deploy actcore-api -o jsonpath='{.spec.temp EOF )" )" +API_IMAGE_ID="$( + cluster_bash "$(cat <&2 + exit 1 +fi SYNC_STATUS_JSON="$( cluster_bash "$(cat <&2 exit 1 fi -export TRIGGER_JSON +TRIGGER_KEY="$( + python3 - <<'PY' +import json +import os + +trigger = json.loads(os.environ["TRIGGER_JSON"]) +trigger_key = trigger.get("trigger_key") +if not trigger_key: + raise SystemExit("manual trigger response did not include trigger_key") +print(trigger_key) +PY +)" +export TRIGGER_KEY +EXPECTED_RUN_ID="$( + python3 - <<'PY' +import os +import uuid + +definition_id = os.environ["DEFINITION_ID"] +trigger_key = os.environ["TRIGGER_KEY"] +print(uuid.uuid5(uuid.NAMESPACE_URL, f"{definition_id}:{trigger_key}")) +PY +)" +export TRIGGER_JSON EXPECTED_RUN_ID +log "manual trigger run id: ${EXPECTED_RUN_ID}" CURRENT_GATE="State Hub ops_inventory_probe evidence" log "polling State Hub for ops_inventory_probe progress" @@ -348,6 +407,8 @@ base = os.environ["STATE_HUB_URL"].rstrip("/") started = datetime.fromisoformat(os.environ["STARTED_AT"].replace("Z", "+00:00")) timeout = int(os.environ["STATE_HUB_PROGRESS_TIMEOUT_SECONDS"]) interval = int(os.environ["STATE_HUB_PROGRESS_POLL_SECONDS"]) +definition_id = os.environ["DEFINITION_ID"] +expected_run_id = os.environ["EXPECTED_RUN_ID"] deadline = time.monotonic() + timeout url = base + "/progress/?" + urllib.parse.urlencode({"event_type": "ops_inventory_probe"}) @@ -358,18 +419,27 @@ while time.monotonic() < deadline: created_at = datetime.fromisoformat(event["created_at"].replace("Z", "+00:00")) if created_at >= started: detail = event.get("detail") or {} - print(json.dumps({ - "id": event["id"], - "event_type": event.get("event_type"), - "summary": event.get("summary"), - "author": event.get("author"), - "created_at": event.get("created_at"), - "detail_keys": sorted(detail.keys()) if isinstance(detail, dict) else [], - })) - raise SystemExit(0) + if ( + isinstance(detail, dict) + and detail.get("activity_id") == definition_id + and detail.get("activity_core_run_id") == expected_run_id + ): + print(json.dumps({ + "id": event["id"], + "event_type": event.get("event_type"), + "summary": event.get("summary"), + "author": event.get("author"), + "created_at": event.get("created_at"), + "activity_id": detail.get("activity_id"), + "activity_core_run_id": detail.get("activity_core_run_id"), + "expected_activity_core_run_id": expected_run_id, + "idempotency_key": detail.get("idempotency_key"), + "detail_keys": sorted(detail.keys()), + })) + raise SystemExit(0) time.sleep(interval) -raise SystemExit(f"no ops_inventory_probe progress found after {timeout}s") +raise SystemExit(f"no ops_inventory_probe progress for manual run {expected_run_id} found after {timeout}s") PY )" export PROGRESS_JSON diff --git a/workplans/RAILIANCE-WP-0013-activity-core-verifier-evidence-hardening.md b/workplans/RAILIANCE-WP-0013-activity-core-verifier-evidence-hardening.md new file mode 100644 index 0000000..7179d48 --- /dev/null +++ b/workplans/RAILIANCE-WP-0013-activity-core-verifier-evidence-hardening.md @@ -0,0 +1,120 @@ +--- +id: RAILIANCE-WP-0013 +type: workplan +title: "activity-core verifier evidence hardening" +domain: railiance +repo: railiance-cluster +status: finished +owner: codex +topic_slug: railiance +created: "2026-06-16" +updated: "2026-06-16" +state_hub_workstream_id: "a3abb83a-2d42-40f9-a5f6-1dbc36903436" +--- + +# activity-core verifier evidence hardening + +## Context + +`RAILIANCE-WP-0012` moved activity-core live deploy/verify ownership into +`railiance-cluster` and produced State Hub evidence +`baeeaeac-aa6d-4406-ae64-e54577f21386`, with `ops_inventory_probe` progress +`4c82360d-33e7-455b-8ab4-33facd4a3f8e`. + +A follow-up review found hardening work that matters for routine verifier use: +the verifier should prove the State Hub progress event belongs to the specific +manual trigger it launched, evidence should include an immutable runtime +identity, and local `kubectl` mode should require an explicit double opt-in. + +This is a hardening follow-up only; it does not reopen activity-core +`ACTIVITY-WP-0007-T06`. + +## Correlate State Hub progress to the manual trigger + +```task +id: RAILIANCE-WP-0013-T01 +status: done +priority: high +state_hub_task_id: "d013a4a9-77fc-4cf0-babf-528d71acc0a1" +``` + +Update `tools/cmd/railiance-verify-activity-core` so after +`POST /activity-definitions//trigger` it parses `trigger_key`, derives the +expected activity-core manual `run_id`, and polls State Hub until it finds +`ops_inventory_probe` where: + +- `detail.activity_id == DEFINITION_ID`; +- `detail.activity_core_run_id == expected_run_id`. + +The verifier must not pass on merely any event created after `STARTED_AT`. +Include the expected run id and matched progress id in the evidence note. + +2026-06-16: Implemented exact correlation. The verifier now derives the +expected UUIDv5 `activity_core_run_id` from `:` and +requires State Hub `ops_inventory_probe` detail to match both `activity_id` and +`activity_core_run_id`. + +## Record immutable runtime evidence + +```task +id: RAILIANCE-WP-0013-T02 +status: done +priority: medium +state_hub_task_id: "c5780ec1-9a74-401e-b60e-a0fdf2b7e5d2" +``` + +Ensure successful evidence includes either `activity_core_revision` or an +immutable Kubernetes image ID/digest. When the remote repo revision is +unavailable, fall back to the live `actcore-api` pod container `imageID`. + +2026-06-16: Implemented `api_image_id` capture from the live `actcore-api` pod +container status and added a guard so passed evidence must include either the +remote repo revision or the immutable image ID. + +## Guard explicit local kubectl override + +```task +id: RAILIANCE-WP-0013-T03 +status: done +priority: medium +state_hub_task_id: "0d60809f-3f1d-4ea9-a96f-af074911acc0" +``` + +Keep `railiance01`/SSH as the default executor. If +`ACTIVITY_CORE_CLUSTER_HOST=local` is selected, require an additional explicit +opt-in such as `ACTIVITY_CORE_ALLOW_LOCAL_KUBECTL=1` and print the current +`kubectl` context before continuing. + +2026-06-16: Implemented the double opt-in. `ACTIVITY_CORE_CLUSTER_HOST=local` +now exits before cluster access unless `ACTIVITY_CORE_ALLOW_LOCAL_KUBECTL=1` is +also set, and accepted local mode prints the current `kubectl` context. + +## Verify and publish hardening evidence + +```task +id: RAILIANCE-WP-0013-T04 +status: done +priority: medium +state_hub_task_id: "150e4fa3-800c-4997-baaa-da696f5a0fc0" +``` + +Run `bash -n tools/cmd/railiance-verify-activity-core`, run +`make verify-activity-core` against Railiance01, confirm the evidence note +matched the manual trigger run id, and post a non-secret State Hub note citing +the new evidence. + +2026-06-16: Verified with `bash -n tools/cmd/railiance-verify-activity-core` +and a live Railiance01 `make verify-activity-core` run. The verifier posted +State Hub evidence note `60256e9a-9d1b-44db-8999-738cf03bca2e`, matched manual +run id `90e3b112-d1e3-51af-8fb2-cb61f26add17`, matched +`ops_inventory_probe` progress `db408146-0310-4ac3-ac77-f73c5a41e070`, and +included `api_image_id` +`sha256:5ff92a8217c450ae06075d00862b6e2a92a83ca09eea18b5a5e96b5d2d728b35`. + +Done when: + +- the verifier rejects unrelated fresh `ops_inventory_probe` events; +- evidence includes a non-null revision or image digest; +- local `kubectl` mode requires explicit double opt-in; +- the Railiance01 verifier run posts a passed evidence note with matched run id; +- `make fix-consistency REPO=railiance-cluster` has synced the workplan.