From 5f8d2831c73ac332e6f8866330f73abecc96a50b Mon Sep 17 00:00:00 2001 From: tegwick Date: Thu, 18 Jun 2026 02:10:18 +0200 Subject: [PATCH] Add activity-core llm-connect reconcile gate --- Makefile | 5 +- ...liance-reconcile-activity-core-llm-connect | 293 ++++++++++++++++++ ...ctivity-core-llm-connect-live-reconcile.md | 96 ++++++ 3 files changed, 393 insertions(+), 1 deletion(-) create mode 100755 tools/cmd/railiance-reconcile-activity-core-llm-connect create mode 100644 workplans/RAILIANCE-WP-0014-activity-core-llm-connect-live-reconcile.md diff --git a/Makefile b/Makefile index e95e3ee..f489220 100644 --- a/Makefile +++ b/Makefile @@ -27,6 +27,9 @@ test-ha-failover: ## Run HA failover test (D3) — kills primary PG pod, asserts verify-activity-core: ## Reconcile activity-core runtime and verify disabled ops inventory probe evidence tools/cmd/railiance-verify-activity-core +reconcile-activity-core-llm-connect: ## Reconcile activity-core llm-connect URL and run non-secret gate checks + tools/cmd/railiance-reconcile-activity-core-llm-connect + ##@ Help help: ## Show this help @@ -34,4 +37,4 @@ help: ## Show this help /^[a-zA-Z_-]+:.*?##/ { printf " \033[36m%-20s\033[0m %s\n", $$1, $$2 } \ /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) }' $(MAKEFILE_LIST) -.PHONY: backup restore preflight k3s-install smoke test-ha-failover verify-activity-core help +.PHONY: backup restore preflight k3s-install smoke test-ha-failover verify-activity-core reconcile-activity-core-llm-connect help diff --git a/tools/cmd/railiance-reconcile-activity-core-llm-connect b/tools/cmd/railiance-reconcile-activity-core-llm-connect new file mode 100755 index 0000000..7a58207 --- /dev/null +++ b/tools/cmd/railiance-reconcile-activity-core-llm-connect @@ -0,0 +1,293 @@ +#!/usr/bin/env bash +# Cluster-owned activity-core <-> llm-connect reconcile and non-secret evidence. +set -euo pipefail + +ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" + +NAMESPACE="${ACTIVITY_CORE_NAMESPACE:-activity-core}" +CLUSTER_HOST="${ACTIVITY_CORE_CLUSTER_HOST:-railiance01}" +STATE_HUB_URL="${STATE_HUB_URL:-http://127.0.0.1:8000}" + +EXPECTED_URL="${LLM_CONNECT_URL:-http://llm-connect.activity-core.svc.cluster.local:8080}" +EXPECTED_TIMEOUT="${LLM_CONNECT_TIMEOUT_SECONDS:-300}" +SECRET_NAME="${LLM_CONNECT_PROVIDER_SECRET_NAME:-llm-connect-provider-secrets}" +DEPLOYMENT_NAME="${LLM_CONNECT_DEPLOYMENT_NAME:-llm-connect}" +LLM_CONNECT_REPO="${LLM_CONNECT_REPO:-/home/worsch/llm-connect}" +LLM_CONNECT_REMOTE_REPO="${LLM_CONNECT_REMOTE_REPO:-}" +APPLY_LLM_CONNECT_OVERLAY="${APPLY_LLM_CONNECT_OVERLAY:-1}" +REQUIRE_SMOKE="${REQUIRE_LLM_CONNECT_SMOKE:-0}" + +EVIDENCE_WORKSTREAM_ID="${STATE_HUB_EVIDENCE_WORKSTREAM_ID:-}" +EVIDENCE_TASK_ID="${STATE_HUB_EVIDENCE_TASK_ID:-}" + +PATCH_JSON="$( + EXPECTED_URL="$EXPECTED_URL" EXPECTED_TIMEOUT="$EXPECTED_TIMEOUT" python3 - <<'PY' +import json +import os + +print(json.dumps({ + "data": { + "LLM_CONNECT_URL": os.environ["EXPECTED_URL"], + "LLM_CONNECT_TIMEOUT_SECONDS": os.environ["EXPECTED_TIMEOUT"], + } +})) +PY +)" + +LIVE_URL="" +LIVE_TIMEOUT="" +SECRET_STATUS="unknown" +SECRET_KEY_COUNT="0" +DEPLOYMENT_STATUS="unknown" +SMOKE_STATUS="skipped" +SMOKE_SUMMARY="" +EVIDENCE_STATUS="passed" +FAILING_GATE="" + +export NAMESPACE CLUSTER_HOST STATE_HUB_URL EXPECTED_URL EXPECTED_TIMEOUT +export SECRET_NAME DEPLOYMENT_NAME LLM_CONNECT_REPO LLM_CONNECT_REMOTE_REPO +export APPLY_LLM_CONNECT_OVERLAY REQUIRE_SMOKE EVIDENCE_WORKSTREAM_ID EVIDENCE_TASK_ID +export LIVE_URL LIVE_TIMEOUT SECRET_STATUS SECRET_KEY_COUNT DEPLOYMENT_STATUS +export SMOKE_STATUS SMOKE_SUMMARY EVIDENCE_STATUS FAILING_GATE + +log() { + printf '[activity-core-llm-connect] %s\n' "$*" +} + +quote() { + printf '%q' "$1" +} + +cluster_bash() { + local script="$1" + if [[ -n "$CLUSTER_HOST" ]]; then + ssh "$CLUSTER_HOST" "bash -s" <<<"$script" + else + bash -s <<<"$script" + fi +} + +post_evidence() { + python3 - <<'PY' +import json +import os +import sys +import urllib.request + +status = os.environ["EVIDENCE_STATUS"] +detail = { + "producer": "railiance-cluster", + "verification": "activity-core llm-connect live reconcile", + "status": status, + "failing_gate": os.environ.get("FAILING_GATE") or None, + "cluster_host": os.environ.get("CLUSTER_HOST") or "local-kubectl", + "namespace": os.environ["NAMESPACE"], + "expected_url": os.environ["EXPECTED_URL"], + "expected_timeout_seconds": os.environ["EXPECTED_TIMEOUT"], + "live_url": os.environ.get("LIVE_URL") or None, + "live_timeout_seconds": os.environ.get("LIVE_TIMEOUT") or None, + "provider_secret": { + "name": os.environ["SECRET_NAME"], + "status": os.environ.get("SECRET_STATUS"), + "key_count": int(os.environ.get("SECRET_KEY_COUNT") or "0"), + }, + "deployment": { + "name": os.environ["DEPLOYMENT_NAME"], + "status": os.environ.get("DEPLOYMENT_STATUS"), + }, + "smoke": { + "status": os.environ.get("SMOKE_STATUS"), + "summary": os.environ.get("SMOKE_SUMMARY") or None, + }, +} + +if status == "passed": + summary = ( + "Railiance activity-core llm-connect reconcile passed: runtime config, " + "provider Secret, deployment, and smoke gate are all healthy." + ) +elif status == "blocked": + summary = ( + "Railiance activity-core llm-connect reconcile is blocked: " + f"{os.environ.get('FAILING_GATE') or 'operator gate'}." + ) +else: + summary = ( + "Railiance activity-core llm-connect reconcile failed" + + (f" at {os.environ.get('FAILING_GATE')}" if os.environ.get("FAILING_GATE") else "") + + "." + ) + +payload = { + "summary": summary, + "event_type": "note", + "author": "railiance-cluster", + "detail": detail, +} +if os.environ.get("EVIDENCE_WORKSTREAM_ID"): + payload["workstream_id"] = os.environ["EVIDENCE_WORKSTREAM_ID"] +if os.environ.get("EVIDENCE_TASK_ID"): + payload["task_id"] = os.environ["EVIDENCE_TASK_ID"] + +body = json.dumps(payload).encode("utf-8") +req = urllib.request.Request( + os.environ["STATE_HUB_URL"].rstrip("/") + "/progress/", + data=body, + headers={"Content-Type": "application/json"}, + method="POST", +) +with urllib.request.urlopen(req, timeout=20) as resp: + sys.stdout.write(resp.read().decode("utf-8")) +PY +} + +if [[ -z "$LLM_CONNECT_REMOTE_REPO" ]]; then + if [[ -n "$CLUSTER_HOST" ]]; then + LLM_CONNECT_REMOTE_REPO="$(ssh "$CLUSTER_HOST" pwd)/llm-connect" + else + LLM_CONNECT_REMOTE_REPO="$LLM_CONNECT_REPO" + fi +fi +export LLM_CONNECT_REMOTE_REPO + +log "using cluster executor: ${CLUSTER_HOST:-local kubectl}" +cluster_bash 'set -euo pipefail; command -v kubectl >/dev/null' + +log "reconciling non-secret activity-core runtime config" +cluster_bash "$(cat </dev/null 2>&1; then + kubectl -n $(quote "$NAMESPACE") get secret $(quote "$SECRET_NAME") -o go-template='{{ len .data }}' +else + printf missing +fi +EOF +)" +)" +if [[ "$SECRET_KEY_COUNT" == "missing" ]]; then + SECRET_STATUS="missing" + SECRET_KEY_COUNT="0" +elif [[ "${SECRET_KEY_COUNT:-0}" == "0" ]]; then + SECRET_STATUS="empty" +else + SECRET_STATUS="present" +fi +export SECRET_STATUS SECRET_KEY_COUNT + +if [[ "$SECRET_STATUS" != "present" ]]; then + EVIDENCE_STATUS="blocked" + FAILING_GATE="provider Secret ${SECRET_NAME} ${SECRET_STATUS}" + DEPLOYMENT_STATUS="not checked; provider Secret gate not satisfied" + SMOKE_STATUS="blocked" + SMOKE_SUMMARY="provider Secret must be populated outside Git/State Hub before deployment and smoke" + export EVIDENCE_STATUS FAILING_GATE DEPLOYMENT_STATUS SMOKE_STATUS SMOKE_SUMMARY + post_evidence + [[ "$REQUIRE_SMOKE" == "1" ]] && exit 1 + exit 0 +fi + +if [[ "$APPLY_LLM_CONNECT_OVERLAY" == "1" ]]; then + if [[ -n "$CLUSTER_HOST" ]]; then + log "syncing llm-connect overlay to ${CLUSTER_HOST}:${LLM_CONNECT_REMOTE_REPO}/deploy/k8s/activity-core-llm-connect" + ssh "$CLUSTER_HOST" "mkdir -p $(quote "$LLM_CONNECT_REMOTE_REPO")/deploy/k8s/activity-core-llm-connect" + rsync -a --delete \ + "$LLM_CONNECT_REPO/deploy/k8s/activity-core-llm-connect/" \ + "${CLUSTER_HOST}:${LLM_CONNECT_REMOTE_REPO}/deploy/k8s/activity-core-llm-connect/" + fi + log "applying llm-connect overlay" + cluster_bash "$(cat </dev/null 2>&1; then + kubectl -n $(quote "$NAMESPACE") get deploy $(quote "$DEPLOYMENT_NAME") -o jsonpath='{.status.readyReplicas}/{.status.replicas}' +else + printf missing +fi +EOF +)" +)" +export DEPLOYMENT_STATUS + +if [[ "$DEPLOYMENT_STATUS" == "missing" || "$DEPLOYMENT_STATUS" != "1/1" ]]; then + EVIDENCE_STATUS="blocked" + FAILING_GATE="llm-connect deployment not ready (${DEPLOYMENT_STATUS})" + SMOKE_STATUS="blocked" + SMOKE_SUMMARY="deployment must be ready before smoke" + export EVIDENCE_STATUS FAILING_GATE SMOKE_STATUS SMOKE_SUMMARY + post_evidence + [[ "$REQUIRE_SMOKE" == "1" ]] && exit 1 + exit 0 +fi + +log "running in-namespace llm-connect fixture smoke" +set +e +SMOKE_OUTPUT="$( + cluster_bash "$(cat <&1 +)" +SMOKE_CODE=$? +set -e + +if [[ "$SMOKE_CODE" == "0" ]]; then + SMOKE_STATUS="passed" + SMOKE_SUMMARY="$SMOKE_OUTPUT" + EVIDENCE_STATUS="passed" + FAILING_GATE="" +else + SMOKE_STATUS="failed" + SMOKE_SUMMARY="$(printf '%s' "$SMOKE_OUTPUT" | tail -n 5)" + EVIDENCE_STATUS="failed" + FAILING_GATE="llm-connect fixture smoke failed" +fi +export SMOKE_STATUS SMOKE_SUMMARY EVIDENCE_STATUS FAILING_GATE +post_evidence +exit "$SMOKE_CODE" + diff --git a/workplans/RAILIANCE-WP-0014-activity-core-llm-connect-live-reconcile.md b/workplans/RAILIANCE-WP-0014-activity-core-llm-connect-live-reconcile.md new file mode 100644 index 0000000..d08468b --- /dev/null +++ b/workplans/RAILIANCE-WP-0014-activity-core-llm-connect-live-reconcile.md @@ -0,0 +1,96 @@ +--- +id: RAILIANCE-WP-0014 +type: workplan +title: "activity-core llm-connect live reconcile" +domain: railiance +repo: railiance-cluster +status: blocked +owner: codex +topic_slug: railiance +created: "2026-06-18" +updated: "2026-06-18" +state_hub_workstream_id: "a152ddda-d60a-4a65-9b9c-59e2db9ff2b7" +--- + +# activity-core llm-connect live reconcile + +## Context + +activity-core has updated its Railiance runtime manifest so +`actcore-runtime-config` points at the verified in-cluster llm-connect URL: + +```text +LLM_CONNECT_URL=http://llm-connect.activity-core.svc.cluster.local:8080 +LLM_CONNECT_TIMEOUT_SECONDS=300 +``` + +The remaining live gate belongs at the cluster/operator layer. Provider +credentials must stay outside Git and State Hub, and the fixture smoke should +record only non-secret evidence. + +## Add cluster-owned reconcile/check command + +```task +id: RAILIANCE-WP-0014-T01 +status: done +priority: high +state_hub_task_id: "49288db7-8102-4ad5-af08-1fe6ab3f1d37" +``` + +Add a repeatable Railiance command that: + +- reconciles the non-secret activity-core runtime config keys; +- checks the provider Secret by key count only; +- applies the llm-connect overlay only after the provider Secret exists; +- runs the in-namespace fixture smoke only after deployment readiness; +- posts a non-secret State Hub evidence note. + +2026-06-18: Added `tools/cmd/railiance-reconcile-activity-core-llm-connect` +and Makefile target `reconcile-activity-core-llm-connect`. + +## Reconcile live non-secret runtime config + +```task +id: RAILIANCE-WP-0014-T02 +status: done +priority: high +state_hub_task_id: "61df5bad-535f-4ad1-ac7a-f46ff278c388" +``` + +Patch the live `activity-core/actcore-runtime-config` ConfigMap so it consumes +the verified llm-connect service URL and timeout. Do not touch Secret values. + +2026-06-18: The reconcile command patches only `LLM_CONNECT_URL` and +`LLM_CONNECT_TIMEOUT_SECONDS`, then re-reads the live ConfigMap to verify the +values. Live evidence note `c72c514a-399e-4c54-8d5b-d36405932360` confirms +`LLM_CONNECT_URL=http://llm-connect.activity-core.svc.cluster.local:8080` and +`LLM_CONNECT_TIMEOUT_SECONDS=300`. + +## Complete provider Secret, deployment, and smoke gate + +```task +id: RAILIANCE-WP-0014-T03 +status: blocked +priority: high +state_hub_task_id: "ae8af00a-c14f-4b76-933c-46d06cd360ae" +``` + +After an operator stores provider credentials in +`activity-core/llm-connect-provider-secrets`, rerun: + +```bash +make reconcile-activity-core-llm-connect +``` + +The command will apply the llm-connect overlay, wait for deployment readiness, +run the in-namespace fixture smoke with `imagePullPolicy=Never`, and post +non-secret evidence: provider Secret key count, deployment readiness, +pass/fail, latency/recommendation summary or sanitized failure. + +Current live gate on 2026-06-18: provider Secret +`activity-core/llm-connect-provider-secrets` is missing, so deployment and +smoke are intentionally blocked until operator/OpenBao-to-Kubernetes Secret +custody is complete. Evidence note +`c72c514a-399e-4c54-8d5b-d36405932360` records provider Secret status +`missing`, key count `0`, deployment status `not checked; provider Secret gate +not satisfied`, and smoke status `blocked`.