Add activity-core llm-connect reconcile gate
Some checks failed
railiance-tests / smoke (push) Has been cancelled
Some checks failed
railiance-tests / smoke (push) Has been cancelled
This commit is contained in:
5
Makefile
5
Makefile
@@ -27,6 +27,9 @@ test-ha-failover: ## Run HA failover test (D3) — kills primary PG pod, asserts
|
||||
verify-activity-core: ## Reconcile activity-core runtime and verify disabled ops inventory probe evidence
|
||||
tools/cmd/railiance-verify-activity-core
|
||||
|
||||
reconcile-activity-core-llm-connect: ## Reconcile activity-core llm-connect URL and run non-secret gate checks
|
||||
tools/cmd/railiance-reconcile-activity-core-llm-connect
|
||||
|
||||
##@ Help
|
||||
|
||||
help: ## Show this help
|
||||
@@ -34,4 +37,4 @@ help: ## Show this help
|
||||
/^[a-zA-Z_-]+:.*?##/ { printf " \033[36m%-20s\033[0m %s\n", $$1, $$2 } \
|
||||
/^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) }' $(MAKEFILE_LIST)
|
||||
|
||||
.PHONY: backup restore preflight k3s-install smoke test-ha-failover verify-activity-core help
|
||||
.PHONY: backup restore preflight k3s-install smoke test-ha-failover verify-activity-core reconcile-activity-core-llm-connect help
|
||||
|
||||
293
tools/cmd/railiance-reconcile-activity-core-llm-connect
Executable file
293
tools/cmd/railiance-reconcile-activity-core-llm-connect
Executable file
@@ -0,0 +1,293 @@
|
||||
#!/usr/bin/env bash
|
||||
# Cluster-owned activity-core <-> llm-connect reconcile and non-secret evidence.
|
||||
set -euo pipefail
|
||||
|
||||
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
||||
|
||||
NAMESPACE="${ACTIVITY_CORE_NAMESPACE:-activity-core}"
|
||||
CLUSTER_HOST="${ACTIVITY_CORE_CLUSTER_HOST:-railiance01}"
|
||||
STATE_HUB_URL="${STATE_HUB_URL:-http://127.0.0.1:8000}"
|
||||
|
||||
EXPECTED_URL="${LLM_CONNECT_URL:-http://llm-connect.activity-core.svc.cluster.local:8080}"
|
||||
EXPECTED_TIMEOUT="${LLM_CONNECT_TIMEOUT_SECONDS:-300}"
|
||||
SECRET_NAME="${LLM_CONNECT_PROVIDER_SECRET_NAME:-llm-connect-provider-secrets}"
|
||||
DEPLOYMENT_NAME="${LLM_CONNECT_DEPLOYMENT_NAME:-llm-connect}"
|
||||
LLM_CONNECT_REPO="${LLM_CONNECT_REPO:-/home/worsch/llm-connect}"
|
||||
LLM_CONNECT_REMOTE_REPO="${LLM_CONNECT_REMOTE_REPO:-}"
|
||||
APPLY_LLM_CONNECT_OVERLAY="${APPLY_LLM_CONNECT_OVERLAY:-1}"
|
||||
REQUIRE_SMOKE="${REQUIRE_LLM_CONNECT_SMOKE:-0}"
|
||||
|
||||
EVIDENCE_WORKSTREAM_ID="${STATE_HUB_EVIDENCE_WORKSTREAM_ID:-}"
|
||||
EVIDENCE_TASK_ID="${STATE_HUB_EVIDENCE_TASK_ID:-}"
|
||||
|
||||
PATCH_JSON="$(
|
||||
EXPECTED_URL="$EXPECTED_URL" EXPECTED_TIMEOUT="$EXPECTED_TIMEOUT" python3 - <<'PY'
|
||||
import json
|
||||
import os
|
||||
|
||||
print(json.dumps({
|
||||
"data": {
|
||||
"LLM_CONNECT_URL": os.environ["EXPECTED_URL"],
|
||||
"LLM_CONNECT_TIMEOUT_SECONDS": os.environ["EXPECTED_TIMEOUT"],
|
||||
}
|
||||
}))
|
||||
PY
|
||||
)"
|
||||
|
||||
LIVE_URL=""
|
||||
LIVE_TIMEOUT=""
|
||||
SECRET_STATUS="unknown"
|
||||
SECRET_KEY_COUNT="0"
|
||||
DEPLOYMENT_STATUS="unknown"
|
||||
SMOKE_STATUS="skipped"
|
||||
SMOKE_SUMMARY=""
|
||||
EVIDENCE_STATUS="passed"
|
||||
FAILING_GATE=""
|
||||
|
||||
export NAMESPACE CLUSTER_HOST STATE_HUB_URL EXPECTED_URL EXPECTED_TIMEOUT
|
||||
export SECRET_NAME DEPLOYMENT_NAME LLM_CONNECT_REPO LLM_CONNECT_REMOTE_REPO
|
||||
export APPLY_LLM_CONNECT_OVERLAY REQUIRE_SMOKE EVIDENCE_WORKSTREAM_ID EVIDENCE_TASK_ID
|
||||
export LIVE_URL LIVE_TIMEOUT SECRET_STATUS SECRET_KEY_COUNT DEPLOYMENT_STATUS
|
||||
export SMOKE_STATUS SMOKE_SUMMARY EVIDENCE_STATUS FAILING_GATE
|
||||
|
||||
log() {
|
||||
printf '[activity-core-llm-connect] %s\n' "$*"
|
||||
}
|
||||
|
||||
quote() {
|
||||
printf '%q' "$1"
|
||||
}
|
||||
|
||||
cluster_bash() {
|
||||
local script="$1"
|
||||
if [[ -n "$CLUSTER_HOST" ]]; then
|
||||
ssh "$CLUSTER_HOST" "bash -s" <<<"$script"
|
||||
else
|
||||
bash -s <<<"$script"
|
||||
fi
|
||||
}
|
||||
|
||||
post_evidence() {
|
||||
python3 - <<'PY'
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import urllib.request
|
||||
|
||||
status = os.environ["EVIDENCE_STATUS"]
|
||||
detail = {
|
||||
"producer": "railiance-cluster",
|
||||
"verification": "activity-core llm-connect live reconcile",
|
||||
"status": status,
|
||||
"failing_gate": os.environ.get("FAILING_GATE") or None,
|
||||
"cluster_host": os.environ.get("CLUSTER_HOST") or "local-kubectl",
|
||||
"namespace": os.environ["NAMESPACE"],
|
||||
"expected_url": os.environ["EXPECTED_URL"],
|
||||
"expected_timeout_seconds": os.environ["EXPECTED_TIMEOUT"],
|
||||
"live_url": os.environ.get("LIVE_URL") or None,
|
||||
"live_timeout_seconds": os.environ.get("LIVE_TIMEOUT") or None,
|
||||
"provider_secret": {
|
||||
"name": os.environ["SECRET_NAME"],
|
||||
"status": os.environ.get("SECRET_STATUS"),
|
||||
"key_count": int(os.environ.get("SECRET_KEY_COUNT") or "0"),
|
||||
},
|
||||
"deployment": {
|
||||
"name": os.environ["DEPLOYMENT_NAME"],
|
||||
"status": os.environ.get("DEPLOYMENT_STATUS"),
|
||||
},
|
||||
"smoke": {
|
||||
"status": os.environ.get("SMOKE_STATUS"),
|
||||
"summary": os.environ.get("SMOKE_SUMMARY") or None,
|
||||
},
|
||||
}
|
||||
|
||||
if status == "passed":
|
||||
summary = (
|
||||
"Railiance activity-core llm-connect reconcile passed: runtime config, "
|
||||
"provider Secret, deployment, and smoke gate are all healthy."
|
||||
)
|
||||
elif status == "blocked":
|
||||
summary = (
|
||||
"Railiance activity-core llm-connect reconcile is blocked: "
|
||||
f"{os.environ.get('FAILING_GATE') or 'operator gate'}."
|
||||
)
|
||||
else:
|
||||
summary = (
|
||||
"Railiance activity-core llm-connect reconcile failed"
|
||||
+ (f" at {os.environ.get('FAILING_GATE')}" if os.environ.get("FAILING_GATE") else "")
|
||||
+ "."
|
||||
)
|
||||
|
||||
payload = {
|
||||
"summary": summary,
|
||||
"event_type": "note",
|
||||
"author": "railiance-cluster",
|
||||
"detail": detail,
|
||||
}
|
||||
if os.environ.get("EVIDENCE_WORKSTREAM_ID"):
|
||||
payload["workstream_id"] = os.environ["EVIDENCE_WORKSTREAM_ID"]
|
||||
if os.environ.get("EVIDENCE_TASK_ID"):
|
||||
payload["task_id"] = os.environ["EVIDENCE_TASK_ID"]
|
||||
|
||||
body = json.dumps(payload).encode("utf-8")
|
||||
req = urllib.request.Request(
|
||||
os.environ["STATE_HUB_URL"].rstrip("/") + "/progress/",
|
||||
data=body,
|
||||
headers={"Content-Type": "application/json"},
|
||||
method="POST",
|
||||
)
|
||||
with urllib.request.urlopen(req, timeout=20) as resp:
|
||||
sys.stdout.write(resp.read().decode("utf-8"))
|
||||
PY
|
||||
}
|
||||
|
||||
if [[ -z "$LLM_CONNECT_REMOTE_REPO" ]]; then
|
||||
if [[ -n "$CLUSTER_HOST" ]]; then
|
||||
LLM_CONNECT_REMOTE_REPO="$(ssh "$CLUSTER_HOST" pwd)/llm-connect"
|
||||
else
|
||||
LLM_CONNECT_REMOTE_REPO="$LLM_CONNECT_REPO"
|
||||
fi
|
||||
fi
|
||||
export LLM_CONNECT_REMOTE_REPO
|
||||
|
||||
log "using cluster executor: ${CLUSTER_HOST:-local kubectl}"
|
||||
cluster_bash 'set -euo pipefail; command -v kubectl >/dev/null'
|
||||
|
||||
log "reconciling non-secret activity-core runtime config"
|
||||
cluster_bash "$(cat <<EOF
|
||||
set -euo pipefail
|
||||
kubectl -n $(quote "$NAMESPACE") patch configmap actcore-runtime-config --type merge -p $(quote "$PATCH_JSON")
|
||||
EOF
|
||||
)"
|
||||
|
||||
LIVE_URL="$(
|
||||
cluster_bash "$(cat <<EOF
|
||||
set -euo pipefail
|
||||
kubectl -n $(quote "$NAMESPACE") get cm actcore-runtime-config -o jsonpath='{.data.LLM_CONNECT_URL}'
|
||||
EOF
|
||||
)"
|
||||
)"
|
||||
LIVE_TIMEOUT="$(
|
||||
cluster_bash "$(cat <<EOF
|
||||
set -euo pipefail
|
||||
kubectl -n $(quote "$NAMESPACE") get cm actcore-runtime-config -o jsonpath='{.data.LLM_CONNECT_TIMEOUT_SECONDS}'
|
||||
EOF
|
||||
)"
|
||||
)"
|
||||
export LIVE_URL LIVE_TIMEOUT
|
||||
|
||||
if [[ "$LIVE_URL" != "$EXPECTED_URL" || "$LIVE_TIMEOUT" != "$EXPECTED_TIMEOUT" ]]; then
|
||||
EVIDENCE_STATUS="failed"
|
||||
FAILING_GATE="runtime config mismatch"
|
||||
export EVIDENCE_STATUS FAILING_GATE
|
||||
post_evidence
|
||||
exit 1
|
||||
fi
|
||||
|
||||
SECRET_KEY_COUNT="$(
|
||||
cluster_bash "$(cat <<EOF
|
||||
set -euo pipefail
|
||||
if kubectl -n $(quote "$NAMESPACE") get secret $(quote "$SECRET_NAME") >/dev/null 2>&1; then
|
||||
kubectl -n $(quote "$NAMESPACE") get secret $(quote "$SECRET_NAME") -o go-template='{{ len .data }}'
|
||||
else
|
||||
printf missing
|
||||
fi
|
||||
EOF
|
||||
)"
|
||||
)"
|
||||
if [[ "$SECRET_KEY_COUNT" == "missing" ]]; then
|
||||
SECRET_STATUS="missing"
|
||||
SECRET_KEY_COUNT="0"
|
||||
elif [[ "${SECRET_KEY_COUNT:-0}" == "0" ]]; then
|
||||
SECRET_STATUS="empty"
|
||||
else
|
||||
SECRET_STATUS="present"
|
||||
fi
|
||||
export SECRET_STATUS SECRET_KEY_COUNT
|
||||
|
||||
if [[ "$SECRET_STATUS" != "present" ]]; then
|
||||
EVIDENCE_STATUS="blocked"
|
||||
FAILING_GATE="provider Secret ${SECRET_NAME} ${SECRET_STATUS}"
|
||||
DEPLOYMENT_STATUS="not checked; provider Secret gate not satisfied"
|
||||
SMOKE_STATUS="blocked"
|
||||
SMOKE_SUMMARY="provider Secret must be populated outside Git/State Hub before deployment and smoke"
|
||||
export EVIDENCE_STATUS FAILING_GATE DEPLOYMENT_STATUS SMOKE_STATUS SMOKE_SUMMARY
|
||||
post_evidence
|
||||
[[ "$REQUIRE_SMOKE" == "1" ]] && exit 1
|
||||
exit 0
|
||||
fi
|
||||
|
||||
if [[ "$APPLY_LLM_CONNECT_OVERLAY" == "1" ]]; then
|
||||
if [[ -n "$CLUSTER_HOST" ]]; then
|
||||
log "syncing llm-connect overlay to ${CLUSTER_HOST}:${LLM_CONNECT_REMOTE_REPO}/deploy/k8s/activity-core-llm-connect"
|
||||
ssh "$CLUSTER_HOST" "mkdir -p $(quote "$LLM_CONNECT_REMOTE_REPO")/deploy/k8s/activity-core-llm-connect"
|
||||
rsync -a --delete \
|
||||
"$LLM_CONNECT_REPO/deploy/k8s/activity-core-llm-connect/" \
|
||||
"${CLUSTER_HOST}:${LLM_CONNECT_REMOTE_REPO}/deploy/k8s/activity-core-llm-connect/"
|
||||
fi
|
||||
log "applying llm-connect overlay"
|
||||
cluster_bash "$(cat <<EOF
|
||||
set -euo pipefail
|
||||
kubectl apply -k $(quote "$LLM_CONNECT_REMOTE_REPO")/deploy/k8s/activity-core-llm-connect
|
||||
kubectl -n $(quote "$NAMESPACE") rollout status deploy/$(quote "$DEPLOYMENT_NAME") --timeout=180s
|
||||
EOF
|
||||
)"
|
||||
fi
|
||||
|
||||
DEPLOYMENT_STATUS="$(
|
||||
cluster_bash "$(cat <<EOF
|
||||
set -euo pipefail
|
||||
if kubectl -n $(quote "$NAMESPACE") get deploy $(quote "$DEPLOYMENT_NAME") >/dev/null 2>&1; then
|
||||
kubectl -n $(quote "$NAMESPACE") get deploy $(quote "$DEPLOYMENT_NAME") -o jsonpath='{.status.readyReplicas}/{.status.replicas}'
|
||||
else
|
||||
printf missing
|
||||
fi
|
||||
EOF
|
||||
)"
|
||||
)"
|
||||
export DEPLOYMENT_STATUS
|
||||
|
||||
if [[ "$DEPLOYMENT_STATUS" == "missing" || "$DEPLOYMENT_STATUS" != "1/1" ]]; then
|
||||
EVIDENCE_STATUS="blocked"
|
||||
FAILING_GATE="llm-connect deployment not ready (${DEPLOYMENT_STATUS})"
|
||||
SMOKE_STATUS="blocked"
|
||||
SMOKE_SUMMARY="deployment must be ready before smoke"
|
||||
export EVIDENCE_STATUS FAILING_GATE SMOKE_STATUS SMOKE_SUMMARY
|
||||
post_evidence
|
||||
[[ "$REQUIRE_SMOKE" == "1" ]] && exit 1
|
||||
exit 0
|
||||
fi
|
||||
|
||||
log "running in-namespace llm-connect fixture smoke"
|
||||
set +e
|
||||
SMOKE_OUTPUT="$(
|
||||
cluster_bash "$(cat <<EOF
|
||||
set -euo pipefail
|
||||
kubectl -n $(quote "$NAMESPACE") run llm-connect-smoke-\$(date +%s) \\
|
||||
--rm -i --restart=Never \\
|
||||
--image=llm-connect:latest \\
|
||||
--image-pull-policy=Never \\
|
||||
--env=LLM_CONNECT_URL=$(quote "$EXPECTED_URL") \\
|
||||
--env=LLM_CONNECT_TIMEOUT_SECONDS=$(quote "$EXPECTED_TIMEOUT") \\
|
||||
-- python scripts/smoke_activity_core_endpoint.py
|
||||
EOF
|
||||
)" 2>&1
|
||||
)"
|
||||
SMOKE_CODE=$?
|
||||
set -e
|
||||
|
||||
if [[ "$SMOKE_CODE" == "0" ]]; then
|
||||
SMOKE_STATUS="passed"
|
||||
SMOKE_SUMMARY="$SMOKE_OUTPUT"
|
||||
EVIDENCE_STATUS="passed"
|
||||
FAILING_GATE=""
|
||||
else
|
||||
SMOKE_STATUS="failed"
|
||||
SMOKE_SUMMARY="$(printf '%s' "$SMOKE_OUTPUT" | tail -n 5)"
|
||||
EVIDENCE_STATUS="failed"
|
||||
FAILING_GATE="llm-connect fixture smoke failed"
|
||||
fi
|
||||
export SMOKE_STATUS SMOKE_SUMMARY EVIDENCE_STATUS FAILING_GATE
|
||||
post_evidence
|
||||
exit "$SMOKE_CODE"
|
||||
|
||||
@@ -0,0 +1,96 @@
|
||||
---
|
||||
id: RAILIANCE-WP-0014
|
||||
type: workplan
|
||||
title: "activity-core llm-connect live reconcile"
|
||||
domain: railiance
|
||||
repo: railiance-cluster
|
||||
status: blocked
|
||||
owner: codex
|
||||
topic_slug: railiance
|
||||
created: "2026-06-18"
|
||||
updated: "2026-06-18"
|
||||
state_hub_workstream_id: "a152ddda-d60a-4a65-9b9c-59e2db9ff2b7"
|
||||
---
|
||||
|
||||
# activity-core llm-connect live reconcile
|
||||
|
||||
## Context
|
||||
|
||||
activity-core has updated its Railiance runtime manifest so
|
||||
`actcore-runtime-config` points at the verified in-cluster llm-connect URL:
|
||||
|
||||
```text
|
||||
LLM_CONNECT_URL=http://llm-connect.activity-core.svc.cluster.local:8080
|
||||
LLM_CONNECT_TIMEOUT_SECONDS=300
|
||||
```
|
||||
|
||||
The remaining live gate belongs at the cluster/operator layer. Provider
|
||||
credentials must stay outside Git and State Hub, and the fixture smoke should
|
||||
record only non-secret evidence.
|
||||
|
||||
## Add cluster-owned reconcile/check command
|
||||
|
||||
```task
|
||||
id: RAILIANCE-WP-0014-T01
|
||||
status: done
|
||||
priority: high
|
||||
state_hub_task_id: "49288db7-8102-4ad5-af08-1fe6ab3f1d37"
|
||||
```
|
||||
|
||||
Add a repeatable Railiance command that:
|
||||
|
||||
- reconciles the non-secret activity-core runtime config keys;
|
||||
- checks the provider Secret by key count only;
|
||||
- applies the llm-connect overlay only after the provider Secret exists;
|
||||
- runs the in-namespace fixture smoke only after deployment readiness;
|
||||
- posts a non-secret State Hub evidence note.
|
||||
|
||||
2026-06-18: Added `tools/cmd/railiance-reconcile-activity-core-llm-connect`
|
||||
and Makefile target `reconcile-activity-core-llm-connect`.
|
||||
|
||||
## Reconcile live non-secret runtime config
|
||||
|
||||
```task
|
||||
id: RAILIANCE-WP-0014-T02
|
||||
status: done
|
||||
priority: high
|
||||
state_hub_task_id: "61df5bad-535f-4ad1-ac7a-f46ff278c388"
|
||||
```
|
||||
|
||||
Patch the live `activity-core/actcore-runtime-config` ConfigMap so it consumes
|
||||
the verified llm-connect service URL and timeout. Do not touch Secret values.
|
||||
|
||||
2026-06-18: The reconcile command patches only `LLM_CONNECT_URL` and
|
||||
`LLM_CONNECT_TIMEOUT_SECONDS`, then re-reads the live ConfigMap to verify the
|
||||
values. Live evidence note `c72c514a-399e-4c54-8d5b-d36405932360` confirms
|
||||
`LLM_CONNECT_URL=http://llm-connect.activity-core.svc.cluster.local:8080` and
|
||||
`LLM_CONNECT_TIMEOUT_SECONDS=300`.
|
||||
|
||||
## Complete provider Secret, deployment, and smoke gate
|
||||
|
||||
```task
|
||||
id: RAILIANCE-WP-0014-T03
|
||||
status: blocked
|
||||
priority: high
|
||||
state_hub_task_id: "ae8af00a-c14f-4b76-933c-46d06cd360ae"
|
||||
```
|
||||
|
||||
After an operator stores provider credentials in
|
||||
`activity-core/llm-connect-provider-secrets`, rerun:
|
||||
|
||||
```bash
|
||||
make reconcile-activity-core-llm-connect
|
||||
```
|
||||
|
||||
The command will apply the llm-connect overlay, wait for deployment readiness,
|
||||
run the in-namespace fixture smoke with `imagePullPolicy=Never`, and post
|
||||
non-secret evidence: provider Secret key count, deployment readiness,
|
||||
pass/fail, latency/recommendation summary or sanitized failure.
|
||||
|
||||
Current live gate on 2026-06-18: provider Secret
|
||||
`activity-core/llm-connect-provider-secrets` is missing, so deployment and
|
||||
smoke are intentionally blocked until operator/OpenBao-to-Kubernetes Secret
|
||||
custody is complete. Evidence note
|
||||
`c72c514a-399e-4c54-8d5b-d36405932360` records provider Secret status
|
||||
`missing`, key count `0`, deployment status `not checked; provider Secret gate
|
||||
not satisfied`, and smoke status `blocked`.
|
||||
Reference in New Issue
Block a user