Add activity-core cluster verifier
Some checks failed
railiance-tests / smoke (push) Has been cancelled
Some checks failed
railiance-tests / smoke (push) Has been cancelled
This commit is contained in:
5
Makefile
5
Makefile
@@ -24,6 +24,9 @@ smoke: ## Run Kubernetes smoke tests
|
|||||||
test-ha-failover: ## Run HA failover test (D3) — kills primary PG pod, asserts recovery
|
test-ha-failover: ## Run HA failover test (D3) — kills primary PG pod, asserts recovery
|
||||||
bash tests/test_ha_failover.sh $(if $(GITEA_URL),$(GITEA_URL),)
|
bash tests/test_ha_failover.sh $(if $(GITEA_URL),$(GITEA_URL),)
|
||||||
|
|
||||||
|
verify-activity-core: ## Reconcile activity-core runtime and verify disabled ops inventory probe evidence
|
||||||
|
tools/cmd/railiance-verify-activity-core
|
||||||
|
|
||||||
##@ Help
|
##@ Help
|
||||||
|
|
||||||
help: ## Show this help
|
help: ## Show this help
|
||||||
@@ -31,4 +34,4 @@ help: ## Show this help
|
|||||||
/^[a-zA-Z_-]+:.*?##/ { printf " \033[36m%-20s\033[0m %s\n", $$1, $$2 } \
|
/^[a-zA-Z_-]+:.*?##/ { printf " \033[36m%-20s\033[0m %s\n", $$1, $$2 } \
|
||||||
/^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) }' $(MAKEFILE_LIST)
|
/^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) }' $(MAKEFILE_LIST)
|
||||||
|
|
||||||
.PHONY: backup restore preflight k3s-install smoke test-ha-failover help
|
.PHONY: backup restore preflight k3s-install smoke test-ha-failover verify-activity-core help
|
||||||
|
|||||||
384
tools/cmd/railiance-verify-activity-core
Executable file
384
tools/cmd/railiance-verify-activity-core
Executable file
@@ -0,0 +1,384 @@
|
|||||||
|
#!/usr/bin/env bash
|
||||||
|
# Cluster-owned activity-core runtime reconcile and ops inventory probe evidence path.
|
||||||
|
set -euo pipefail
|
||||||
|
|
||||||
|
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
||||||
|
|
||||||
|
NAMESPACE="${ACTIVITY_CORE_NAMESPACE:-activity-core}"
|
||||||
|
DEFINITION_ID="${ACTIVITY_CORE_OPS_DEFINITION_ID:-40d15a87-7ff6-4d8e-992c-37df15f95110}"
|
||||||
|
DEFINITION_SLUG="${ACTIVITY_CORE_OPS_DEFINITION_SLUG:-ops-service-inventory-probes}"
|
||||||
|
DEFINITION_NAME="${ACTIVITY_CORE_OPS_DEFINITION_NAME:-Ops Service Inventory Probes}"
|
||||||
|
|
||||||
|
STATE_HUB_URL="${STATE_HUB_URL:-http://127.0.0.1:8000}"
|
||||||
|
STATE_HUB_PROGRESS_TIMEOUT_SECONDS="${STATE_HUB_PROGRESS_TIMEOUT_SECONDS:-180}"
|
||||||
|
STATE_HUB_PROGRESS_POLL_SECONDS="${STATE_HUB_PROGRESS_POLL_SECONDS:-5}"
|
||||||
|
|
||||||
|
ACTIVITY_CORE_REPO="${ACTIVITY_CORE_REPO:-/home/worsch/activity-core}"
|
||||||
|
ACTIVITY_CORE_REMOTE_REPO="${ACTIVITY_CORE_REMOTE_REPO:-}"
|
||||||
|
ACTIVITY_CORE_CLUSTER_HOST="${ACTIVITY_CORE_CLUSTER_HOST:-railiance01}"
|
||||||
|
ACTIVITY_CORE_SYNC_RUNTIME_BUNDLE="${ACTIVITY_CORE_SYNC_RUNTIME_BUNDLE:-auto}"
|
||||||
|
ACTIVITY_CORE_RESTART_DEPLOYMENTS="${ACTIVITY_CORE_RESTART_DEPLOYMENTS:-0}"
|
||||||
|
if [[ "$ACTIVITY_CORE_CLUSTER_HOST" == "local" ]]; then
|
||||||
|
ACTIVITY_CORE_CLUSTER_HOST=""
|
||||||
|
fi
|
||||||
|
if [[ -z "$ACTIVITY_CORE_REMOTE_REPO" ]]; then
|
||||||
|
if [[ -n "$ACTIVITY_CORE_CLUSTER_HOST" ]]; then
|
||||||
|
ACTIVITY_CORE_REMOTE_REPO="$(ssh "$ACTIVITY_CORE_CLUSTER_HOST" pwd)/activity-core"
|
||||||
|
else
|
||||||
|
ACTIVITY_CORE_REMOTE_REPO="$ACTIVITY_CORE_REPO"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
EVIDENCE_WORKSTREAM_ID="${STATE_HUB_EVIDENCE_WORKSTREAM_ID:-c91a0946-92f9-4b41-8a92-005b29952916}"
|
||||||
|
EVIDENCE_TASK_ID="${STATE_HUB_EVIDENCE_TASK_ID:-d15fc947-3fbe-4269-93c6-d98577352149}"
|
||||||
|
INTER_HUB_SUBMISSION_STATUS="${INTER_HUB_SUBMISSION_STATUS:-deferred}"
|
||||||
|
INTER_HUB_DEFER_REASON="${INTER_HUB_DEFER_REASON:-ops-hub key custody and Inter-Hub production intake remain operator-gated; State Hub fallback evidence is accepted for this handoff}"
|
||||||
|
|
||||||
|
STARTED_AT="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
|
||||||
|
CURRENT_GATE="startup"
|
||||||
|
REMOTE_REVISION=""
|
||||||
|
API_IMAGE=""
|
||||||
|
SYNC_STATUS_JSON=""
|
||||||
|
DEFINITION_JSON=""
|
||||||
|
TRIGGER_JSON=""
|
||||||
|
PROGRESS_JSON=""
|
||||||
|
EVIDENCE_NOTE_JSON=""
|
||||||
|
|
||||||
|
export NAMESPACE DEFINITION_ID DEFINITION_SLUG DEFINITION_NAME
|
||||||
|
export STATE_HUB_URL EVIDENCE_WORKSTREAM_ID EVIDENCE_TASK_ID
|
||||||
|
export STATE_HUB_PROGRESS_TIMEOUT_SECONDS STATE_HUB_PROGRESS_POLL_SECONDS
|
||||||
|
export INTER_HUB_SUBMISSION_STATUS INTER_HUB_DEFER_REASON STARTED_AT
|
||||||
|
export ACTIVITY_CORE_CLUSTER_HOST ACTIVITY_CORE_REMOTE_REPO
|
||||||
|
export ACTIVITY_CORE_SYNC_RUNTIME_BUNDLE ACTIVITY_CORE_RESTART_DEPLOYMENTS
|
||||||
|
export REMOTE_REVISION API_IMAGE SYNC_STATUS_JSON DEFINITION_JSON TRIGGER_JSON PROGRESS_JSON
|
||||||
|
|
||||||
|
log() {
|
||||||
|
printf '[activity-core-verify] %s\n' "$*"
|
||||||
|
}
|
||||||
|
|
||||||
|
quote() {
|
||||||
|
printf '%q' "$1"
|
||||||
|
}
|
||||||
|
|
||||||
|
cluster_bash() {
|
||||||
|
local script="$1"
|
||||||
|
if [[ -n "$ACTIVITY_CORE_CLUSTER_HOST" ]]; then
|
||||||
|
ssh "$ACTIVITY_CORE_CLUSTER_HOST" "bash -s" <<<"$script"
|
||||||
|
else
|
||||||
|
bash -s <<<"$script"
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
should_sync_runtime_bundle() {
|
||||||
|
case "$ACTIVITY_CORE_SYNC_RUNTIME_BUNDLE" in
|
||||||
|
1|true|yes) return 0 ;;
|
||||||
|
0|false|no) return 1 ;;
|
||||||
|
auto)
|
||||||
|
[[ -n "$ACTIVITY_CORE_CLUSTER_HOST" && -d "$ACTIVITY_CORE_REPO/k8s/railiance" ]]
|
||||||
|
return
|
||||||
|
;;
|
||||||
|
*)
|
||||||
|
printf 'invalid ACTIVITY_CORE_SYNC_RUNTIME_BUNDLE=%s\n' "$ACTIVITY_CORE_SYNC_RUNTIME_BUNDLE" >&2
|
||||||
|
exit 2
|
||||||
|
;;
|
||||||
|
esac
|
||||||
|
}
|
||||||
|
|
||||||
|
post_evidence() {
|
||||||
|
local status="$1"
|
||||||
|
local failing_gate="${2:-}"
|
||||||
|
export EVIDENCE_STATUS="$status"
|
||||||
|
export FAILING_GATE="$failing_gate"
|
||||||
|
python3 - <<'PY'
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
def from_json_env(name):
|
||||||
|
raw = os.environ.get(name, "")
|
||||||
|
if not raw:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return json.loads(raw)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return {"raw": raw}
|
||||||
|
|
||||||
|
status = os.environ["EVIDENCE_STATUS"]
|
||||||
|
failing_gate = os.environ.get("FAILING_GATE") or None
|
||||||
|
definition = from_json_env("DEFINITION_JSON")
|
||||||
|
trigger = from_json_env("TRIGGER_JSON")
|
||||||
|
progress = from_json_env("PROGRESS_JSON")
|
||||||
|
sync_status = from_json_env("SYNC_STATUS_JSON")
|
||||||
|
|
||||||
|
detail = {
|
||||||
|
"producer": "railiance-cluster",
|
||||||
|
"verification": "activity-core cluster-owned deploy/verify",
|
||||||
|
"status": status,
|
||||||
|
"failing_gate": failing_gate,
|
||||||
|
"cluster_host": os.environ.get("ACTIVITY_CORE_CLUSTER_HOST") or "local-kubectl",
|
||||||
|
"namespace": os.environ.get("NAMESPACE"),
|
||||||
|
"activity_core_repo": os.environ.get("ACTIVITY_CORE_REMOTE_REPO"),
|
||||||
|
"activity_core_revision": os.environ.get("REMOTE_REVISION") or None,
|
||||||
|
"api_image": os.environ.get("API_IMAGE") or None,
|
||||||
|
"runtime_bundle": "k8s/railiance/20-runtime.yaml",
|
||||||
|
"sync_job": sync_status,
|
||||||
|
"definition": definition,
|
||||||
|
"manual_trigger": trigger,
|
||||||
|
"state_hub_progress": progress,
|
||||||
|
"inter_hub_submission": {
|
||||||
|
"status": os.environ.get("INTER_HUB_SUBMISSION_STATUS"),
|
||||||
|
"reason": os.environ.get("INTER_HUB_DEFER_REASON"),
|
||||||
|
},
|
||||||
|
"started_at": os.environ.get("STARTED_AT"),
|
||||||
|
}
|
||||||
|
|
||||||
|
if status == "passed":
|
||||||
|
summary = (
|
||||||
|
"Railiance activity-core deploy/verify passed: runtime reconciled, "
|
||||||
|
"actcore-sync completed, ops-service-inventory-probes remains disabled, "
|
||||||
|
f"manual trigger {trigger.get('workflow_id') if isinstance(trigger, dict) else 'unknown'} ran, "
|
||||||
|
f"and State Hub ops_inventory_probe progress {progress.get('id') if isinstance(progress, dict) else 'unknown'} exists."
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
summary = (
|
||||||
|
"Railiance activity-core deploy/verify failed"
|
||||||
|
+ (f" at {failing_gate}" if failing_gate else "")
|
||||||
|
+ "; see non-secret evidence detail for the last completed gate."
|
||||||
|
)
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"summary": summary,
|
||||||
|
"event_type": "note",
|
||||||
|
"author": "railiance-cluster",
|
||||||
|
"detail": detail,
|
||||||
|
}
|
||||||
|
if os.environ.get("EVIDENCE_WORKSTREAM_ID"):
|
||||||
|
payload["workstream_id"] = os.environ["EVIDENCE_WORKSTREAM_ID"]
|
||||||
|
if os.environ.get("EVIDENCE_TASK_ID"):
|
||||||
|
payload["task_id"] = os.environ["EVIDENCE_TASK_ID"]
|
||||||
|
|
||||||
|
body = json.dumps(payload).encode("utf-8")
|
||||||
|
req = urllib.request.Request(
|
||||||
|
os.environ["STATE_HUB_URL"].rstrip("/") + "/progress/",
|
||||||
|
data=body,
|
||||||
|
headers={"Content-Type": "application/json"},
|
||||||
|
method="POST",
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(req, timeout=20) as resp:
|
||||||
|
sys.stdout.write(resp.read().decode("utf-8"))
|
||||||
|
except Exception as exc:
|
||||||
|
sys.stderr.write(f"failed to post State Hub evidence note: {exc}\n")
|
||||||
|
raise
|
||||||
|
PY
|
||||||
|
}
|
||||||
|
|
||||||
|
on_error() {
|
||||||
|
local code=$?
|
||||||
|
trap - ERR
|
||||||
|
post_evidence "failed" "$CURRENT_GATE" >/dev/null || true
|
||||||
|
exit "$code"
|
||||||
|
}
|
||||||
|
trap on_error ERR
|
||||||
|
|
||||||
|
CURRENT_GATE="cluster executor preflight"
|
||||||
|
log "using cluster executor: ${ACTIVITY_CORE_CLUSTER_HOST:-local kubectl}"
|
||||||
|
cluster_bash "$(cat <<EOF
|
||||||
|
set -euo pipefail
|
||||||
|
command -v kubectl >/dev/null
|
||||||
|
EOF
|
||||||
|
)"
|
||||||
|
|
||||||
|
CURRENT_GATE="runtime bundle sync"
|
||||||
|
if should_sync_runtime_bundle; then
|
||||||
|
if [[ -z "$ACTIVITY_CORE_CLUSTER_HOST" ]]; then
|
||||||
|
log "runtime bundle already local at ${ACTIVITY_CORE_REPO}/k8s/railiance"
|
||||||
|
else
|
||||||
|
log "syncing runtime bundle to ${ACTIVITY_CORE_CLUSTER_HOST}:${ACTIVITY_CORE_REMOTE_REPO}/k8s/railiance"
|
||||||
|
ssh "$ACTIVITY_CORE_CLUSTER_HOST" "mkdir -p $(quote "$ACTIVITY_CORE_REMOTE_REPO")/k8s/railiance"
|
||||||
|
rsync -a --delete \
|
||||||
|
"$ACTIVITY_CORE_REPO/k8s/railiance/" \
|
||||||
|
"${ACTIVITY_CORE_CLUSTER_HOST}:${ACTIVITY_CORE_REMOTE_REPO}/k8s/railiance/"
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
CURRENT_GATE="runtime bundle preflight"
|
||||||
|
cluster_bash "$(cat <<EOF
|
||||||
|
set -euo pipefail
|
||||||
|
test -f $(quote "$ACTIVITY_CORE_REMOTE_REPO")/k8s/railiance/00-namespace.yaml
|
||||||
|
test -f $(quote "$ACTIVITY_CORE_REMOTE_REPO")/k8s/railiance/20-runtime.yaml
|
||||||
|
grep -q $(quote "$DEFINITION_SLUG") $(quote "$ACTIVITY_CORE_REMOTE_REPO")/k8s/railiance/20-runtime.yaml
|
||||||
|
EOF
|
||||||
|
)"
|
||||||
|
|
||||||
|
CURRENT_GATE="activity-core revision inspection"
|
||||||
|
REMOTE_REVISION="$(
|
||||||
|
cluster_bash "$(cat <<EOF
|
||||||
|
set -euo pipefail
|
||||||
|
git -C $(quote "$ACTIVITY_CORE_REMOTE_REPO") rev-parse --short HEAD 2>/dev/null || true
|
||||||
|
EOF
|
||||||
|
)"
|
||||||
|
)"
|
||||||
|
export REMOTE_REVISION
|
||||||
|
|
||||||
|
CURRENT_GATE="runtime bundle reconcile"
|
||||||
|
log "reconciling activity-core runtime bundle"
|
||||||
|
cluster_bash "$(cat <<EOF
|
||||||
|
set -euo pipefail
|
||||||
|
kubectl apply -f $(quote "$ACTIVITY_CORE_REMOTE_REPO")/k8s/railiance/00-namespace.yaml
|
||||||
|
kubectl -n $(quote "$NAMESPACE") delete job actcore-migrate actcore-sync --ignore-not-found
|
||||||
|
kubectl apply -f $(quote "$ACTIVITY_CORE_REMOTE_REPO")/k8s/railiance/20-runtime.yaml
|
||||||
|
if [[ $(quote "$ACTIVITY_CORE_RESTART_DEPLOYMENTS") == "1" ]]; then
|
||||||
|
kubectl -n $(quote "$NAMESPACE") rollout restart deploy/actcore-api deploy/actcore-worker deploy/actcore-event-router
|
||||||
|
fi
|
||||||
|
kubectl -n $(quote "$NAMESPACE") wait --for=condition=complete job/actcore-migrate --timeout=180s
|
||||||
|
kubectl -n $(quote "$NAMESPACE") rollout status deploy/actcore-api --timeout=180s
|
||||||
|
kubectl -n $(quote "$NAMESPACE") rollout status deploy/actcore-worker --timeout=180s
|
||||||
|
kubectl -n $(quote "$NAMESPACE") rollout status deploy/actcore-event-router --timeout=180s
|
||||||
|
kubectl -n $(quote "$NAMESPACE") wait --for=condition=complete job/actcore-sync --timeout=180s
|
||||||
|
EOF
|
||||||
|
)"
|
||||||
|
|
||||||
|
CURRENT_GATE="live image capability check"
|
||||||
|
cluster_bash "$(cat <<EOF
|
||||||
|
set -euo pipefail
|
||||||
|
kubectl -n $(quote "$NAMESPACE") exec deploy/actcore-api -- python -c 'import activity_core.context_resolvers.ops_inventory; import activity_core.ops_evidence_sinks'
|
||||||
|
EOF
|
||||||
|
)"
|
||||||
|
|
||||||
|
CURRENT_GATE="runtime status capture"
|
||||||
|
API_IMAGE="$(
|
||||||
|
cluster_bash "$(cat <<EOF
|
||||||
|
set -euo pipefail
|
||||||
|
kubectl -n $(quote "$NAMESPACE") get deploy actcore-api -o jsonpath='{.spec.template.spec.containers[0].image}'
|
||||||
|
EOF
|
||||||
|
)"
|
||||||
|
)"
|
||||||
|
SYNC_STATUS_JSON="$(
|
||||||
|
cluster_bash "$(cat <<EOF
|
||||||
|
set -euo pipefail
|
||||||
|
kubectl -n $(quote "$NAMESPACE") get job actcore-sync -o json
|
||||||
|
EOF
|
||||||
|
)" | python3 -c 'import json,sys; j=json.load(sys.stdin); s=j.get("status",{}); print(json.dumps({"name": j["metadata"]["name"], "succeeded": s.get("succeeded", 0), "failed": s.get("failed", 0), "completion_time": s.get("completionTime")}))'
|
||||||
|
)"
|
||||||
|
export API_IMAGE SYNC_STATUS_JSON
|
||||||
|
|
||||||
|
CURRENT_GATE="disabled definition check"
|
||||||
|
log "checking ${DEFINITION_SLUG} is present and disabled"
|
||||||
|
DEFINITION_JSON="$(
|
||||||
|
cluster_bash "$(cat <<EOF
|
||||||
|
set -euo pipefail
|
||||||
|
kubectl -n $(quote "$NAMESPACE") exec -i deploy/actcore-api -- python - $(quote "$DEFINITION_ID") $(quote "$DEFINITION_NAME") <<'PY'
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
definition_id = sys.argv[1]
|
||||||
|
expected_name = sys.argv[2]
|
||||||
|
with urllib.request.urlopen("http://localhost:8010/activity-definitions/", timeout=30) as resp:
|
||||||
|
definitions = json.load(resp)
|
||||||
|
|
||||||
|
for definition in definitions:
|
||||||
|
if definition.get("id") == definition_id:
|
||||||
|
if definition.get("enabled") is not False:
|
||||||
|
raise SystemExit(f"definition {definition_id} exists but enabled={definition.get('enabled')!r}")
|
||||||
|
if definition.get("name") != expected_name:
|
||||||
|
raise SystemExit(f"definition {definition_id} name mismatch: {definition.get('name')!r}")
|
||||||
|
print(json.dumps({
|
||||||
|
"id": definition["id"],
|
||||||
|
"slug": "ops-service-inventory-probes",
|
||||||
|
"name": definition["name"],
|
||||||
|
"enabled": definition["enabled"],
|
||||||
|
"trigger_type": definition.get("trigger_type"),
|
||||||
|
"version": definition.get("version"),
|
||||||
|
}))
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
raise SystemExit(f"definition {definition_id} not found")
|
||||||
|
PY
|
||||||
|
EOF
|
||||||
|
)"
|
||||||
|
)"
|
||||||
|
if [[ -z "$DEFINITION_JSON" ]]; then
|
||||||
|
printf 'definition check produced no output\n' >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
export DEFINITION_JSON
|
||||||
|
|
||||||
|
CURRENT_GATE="manual disabled trigger"
|
||||||
|
log "triggering disabled definition manually"
|
||||||
|
TRIGGER_JSON="$(
|
||||||
|
cluster_bash "$(cat <<EOF
|
||||||
|
set -euo pipefail
|
||||||
|
kubectl -n $(quote "$NAMESPACE") exec -i deploy/actcore-api -- python - $(quote "$DEFINITION_ID") <<'PY'
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
definition_id = sys.argv[1]
|
||||||
|
req = urllib.request.Request(
|
||||||
|
f"http://localhost:8010/activity-definitions/{definition_id}/trigger",
|
||||||
|
method="POST",
|
||||||
|
)
|
||||||
|
with urllib.request.urlopen(req, timeout=30) as resp:
|
||||||
|
print(resp.read().decode("utf-8"))
|
||||||
|
PY
|
||||||
|
EOF
|
||||||
|
)"
|
||||||
|
)"
|
||||||
|
if [[ -z "$TRIGGER_JSON" ]]; then
|
||||||
|
printf 'manual trigger produced no output\n' >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
export TRIGGER_JSON
|
||||||
|
|
||||||
|
CURRENT_GATE="State Hub ops_inventory_probe evidence"
|
||||||
|
log "polling State Hub for ops_inventory_probe progress"
|
||||||
|
PROGRESS_JSON="$(
|
||||||
|
python3 - <<'PY'
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import urllib.parse
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
base = os.environ["STATE_HUB_URL"].rstrip("/")
|
||||||
|
started = datetime.fromisoformat(os.environ["STARTED_AT"].replace("Z", "+00:00"))
|
||||||
|
timeout = int(os.environ["STATE_HUB_PROGRESS_TIMEOUT_SECONDS"])
|
||||||
|
interval = int(os.environ["STATE_HUB_PROGRESS_POLL_SECONDS"])
|
||||||
|
deadline = time.monotonic() + timeout
|
||||||
|
url = base + "/progress/?" + urllib.parse.urlencode({"event_type": "ops_inventory_probe"})
|
||||||
|
|
||||||
|
while time.monotonic() < deadline:
|
||||||
|
with urllib.request.urlopen(url, timeout=20) as resp:
|
||||||
|
events = json.load(resp)
|
||||||
|
for event in events:
|
||||||
|
created_at = datetime.fromisoformat(event["created_at"].replace("Z", "+00:00"))
|
||||||
|
if created_at >= started:
|
||||||
|
detail = event.get("detail") or {}
|
||||||
|
print(json.dumps({
|
||||||
|
"id": event["id"],
|
||||||
|
"event_type": event.get("event_type"),
|
||||||
|
"summary": event.get("summary"),
|
||||||
|
"author": event.get("author"),
|
||||||
|
"created_at": event.get("created_at"),
|
||||||
|
"detail_keys": sorted(detail.keys()) if isinstance(detail, dict) else [],
|
||||||
|
}))
|
||||||
|
raise SystemExit(0)
|
||||||
|
time.sleep(interval)
|
||||||
|
|
||||||
|
raise SystemExit(f"no ops_inventory_probe progress found after {timeout}s")
|
||||||
|
PY
|
||||||
|
)"
|
||||||
|
export PROGRESS_JSON
|
||||||
|
|
||||||
|
CURRENT_GATE="State Hub evidence note"
|
||||||
|
log "posting non-secret evidence note to State Hub"
|
||||||
|
EVIDENCE_NOTE_JSON="$(post_evidence "passed" "")"
|
||||||
|
export EVIDENCE_NOTE_JSON
|
||||||
|
|
||||||
|
trap - ERR
|
||||||
|
log "verification passed"
|
||||||
|
printf '%s\n' "$EVIDENCE_NOTE_JSON"
|
||||||
@@ -0,0 +1,110 @@
|
|||||||
|
---
|
||||||
|
id: RAILIANCE-WP-0012
|
||||||
|
type: workplan
|
||||||
|
title: "activity-core cluster-owned deploy/verify"
|
||||||
|
domain: railiance
|
||||||
|
repo: railiance-cluster
|
||||||
|
status: finished
|
||||||
|
owner: codex
|
||||||
|
topic_slug: railiance
|
||||||
|
created: "2026-06-15"
|
||||||
|
updated: "2026-06-16"
|
||||||
|
state_hub_workstream_id: "6434f7cb-e13c-4c05-839b-197bb239d5cd"
|
||||||
|
---
|
||||||
|
|
||||||
|
# activity-core cluster-owned deploy/verify
|
||||||
|
|
||||||
|
## Context
|
||||||
|
|
||||||
|
activity-core `ACTIVITY-WP-0007-T06` needs live Railiance cluster evidence for
|
||||||
|
the disabled ops inventory probe. That live verification should be owned by the
|
||||||
|
cluster/operator layer, not by arbitrary activity-core sessions with local
|
||||||
|
`kubectl` assumptions.
|
||||||
|
|
||||||
|
This workplan creates a cluster-owned path that keeps credentials in
|
||||||
|
operator-owned locations while returning only non-secret evidence to State Hub.
|
||||||
|
|
||||||
|
## Implement cluster-owned verifier
|
||||||
|
|
||||||
|
```task
|
||||||
|
id: RAILIANCE-WP-0012-T01
|
||||||
|
status: done
|
||||||
|
priority: high
|
||||||
|
state_hub_task_id: "3769fdfb-b4f1-431b-a55a-672d93b3ea55"
|
||||||
|
```
|
||||||
|
|
||||||
|
Add a repeatable command that:
|
||||||
|
|
||||||
|
- reconciles the activity-core Railiance runtime bundle;
|
||||||
|
- reruns `actcore-sync`;
|
||||||
|
- checks the `ops-service-inventory-probes` ActivityDefinition exists and is
|
||||||
|
still disabled;
|
||||||
|
- triggers the disabled definition manually through the in-cluster API path;
|
||||||
|
- verifies a fresh `ops_inventory_probe` progress event exists in State Hub;
|
||||||
|
- posts a non-secret State Hub evidence note for activity-core to cite.
|
||||||
|
|
||||||
|
Implemented as `tools/cmd/railiance-verify-activity-core` with Makefile target
|
||||||
|
`verify-activity-core`. The script defaults to the `railiance01` SSH executor;
|
||||||
|
use `ACTIVITY_CORE_CLUSTER_HOST=local` only for an explicitly selected local
|
||||||
|
`kubectl` context.
|
||||||
|
|
||||||
|
## Run live verification and publish evidence
|
||||||
|
|
||||||
|
```task
|
||||||
|
id: RAILIANCE-WP-0012-T02
|
||||||
|
status: done
|
||||||
|
priority: high
|
||||||
|
state_hub_task_id: "6d7f87c3-a533-4de1-84de-9ca65f2e2779"
|
||||||
|
```
|
||||||
|
|
||||||
|
Run `make verify-activity-core` against the Railiance cluster. On success, cite
|
||||||
|
the State Hub evidence note id in this task and in activity-core
|
||||||
|
`ACTIVITY-WP-0007-T06`.
|
||||||
|
|
||||||
|
If a gate fails, the verifier must still post a non-secret State Hub note with
|
||||||
|
the failing gate and last completed evidence fields.
|
||||||
|
|
||||||
|
2026-06-15: Completed against Railiance01 after refreshing the same-tag
|
||||||
|
`activity-core:railiance01-prod` image from activity-core commit `ab17378`,
|
||||||
|
importing digest `sha256:cff43c72455b9fc4fc11a0a997b4671a38987bb4583a600245dd961965af0e40`
|
||||||
|
into k3s containerd, syncing the current runtime bundle to
|
||||||
|
`/home/tegwick/activity-core/k8s/railiance`, and restarting the activity-core
|
||||||
|
runtime deployments. The verifier reconciled the runtime bundle, completed
|
||||||
|
`actcore-sync`, confirmed `ops-service-inventory-probes` exists and remains
|
||||||
|
disabled, triggered it manually, verified State Hub progress
|
||||||
|
`4c82360d-33e7-455b-8ab4-33facd4a3f8e`, and posted evidence note
|
||||||
|
`baeeaeac-aa6d-4406-ae64-e54577f21386`.
|
||||||
|
|
||||||
|
An intermediate verifier invocation accidentally targeted the local
|
||||||
|
CoulombCore `kubectl` context. It created only `actcore-*` runtime resources in
|
||||||
|
the existing `activity-core` namespace; those resources were removed with the
|
||||||
|
runtime manifest cleanup, and the pre-existing `llm-connect` deployment remains
|
||||||
|
running.
|
||||||
|
|
||||||
|
Operational cleanup note: the successful Railiance01 verifier run used
|
||||||
|
`ACTIVITY_CORE_RESTART_DEPLOYMENTS=1` after importing the same-tag image. The
|
||||||
|
script was corrected afterward to restart only `actcore-api`,
|
||||||
|
`actcore-worker`, and `actcore-event-router`, because
|
||||||
|
`actcore-state-hub-bridge` uses host networking and a rolling restart leaves a
|
||||||
|
new bridge pod pending behind the host-bound running pod. A 2026-06-16 cleanup
|
||||||
|
check showed the bridge rollout had settled on Railiance01: the host-bound
|
||||||
|
bridge pod was running and the replacement ReplicaSet was scaled to zero, so no
|
||||||
|
manual live cleanup was needed.
|
||||||
|
|
||||||
|
## Handoff closure to activity-core
|
||||||
|
|
||||||
|
```task
|
||||||
|
id: RAILIANCE-WP-0012-T03
|
||||||
|
status: done
|
||||||
|
priority: medium
|
||||||
|
state_hub_task_id: "43f652c6-fcc4-49fa-90cc-4122eb6d5321"
|
||||||
|
```
|
||||||
|
|
||||||
|
After live evidence exists, update activity-core `ACTIVITY-WP-0007-T06` to cite
|
||||||
|
the Railiance evidence and close it if Inter-Hub submission is active or
|
||||||
|
explicitly deferred with the clean State Hub fallback result.
|
||||||
|
|
||||||
|
2026-06-15: Updated activity-core `ACTIVITY-WP-0007-T06` to cite Railiance
|
||||||
|
evidence note `baeeaeac-aa6d-4406-ae64-e54577f21386` and close the task with
|
||||||
|
Inter-Hub submission explicitly deferred while the State Hub fallback evidence
|
||||||
|
path is verified.
|
||||||
Reference in New Issue
Block a user