From b3b0c3e3ffe6ec14d07f985e5292b7b9616ae822 Mon Sep 17 00:00:00 2001 From: tegwick Date: Thu, 2 Jul 2026 00:02:36 +0200 Subject: [PATCH] Repo hygiene + new workplans (RAIL-BS-WP-0008/0009) - Add RAIL-BS-WP-0008 (activity-core WP-0016 deploy) and RAIL-BS-WP-0009 (admin-sync smoke) from inbox asks 87952ff1 / aa8b7986 - Archive finished workplans to workplans/archived/ per ADR-001 convention; normalize frontmatter statuses (completed/done -> finished) - Fill stack-and-commands.md, complete repo-boundary.md, refresh SCOPE Current State, add docs/operator-runbook.md for production-touching targets Co-Authored-By: Claude Fable 5 --- .claude/rules/repo-boundary.md | 19 ++-- .claude/rules/stack-and-commands.md | 27 +++--- SCOPE.md | 4 +- docs/operator-runbook.md | 35 ++++++++ ...tivity-core-wp0016-triage-output-deploy.md | 89 +++++++++++++++++++ ...-WP-0009-activity-core-admin-sync-smoke.md | 46 ++++++++++ ...-RAIL-BS-WP-0001-dependency-management.md} | 2 +- .../260622-RAIL-BS-WP-0002-k3s-baseline.md} | 2 +- ...RAIL-BS-WP-0003-pgpool-ha-failover-fix.md} | 2 +- .../260622-RAIL-BS-WP-0004-safety-net.md} | 2 +- ...22-RAIL-BS-WP-0005-kubeconfig-delivery.md} | 2 +- ...ivity-core-cluster-owned-deploy-verify.md} | 0 ...ivity-core-verifier-evidence-hardening.md} | 0 ...-BS-WP-0006-staged-promotion-lifecycle.md} | 0 ...tivity-core-llm-connect-live-reconcile.md} | 0 15 files changed, 206 insertions(+), 24 deletions(-) create mode 100644 docs/operator-runbook.md create mode 100644 workplans/RAIL-BS-WP-0008-activity-core-wp0016-triage-output-deploy.md create mode 100644 workplans/RAIL-BS-WP-0009-activity-core-admin-sync-smoke.md rename workplans/{RAIL-BS-WP-0001-dependency-management.md => archived/260622-RAIL-BS-WP-0001-dependency-management.md} (99%) rename workplans/{RAIL-BS-WP-0002-k3s-baseline.md => archived/260622-RAIL-BS-WP-0002-k3s-baseline.md} (99%) rename workplans/{RAIL-BS-WP-0003-pgpool-ha-failover-fix.md => archived/260622-RAIL-BS-WP-0003-pgpool-ha-failover-fix.md} (99%) rename workplans/{RAIL-BS-WP-0004-safety-net.md => archived/260622-RAIL-BS-WP-0004-safety-net.md} (99%) rename workplans/{RAIL-BS-WP-0005-kubeconfig-delivery.md => archived/260622-RAIL-BS-WP-0005-kubeconfig-delivery.md} (99%) rename workplans/{RAILIANCE-WP-0012-activity-core-cluster-owned-deploy-verify.md => archived/260622-RAILIANCE-WP-0012-activity-core-cluster-owned-deploy-verify.md} (100%) rename workplans/{RAILIANCE-WP-0013-activity-core-verifier-evidence-hardening.md => archived/260622-RAILIANCE-WP-0013-activity-core-verifier-evidence-hardening.md} (100%) rename workplans/{RAIL-BS-WP-0006-staged-promotion-lifecycle.md => archived/260627-RAIL-BS-WP-0006-staged-promotion-lifecycle.md} (100%) rename workplans/{RAILIANCE-WP-0014-activity-core-llm-connect-live-reconcile.md => archived/260701-RAILIANCE-WP-0014-activity-core-llm-connect-live-reconcile.md} (100%) diff --git a/.claude/rules/repo-boundary.md b/.claude/rules/repo-boundary.md index 7eccb6c..2f85f27 100644 --- a/.claude/rules/repo-boundary.md +++ b/.claude/rules/repo-boundary.md @@ -1,8 +1,17 @@ ## Repo boundary -This repo owns **railiance-cluster** only. It does not own: +This repo owns **railiance-cluster** only (OAS S2 — cluster runtime). It does not own: - +- OS hardening, SSH, firewall, Terraform/cloud-init provisioning → `railiance-infra` (S1) +- Platform services: PostgreSQL HA, Valkey, OpenBao, object storage → `railiance-platform` (S3) +- CI/CD templates, developer portal, SDKs → `railiance-enablement` (S4) +- Application Helm releases and workload manifests (incl. Gitea values) → `railiance-apps` (S5) +- Forge/registry infrastructure (Gitea/Forgejo operation) → `railiance-forge` +- Ecosystem graph/registry model → `railiance-fabric` +- Identity/SSO/MFA (Keycloak, IAM profiles) → `net-kingdom` +- State Hub code → `state-hub` / `the-custodian` + +S2 *does* deploy cluster-scoped operators and addons (cert-manager, cnpg +operator, ArgoCD, nginx ingress) and owns kubeconfig custody, plus +cluster-owned deploy/verify gates for workloads whose repos have no cluster +access (e.g. the activity-core and llm-connect reconcile commands). diff --git a/.claude/rules/stack-and-commands.md b/.claude/rules/stack-and-commands.md index dc53ac6..ee026b2 100644 --- a/.claude/rules/stack-and-commands.md +++ b/.claude/rules/stack-and-commands.md @@ -1,19 +1,22 @@ ## Stack - -- **Language:** -- **Key deps:** +- **Language:** Bash tooling (`tools/cmd/`) orchestrating kubectl/Helm over SSH +- **Key deps:** k3s on railiance01 (COULOMBCORE), Helm, SOPS/age for secrets, State Hub REST for evidence notes +- **Execution model:** commands run from the workstation; cluster access is `ssh railiance01` (most `tools/cmd/*` accept a `CLUSTER_HOST` override) ## Dev Commands ```bash -# TODO: Fill in the standard commands for this repo - -# Install dependencies - -# Run tests - -# Lint / type check - -# Build / package (if applicable) +make help # list all targets +make preflight # pre-migration safety gate — run before cluster work +make smoke # Kubernetes smoke tests +make test-ha-failover # HA failover test (kills primary PG pod, asserts recovery) +sudo make backup # age-encrypted backup: k3s state + Helm values + kubeconfig +make restore # list backups + restore guide +make verify-activity-core # reconcile activity-core runtime + probe evidence +make reconcile-activity-core-llm-connect # llm-connect reconcile + non-secret gate checks ``` + +Production-touching targets (deploy/reconcile/backup) need operator approval — +see `docs/operator-runbook.md`. There is no test suite or linter in this repo; +validation is the preflight + smoke targets against the live cluster. diff --git a/SCOPE.md b/SCOPE.md index 8bceca5..15d7933 100644 --- a/SCOPE.md +++ b/SCOPE.md @@ -60,8 +60,8 @@ Railiance is structured as five independent repos per OAS Stack layer. This repo ## Current State - Status: active / stable -- Implementation: k3s baseline complete (RAIL-BS-WP-0002 done); pgpool HA failover fix complete (RAIL-BS-WP-0003 done); integrated backup complete (RAIL-BS-WP-0004 done — age-encrypted local backup, daily cron under root) -- Stability: high — no active open workplans +- Implementation: k3s baseline, pgpool HA failover fix, age-encrypted backup, kubeconfig delivery, staged promotion lifecycle, and activity-core/llm-connect reconcile gates all finished (RAIL-BS-WP-0002…0006, RAILIANCE-WP-0012…0014) +- Open work: RAIL-BS-WP-0007 ThreePhoenix HA cluster (active, 0/7); RAIL-BS-WP-0008 activity-core WP-0016 deploy (ready); RAIL-BS-WP-0009 admin-sync smoke (ready) - Usage: core Kubernetes runtime for all Railiance deployments; runs on COULOMBCORE (92.205.130.254) - Also deployed at cluster level: cert-manager, ArgoCD, CloudNative PG operator (cnpg), nginx ingress, SSO stack (mfa + sso namespaces via net-kingdom) diff --git a/docs/operator-runbook.md b/docs/operator-runbook.md new file mode 100644 index 0000000..206abb2 --- /dev/null +++ b/docs/operator-runbook.md @@ -0,0 +1,35 @@ +# Operator runbook — production-touching commands + +All targets below change state on the production k3s cluster (railiance01 / +COULOMBCORE, 92.205.130.254) or its backups. Agent sessions running in auto +mode are denied these by the permission classifier — that is intentional. + +## How to run a production-touching target + +- **Interactively in a Claude Code session:** type `! ` so the + command runs under the operator's authority and the output lands in the + conversation for the agent to act on. +- **Directly:** run from this repo root on the workstation; cluster access is + `ssh railiance01` (key-based, configured in `~/.ssh/config`). + +## Production-touching targets + +| Target | Effect | +|---|---| +| `sudo make backup` | writes age-encrypted backup to `/opt/backup/railiance/cluster/` | +| `make k3s-install` | (re)installs k3s baseline — destructive, preflight first | +| `make test-ha-failover` | kills the primary PG pod to assert recovery | +| `make verify-activity-core` | reconciles activity-core runtime on railiance01 | +| `make reconcile-activity-core-llm-connect` | patches ConfigMap, applies llm-connect overlay, runs smoke pod | + +## Read-only / safe targets + +`make help`, `make preflight`, `make smoke`, `make restore` (prints guide +only). These are safe to allowlist for agent sessions. + +## Evidence convention + +Reconcile/verify targets post non-secret evidence notes to the State Hub +(`STATE_HUB_EVIDENCE_WORKSTREAM_ID` / `STATE_HUB_EVIDENCE_TASK_ID` env vars +attach them to a workstream/task). Never record Secret values — key counts +and readiness states only. diff --git a/workplans/RAIL-BS-WP-0008-activity-core-wp0016-triage-output-deploy.md b/workplans/RAIL-BS-WP-0008-activity-core-wp0016-triage-output-deploy.md new file mode 100644 index 0000000..150455b --- /dev/null +++ b/workplans/RAIL-BS-WP-0008-activity-core-wp0016-triage-output-deploy.md @@ -0,0 +1,89 @@ +--- +id: RAIL-BS-WP-0008 +type: workplan +title: "activity-core WP-0016 triage-output robustness deploy" +domain: financials +repo: railiance-cluster +status: ready +owner: railiance-cluster +topic_slug: railiance +created: "2026-07-01" +updated: "2026-07-01" +--- + +# activity-core WP-0016 triage-output robustness deploy + +## Context + +Inbox message `87952ff1` (activity-core, 2026-06-26): the scheduled daily WSJF +triage run on 2026-06-26 failed schema validation and the whole run was +discarded, resetting the WP-0006-T03 three-clean-run streak. ACTIVITY-WP-0016 +hardened the instruction-executor output contract in-repo (commits +`5eb33bd..bf877b7` on activity-core main, 220 tests passed). The remaining +work is operator/cluster-owned on railiance01. + +**Deploy coupling constraint:** `schemas/daily-triage-report.json` is now +strict per-item and is consumed by both the llm-connect hint and the +whole-doc validator. It MUST ship together with the new `executor.py` +(T03 per-item quarantine parser). Never deploy the schema ahead of the code. + +## Deploy activity-core with coupled schema and executor + +```task +id: RAIL-BS-WP-0008-T01 +status: todo +priority: high +``` + +Rebuild/import the activity-core image from main (`bf877b7` or later) into +the railiance01 k3s runtime and reconcile the activity-core deployment so the +new executor and the strict per-item schema ship together. + +## Update daily-statehub-wsjf-triage runtime-bundle Instruction + +```task +id: RAIL-BS-WP-0008-T02 +status: todo +priority: high +``` + +In the runtime projection (not the activity-core repo), update the +`daily-statehub-wsjf-triage` Instruction: + +- raise `max_tokens` (currently ~1200; give clear headroom above the + ~1300–1500-token 16-workstream list); +- prompt: bounded top-N (≤7) ranked recommendations, "if uncertain emit fewer + well-formed items rather than more"; +- prompt: per-item NDJSON framing (leading summary object, then one + recommendation JSON object per line) so the T03 parser recovers items + independently. + +## Pull raw llm-connect response for the 2026-06-26 run + +```task +id: RAIL-BS-WP-0008-T03 +status: todo +priority: medium +``` + +From the llm-connect pod logs / response store on railiance01, capture the +full raw response and `finish_reason` for the 2026-06-26 05:20:57Z run +(activity-core retained only a 4000-char preview; the JSON break is at char +5268). Send to activity-core to close ACTIVITY-WP-0016-T01. Logs only, no +secrets. + +## Acceptance smoke + +```task +id: RAIL-BS-WP-0008-T04 +status: todo +priority: high +``` + +Trigger one daily-triage run against the reconciled runtime and confirm it +either (i) returns a clean schema-valid report, or (ii) degrades gracefully +(valid recommendations with `output_validated=true`, `partial=true`, +`quarantined_count>0`) instead of discarding the run. Confirm the State Hub +shows a matching `daily_triage` progress event. Closes ACTIVITY-WP-0016-T05 +and unblocks the three-clean-run streak for ACTIVITY-WP-0010-T04 / +WP-0006-T03. diff --git a/workplans/RAIL-BS-WP-0009-activity-core-admin-sync-smoke.md b/workplans/RAIL-BS-WP-0009-activity-core-admin-sync-smoke.md new file mode 100644 index 0000000..2416ebe --- /dev/null +++ b/workplans/RAIL-BS-WP-0009-activity-core-admin-sync-smoke.md @@ -0,0 +1,46 @@ +--- +id: RAIL-BS-WP-0009 +type: workplan +title: "activity-core no-restart admin-sync smoke (ACTIVITY-WP-0012-T05)" +domain: financials +repo: railiance-cluster +status: ready +owner: railiance-cluster +topic_slug: railiance +created: "2026-07-01" +updated: "2026-07-01" +--- + +# activity-core no-restart admin-sync smoke (ACTIVITY-WP-0012-T05) + +## Context + +Inbox message `aa8b7986` (activity-core, 2026-06-18): activity-core commit +`3e93567` implements ACTIVITY-WP-0012 T01–T04 (shared sync_service, +`POST /admin/sync`, explicit schedule upsert/pause/orphan-delete counts, +worker startup reuse, runbook docs; 192 tests passed). T05 is the +cluster-owned smoke: prove admin sync works **without** worker +SIGTERM/pod restart. + +The deploy precondition is covered by RAIL-BS-WP-0008-T01 (main at +`bf877b7` ≥ `3e93567`), so run this after that reconcile. + +## Run the no-restart admin-sync smoke + +```task +id: RAIL-BS-WP-0009-T01 +status: wait +priority: medium +``` + +After RAIL-BS-WP-0008-T01 is deployed, without restarting the worker: + +1. Change or use a customer ActivityDefinition enabled-flip/rename fixture. +2. Call `POST /admin/sync?definitions=true&schedules=true` from the operator + path. +3. Confirm the new Temporal schedule is active and the retired/disabled + schedule is paused or deleted per sync semantics. +4. Confirm event-triggered definitions still fire normally. +5. Record non-secret evidence in the State Hub. Response JSON should include + `definitions.synced`, `schedules.upserted`, `schedules.paused`, + `schedules.deleted_orphans`, and `errors[]`. diff --git a/workplans/RAIL-BS-WP-0001-dependency-management.md b/workplans/archived/260622-RAIL-BS-WP-0001-dependency-management.md similarity index 99% rename from workplans/RAIL-BS-WP-0001-dependency-management.md rename to workplans/archived/260622-RAIL-BS-WP-0001-dependency-management.md index f2f5ccb..73ef47c 100644 --- a/workplans/RAIL-BS-WP-0001-dependency-management.md +++ b/workplans/archived/260622-RAIL-BS-WP-0001-dependency-management.md @@ -4,7 +4,7 @@ type: workplan title: "Dependency Management — Add lockfile for Ansible control-node deps" domain: financials repo: railiance-cluster -status: completed +status: finished owner: railiance topic_slug: railiance state_hub_workstream_id: 59155efb-b461-4caa-ad7b-b3fce348db84 diff --git a/workplans/RAIL-BS-WP-0002-k3s-baseline.md b/workplans/archived/260622-RAIL-BS-WP-0002-k3s-baseline.md similarity index 99% rename from workplans/RAIL-BS-WP-0002-k3s-baseline.md rename to workplans/archived/260622-RAIL-BS-WP-0002-k3s-baseline.md index 955d135..9e32ae9 100644 --- a/workplans/RAIL-BS-WP-0002-k3s-baseline.md +++ b/workplans/archived/260622-RAIL-BS-WP-0002-k3s-baseline.md @@ -4,7 +4,7 @@ type: workplan title: "k3s and Kubernetes Platform Baseline" domain: financials repo: railiance-cluster -status: completed +status: finished owner: railiance topic_slug: railiance repo_goal_id: "70ab2379-fb9d-4fec-a09d-b2a717e4ace8" diff --git a/workplans/RAIL-BS-WP-0003-pgpool-ha-failover-fix.md b/workplans/archived/260622-RAIL-BS-WP-0003-pgpool-ha-failover-fix.md similarity index 99% rename from workplans/RAIL-BS-WP-0003-pgpool-ha-failover-fix.md rename to workplans/archived/260622-RAIL-BS-WP-0003-pgpool-ha-failover-fix.md index 2c333ce..f2cdef5 100644 --- a/workplans/RAIL-BS-WP-0003-pgpool-ha-failover-fix.md +++ b/workplans/archived/260622-RAIL-BS-WP-0003-pgpool-ha-failover-fix.md @@ -4,7 +4,7 @@ type: bug-report title: "pgpool CrashLoopBackOff on PostgreSQL HA failover — missing secret key" domain: financials repo: railiance-cluster -status: completed +status: finished owner: tegwick created: "2026-03-10" updated: "2026-03-10" diff --git a/workplans/RAIL-BS-WP-0004-safety-net.md b/workplans/archived/260622-RAIL-BS-WP-0004-safety-net.md similarity index 99% rename from workplans/RAIL-BS-WP-0004-safety-net.md rename to workplans/archived/260622-RAIL-BS-WP-0004-safety-net.md index 23daeb5..331aa6c 100644 --- a/workplans/RAIL-BS-WP-0004-safety-net.md +++ b/workplans/archived/260622-RAIL-BS-WP-0004-safety-net.md @@ -4,7 +4,7 @@ type: workplan title: "Integrated Backup — S2 Kubernetes Runtime Layer" domain: financials repo: railiance-cluster -status: done +status: finished owner: tegwick topic_slug: railiance state_hub_workstream_id: "7e8b0c20-51eb-40c9-9e3b-85dd380d7625" diff --git a/workplans/RAIL-BS-WP-0005-kubeconfig-delivery.md b/workplans/archived/260622-RAIL-BS-WP-0005-kubeconfig-delivery.md similarity index 99% rename from workplans/RAIL-BS-WP-0005-kubeconfig-delivery.md rename to workplans/archived/260622-RAIL-BS-WP-0005-kubeconfig-delivery.md index 75040bf..1a3b957 100644 --- a/workplans/RAIL-BS-WP-0005-kubeconfig-delivery.md +++ b/workplans/archived/260622-RAIL-BS-WP-0005-kubeconfig-delivery.md @@ -4,7 +4,7 @@ type: workplan title: "Kubeconfig delivery for netkingdom SSO/MFA stack apply" domain: financials repo: railiance-cluster -status: done +status: finished owner: railiance-worker topic_slug: railiance capability_request_id: "34b97d89-e80a-42ae-a623-a9185e5b17f5" diff --git a/workplans/RAILIANCE-WP-0012-activity-core-cluster-owned-deploy-verify.md b/workplans/archived/260622-RAILIANCE-WP-0012-activity-core-cluster-owned-deploy-verify.md similarity index 100% rename from workplans/RAILIANCE-WP-0012-activity-core-cluster-owned-deploy-verify.md rename to workplans/archived/260622-RAILIANCE-WP-0012-activity-core-cluster-owned-deploy-verify.md diff --git a/workplans/RAILIANCE-WP-0013-activity-core-verifier-evidence-hardening.md b/workplans/archived/260622-RAILIANCE-WP-0013-activity-core-verifier-evidence-hardening.md similarity index 100% rename from workplans/RAILIANCE-WP-0013-activity-core-verifier-evidence-hardening.md rename to workplans/archived/260622-RAILIANCE-WP-0013-activity-core-verifier-evidence-hardening.md diff --git a/workplans/RAIL-BS-WP-0006-staged-promotion-lifecycle.md b/workplans/archived/260627-RAIL-BS-WP-0006-staged-promotion-lifecycle.md similarity index 100% rename from workplans/RAIL-BS-WP-0006-staged-promotion-lifecycle.md rename to workplans/archived/260627-RAIL-BS-WP-0006-staged-promotion-lifecycle.md diff --git a/workplans/RAILIANCE-WP-0014-activity-core-llm-connect-live-reconcile.md b/workplans/archived/260701-RAILIANCE-WP-0014-activity-core-llm-connect-live-reconcile.md similarity index 100% rename from workplans/RAILIANCE-WP-0014-activity-core-llm-connect-live-reconcile.md rename to workplans/archived/260701-RAILIANCE-WP-0014-activity-core-llm-connect-live-reconcile.md