diff --git a/state-hub/Makefile b/state-hub/Makefile index 914e55c..52db38f 100644 --- a/state-hub/Makefile +++ b/state-hub/Makefile @@ -1,4 +1,4 @@ -.PHONY: install install-cli db db-tools migrate seed api dashboard check start clean register-project validate-adr add-domain rename-domain add-repo list-repos cleanup-stale +.PHONY: install install-cli db db-tools migrate seed api dashboard check start clean register-project validate-adr add-domain rename-domain add-repo list-repos cleanup-stale tunnel tunnel-daemon tunnel-loop tunnel-status tunnel-stop COMPOSE = docker compose -f infra/docker-compose.yml --env-file .env @@ -34,14 +34,65 @@ dashboard: check: curl -sf http://127.0.0.1:8000/state/health | python3 -m json.tool -## Open a reverse SSH tunnel so a remote host can reach the local State Hub. -## Usage: make tunnel HOST=user@hostname -## The remote host will then reach the hub at http://127.0.0.1:8000 +## COULOMBCORE host (default target for tunnel targets) +COULOMBCORE ?= tegwick@92.205.130.254 +TUNNEL_PORT ?= 8000 + +## Foreground reverse tunnel — good for debugging. Ctrl-C to stop. +## Usage: make tunnel HOST=tegwick@92.205.130.254 tunnel: @test -n "$(HOST)" || (echo "ERROR: HOST is required. Usage: make tunnel HOST=user@hostname"; exit 1) - @echo "Opening reverse tunnel → $(HOST) (remote :8000 → local :8000)" + @echo "Opening reverse tunnel → $(HOST) (remote :$(TUNNEL_PORT) → local :$(TUNNEL_PORT))" @echo "Keep this terminal open. Ctrl-C to close the tunnel." - ssh -R 8000:127.0.0.1:8000 $(HOST) + ssh -N -o "ServerAliveInterval=30" -o "ServerAliveCountMax=3" \ + -R $(TUNNEL_PORT):127.0.0.1:$(TUNNEL_PORT) $(HOST) + +## Background tunnel to COULOMBCORE with auto-reconnect. +## Uses autossh if available; prints install hint and exits if not. +## After running, COULOMBCORE can reach the State Hub at http://127.0.0.1:8000 +tunnel-daemon: + @if command -v autossh >/dev/null 2>&1; then \ + echo "Starting autossh tunnel → $(COULOMBCORE)"; \ + autossh -f -N -M 0 \ + -o "ServerAliveInterval=30" \ + -o "ServerAliveCountMax=3" \ + -o "ExitOnForwardFailure=yes" \ + -R $(TUNNEL_PORT):127.0.0.1:$(TUNNEL_PORT) $(COULOMBCORE); \ + echo "Tunnel running in background. Use 'make tunnel-status' to check."; \ + else \ + echo "autossh not found — install it: sudo apt-get install autossh"; \ + echo "Fallback: run 'make tunnel-loop HOST=$(COULOMBCORE)' in a dedicated terminal."; \ + exit 1; \ + fi + +## Reconnect loop — works without autossh. Run in a terminal you can leave open. +## Usage: make tunnel-loop HOST=tegwick@92.205.130.254 +tunnel-loop: + @test -n "$(HOST)" || (echo "ERROR: HOST is required. Usage: make tunnel-loop HOST=user@hostname"; exit 1) + @echo "Reconnect loop → $(HOST). Ctrl-C to stop." + @while true; do \ + echo "[$(shell date -u +%Y-%m-%dT%H:%M:%SZ)] Connecting..."; \ + ssh -N -o "ServerAliveInterval=30" -o "ServerAliveCountMax=3" \ + -o "ExitOnForwardFailure=yes" \ + -R $(TUNNEL_PORT):127.0.0.1:$(TUNNEL_PORT) $(HOST) || true; \ + echo "[$(shell date -u +%Y-%m-%dT%H:%M:%SZ)] Connection lost — retrying in 5s..."; \ + sleep 5; \ + done + +## Check whether a tunnel is currently active +tunnel-status: + @if command -v autossh >/dev/null 2>&1 && pgrep -f "autossh.*$(TUNNEL_PORT)" > /dev/null 2>&1; then \ + echo "autossh tunnel: RUNNING (PIDs: $$(pgrep -f 'autossh.*$(TUNNEL_PORT)' | tr '\n' ' '))"; \ + elif pgrep -f "ssh.*-R $(TUNNEL_PORT)" > /dev/null 2>&1; then \ + echo "ssh tunnel: RUNNING (PIDs: $$(pgrep -f 'ssh.*-R $(TUNNEL_PORT)' | tr '\n' ' '))"; \ + else \ + echo "Tunnel: NOT running"; \ + fi + +## Stop any active tunnel (autossh or plain ssh) +tunnel-stop: + @pkill -f "autossh.*$(TUNNEL_PORT)" 2>/dev/null && echo "autossh stopped" || true + @pkill -f "ssh.*-R $(TUNNEL_PORT)" 2>/dev/null && echo "ssh loop stopped" || true start: db sleep 3 diff --git a/workplans/CUST-WP-0000-state-hub-v0.1-build-deploy.md b/workplans/CUST-WP-0000-state-hub-v0.1-build-deploy.md new file mode 100644 index 0000000..aa0a2ef --- /dev/null +++ b/workplans/CUST-WP-0000-state-hub-v0.1-build-deploy.md @@ -0,0 +1,42 @@ +--- +id: CUST-WP-0000 +type: workplan +title: "State Hub v0.1 — Build & Deploy" +domain: custodian +repo: the-custodian +status: completed +owner: custodian +topic_slug: custodian +created: "2026-02-24" +updated: "2026-02-24" +completed: "2026-02-24" +state_hub_workstream_id: "2b0efa54-0209-4ca9-8ab3-30dfbdb991b0" +note: > + Pre-ADR-001 record. This workstream was created DB-first during the first + Custodian session (2026-02-24) before the workplan-as-repository-artefact + convention was established. This file is a retroactive record written on + 2026-03-11 to satisfy the ADR-001 consistency checker (C-08). +--- + +# State Hub v0.1 — Build & Deploy + +## What was built + +The first live implementation layer of the Custodian system, delivered in the +initial session on 2026-02-24: + +- PostgreSQL schema (topics, workstreams, tasks, decisions, progress_events) +- FastAPI app with routers for all entities + `/state/summary` +- FastMCP stdio server (11 tools, 5 resources/templates) +- Observable Framework dashboard (4 pages: index, workstreams, decisions, progress) +- Docker Compose for local PostgreSQL +- Alembic migration `0001_initial_schema` +- Seed script inserting 6 canonical topics +- `.mcp.json` at repo root for Claude Code discovery +- `make register-project` automation for onboarding domain repos + +## References + +- Commit range: initial state-hub implementation (2026-02-24) +- Superseded by: CUST-WP-0000 (this file) covers only v0.1 baseline; + subsequent features tracked in CUST-WP-0001 onward diff --git a/workplans/CUST-WP-0000b-state-hub-v0.2-decisions-dependencies.md b/workplans/CUST-WP-0000b-state-hub-v0.2-decisions-dependencies.md new file mode 100644 index 0000000..35c56a1 --- /dev/null +++ b/workplans/CUST-WP-0000b-state-hub-v0.2-decisions-dependencies.md @@ -0,0 +1,42 @@ +--- +id: CUST-WP-0000b +type: workplan +title: "State Hub v0.2 — Decisions, Suggestions & Dependencies" +domain: custodian +repo: the-custodian +status: completed +owner: custodian +topic_slug: custodian +created: "2026-02-25" +updated: "2026-02-25" +completed: "2026-02-25" +state_hub_workstream_id: "6585ee66-aa4e-436e-bbec-d83293c33e8f" +note: > + Pre-ADR-001 record. This workstream was created DB-first before the + workplan-as-repository-artefact convention was established. Retroactive + file written on 2026-03-11 to satisfy the ADR-001 consistency checker (C-08). +--- + +# State Hub v0.2 — Decisions, Suggestions & Dependencies + +## What was built + +Delivered 2026-02-25, evolving the hub from a state tracker to an active +coordination layer: + +- `WorkstreamDependency` model + migration `0b547c153153` — directed + dependency graph between workstreams +- API: `POST/GET /workstreams/{id}/dependencies/`, + `DELETE /workstreams/{id}/dependencies/{dep_id}` +- API: `GET /state/next_steps` — derived next-action suggestions (never persisted) +- `StateSummary` extended with `next_steps` and `depends_on`/`blocks` on workstreams +- Design boundary formalised: hub is a read model with exactly two write use + cases — resolving decisions and suggesting next steps +- MCP: `get_next_steps()` tool added +- `scripts/script.py.mako` added (required for Alembic autogenerate) + +## References + +- Alembic migration: `0b547c153153` +- Design boundary document: `canon/architecture/` (hub as read model) +- CLAUDE.md global + railiance updated with `get_next_steps()` in session start diff --git a/workplans/CUST-WP-0011-state-hub-threephoenix-migration.md b/workplans/CUST-WP-0011-state-hub-threephoenix-migration.md new file mode 100644 index 0000000..b96308c --- /dev/null +++ b/workplans/CUST-WP-0011-state-hub-threephoenix-migration.md @@ -0,0 +1,346 @@ +--- +id: CUST-WP-0011 +type: workplan +title: "Migrate Custodian State Hub to ThreePhoenix Cluster" +domain: custodian +repo: the-custodian +status: active +owner: custodian +topic_slug: custodian +created: "2026-03-11" +updated: "2026-03-11" +state_hub_workstream_id: "967baafb-d92d-405a-ba0b-0d00d37c4940" +--- + +# Migrate Custodian State Hub to ThreePhoenix Cluster + +## Goal + +Move the Custodian State Hub (FastAPI + PostgreSQL) from its current home on +the WSL2 operator workstation to the ThreePhoenix Kubernetes cluster +(Railiance01/02/03), making it available to Claude Code sessions running on +any machine with cluster access — without public internet exposure. + +The State Hub is **irreplaceable episodic memory**. This migration must be +executed with zero tolerance for data loss and a tested rollback path at +every stage. + +## Pre-conditions (gate — do not start until all satisfied) + +- [ ] ThreePhoenix cluster has three healthy nodes (Railiance01 confirmed, Railiance02 + Railiance03 joined) +- [ ] Longhorn distributed storage installed and verified (replication factor ≥ 2) +- [ ] HA failover test passes (`tests/test_ha_failover.sh` exits 0 on the cluster) +- [ ] S2 integrated backup operational and tested on the cluster +- [ ] A full WSL2 State Hub backup has been taken and restore-drilled **within 24h of starting this workplan** + +These gates are mandatory. A single-node cluster or unverified storage is not +an acceptable migration target for the Custodian. + +## Architecture after migration + +``` +COULOMBCORE / operator workstation (WSL2) + └─ Claude Code + └─ MCP server subprocess (Python, local clone of the-custodian) + └─ HTTP → ssh -L 8000:state-hub-svc:8000 tegwick@92.205.62.239 + └─ Railiance01 k3s + └─ state-hub ClusterIP service + ├─ FastAPI pod (1–2 replicas) + └─ PostgreSQL PVC (Longhorn, 2-way replicated) +``` + +Key properties: +- **Not publicly exposed** — ClusterIP only; access via SSH port-forward +- **Replicated storage** — Longhorn replicates the PG data volume across nodes +- **WSL2 instance retained as DR fallback** during the stabilisation period +- **MCP config unchanged** — subprocess still calls `http://127.0.0.1:8000`; + the SSH port-forward provides the binding + +## Backup and disaster recovery contract + +Before and during migration, the following must hold at all times: + +| Asset | Backup mechanism | RPO | Tested? | +|---|---|---|---| +| State Hub PostgreSQL DB | `make backup` (pg_dump → age-encrypted, Nextcloud offsite) | Daily | Must be drilled before T03 | +| State Hub DB on cluster | Longhorn snapshot + age-encrypted copy to `/opt/backup/` | Daily | Must be drilled before T06 | +| WSL2 instance | Remains live during stabilisation period | — | Running | + +**Rollback rule:** at any task boundary, if something is wrong, revert to +WSL2. No task should leave the system in a state where both WSL2 and cluster +are broken. + +--- + +## Tasks + +### T01 — Drill WSL2 backup restore end-to-end + +```task +id: T01 +status: todo +priority: high +state_hub_task_id: "b0caf112-dc1d-43a8-9f27-d627dd4aa2bf" +``` + +Before touching anything, prove the current backup can actually be restored: + +```bash +# In the-custodian/state-hub/ +make backup # take fresh backup +# Spin up a test postgres container +docker run -d --name pg-restore-test -e POSTGRES_PASSWORD=test \ + -p 5433:5432 postgres:16 +# Decrypt and restore +age -d -i ~/.config/sops/age/keys.txt \ + /opt/backup/custodian/state-hub-latest.sql.gz.age | \ + gunzip | psql -h 127.0.0.1 -p 5433 -U postgres state_hub +# Spot-check: count topics +psql -h 127.0.0.1 -p 5433 -U postgres -c "SELECT COUNT(*) FROM topics;" state_hub +docker rm -f pg-restore-test +``` + +**Done when:** restore completes, topic count matches production, drill logged +in `memory/episodic/`. + +--- + +### T02 — Helm chart for State Hub (new: railiance-platform) + +```task +id: T02 +status: todo +priority: high +state_hub_task_id: "24887dd9-7d50-4cc4-add7-bffa1454b80c" +``` + +Create `helm/state-hub/` in `railiance-platform` (S3 layer owns platform +services). The chart must deploy: + +- **FastAPI deployment** — image built from `the-custodian/state-hub/`, + 1 replica initially (scale to 2 after T06) +- **PostgreSQL StatefulSet** — single instance backed by a Longhorn PVC + (minimum 5 Gi); HA not required here — Longhorn replication IS the HA +- **ClusterIP service** `state-hub` on port 8000 +- **ConfigMap** for non-secret config (DB URL template, log level) +- **Secret** for DB credentials (SOPS-encrypted values file) +- **Liveness/readiness probe** — `GET /state/health` + +Values: +```yaml +image: + repository: gitea.local/custodian/state-hub + tag: latest +postgres: + storageClass: longhorn + size: 5Gi +replicaCount: 1 +``` + +**Done when:** `helm lint` passes; chart committed in railiance-platform. + +--- + +### T03 — Build and push State Hub container image + +```task +id: T03 +status: todo +priority: high +state_hub_task_id: "79908ade-3e38-451b-a403-2361a16a3f3a" +``` + +Add `state-hub/Dockerfile` to the-custodian: + +```dockerfile +FROM python:3.12-slim +WORKDIR /app +COPY pyproject.toml uv.lock ./ +RUN pip install uv && uv sync --frozen --no-dev +COPY api/ ./api/ +COPY mcp_server/ ./mcp_server/ +CMD ["uv", "run", "uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "8000"] +``` + +Build and push to the cluster-local Gitea registry: + +```bash +docker build -t gitea.local/custodian/state-hub:latest . +docker push gitea.local/custodian/state-hub:latest +``` + +**Done when:** image available in Gitea registry; `helm install --dry-run` +resolves the image. + +--- + +### T04 — Deploy to cluster and run Alembic migrations + +```task +id: T04 +status: todo +priority: high +state_hub_task_id: "a7baf2eb-abd7-4aa3-b2cb-a5370ac09844" +``` + +```bash +# From operator workstation via SSH port-forward to k3s API +helm install state-hub ./helm/state-hub/ \ + -n custodian --create-namespace \ + -f helm/state-hub/values-production.yaml + +# Wait for pods +kubectl -n custodian rollout status deployment/state-hub + +# Run migrations inside the pod +kubectl -n custodian exec -it deploy/state-hub -- \ + uv run alembic upgrade head +``` + +**Done when:** pod Running, `/state/health` returns 200, Alembic reports +"head" from inside the pod. + +--- + +### T05 — Migrate data from WSL2 to cluster + +```task +id: T05 +status: todo +priority: high +state_hub_task_id: "a307dd46-a8e2-49df-b016-c187759ebcf1" +``` + +This is the point of no return for the DB — execute with care: + +```bash +# 1. Take final WSL2 backup +make -C ~/the-custodian/state-hub backup + +# 2. Copy dump into the cluster postgres pod +kubectl -n custodian cp /tmp/state-hub-migration.sql \ + $(kubectl -n custodian get pod -l app=state-hub-postgres -o name):/tmp/ + +# 3. Restore +kubectl -n custodian exec -it deploy/state-hub-postgres -- \ + psql -U postgres -d state_hub -f /tmp/state-hub-migration.sql + +# 4. Spot-check counts match WSL2 +kubectl -n custodian exec -it deploy/state-hub -- \ + psql -c "SELECT relname, n_live_tup FROM pg_stat_user_tables ORDER BY n_live_tup DESC;" +``` + +**Rollback:** if counts differ, delete cluster DB data, re-run from T04. +WSL2 is still live and unchanged. + +**Done when:** all table row counts match the WSL2 instance. + +--- + +### T06 — Drill cluster backup restore + +```task +id: T06 +status: todo +priority: high +state_hub_task_id: "03753b88-824c-4448-97b2-f7315d145060" +``` + +Before cutting over, prove the cluster backup can be restored: + +```bash +# Trigger a backup via the cluster cron (or manually) +kubectl -n custodian create job --from=cronjob/state-hub-backup backup-drill-01 + +# Verify output in /opt/backup/ on the node holding the PVC +# Decrypt and restore to a test namespace +kubectl create ns restore-test +# ... restore steps similar to T01 but against cluster postgres +``` + +**Done when:** restore drill passes; drill logged. + +--- + +### T07 — Cutover: redirect MCP config to cluster + +```task +id: T07 +status: todo +priority: medium +state_hub_task_id: "ff1de25e-c301-4b86-9420-84dfe72e565e" +``` + +Update the MCP config on every operator workstation (WSL2, COULOMBCORE) to +reach the cluster state hub via SSH port-forward instead of the local process. + +The MCP server subprocess still runs locally (Python, same `server.py`). +Only the API endpoint it calls changes — via a persistent port-forward: + +```bash +# On operator workstation — keep this running (add to tunnel-daemon or tunnel-loop) +ssh -L 8000:state-hub.custodian.svc.cluster.local:8000 tegwick@92.205.62.239 +``` + +No change to `.mcp.json` needed — subprocess still calls `http://127.0.0.1:8000`. + +Alternatively: update the MCP server's `API_BASE` env var to point directly +to the port-forward. Either approach is valid; document the chosen one. + +**Done when:** `claude /mcp` shows `state-hub` connected; `get_state_summary()` +returns live cluster data. + +--- + +### T08 — Stabilisation period (2 weeks minimum) + +```task +id: T08 +status: todo +priority: medium +state_hub_task_id: "e06a59a0-5310-4c1c-9ba5-7cfaadda62e2" +``` + +Run the cluster state hub as the primary for two weeks before retiring WSL2: + +- Keep WSL2 state hub running (but frozen — no writes) as DR fallback +- Monitor cluster pod restarts, storage health, backup cron +- Run `get_state_summary()` at the start of each session; confirm data is live +- Test failover: kill the FastAPI pod; verify it restarts and responds within 60s + +**Done when:** two weeks elapsed with no data loss events; all backup drills +passed. + +--- + +### T09 — Retire WSL2 instance + +```task +id: T09 +status: todo +priority: low +state_hub_task_id: "d75a2d49-f3b1-4bdd-b9e1-a1c6a9744681" +``` + +Once T08 stabilisation passes: + +1. Take a final WSL2 backup (archive, keep indefinitely) +2. Stop the WSL2 Docker container: `make -C ~/the-custodian/state-hub clean` +3. Update `CLAUDE.md` global and project to remove WSL2 state hub start instructions +4. Update MEMORY.md — state hub is now cluster-hosted +5. Record a decision in the state hub: "State Hub WSL2 instance retired" + +**Done when:** WSL2 state hub no longer running; documentation updated. + +--- + +## References + +- Constitution constraint: irreversible actions require human approval — T05 + (data migration) and T09 (WSL2 retirement) require explicit sign-off +- OAS layer: S3 Platform Services (railiance-platform) +- DR dependency: Longhorn storage (railiance-cluster WP to be linked) +- Extension point: EP-RAIL-005 (full-stack backup) — state hub must implement + `make backup` / `make restore` standard interface before T06 +- Domain goal: `6f96c712-60e6-4ea9-ab06-168878eafbce` (Three-Phoenix Secure + Kubernetes Infrastructure)