From 1eb8559f27fe73872090b4cc08de110a8b23cd6e Mon Sep 17 00:00:00 2001 From: tegwick Date: Fri, 15 May 2026 23:03:28 +0200 Subject: [PATCH] tools and workplans --- README.md | 8 +- docs/README.md | 5 +- tools/create_railiance_repo.sh | 4 +- tools/furnish_railiance_repo.sh | 2 +- tools/seed_node.sh | 8 +- uv.lock | 2 +- .../RAIL-BS-WP-0001-dependency-management.md | 10 +- ...L-BS-WP-0006-staged-promotion-lifecycle.md | 218 +++++++++++++++++ ...RAIL-BS-WP-0007-threephoenix-ha-cluster.md | 229 ++++++++++++++++++ 9 files changed, 466 insertions(+), 20 deletions(-) create mode 100644 workplans/RAIL-BS-WP-0006-staged-promotion-lifecycle.md create mode 100644 workplans/RAIL-BS-WP-0007-threephoenix-ha-cluster.md diff --git a/README.md b/README.md index d43213a..30ed4c8 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# Railiance Bootstrap +# Railiance Cluster [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](LICENSE) Opinionated Infrastructure-as-Code framework for reproducible, self-reliant systems. @@ -6,7 +6,7 @@ Opinionated Infrastructure-as-Code framework for reproducible, self-reliant syst Railiance is an opinionated **Infrastructure-as-Code framework** — think *Rails for Ops*: convention over configuration, reproducibility first. -This repo (`railiance-bootstrap`) is the **entry point**: +This repo (`railiance-cluster`) is the **cluster runtime entry point**: from two bare Linux servers, a Git repo, and credentials, you can rebuild a fully automated Kubernetes-based environment. @@ -16,8 +16,8 @@ a fully automated Kubernetes-based environment. 1. **Clone this repo** ```bash - git clone /railiance-bootstrap.git - cd railiance-bootstrap + git clone /railiance-cluster.git + cd railiance-cluster ``` 2. **Configure Gitea access** diff --git a/docs/README.md b/docs/README.md index 313c3bf..4f1483d 100644 --- a/docs/README.md +++ b/docs/README.md @@ -54,8 +54,8 @@ From two bare Linux servers, a Git repo, and valid credentials, you can rebuild 1. **Clone the repo** ```bash - git clone /railiance-bootstrap.git - cd railiance-bootstrap + git clone /railiance-cluster.git + cd railiance-cluster 2. **Prepare your config** Edit ~/.railiance_gitea.conf with your Gitea URL, user, and token. @@ -81,4 +81,3 @@ Railiance is more than infra scripts: it’s the foundation of self-empowering i where humans and AI agents collaborate to manage systems with trust, clarity, and calmness. From bare metal to resilient clusters, in one repo. - diff --git a/tools/create_railiance_repo.sh b/tools/create_railiance_repo.sh index 5b828cc..83680c8 100644 --- a/tools/create_railiance_repo.sh +++ b/tools/create_railiance_repo.sh @@ -4,7 +4,7 @@ # Railiance — sanitized bootstrap script for creating a Gitea repository. # # PURPOSE: -# Creates a repo (default: railiance-bootstrap) in your Gitea (user or org), +# Creates a repo (default: railiance-cluster) in your Gitea (user or org), # scaffolds a minimal layout, commits, and pushes the initial content. # # SAFETY NOTES: @@ -25,7 +25,7 @@ # ./tools/create_railiance_repo.sh --org coulomb # override org from config # # FLAGS: -# --repo : repository name (default: railiance-bootstrap) +# --repo : repository name (default: railiance-cluster) # --desc : description (default provided) # --org : create under this organization (overrides GITEA_ORG) # --public : create as public repo (default: private) diff --git a/tools/furnish_railiance_repo.sh b/tools/furnish_railiance_repo.sh index 2713c30..df5734a 100644 --- a/tools/furnish_railiance_repo.sh +++ b/tools/furnish_railiance_repo.sh @@ -58,7 +58,7 @@ ensure_file "${repo_root}/README.md" RREAD <<'RREAD' Railiance is an opinionated **Infrastructure-as-Code framework** — think *Rails for Ops*: convention over configuration, reproducibility first. -This repo (`railiance-bootstrap`) is the **entry point**: +This repo (`railiance-cluster`) is the **cluster runtime entry point**: from two bare Linux servers, a Git repo, and credentials, you can rebuild a fully automated Kubernetes-based environment. RREAD diff --git a/tools/seed_node.sh b/tools/seed_node.sh index a51b35e..a486165 100644 --- a/tools/seed_node.sh +++ b/tools/seed_node.sh @@ -5,13 +5,13 @@ # Responsibilities: # - Ensure minimal prerequisites (curl, git, jq) # - Discover metadata (panspermia.json or env vars) -# - Clone or update the parent repo (default: railiance-bootstrap) +# - Clone or update the parent repo (default: railiance-cluster) # - Run furnishing (idempotent) to align housekeeping # - (Optional) handoff to further bootstrap steps # # Usage examples: # ./tools/seed_node.sh -# REPO_URL=https://git.example.com/org/railiance-bootstrap.git ./tools/seed_node.sh +# REPO_URL=https://git.example.com/org/railiance-cluster.git ./tools/seed_node.sh # ./tools/seed_node.sh --repo-dir /srv/railiance --branch main # # Notes: @@ -86,7 +86,7 @@ fi if [[ -z "${REPO_URL}" ]]; then echo "ERROR: No REPO_URL provided and no panspermia metadata found." >&2 echo "Provide one of:" >&2 - echo " - env REPO_URL=https://git.example.com/org/railiance-bootstrap.git" >&2 + echo " - env REPO_URL=https://git.example.com/org/railiance-cluster.git" >&2 echo " - or a panspermia.json with .parent_body.repo_url" >&2 exit 1 fi @@ -129,7 +129,7 @@ Next steps (manual, for now): 4) Prepare GitOps operator (ArgoCD/Flux) pointing to this repo Hints: - - To use SSH instead of HTTPS, set REPO_URL=git@your-gitea:org/railiance-bootstrap.git + - To use SSH instead of HTTPS, set REPO_URL=git@your-gitea:org/railiance-cluster.git - If using HTTPS, set up 'git config --global credential.helper cache|store' - For air-gapped: copy a Spore bundle, extract, then run this seed script diff --git a/uv.lock b/uv.lock index cb0ba4c..c25ac23 100644 --- a/uv.lock +++ b/uv.lock @@ -362,7 +362,7 @@ wheels = [ ] [[package]] -name = "railiance-bootstrap" +name = "railiance-cluster" version = "0.1.0" source = { virtual = "." } dependencies = [ diff --git a/workplans/RAIL-BS-WP-0001-dependency-management.md b/workplans/RAIL-BS-WP-0001-dependency-management.md index 06158a3..50f143d 100644 --- a/workplans/RAIL-BS-WP-0001-dependency-management.md +++ b/workplans/RAIL-BS-WP-0001-dependency-management.md @@ -27,7 +27,7 @@ installed on the control node at any given time. This means: - Behaviour is not reproducible across machines or over time - The Custodian State Hub SBOM scanner finds nothing to ingest (`last_sbom_at = null`) - Licence and vulnerability auditing of the actual dependencies in use is impossible -- The `railiance-bootstrap` repo appears as a gap in the SBOM coverage map +- The `railiance-cluster` repo appears as a gap in the SBOM coverage map ## Root cause @@ -42,7 +42,7 @@ dependencies. No `ansible/requirements.yml` exists for Galaxy collections - `uv.lock` is generated and committed — pins Ansible + full transitive pip tree - If Galaxy collections are used: `ansible/requirements.yml` lists them - SBOM is ingested: `last_sbom_at` is not null in the State Hub -- The SBOM dashboard shows `railiance-bootstrap` in the railiance domain row +- The SBOM dashboard shows `railiance-cluster` in the railiance domain row with a package count ## Tasks @@ -76,7 +76,7 @@ state_hub_task_id: "8aa8a9d3-6560-4176-b933-72a21e6d43d4" 1. Create `pyproject.toml`: ```toml [project] - name = "railiance-bootstrap" + name = "railiance-cluster" version = "0.1.0" requires-python = ">=3.11" dependencies = [ @@ -100,10 +100,10 @@ state_hub_task_id: "4fb477e9-dbac-4e43-84d0-5202c68f4705" From `~/the-custodian/state-hub/`: ```bash -make ingest-sbom REPO=railiance-bootstrap SCAN=1 REPO_PATH=/home/worsch/railiance-bootstrap +make ingest-sbom REPO=railiance-cluster SCAN=1 REPO_PATH=/home/worsch/railiance-cluster ``` -Verify in the SBOM dashboard: railiance domain should show `railiance-bootstrap` +Verify in the SBOM dashboard: railiance domain should show `railiance-cluster` with a package count and no gap warning. ### T4 — Create ansible/requirements.yml (even if empty) diff --git a/workplans/RAIL-BS-WP-0006-staged-promotion-lifecycle.md b/workplans/RAIL-BS-WP-0006-staged-promotion-lifecycle.md new file mode 100644 index 0000000..46d2d5e --- /dev/null +++ b/workplans/RAIL-BS-WP-0006-staged-promotion-lifecycle.md @@ -0,0 +1,218 @@ +--- +id: RAIL-BS-WP-0006 +type: workplan +title: "Staged Promotion Lifecycle" +domain: railiance +repo: railiance-cluster +status: active +owner: railiance +topic_slug: railiance +repo_goal_id: "6ea441f7-7fe3-4598-922b-38baf20c0580" +state_hub_workstream_id: "cb72d3ba-1863-43c2-a2a5-49ac75fc2603" +created: "2026-02-24" +updated: "2026-05-03" +--- + +# Staged Promotion Lifecycle + +## Goal + +Design and implement the three-stage deployment lifecycle as the core +Railiance application promotion pattern: + +1. Stage 1: local development and validation. +2. Stage 2: canary on production infrastructure. +3. Stage 3: full production promotion with rollback. + +This lifecycle should become the repeatable path for native Railiance apps and +third-party upstream applications wrapped by a Railiance overlay repo. + +## Why This Belongs Before Forgejo + +Forgejo will become critical production infrastructure. Before moving the +source forge itself, Railiance needs a well-defined promotion lifecycle so the +Forgejo deployment, Actions runners, package registry, and future upgrades can +move through the same staged gates as every other important workload. + +## Boundary + +This workplan lives in `railiance-cluster` because it defines cluster runtime +promotion mechanics and the canonical handoff between local validation, +canary deployment, and production routing. + +Expected cross-repo handoffs: + +- `railiance-enablement`: developer-facing CLI templates and CI workflow + conventions. +- `railiance-platform`: shared platform dependencies used by canaries. +- `railiance-apps`: application Helm values and workload-specific promotion + definitions. + +## Tasks + +### T01 - Write deployment lifecycle specification + +```task +id: RAIL-BS-WP-0006-T01 +status: todo +priority: high +state_hub_task_id: "fbfc341f-8ccb-4950-a85d-3e59c4f5b87f" +``` + +Write `docs/deployment-lifecycle.md`. + +The spec should define: + +- Stage 1, Stage 2, and Stage 3 semantics. +- Required checks before each stage. +- Canary acceptance gates. +- Rollback expectations. +- Human approval gates for production-critical workloads. + +**Done when:** the lifecycle is clear enough to apply to Forgejo as a later +production workload. + +--- + +### T02 - Define railiance directory schema and app.toml contract + +```task +id: RAIL-BS-WP-0006-T02 +status: todo +priority: high +state_hub_task_id: "523cf928-bb0e-4109-a172-abf029c62885" +``` + +Define the repository-local `railiance/` directory schema and `app.toml` +contract for native and third-party applications. + +Minimum contract: + +- App identity and ownership. +- Stage definitions. +- Required platform dependencies. +- Health checks and observability endpoints. +- Promotion and rollback commands. +- Secret references without plaintext secret values. + +**Done when:** a repo can declare how it moves through the Railiance promotion +lifecycle without bespoke instructions. + +--- + +### T03 - Overlay repo pattern and creation script + +```task +id: RAIL-BS-WP-0006-T03 +status: todo +priority: medium +state_hub_task_id: "7cd378f2-0319-407a-9ce7-2c6d1a6d6d24" +``` + +Design the overlay repo pattern for third-party upstream applications and add +`create_railiance_overlay_repo.sh` or equivalent tooling. + +The pattern should keep upstream code and Railiance deployment concerns cleanly +separated while still allowing reproducible promotion. + +**Done when:** a third-party app can be wrapped without forking deployment +logic into the upstream repository. + +--- + +### T04 - railiance run command + +```task +id: RAIL-BS-WP-0006-T04 +status: todo +priority: high +state_hub_task_id: "95c3311b-04bb-4c83-bda3-47958217b665" +``` + +Implement the Stage 1 `railiance run` command for local development and +validation. + +Expected behavior: + +- Read `railiance/app.toml`. +- Start or validate the local development target. +- Run defined local health checks. +- Emit a machine-readable result suitable for later promotion gates. + +**Done when:** at least one representative app can complete Stage 1 locally. + +--- + +### T05 - Canary Helm chart template + +```task +id: RAIL-BS-WP-0006-T05 +status: todo +priority: high +state_hub_task_id: "47b8cd47-99c7-4f31-a147-ea16afde7217" +``` + +Create the Stage 2 canary Helm chart template. + +Minimum requirements: + +- Stable and canary release identities. +- Weighted routing or equivalent traffic split through the chosen ingress + path. +- Prometheus-compatible annotations. +- Resource limits appropriate for single-node and future ThreePhoenix use. +- Rollback-safe values layout. + +**Done when:** a canary deployment can be created without hand-editing cluster +resources. + +--- + +### T06 - railiance deploy --stage 2 and observation tooling + +```task +id: RAIL-BS-WP-0006-T06 +status: todo +priority: medium +state_hub_task_id: "6a5c7422-fcb1-49d1-8153-e891bd1c27fa" +``` + +Implement Stage 2 deployment and observation commands. + +Expected behavior: + +- Deploy the canary from declared app metadata. +- Show rollout state, pod health, ingress/routing state, and key metrics. +- Fail closed when prerequisites or health gates are missing. + +**Done when:** Stage 2 can be run and observed from a repeatable command path. + +--- + +### T07 - railiance promote, rollback, and onboarding guide + +```task +id: RAIL-BS-WP-0006-T07 +status: todo +priority: medium +state_hub_task_id: "476198f6-0049-4ac4-9593-6723c86c9602" +``` + +Implement Stage 3 promotion and rollback commands, then write the reference +onboarding guide. + +Expected output: + +- `railiance promote` for controlled production promotion. +- `railiance rollback` for reverting to the previous stable version. +- A guide showing how a representative app adopts the lifecycle. +- Explicit human approval points for critical infrastructure workloads. + +**Done when:** a representative app can move Stage 1 -> Stage 2 -> Stage 3 and +back through rollback using documented commands. + +## Dependencies + +This workplan should be done before the Forgejo production cutover. It can run +in parallel with preparatory ThreePhoenix design, but its Stage 2/3 behavior +should be validated against the intended ThreePhoenix cluster model. diff --git a/workplans/RAIL-BS-WP-0007-threephoenix-ha-cluster.md b/workplans/RAIL-BS-WP-0007-threephoenix-ha-cluster.md new file mode 100644 index 0000000..3c8521d --- /dev/null +++ b/workplans/RAIL-BS-WP-0007-threephoenix-ha-cluster.md @@ -0,0 +1,229 @@ +--- +id: RAIL-BS-WP-0007 +type: workplan +title: "ThreePhoenix - HA Cluster Implementation" +domain: railiance +repo: railiance-cluster +status: active +owner: railiance +topic_slug: railiance +repo_goal_id: "6ea441f7-7fe3-4598-922b-38baf20c0580" +state_hub_workstream_id: "9e208376-23f1-40c7-9813-fac1f7d6ad3b" +created: "2026-02-25" +updated: "2026-05-03" +--- + +# ThreePhoenix - HA Cluster Implementation + +## Goal + +Implement the ThreePhoenix architecture: a self-healing three-node Kubernetes +cluster substrate for Railiance production systems. + +The cluster target includes: + +- k3s HA with embedded etcd. +- Distributed storage. +- High-availability database patterns. +- Ingress and certificate automation. +- Node rotation and recovery drills. +- Monitoring and acceptance audits. + +## Why This Belongs Before Forgejo + +Forgejo will be the source forge, package base, and Actions surface for the +Railiance stack. Moving it before the production cluster lifecycle is clear +would make Forgejo both the migration target and the infrastructure experiment. + +ThreePhoenix should come first, or at least its lifecycle gates should be +designed first, so Forgejo is deployed onto a substrate whose failure and +promotion behavior is already understood. + +## Boundary + +This workplan is S2 cluster runtime work. + +In scope for `railiance-cluster`: + +- k3s HA topology and runtime configuration. +- Cluster-level storage/operator installation hooks. +- Ingress and certificate controllers. +- Cluster health, rotation, and acceptance checks. + +Out of scope: + +- Database cluster definitions and credentials: `railiance-platform`. +- Forgejo/Gitea application Helm values: `railiance-apps`. +- Developer workflows and Actions templates: `railiance-enablement`. +- OS provisioning and host hardening: `railiance-infra`. + +## Tasks + +### T01 - K3s HA cluster setup + +```task +id: RAIL-BS-WP-0007-T01 +status: todo +priority: high +state_hub_task_id: "1f8a8668-31eb-4d79-bbcd-50f6430a8d66" +``` + +Implement the three-node k3s HA cluster setup using embedded etcd. + +Minimum scope: + +- Define node roles and join sequence. +- Automate first server and additional server joins. +- Validate etcd quorum. +- Document failure behavior for one missing node. + +**Done when:** three nodes can form a healthy k3s HA cluster from documented +commands. + +--- + +### T02 - Longhorn distributed storage + +```task +id: RAIL-BS-WP-0007-T02 +status: todo +priority: high +state_hub_task_id: "b1d4e0fa-da41-4b13-a7d6-34dd040cb605" +``` + +Install and validate distributed storage for stateful workloads. + +Minimum scope: + +- Storage prerequisites and node labeling. +- Longhorn installation or approved alternative. +- Default storage class decision. +- Volume replica and recovery behavior. +- Backup target handoff to `railiance-platform` where appropriate. + +**Done when:** a test PVC survives a node disruption according to the +ThreePhoenix acceptance criteria. + +--- + +### T03 - PostgreSQL HA pattern + +```task +id: RAIL-BS-WP-0007-T03 +status: todo +priority: high +state_hub_task_id: "11283b4c-7e4d-490d-91b3-0d06a593bdf0" +``` + +Define the PostgreSQL HA runtime pattern and handoff to S3. + +The original State Hub task names repmgr and Pgpool-II. Before implementation, +reconcile that with the current Railiance production baseline using +CloudNative PG. + +**Done when:** the chosen HA database pattern is documented, tested, and +owned by the correct layer without conflicting with `railiance-platform`. + +--- + +### T04 - Reference stateful application HA + +```task +id: RAIL-BS-WP-0007-T04 +status: todo +priority: high +state_hub_task_id: "4a20e593-a89d-43da-abcc-5a39a4c8b3c0" +``` + +Validate a representative stateful source-forge workload on the HA cluster. + +The historical task names Gitea. In the current roadmap this should become +Forgejo unless a temporary Gitea reference drill is still useful. + +Minimum checks: + +- Repository storage survives pod reschedule and node disruption. +- Database failover behavior is understood. +- Package registry storage is included in backup/restore thinking. +- Application-level rollback is compatible with the staged promotion lifecycle. + +**Done when:** Railiance has a proven stateful source-forge deployment pattern +that can be reused for the Forgejo migration. + +--- + +### T05 - Nginx ingress and cert-manager SSL + +```task +id: RAIL-BS-WP-0007-T05 +status: todo +priority: medium +state_hub_task_id: "68315a40-dd5b-4032-a9e7-1152e38f9807" +``` + +Implement and validate the production ingress and certificate path. + +Minimum scope: + +- Ingress controller topology. +- TLS certificate issuance and renewal. +- Private/public exposure rules. +- Health checks for ingress and certificate validity. + +**Done when:** representative services can be exposed through the intended +ingress path with valid certificates. + +--- + +### T06 - Phoenix CronJob automation + +```task +id: RAIL-BS-WP-0007-T06 +status: todo +priority: medium +state_hub_task_id: "f658aa6a-1c48-4660-88fa-35eaa0137e12" +``` + +Implement weekly node rotation or equivalent Phoenix recovery automation. + +Minimum scope: + +- Define what "rotation" means for the current host reality. +- Automate safe cordon, drain, rebuild/rejoin, and validation steps where + feasible. +- Include explicit human gates for destructive host actions. +- Log rotation results to State Hub. + +**Done when:** the cluster recovery rhythm is scripted, documented, and tested +without risking production data. + +--- + +### T07 - Monitoring stack and acceptance audit checklist + +```task +id: RAIL-BS-WP-0007-T07 +status: todo +priority: medium +state_hub_task_id: "70f6c8ab-a700-4fb2-893e-cf5a40615044" +``` + +Add the monitoring stack and final acceptance audit checklist. + +Minimum scope: + +- Cluster health signals. +- Storage health. +- Database/operator health handoff. +- Ingress and certificate health. +- Backup/restore freshness. +- Promotion lifecycle readiness. + +**Done when:** ThreePhoenix can be declared ready for critical workloads only +after the checklist passes. + +## Dependencies + +This workplan should precede the Forgejo production cutover. It should also +shape the Stage 2 and Stage 3 gates in `RAIL-BS-WP-0006` so canaries and +promotions operate against the real HA substrate.