diff --git a/Makefile b/Makefile index 5466e1e..408542a 100644 --- a/Makefile +++ b/Makefile @@ -129,9 +129,6 @@ tf-providers-plan: ## Plan after an upgrade (uses HCLOUD_TOKEN if set) ansible-bootstrap: ## Run base bootstrap play (users, ssh, ufw, sops-agent) cd ansible && ansible-playbook playbooks/bootstrap.yaml -u admin -converge: ansible-bootstrap ## Alias for current bootstrap converge - @true - # ---- Orchestration ---- apply: tf-fmt tf-apply ansible-bootstrap ## Provision via Terraform then converge via Ansible @@ -189,6 +186,16 @@ ansible-inventory: ## Print the dynamic inventory Ansible will use ansible-ping: ## Quick connectivity check (SSH + Python availability) cd $(ANS_DIR) && ansible all -u $(SSH_USER) -m ping +status: ## Show live security state of all hosts (UFW, fail2ban, SSH hardening) + @echo "=== Connectivity ===" + cd $(ANS_DIR) && ansible all -u $(SSH_USER) -m ping + @echo "=== UFW ===" + cd $(ANS_DIR) && ansible all -u $(SSH_USER) -m shell -a "ufw status" --become + @echo "=== fail2ban ===" + cd $(ANS_DIR) && ansible all -u $(SSH_USER) -m shell -a "systemctl is-active fail2ban" + @echo "=== SSH hardening ===" + cd $(ANS_DIR) && ansible all -u $(SSH_USER) -m shell -a "grep -iE '^(PermitRootLogin|PasswordAuthentication)' /etc/ssh/sshd_config" --become + converge: ## Converge all hosts to the baseline (idempotent) cd $(ANS_DIR) && ansible-playbook $(PLAY) -u $(SSH_USER) diff --git a/spec/server-baseline.yaml b/spec/server-baseline.yaml new file mode 100644 index 0000000..38b079c --- /dev/null +++ b/spec/server-baseline.yaml @@ -0,0 +1,86 @@ +# Railiance Managed Node — Baseline Server Specification +# This file is the authoritative source of truth for the target state of every +# server managed by railiance-hosts. All convergence roles and test assertions +# MUST be derivable from this document. +# +# When you change something here, update the Ansible roles AND the Goss tests. +# Format: human-readable YAML, kept technology-neutral. + +version: "1.0" +applies_to: all # override per node group if needed + +# --------------------------------------------------------------------------- +# Firewall +# --------------------------------------------------------------------------- +firewall: + engine: ufw + status: active + default_incoming: deny + default_outgoing: allow + rules: + - name: SSH + port: 22 + proto: tcp + action: allow + - name: k3s-api + port: 6443 + proto: tcp + action: allow + - name: flannel-vxlan + port: 8472 + proto: udp + action: allow + +# --------------------------------------------------------------------------- +# SSH daemon +# --------------------------------------------------------------------------- +ssh: + permit_root_login: "no" + password_authentication: "no" + pubkey_authentication: "yes" + challenge_response_authentication: "no" + +# --------------------------------------------------------------------------- +# Services +# --------------------------------------------------------------------------- +services: + - name: ufw + enabled: true + running: true + - name: fail2ban + enabled: true + running: true + - name: ssh + enabled: true + running: true + +# --------------------------------------------------------------------------- +# Packages +# --------------------------------------------------------------------------- +packages: + installed: + - ufw + - fail2ban + - git + - curl + - vim + - htop + - age + - sops + +# --------------------------------------------------------------------------- +# Users +# --------------------------------------------------------------------------- +users: + - name: admin + shell: /bin/bash + sudo: passwordless # NOPASSWD:ALL in /etc/sudoers.d/ + ssh_key_auth: true + +# --------------------------------------------------------------------------- +# Security baseline +# --------------------------------------------------------------------------- +security: + histcontrol: ignorespace # set in /etc/profile.d/ + fail2ban_jails: + - sshd diff --git a/workplans/RAIL-HO-WP-0002-server-spec-and-test-suite.md b/workplans/RAIL-HO-WP-0002-server-spec-and-test-suite.md new file mode 100644 index 0000000..1b74808 --- /dev/null +++ b/workplans/RAIL-HO-WP-0002-server-spec-and-test-suite.md @@ -0,0 +1,262 @@ +--- +id: RAIL-HO-WP-0002 +type: workplan +title: "Server Specification and Automated Test Suite" +domain: railiance +repo: railiance-hosts +status: active +owner: railiance +topic_slug: railiance +state_hub_workstream_id: "" # register after creating workstream in hub +created: "2026-03-09" +updated: "2026-03-09" +--- + +# Server Specification and Automated Test Suite + +## Motivation + +`make status` produces raw shell output that requires manual interpretation. +There is no machine-readable specification of what a converged Railiance node +should look like, and therefore no way to assert automatically whether a server +is in the correct state. + +This workplan closes that gap by introducing: + +1. **A declarative server specification** (`spec/server-baseline.yaml`) — the + single source of truth for the target state of every managed node. +2. **A Goss test suite** derived from that spec — YAML assertions that map + one-to-one to spec items and produce a structured pass/fail report. +3. **`make verify`** — runs the test suite against all hosts and exits non-zero + on failure, suitable for CI. +4. **An ADR** that formally defines the boundary between `railiance-hosts` and + `railiance-bootstrap`. + +## Concept + +### Separation of concerns + +| Repo | Responsibility | +|------|----------------| +| `railiance-hosts` | **What** a managed node should look like (spec), **how** to get it there (Ansible roles), **how to verify** it got there (Goss tests), inventory, secrets | +| `railiance-bootstrap` | Upstream Kubernetes/app-layer provisioning that builds on an already-converged base node; does NOT own security baseline | + +The `railiance-bootstrap` ansible work (harden.yml, bootstrap.yml) is +superseded by `roles/base` and `roles/sops_agent` in this repo. Going forward, +any security or OS-level configuration belongs here. `railiance-bootstrap` may +consume a node that has already been converged by this repo, but must not +re-configure items owned here. + +### Test framework: Goss + +[Goss](https://github.com/goss-org/goss) is a Go binary that evaluates YAML +test files against the live node. It was chosen because: + +- Tests and spec map one-to-one (Goss YAML IS the assertion) +- Single binary, no Python/Ruby runtime on target host +- Fast (runs in-process, no SSH per test) +- Output can be TAP, JSON, or human-readable +- Deployable via Ansible in a single task + +### Directory layout + +``` +spec/ + server-baseline.yaml ← authoritative target-state spec (already created) + +goss/ + baseline.yaml ← Goss assertions (derived from spec) + vars/ + baseline-vars.yaml ← parameterised values (ports, users, etc.) + +ansible/ + playbooks/ + verify.yaml ← deploy Goss + run tests + fetch results + roles/ + goss/ ← role: install binary, copy tests, run, report +``` + +--- + +## Tasks + +### T01 — Resolve duplicate `converge` target and fix SSH check + +```task +id: T01 +status: done +completed: "2026-03-09" +priority: high +``` + +- Remove redundant `converge: ansible-bootstrap` alias (caused Makefile warning) +- Fix `sshd -T` command (requires hostkeys) → replaced with + `grep -iE '^(PermitRootLogin|PasswordAuthentication)' /etc/ssh/sshd_config` + +**Done when:** `make status` completes without warnings and SSH section returns +`PermitRootLogin no` / `PasswordAuthentication no`. + +--- + +### T02 — Finalise server baseline spec + +```task +id: T02 +status: done +completed: "2026-03-09" +priority: high +``` + +Created `spec/server-baseline.yaml` covering: +- Firewall rules (UFW, default deny, allowed ports) +- SSH daemon settings +- Required services and packages +- Admin user constraints +- Security settings (fail2ban jails, HISTCONTROL) + +**Done when:** spec reviewed and agreed — it becomes the contract that roles +and tests must satisfy. + +--- + +### T03 — Implement Goss test suite + +```task +id: T03 +status: todo +priority: high +``` + +Create `goss/baseline.yaml` with Goss assertions that implement every item in +`spec/server-baseline.yaml`. Each spec section maps to a Goss resource type: + +| spec section | Goss resource | +|---|---| +| `firewall.status` | `command: ufw status` | +| `firewall.rules` | `command: ufw status` stdout contains | +| `ssh.*` | `file: /etc/ssh/sshd_config` contains | +| `services` | `service:` blocks | +| `packages` | `package:` blocks | +| `users` | `user:` + `file: /etc/sudoers.d/admin` | + +Example structure: + +```yaml +# goss/baseline.yaml +package: + ufw: + installed: true + fail2ban: + installed: true + +service: + ufw: + enabled: true + running: true + fail2ban: + enabled: true + running: true + +file: + /etc/ssh/sshd_config: + exists: true + contains: + - /^PermitRootLogin no/ + - /^PasswordAuthentication no/ + +command: + ufw status: + exit-status: 0 + stdout: + - "Status: active" + - "22/tcp.*ALLOW" + - "6443/tcp.*ALLOW" + - "8472/udp.*ALLOW" + +user: + admin: + exists: true + groups: + - sudo + shell: /bin/bash +``` + +**Done when:** `goss validate` passes on a freshly converged node. + +--- + +### T04 — Ansible role and playbook for Goss + +```task +id: T04 +status: todo +priority: high +``` + +Create `ansible/roles/goss/` with tasks that: +1. Download the Goss binary (pinned version) to `/usr/local/bin/goss` +2. Copy `goss/baseline.yaml` to `/etc/goss/baseline.yaml` +3. Run `goss -g /etc/goss/baseline.yaml validate --format tap` +4. Fetch the TAP output back to the control node as `reports/goss--.tap` +5. Fail the play if any test fails (`rc != 0`) + +Create `ansible/playbooks/verify.yaml`: +```yaml +- hosts: all + become: true + roles: + - role: goss +``` + +**Done when:** `ansible-playbook ansible/playbooks/verify.yaml` exits 0 on a +clean node, non-zero on a deliberately broken one (test with a manual config change). + +--- + +### T05 — Add `make verify` target + +```task +id: T05 +status: todo +priority: medium +``` + +Add to Makefile: + +```makefile +verify: ## Run Goss test suite against all hosts — exits non-zero on failure + cd $(ANS_DIR) && ansible-playbook playbooks/verify.yaml -u $(SSH_USER) +``` + +Also update `make status` to print a summary line ("All assertions passed" / +"N assertions FAILED") rather than raw shell output. + +**Done when:** `make verify` exits 0 on a good node, non-zero on a bad one. + +--- + +### T06 — Write ADR: railiance-hosts vs railiance-bootstrap boundary + +```task +id: T06 +status: todo +priority: medium +``` + +Create `docs/adr/ADR-002-repo-boundary-hosts-vs-bootstrap.md` documenting: + +- What `railiance-hosts` owns (OS baseline, security, spec, tests) +- What `railiance-bootstrap` owns (Kubernetes/app layer, consumes a converged node) +- Decision: any item present in `spec/server-baseline.yaml` must NOT be + managed by `railiance-bootstrap` +- Migration note: superseded bootstrap.yml / harden.yml in that repo + +**Done when:** ADR written and merged. + +--- + +## References + +- Goss documentation: https://github.com/goss-org/goss +- Server spec: `spec/server-baseline.yaml` +- Bootstrap workplan: `workplans/RAIL-HO-WP-0001-hosteurope-bootstrap.md`