diff --git a/SCOPE.md b/SCOPE.md index 4168f74..f59f3a6 100644 --- a/SCOPE.md +++ b/SCOPE.md @@ -123,4 +123,4 @@ keywords: [sops, age, secrets, encryption, gitops, key-rotation, credential] ## Notes -Designed for remote execution from HostEurope (92.205.130.254). SSH reverse tunnel required for State Hub access: `ssh -R 8000:127.0.0.1:8000 tegwick@92.205.130.254`. +Targets two servers: COULOMBCORE (92.205.130.254) and Railiance01 (92.205.62.239). State Hub access via ops-bridge — `bridge up state-hub-coulombcore` or `bridge up state-hub-railiance01` from the workstation (see ADR-004). diff --git a/ansible/inventory/host_vars/CoulombCore.yml b/ansible/inventory/host_vars/CoulombCore.yml new file mode 100644 index 0000000..b2be451 --- /dev/null +++ b/ansible/inventory/host_vars/CoulombCore.yml @@ -0,0 +1,14 @@ +# Host-specific variables for CoulombCore (92.205.130.254) +# k3s single-node cluster host — HostEurope + +# Swapfile (T01) +swap_size_gb: 4 +swap_swappiness: 10 + +# Resource limits (T02) — prevents runaway agents (see INC-002) +resource_limit_user: tegwick +resource_limit_uid: 1000 +nproc_soft: 512 +nproc_hard: 1024 +user_memory_max: "1500M" +user_memory_swap_max: "512M" diff --git a/ansible/inventory_from_yaml.py b/ansible/inventory_from_yaml.py index b207483..bfc8976 100755 --- a/ansible/inventory_from_yaml.py +++ b/ansible/inventory_from_yaml.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -import json, yaml, subprocess, os, sys, pathlib +import json, yaml, subprocess, os, sys, pathlib, glob def load_servers(): with open(os.path.join(os.path.dirname(__file__), '..', 'inventory', 'servers.yaml')) as f: @@ -17,6 +17,15 @@ def load_tf_outputs(): except Exception: return {} +def load_host_vars(name): + """Load host_vars/.yml if it exists.""" + script_dir = os.path.dirname(__file__) + path = os.path.join(script_dir, '..', 'inventory', 'host_vars', f'{name}.yml') + if os.path.exists(path): + with open(path) as f: + return yaml.safe_load(f) or {} + return {} + def main(): server_list = load_servers() tf = load_tf_outputs() @@ -25,10 +34,14 @@ def main(): for s in server_list: name = s['name'] host_names.append(name) - hostvars[name] = { + hvars = { "ansible_host": tf.get(name) or s.get('ip'), - "ansible_user": s.get('ssh_user', 'admin') + "ansible_user": s.get('ssh_user', 'admin'), } + if s.get('ssh_key'): + hvars["ansible_ssh_private_key_file"] = s['ssh_key'] + hvars.update(load_host_vars(name)) + hostvars[name] = hvars inv = { "all": {"hosts": host_names}, "_meta": {"hostvars": hostvars} diff --git a/ansible/playbooks/bootstrap.yaml b/ansible/playbooks/bootstrap.yaml index ed2c57e..627b1ee 100644 --- a/ansible/playbooks/bootstrap.yaml +++ b/ansible/playbooks/bootstrap.yaml @@ -7,4 +7,6 @@ - role: base - role: sops_agent - role: custodian_agent # injects ~/.ssh/id_custodian_agent.pub into authorized_keys + - role: swapfile # provisions swap file (size + swappiness from host_vars) + - role: resource_limits # nproc PAM caps + systemd user slice memory limits # - role: wireguard # enable if you configure WireGuard variables diff --git a/ansible/roles/resource_limits/handlers/main.yml b/ansible/roles/resource_limits/handlers/main.yml new file mode 100644 index 0000000..16b0951 --- /dev/null +++ b/ansible/roles/resource_limits/handlers/main.yml @@ -0,0 +1,4 @@ +--- +- name: Reload systemd daemon + ansible.builtin.systemd: + daemon_reload: true diff --git a/ansible/roles/resource_limits/tasks/main.yml b/ansible/roles/resource_limits/tasks/main.yml new file mode 100644 index 0000000..432f60d --- /dev/null +++ b/ansible/roles/resource_limits/tasks/main.yml @@ -0,0 +1,35 @@ +--- +# resource_limits role — PAM nproc caps + systemd user slice memory limits +# +# Variables (set per-host in host_vars): +# resource_limit_user: username to limit (default: tegwick) +# resource_limit_uid: UID for systemd user slice (default: 1000) +# nproc_soft: soft nproc limit (default: 512) +# nproc_hard: hard nproc limit (default: 1024) +# user_memory_max: systemd MemoryMax (default: 1500M) +# user_memory_swap_max: systemd MemorySwapMax (default: 512M) + +- name: Set PAM nproc limits + ansible.builtin.template: + src: nproc-limits.conf.j2 + dest: /etc/security/limits.d/60-nproc-{{ resource_limit_user | default('tegwick') }}.conf + owner: root + group: root + mode: '0644' + +- name: Ensure systemd user slice override directory + ansible.builtin.file: + path: "/etc/systemd/system/user-{{ resource_limit_uid | default(1000) }}.slice.d" + state: directory + owner: root + group: root + mode: '0755' + +- name: Set systemd user slice memory limits + ansible.builtin.template: + src: user-slice-limits.conf.j2 + dest: "/etc/systemd/system/user-{{ resource_limit_uid | default(1000) }}.slice.d/limits.conf" + owner: root + group: root + mode: '0644' + notify: Reload systemd daemon diff --git a/ansible/roles/resource_limits/templates/nproc-limits.conf.j2 b/ansible/roles/resource_limits/templates/nproc-limits.conf.j2 new file mode 100644 index 0000000..48f5a58 --- /dev/null +++ b/ansible/roles/resource_limits/templates/nproc-limits.conf.j2 @@ -0,0 +1,6 @@ +# Managed by Ansible (resource_limits role) +# Caps process count for {{ resource_limit_user | default('tegwick') }} +# to prevent runaway agents from exhausting the kernel PID table. +# See INC-002 (2026-03-26) for root cause context. +{{ resource_limit_user | default('tegwick') }} soft nproc {{ nproc_soft | default(512) }} +{{ resource_limit_user | default('tegwick') }} hard nproc {{ nproc_hard | default(1024) }} diff --git a/ansible/roles/resource_limits/templates/user-slice-limits.conf.j2 b/ansible/roles/resource_limits/templates/user-slice-limits.conf.j2 new file mode 100644 index 0000000..f3ae130 --- /dev/null +++ b/ansible/roles/resource_limits/templates/user-slice-limits.conf.j2 @@ -0,0 +1,7 @@ +# Managed by Ansible (resource_limits role) +# Caps memory for all processes in the user-{{ resource_limit_uid | default(1000) }}.slice. +# Prevents a single user's agent workload from OOM-killing the entire node. +# See INC-002 (2026-03-26) for root cause context. +[Slice] +MemoryMax={{ user_memory_max | default('1500M') }} +MemorySwapMax={{ user_memory_swap_max | default('512M') }} diff --git a/ansible/roles/swapfile/tasks/main.yml b/ansible/roles/swapfile/tasks/main.yml new file mode 100644 index 0000000..730fbac --- /dev/null +++ b/ansible/roles/swapfile/tasks/main.yml @@ -0,0 +1,59 @@ +--- +# swapfile role — provisions a swap file of configurable size +# +# Variables (set per-host in host_vars): +# swap_size_gb: size in gigabytes (default: 4) +# swap_swappiness: vm.swappiness value (default: 10) + +- name: Check if swapfile exists with correct size + ansible.builtin.stat: + path: /swapfile + register: swapfile_stat + +- name: Allocate swapfile (fallocate) + ansible.builtin.command: + cmd: "fallocate -l {{ (swap_size_gb | default(4)) | int }}G /swapfile" + creates: /swapfile + when: not swapfile_stat.stat.exists + +- name: Set swapfile permissions + ansible.builtin.file: + path: /swapfile + owner: root + group: root + mode: '0600' + +- name: Format swapfile + ansible.builtin.command: + cmd: mkswap /swapfile + when: not swapfile_stat.stat.exists + +- name: Enable swapfile + ansible.builtin.command: + cmd: swapon /swapfile + when: not swapfile_stat.stat.exists + ignore_errors: true # already active is not an error + +- name: Ensure swapfile in /etc/fstab + ansible.builtin.lineinfile: + path: /etc/fstab + regexp: '^/swapfile' + line: '/swapfile none swap sw 0 0' + state: present + +- name: Set vm.swappiness at runtime + ansible.posix.sysctl: + name: vm.swappiness + value: "{{ swap_swappiness | default(10) }}" + state: present + reload: true + +- name: Persist vm.swappiness across reboots + ansible.builtin.copy: + dest: /etc/sysctl.d/60-swappiness.conf + owner: root + group: root + mode: '0644' + content: | + # Managed by Ansible (swapfile role) + vm.swappiness = {{ swap_swappiness | default(10) }} diff --git a/inventory/servers.yaml b/inventory/servers.yaml index 81e0cc1..732e535 100644 --- a/inventory/servers.yaml +++ b/inventory/servers.yaml @@ -1,4 +1,8 @@ servers: + - name: CoulombCore + ip: 92.205.130.254 + ssh_user: tegwick + ssh_key: ~/.ssh/id_ops - name: Railiance01 ip: 92.205.62.239 ssh_user: tegwick diff --git a/workplans/RAIL-HO-WP-0004-production-readiness.md b/workplans/RAIL-HO-WP-0004-production-readiness.md index e8980cf..5aa8488 100644 --- a/workplans/RAIL-HO-WP-0004-production-readiness.md +++ b/workplans/RAIL-HO-WP-0004-production-readiness.md @@ -121,8 +121,15 @@ Role tasks: 3. Ensure `/etc/fstab` entry present 4. Set `vm.swappiness=10` via `sysctl` module (persist in `/etc/sysctl.d/`) -**Done when:** `make converge` is idempotent; `free -h` on CoulombCore shows -4 GB swap; `make verify` passes. +**Convergence pattern:** Ansible is not installed on the workstation. Run convergence +directly on CoulombCore (local Ansible, connection=local): +```bash +ssh -i ~/.ssh/id_ops tegwick@92.205.130.254 \ + 'cd ~/railiance-infra && git pull && ansible-playbook ansible/playbooks/bootstrap.yaml -c local -u tegwick --become -l CoulombCore' +``` + +**Done when:** Convergence runs without errors; `free -h` on CoulombCore shows +4 GB swap; Goss verify passes. --- @@ -211,6 +218,8 @@ id: RAIL-HO-WP-0004-T04 status: todo priority: high state_hub_task_id: "4f4196b5-4d84-4648-b470-e6941444ea46" +needs_human: true +intervention_note: "Live Gitea DB migration — Gitea is the only git host. Requires explicit backup verification and operator approval before execution. Risk: data loss if migration fails mid-flight." ``` **Pre-condition:** T03 done and gitea-db cluster healthy. @@ -432,6 +441,8 @@ id: RAIL-HO-WP-0004-T09 status: todo priority: medium state_hub_task_id: "d2afe78a-eb51-4ce9-b332-f181323d2370" +needs_human: true +intervention_note: "Requires decisions: final hostname/domain for state-hub, whether to use Gitea container registry or ghcr.io, and approval before data migration from workstation postgres." ``` **Pre-condition:** T04 done (cnpg Gitea DB working); T08 done (deploy sequence @@ -464,6 +475,8 @@ id: RAIL-HO-WP-0004-T10 status: todo priority: low state_hub_task_id: "34d73215-f016-4750-8da5-69f82d63d619" +needs_human: true +intervention_note: "activity-core architecture needs review before packaging — needs confirmation of runtime (Rails/Go/other), whether it uses postgres, and what the migration strategy is for any existing on-node data." ``` **Pre-condition:** T09 done (state-hub on cluster operational).