feat(ansible): add swapfile + resource_limits roles; add CoulombCore to inventory
T01: roles/swapfile — idempotent 4GB swapfile, vm.swappiness=10, fstab entry
T02: roles/resource_limits — PAM nproc caps (512/1024), systemd user-1000.slice
memory limits (1500M/512M); templated per-host via host_vars
- inventory/host_vars/CoulombCore.yml — host-specific vars for both roles
- inventory/servers.yaml — add CoulombCore with id_ops SSH key
- inventory_from_yaml.py — load host_vars files into Ansible hostvars
- playbooks/bootstrap.yaml — include swapfile + resource_limits roles
- workplans/WP-0004 — flag T04/T09/T10 needs_human, add CoulombCore-local convergence note
Codifies manual INC-002 hardening. See RAIL-HO-WP-0004-T01/T02.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2
SCOPE.md
2
SCOPE.md
@@ -123,4 +123,4 @@ keywords: [sops, age, secrets, encryption, gitops, key-rotation, credential]
|
|||||||
|
|
||||||
## Notes
|
## Notes
|
||||||
|
|
||||||
Designed for remote execution from HostEurope (92.205.130.254). SSH reverse tunnel required for State Hub access: `ssh -R 8000:127.0.0.1:8000 tegwick@92.205.130.254`.
|
Targets two servers: COULOMBCORE (92.205.130.254) and Railiance01 (92.205.62.239). State Hub access via ops-bridge — `bridge up state-hub-coulombcore` or `bridge up state-hub-railiance01` from the workstation (see ADR-004).
|
||||||
|
|||||||
14
ansible/inventory/host_vars/CoulombCore.yml
Normal file
14
ansible/inventory/host_vars/CoulombCore.yml
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
# Host-specific variables for CoulombCore (92.205.130.254)
|
||||||
|
# k3s single-node cluster host — HostEurope
|
||||||
|
|
||||||
|
# Swapfile (T01)
|
||||||
|
swap_size_gb: 4
|
||||||
|
swap_swappiness: 10
|
||||||
|
|
||||||
|
# Resource limits (T02) — prevents runaway agents (see INC-002)
|
||||||
|
resource_limit_user: tegwick
|
||||||
|
resource_limit_uid: 1000
|
||||||
|
nproc_soft: 512
|
||||||
|
nproc_hard: 1024
|
||||||
|
user_memory_max: "1500M"
|
||||||
|
user_memory_swap_max: "512M"
|
||||||
@@ -1,5 +1,5 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import json, yaml, subprocess, os, sys, pathlib
|
import json, yaml, subprocess, os, sys, pathlib, glob
|
||||||
|
|
||||||
def load_servers():
|
def load_servers():
|
||||||
with open(os.path.join(os.path.dirname(__file__), '..', 'inventory', 'servers.yaml')) as f:
|
with open(os.path.join(os.path.dirname(__file__), '..', 'inventory', 'servers.yaml')) as f:
|
||||||
@@ -17,6 +17,15 @@ def load_tf_outputs():
|
|||||||
except Exception:
|
except Exception:
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
|
def load_host_vars(name):
|
||||||
|
"""Load host_vars/<name>.yml if it exists."""
|
||||||
|
script_dir = os.path.dirname(__file__)
|
||||||
|
path = os.path.join(script_dir, '..', 'inventory', 'host_vars', f'{name}.yml')
|
||||||
|
if os.path.exists(path):
|
||||||
|
with open(path) as f:
|
||||||
|
return yaml.safe_load(f) or {}
|
||||||
|
return {}
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
server_list = load_servers()
|
server_list = load_servers()
|
||||||
tf = load_tf_outputs()
|
tf = load_tf_outputs()
|
||||||
@@ -25,10 +34,14 @@ def main():
|
|||||||
for s in server_list:
|
for s in server_list:
|
||||||
name = s['name']
|
name = s['name']
|
||||||
host_names.append(name)
|
host_names.append(name)
|
||||||
hostvars[name] = {
|
hvars = {
|
||||||
"ansible_host": tf.get(name) or s.get('ip'),
|
"ansible_host": tf.get(name) or s.get('ip'),
|
||||||
"ansible_user": s.get('ssh_user', 'admin')
|
"ansible_user": s.get('ssh_user', 'admin'),
|
||||||
}
|
}
|
||||||
|
if s.get('ssh_key'):
|
||||||
|
hvars["ansible_ssh_private_key_file"] = s['ssh_key']
|
||||||
|
hvars.update(load_host_vars(name))
|
||||||
|
hostvars[name] = hvars
|
||||||
inv = {
|
inv = {
|
||||||
"all": {"hosts": host_names},
|
"all": {"hosts": host_names},
|
||||||
"_meta": {"hostvars": hostvars}
|
"_meta": {"hostvars": hostvars}
|
||||||
|
|||||||
@@ -7,4 +7,6 @@
|
|||||||
- role: base
|
- role: base
|
||||||
- role: sops_agent
|
- role: sops_agent
|
||||||
- role: custodian_agent # injects ~/.ssh/id_custodian_agent.pub into authorized_keys
|
- role: custodian_agent # injects ~/.ssh/id_custodian_agent.pub into authorized_keys
|
||||||
|
- role: swapfile # provisions swap file (size + swappiness from host_vars)
|
||||||
|
- role: resource_limits # nproc PAM caps + systemd user slice memory limits
|
||||||
# - role: wireguard # enable if you configure WireGuard variables
|
# - role: wireguard # enable if you configure WireGuard variables
|
||||||
|
|||||||
4
ansible/roles/resource_limits/handlers/main.yml
Normal file
4
ansible/roles/resource_limits/handlers/main.yml
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
---
|
||||||
|
- name: Reload systemd daemon
|
||||||
|
ansible.builtin.systemd:
|
||||||
|
daemon_reload: true
|
||||||
35
ansible/roles/resource_limits/tasks/main.yml
Normal file
35
ansible/roles/resource_limits/tasks/main.yml
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
---
|
||||||
|
# resource_limits role — PAM nproc caps + systemd user slice memory limits
|
||||||
|
#
|
||||||
|
# Variables (set per-host in host_vars):
|
||||||
|
# resource_limit_user: username to limit (default: tegwick)
|
||||||
|
# resource_limit_uid: UID for systemd user slice (default: 1000)
|
||||||
|
# nproc_soft: soft nproc limit (default: 512)
|
||||||
|
# nproc_hard: hard nproc limit (default: 1024)
|
||||||
|
# user_memory_max: systemd MemoryMax (default: 1500M)
|
||||||
|
# user_memory_swap_max: systemd MemorySwapMax (default: 512M)
|
||||||
|
|
||||||
|
- name: Set PAM nproc limits
|
||||||
|
ansible.builtin.template:
|
||||||
|
src: nproc-limits.conf.j2
|
||||||
|
dest: /etc/security/limits.d/60-nproc-{{ resource_limit_user | default('tegwick') }}.conf
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: '0644'
|
||||||
|
|
||||||
|
- name: Ensure systemd user slice override directory
|
||||||
|
ansible.builtin.file:
|
||||||
|
path: "/etc/systemd/system/user-{{ resource_limit_uid | default(1000) }}.slice.d"
|
||||||
|
state: directory
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: '0755'
|
||||||
|
|
||||||
|
- name: Set systemd user slice memory limits
|
||||||
|
ansible.builtin.template:
|
||||||
|
src: user-slice-limits.conf.j2
|
||||||
|
dest: "/etc/systemd/system/user-{{ resource_limit_uid | default(1000) }}.slice.d/limits.conf"
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: '0644'
|
||||||
|
notify: Reload systemd daemon
|
||||||
@@ -0,0 +1,6 @@
|
|||||||
|
# Managed by Ansible (resource_limits role)
|
||||||
|
# Caps process count for {{ resource_limit_user | default('tegwick') }}
|
||||||
|
# to prevent runaway agents from exhausting the kernel PID table.
|
||||||
|
# See INC-002 (2026-03-26) for root cause context.
|
||||||
|
{{ resource_limit_user | default('tegwick') }} soft nproc {{ nproc_soft | default(512) }}
|
||||||
|
{{ resource_limit_user | default('tegwick') }} hard nproc {{ nproc_hard | default(1024) }}
|
||||||
@@ -0,0 +1,7 @@
|
|||||||
|
# Managed by Ansible (resource_limits role)
|
||||||
|
# Caps memory for all processes in the user-{{ resource_limit_uid | default(1000) }}.slice.
|
||||||
|
# Prevents a single user's agent workload from OOM-killing the entire node.
|
||||||
|
# See INC-002 (2026-03-26) for root cause context.
|
||||||
|
[Slice]
|
||||||
|
MemoryMax={{ user_memory_max | default('1500M') }}
|
||||||
|
MemorySwapMax={{ user_memory_swap_max | default('512M') }}
|
||||||
59
ansible/roles/swapfile/tasks/main.yml
Normal file
59
ansible/roles/swapfile/tasks/main.yml
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
---
|
||||||
|
# swapfile role — provisions a swap file of configurable size
|
||||||
|
#
|
||||||
|
# Variables (set per-host in host_vars):
|
||||||
|
# swap_size_gb: size in gigabytes (default: 4)
|
||||||
|
# swap_swappiness: vm.swappiness value (default: 10)
|
||||||
|
|
||||||
|
- name: Check if swapfile exists with correct size
|
||||||
|
ansible.builtin.stat:
|
||||||
|
path: /swapfile
|
||||||
|
register: swapfile_stat
|
||||||
|
|
||||||
|
- name: Allocate swapfile (fallocate)
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: "fallocate -l {{ (swap_size_gb | default(4)) | int }}G /swapfile"
|
||||||
|
creates: /swapfile
|
||||||
|
when: not swapfile_stat.stat.exists
|
||||||
|
|
||||||
|
- name: Set swapfile permissions
|
||||||
|
ansible.builtin.file:
|
||||||
|
path: /swapfile
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: '0600'
|
||||||
|
|
||||||
|
- name: Format swapfile
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: mkswap /swapfile
|
||||||
|
when: not swapfile_stat.stat.exists
|
||||||
|
|
||||||
|
- name: Enable swapfile
|
||||||
|
ansible.builtin.command:
|
||||||
|
cmd: swapon /swapfile
|
||||||
|
when: not swapfile_stat.stat.exists
|
||||||
|
ignore_errors: true # already active is not an error
|
||||||
|
|
||||||
|
- name: Ensure swapfile in /etc/fstab
|
||||||
|
ansible.builtin.lineinfile:
|
||||||
|
path: /etc/fstab
|
||||||
|
regexp: '^/swapfile'
|
||||||
|
line: '/swapfile none swap sw 0 0'
|
||||||
|
state: present
|
||||||
|
|
||||||
|
- name: Set vm.swappiness at runtime
|
||||||
|
ansible.posix.sysctl:
|
||||||
|
name: vm.swappiness
|
||||||
|
value: "{{ swap_swappiness | default(10) }}"
|
||||||
|
state: present
|
||||||
|
reload: true
|
||||||
|
|
||||||
|
- name: Persist vm.swappiness across reboots
|
||||||
|
ansible.builtin.copy:
|
||||||
|
dest: /etc/sysctl.d/60-swappiness.conf
|
||||||
|
owner: root
|
||||||
|
group: root
|
||||||
|
mode: '0644'
|
||||||
|
content: |
|
||||||
|
# Managed by Ansible (swapfile role)
|
||||||
|
vm.swappiness = {{ swap_swappiness | default(10) }}
|
||||||
@@ -1,4 +1,8 @@
|
|||||||
servers:
|
servers:
|
||||||
|
- name: CoulombCore
|
||||||
|
ip: 92.205.130.254
|
||||||
|
ssh_user: tegwick
|
||||||
|
ssh_key: ~/.ssh/id_ops
|
||||||
- name: Railiance01
|
- name: Railiance01
|
||||||
ip: 92.205.62.239
|
ip: 92.205.62.239
|
||||||
ssh_user: tegwick
|
ssh_user: tegwick
|
||||||
|
|||||||
@@ -121,8 +121,15 @@ Role tasks:
|
|||||||
3. Ensure `/etc/fstab` entry present
|
3. Ensure `/etc/fstab` entry present
|
||||||
4. Set `vm.swappiness=10` via `sysctl` module (persist in `/etc/sysctl.d/`)
|
4. Set `vm.swappiness=10` via `sysctl` module (persist in `/etc/sysctl.d/`)
|
||||||
|
|
||||||
**Done when:** `make converge` is idempotent; `free -h` on CoulombCore shows
|
**Convergence pattern:** Ansible is not installed on the workstation. Run convergence
|
||||||
4 GB swap; `make verify` passes.
|
directly on CoulombCore (local Ansible, connection=local):
|
||||||
|
```bash
|
||||||
|
ssh -i ~/.ssh/id_ops tegwick@92.205.130.254 \
|
||||||
|
'cd ~/railiance-infra && git pull && ansible-playbook ansible/playbooks/bootstrap.yaml -c local -u tegwick --become -l CoulombCore'
|
||||||
|
```
|
||||||
|
|
||||||
|
**Done when:** Convergence runs without errors; `free -h` on CoulombCore shows
|
||||||
|
4 GB swap; Goss verify passes.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@@ -211,6 +218,8 @@ id: RAIL-HO-WP-0004-T04
|
|||||||
status: todo
|
status: todo
|
||||||
priority: high
|
priority: high
|
||||||
state_hub_task_id: "4f4196b5-4d84-4648-b470-e6941444ea46"
|
state_hub_task_id: "4f4196b5-4d84-4648-b470-e6941444ea46"
|
||||||
|
needs_human: true
|
||||||
|
intervention_note: "Live Gitea DB migration — Gitea is the only git host. Requires explicit backup verification and operator approval before execution. Risk: data loss if migration fails mid-flight."
|
||||||
```
|
```
|
||||||
|
|
||||||
**Pre-condition:** T03 done and gitea-db cluster healthy.
|
**Pre-condition:** T03 done and gitea-db cluster healthy.
|
||||||
@@ -432,6 +441,8 @@ id: RAIL-HO-WP-0004-T09
|
|||||||
status: todo
|
status: todo
|
||||||
priority: medium
|
priority: medium
|
||||||
state_hub_task_id: "d2afe78a-eb51-4ce9-b332-f181323d2370"
|
state_hub_task_id: "d2afe78a-eb51-4ce9-b332-f181323d2370"
|
||||||
|
needs_human: true
|
||||||
|
intervention_note: "Requires decisions: final hostname/domain for state-hub, whether to use Gitea container registry or ghcr.io, and approval before data migration from workstation postgres."
|
||||||
```
|
```
|
||||||
|
|
||||||
**Pre-condition:** T04 done (cnpg Gitea DB working); T08 done (deploy sequence
|
**Pre-condition:** T04 done (cnpg Gitea DB working); T08 done (deploy sequence
|
||||||
@@ -464,6 +475,8 @@ id: RAIL-HO-WP-0004-T10
|
|||||||
status: todo
|
status: todo
|
||||||
priority: low
|
priority: low
|
||||||
state_hub_task_id: "34d73215-f016-4750-8da5-69f82d63d619"
|
state_hub_task_id: "34d73215-f016-4750-8da5-69f82d63d619"
|
||||||
|
needs_human: true
|
||||||
|
intervention_note: "activity-core architecture needs review before packaging — needs confirmation of runtime (Rails/Go/other), whether it uses postgres, and what the migration strategy is for any existing on-node data."
|
||||||
```
|
```
|
||||||
|
|
||||||
**Pre-condition:** T09 done (state-hub on cluster operational).
|
**Pre-condition:** T09 done (state-hub on cluster operational).
|
||||||
|
|||||||
Reference in New Issue
Block a user