Files
the-custodian/infra/build-machines/haskell/files/build-agent.py
tegwick 9bc761c2b5 feat(railiance): implement CUST-WP-0032 Haskell build machine infra
Packer build definition, cloud-init autoinstall, GHCup toolchain script,
boot-time registration agent (state-hub + autossh dual tunnel), systemd
unit, key injection, remote-build Makefile, smoke test, and deployment
README. All 15 tasks complete.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-20 12:01:30 +02:00

149 lines
4.9 KiB
Python
Executable File

#!/usr/bin/env python3
"""
build-agent — runs at VM boot.
1. Reads /etc/build-agent.env
2. Detects GHC version
3. Registers (or updates) a capability-catalog entry in the state-hub
4. Opens an autossh reverse tunnel to the workstation
"""
import os, json, socket, subprocess, time, sys
import urllib.request, urllib.error
def load_env(path="/etc/build-agent.env"):
env = {}
try:
with open(path) as f:
for line in f:
line = line.strip()
if line and not line.startswith('#') and '=' in line:
k, _, v = line.partition('=')
env[k.strip()] = v.strip().strip('"')
except FileNotFoundError:
pass
return env
def get_ghc_version():
for path in [
"/home/build/.ghcup/bin/ghc",
"/usr/local/bin/ghc",
]:
try:
r = subprocess.run([path, "--version"],
capture_output=True, text=True, timeout=15)
if r.returncode == 0:
return r.stdout.strip().split()[-1]
except Exception:
continue
return "unknown"
def get_local_ip():
"""Get the primary LAN IP (not loopback)."""
try:
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
s.connect(("8.8.8.8", 80))
ip = s.getsockname()[0]
s.close()
return ip
except Exception:
return "unknown"
def register(cfg):
# State-hub is always accessed via the forward tunnel (port 18000), never
# via direct LAN. This matches the CoulombCore remote worker pattern and
# works regardless of network topology (LAN, VPN, different subnet).
state_hub = cfg.get("STATE_HUB_URL", "http://127.0.0.1:18000")
hostname = socket.gethostname()
domain = cfg.get("STATE_HUB_DOMAIN", "railiance")
remote_port = cfg.get("REMOTE_PORT", "12222")
ghc_ver = get_ghc_version()
local_ip = get_local_ip()
payload = {
"domain": domain,
"capability_type": "haskell-build-agent",
"title": f"Haskell Build Agent — {hostname}",
"description": (
f"GHC {ghc_ver} build sandbox on {hostname} ({local_ip}). "
f"SSH tunnel port: {remote_port} on workstation."
),
"keywords": [
"haskell", "ghc", f"ghc-{ghc_ver}",
"build-agent", "cabal", "stack",
f"host:{hostname}", f"tunnel-port:{remote_port}",
],
}
data = json.dumps(payload).encode()
req = urllib.request.Request(
f"{state_hub}/capability-catalog/",
data=data,
headers={"Content-Type": "application/json"},
method="POST",
)
try:
with urllib.request.urlopen(req, timeout=15) as resp:
result = json.loads(resp.read())
print(f"[build-agent] Registered capability: {result['id']}", flush=True)
return result
except urllib.error.HTTPError as e:
body = e.read().decode()
print(f"[build-agent] Registration HTTP error {e.code}: {body}", flush=True)
raise
except Exception as e:
print(f"[build-agent] Registration failed: {e}", flush=True)
raise
def open_tunnel(cfg):
relay_host = cfg.get("SSH_RELAY_HOST", "")
relay_user = cfg.get("SSH_RELAY_USER", "worsch")
ssh_key = cfg.get("SSH_KEY_PATH", "/home/build/.ssh/id_build")
remote_port = cfg.get("REMOTE_PORT", "12222")
if not relay_host:
print("[build-agent] SSH_RELAY_HOST not set — tunnel disabled", flush=True)
# Sleep forever so systemd considers service active
while True:
time.sleep(3600)
cmd = [
"autossh",
"-M", "0", # disable autossh monitoring port
"-o", "ServerAliveInterval=30",
"-o", "ServerAliveCountMax=3",
"-o", "ExitOnForwardFailure=yes",
"-o", "StrictHostKeyChecking=no",
"-o", "UserKnownHostsFile=/dev/null",
"-N",
"-R", f"{remote_port}:localhost:22", # reverse: workstation → VM SSH
"-L", "18000:localhost:8000", # forward: VM → state-hub (port 18000)
"-i", ssh_key,
f"{relay_user}@{relay_host}",
]
print(
f"[build-agent] Opening tunnels: "
f"-R {remote_port}→local:22, -L 18000→state-hub:8000",
flush=True,
)
subprocess.run(cmd) # autossh manages reconnects internally
def main():
cfg = load_env()
# Retry registration until state-hub is reachable (network may not be ready)
for attempt in range(20):
try:
register(cfg)
break
except Exception:
wait = min(10 * (attempt + 1), 60)
print(f"[build-agent] Retrying in {wait}s ...", flush=True)
time.sleep(wait)
else:
print("[build-agent] Registration permanently failed — continuing to tunnel",
flush=True)
open_tunnel(cfg)
if __name__ == "__main__":
main()