#!/usr/bin/env python3 """ build-agent — runs at VM boot. 1. Reads /etc/build-agent.env 2. Detects GHC version 3. Registers (or updates) a capability-catalog entry in the state-hub 4. Opens an autossh reverse tunnel to the workstation """ import os, json, socket, subprocess, time, sys import urllib.request, urllib.error def load_env(path="/etc/build-agent.env"): env = {} try: with open(path) as f: for line in f: line = line.strip() if line and not line.startswith('#') and '=' in line: k, _, v = line.partition('=') env[k.strip()] = v.strip().strip('"') except FileNotFoundError: pass return env def get_ghc_version(): for path in [ "/home/build/.ghcup/bin/ghc", "/usr/local/bin/ghc", ]: try: r = subprocess.run([path, "--version"], capture_output=True, text=True, timeout=15) if r.returncode == 0: return r.stdout.strip().split()[-1] except Exception: continue return "unknown" def get_local_ip(): """Get the primary LAN IP (not loopback).""" try: s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.connect(("8.8.8.8", 80)) ip = s.getsockname()[0] s.close() return ip except Exception: return "unknown" def register(cfg): # State-hub is always accessed via the forward tunnel (port 18000), never # via direct LAN. This matches the CoulombCore remote worker pattern and # works regardless of network topology (LAN, VPN, different subnet). state_hub = cfg.get("STATE_HUB_URL", "http://127.0.0.1:18000") hostname = socket.gethostname() domain = cfg.get("STATE_HUB_DOMAIN", "railiance") remote_port = cfg.get("REMOTE_PORT", "12222") ghc_ver = get_ghc_version() local_ip = get_local_ip() payload = { "domain": domain, "capability_type": "haskell-build-agent", "title": f"Haskell Build Agent — {hostname}", "description": ( f"GHC {ghc_ver} build sandbox on {hostname} ({local_ip}). " f"SSH tunnel port: {remote_port} on workstation." ), "keywords": [ "haskell", "ghc", f"ghc-{ghc_ver}", "build-agent", "cabal", "stack", f"host:{hostname}", f"tunnel-port:{remote_port}", ], } data = json.dumps(payload).encode() req = urllib.request.Request( f"{state_hub}/capability-catalog/", data=data, headers={"Content-Type": "application/json"}, method="POST", ) try: with urllib.request.urlopen(req, timeout=15) as resp: result = json.loads(resp.read()) print(f"[build-agent] Registered capability: {result['id']}", flush=True) return result except urllib.error.HTTPError as e: body = e.read().decode() print(f"[build-agent] Registration HTTP error {e.code}: {body}", flush=True) raise except Exception as e: print(f"[build-agent] Registration failed: {e}", flush=True) raise def open_tunnel(cfg): relay_host = cfg.get("SSH_RELAY_HOST", "") relay_user = cfg.get("SSH_RELAY_USER", "worsch") ssh_key = cfg.get("SSH_KEY_PATH", "/home/build/.ssh/id_build") remote_port = cfg.get("REMOTE_PORT", "12222") if not relay_host: print("[build-agent] SSH_RELAY_HOST not set — tunnel disabled", flush=True) # Sleep forever so systemd considers service active while True: time.sleep(3600) cmd = [ "autossh", "-M", "0", # disable autossh monitoring port "-o", "ServerAliveInterval=30", "-o", "ServerAliveCountMax=3", "-o", "ExitOnForwardFailure=yes", "-o", "StrictHostKeyChecking=no", "-o", "UserKnownHostsFile=/dev/null", "-N", "-R", f"{remote_port}:localhost:22", # reverse: workstation → VM SSH "-L", "18000:localhost:8000", # forward: VM → state-hub (port 18000) "-i", ssh_key, f"{relay_user}@{relay_host}", ] print( f"[build-agent] Opening tunnels: " f"-R {remote_port}→local:22, -L 18000→state-hub:8000", flush=True, ) subprocess.run(cmd) # autossh manages reconnects internally def main(): cfg = load_env() # Retry registration until state-hub is reachable (network may not be ready) for attempt in range(20): try: register(cfg) break except Exception: wait = min(10 * (attempt + 1), 60) print(f"[build-agent] Retrying in {wait}s ...", flush=True) time.sleep(wait) else: print("[build-agent] Registration permanently failed — continuing to tunnel", flush=True) open_tunnel(cfg) if __name__ == "__main__": main()