Packer build definition, cloud-init autoinstall, GHCup toolchain script, boot-time registration agent (state-hub + autossh dual tunnel), systemd unit, key injection, remote-build Makefile, smoke test, and deployment README. All 15 tasks complete. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
149 lines
4.9 KiB
Python
Executable File
149 lines
4.9 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
build-agent — runs at VM boot.
|
|
1. Reads /etc/build-agent.env
|
|
2. Detects GHC version
|
|
3. Registers (or updates) a capability-catalog entry in the state-hub
|
|
4. Opens an autossh reverse tunnel to the workstation
|
|
"""
|
|
import os, json, socket, subprocess, time, sys
|
|
import urllib.request, urllib.error
|
|
|
|
def load_env(path="/etc/build-agent.env"):
|
|
env = {}
|
|
try:
|
|
with open(path) as f:
|
|
for line in f:
|
|
line = line.strip()
|
|
if line and not line.startswith('#') and '=' in line:
|
|
k, _, v = line.partition('=')
|
|
env[k.strip()] = v.strip().strip('"')
|
|
except FileNotFoundError:
|
|
pass
|
|
return env
|
|
|
|
def get_ghc_version():
|
|
for path in [
|
|
"/home/build/.ghcup/bin/ghc",
|
|
"/usr/local/bin/ghc",
|
|
]:
|
|
try:
|
|
r = subprocess.run([path, "--version"],
|
|
capture_output=True, text=True, timeout=15)
|
|
if r.returncode == 0:
|
|
return r.stdout.strip().split()[-1]
|
|
except Exception:
|
|
continue
|
|
return "unknown"
|
|
|
|
def get_local_ip():
|
|
"""Get the primary LAN IP (not loopback)."""
|
|
try:
|
|
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
|
s.connect(("8.8.8.8", 80))
|
|
ip = s.getsockname()[0]
|
|
s.close()
|
|
return ip
|
|
except Exception:
|
|
return "unknown"
|
|
|
|
def register(cfg):
|
|
# State-hub is always accessed via the forward tunnel (port 18000), never
|
|
# via direct LAN. This matches the CoulombCore remote worker pattern and
|
|
# works regardless of network topology (LAN, VPN, different subnet).
|
|
state_hub = cfg.get("STATE_HUB_URL", "http://127.0.0.1:18000")
|
|
hostname = socket.gethostname()
|
|
domain = cfg.get("STATE_HUB_DOMAIN", "railiance")
|
|
remote_port = cfg.get("REMOTE_PORT", "12222")
|
|
ghc_ver = get_ghc_version()
|
|
local_ip = get_local_ip()
|
|
|
|
payload = {
|
|
"domain": domain,
|
|
"capability_type": "haskell-build-agent",
|
|
"title": f"Haskell Build Agent — {hostname}",
|
|
"description": (
|
|
f"GHC {ghc_ver} build sandbox on {hostname} ({local_ip}). "
|
|
f"SSH tunnel port: {remote_port} on workstation."
|
|
),
|
|
"keywords": [
|
|
"haskell", "ghc", f"ghc-{ghc_ver}",
|
|
"build-agent", "cabal", "stack",
|
|
f"host:{hostname}", f"tunnel-port:{remote_port}",
|
|
],
|
|
}
|
|
|
|
data = json.dumps(payload).encode()
|
|
req = urllib.request.Request(
|
|
f"{state_hub}/capability-catalog/",
|
|
data=data,
|
|
headers={"Content-Type": "application/json"},
|
|
method="POST",
|
|
)
|
|
try:
|
|
with urllib.request.urlopen(req, timeout=15) as resp:
|
|
result = json.loads(resp.read())
|
|
print(f"[build-agent] Registered capability: {result['id']}", flush=True)
|
|
return result
|
|
except urllib.error.HTTPError as e:
|
|
body = e.read().decode()
|
|
print(f"[build-agent] Registration HTTP error {e.code}: {body}", flush=True)
|
|
raise
|
|
except Exception as e:
|
|
print(f"[build-agent] Registration failed: {e}", flush=True)
|
|
raise
|
|
|
|
def open_tunnel(cfg):
|
|
relay_host = cfg.get("SSH_RELAY_HOST", "")
|
|
relay_user = cfg.get("SSH_RELAY_USER", "worsch")
|
|
ssh_key = cfg.get("SSH_KEY_PATH", "/home/build/.ssh/id_build")
|
|
remote_port = cfg.get("REMOTE_PORT", "12222")
|
|
|
|
if not relay_host:
|
|
print("[build-agent] SSH_RELAY_HOST not set — tunnel disabled", flush=True)
|
|
# Sleep forever so systemd considers service active
|
|
while True:
|
|
time.sleep(3600)
|
|
|
|
cmd = [
|
|
"autossh",
|
|
"-M", "0", # disable autossh monitoring port
|
|
"-o", "ServerAliveInterval=30",
|
|
"-o", "ServerAliveCountMax=3",
|
|
"-o", "ExitOnForwardFailure=yes",
|
|
"-o", "StrictHostKeyChecking=no",
|
|
"-o", "UserKnownHostsFile=/dev/null",
|
|
"-N",
|
|
"-R", f"{remote_port}:localhost:22", # reverse: workstation → VM SSH
|
|
"-L", "18000:localhost:8000", # forward: VM → state-hub (port 18000)
|
|
"-i", ssh_key,
|
|
f"{relay_user}@{relay_host}",
|
|
]
|
|
print(
|
|
f"[build-agent] Opening tunnels: "
|
|
f"-R {remote_port}→local:22, -L 18000→state-hub:8000",
|
|
flush=True,
|
|
)
|
|
subprocess.run(cmd) # autossh manages reconnects internally
|
|
|
|
def main():
|
|
cfg = load_env()
|
|
|
|
# Retry registration until state-hub is reachable (network may not be ready)
|
|
for attempt in range(20):
|
|
try:
|
|
register(cfg)
|
|
break
|
|
except Exception:
|
|
wait = min(10 * (attempt + 1), 60)
|
|
print(f"[build-agent] Retrying in {wait}s ...", flush=True)
|
|
time.sleep(wait)
|
|
else:
|
|
print("[build-agent] Registration permanently failed — continuing to tunnel",
|
|
flush=True)
|
|
|
|
open_tunnel(cfg)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|