feat: implement OpsBridge CLI (BRIDGE-WP-0001)

Full TDD implementation of the `bridge` CLI tool covering all phases
from BRIDGE-WP-0001: project scaffolding, config loading, state
management, audit logging, health checks, tunnel lifecycle manager, and
all CLI commands (up/down/restart/status/logs). 77 tests, all green.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-12 01:40:08 +00:00
parent 2c7c440ea7
commit a7eaf59ced
18 changed files with 1803 additions and 0 deletions

0
src/bridge/__init__.py Normal file
View File

65
src/bridge/audit.py Normal file
View File

@@ -0,0 +1,65 @@
"""Audit logging for OpsBridge lifecycle events."""
from __future__ import annotations
import json
from datetime import datetime, timezone
from enum import Enum
from pathlib import Path
from typing import Any, Dict, List, Optional
class AuditEvent(str, Enum):
BRIDGE_STARTED = "bridge_started"
BRIDGE_CONNECTED = "bridge_connected"
BRIDGE_DISCONNECTED = "bridge_disconnected"
BRIDGE_RECONNECTING = "bridge_reconnecting"
HEALTH_CHECK_FAILED = "health_check_failed"
HEALTH_CHECK_RECOVERED = "health_check_recovered"
BRIDGE_STOPPED = "bridge_stopped"
def _default_state_dir() -> Path:
return Path.home() / ".local" / "state" / "bridge"
class AuditLogger:
def __init__(self, state_dir: Optional[Path] = None):
self._dir = Path(state_dir) if state_dir else _default_state_dir()
def _log_path(self, tunnel: str) -> Path:
return self._dir / f"{tunnel}.log"
def log(
self,
tunnel: str,
event: AuditEvent,
actor: str,
actor_class: str,
detail: str = "",
) -> None:
self._dir.mkdir(parents=True, exist_ok=True)
entry: Dict[str, Any] = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"tunnel": tunnel,
"actor": actor,
"actor_class": actor_class,
"event": event.value,
}
if detail:
entry["detail"] = detail
with self._log_path(tunnel).open("a") as f:
f.write(json.dumps(entry) + "\n")
def read_events(self, tunnel: str) -> List[Dict[str, Any]]:
path = self._log_path(tunnel)
if not path.exists():
return []
events = []
for line in path.read_text().splitlines():
line = line.strip()
if line:
try:
events.append(json.loads(line))
except json.JSONDecodeError:
pass
return events

219
src/bridge/cli.py Normal file
View File

@@ -0,0 +1,219 @@
"""CLI for OpsBridge — bridge command."""
from __future__ import annotations
import json
import os
import sys
from pathlib import Path
from typing import Optional
import typer
from bridge.audit import AuditLogger
from bridge.config import ConfigError, load_config
from bridge.manager import TunnelManager
from bridge.models import BridgeState
from bridge.state import StateManager
app = typer.Typer(
name="bridge",
help="OpsBridge — SSH reverse tunnel lifecycle manager.",
no_args_is_help=True,
)
def _state_dir() -> Path:
return Path(os.environ.get("BRIDGE_STATE_DIR", str(Path.home() / ".local" / "state" / "bridge")))
def _load_or_exit():
try:
return load_config()
except ConfigError as e:
typer.echo(f"Error: {e}", err=True)
raise typer.Exit(1)
def _require_tunnel(cfg, name: str):
if name not in cfg.tunnels:
typer.echo(f"Error: tunnel '{name}' not found in config", err=True)
raise typer.Exit(1)
return cfg.tunnels[name]
@app.command()
def up(
tunnel: Optional[str] = typer.Argument(None, help="Tunnel name (omit for all)"),
):
"""Start one or all tunnels."""
cfg = _load_or_exit()
sd = _state_dir()
names = [tunnel] if tunnel else list(cfg.tunnels.keys())
if tunnel:
_require_tunnel(cfg, tunnel)
any_already_running = False
for name in names:
tcfg = cfg.tunnels[name]
mgr = TunnelManager(tcfg, state_dir=sd)
if mgr.is_running():
typer.echo(f"Tunnel '{name}' is already running.")
any_already_running = True
else:
mgr.start()
typer.echo(f"Started tunnel '{name}'.")
if any_already_running and len(names) == 1:
raise typer.Exit(2)
@app.command()
def down(
tunnel: Optional[str] = typer.Argument(None, help="Tunnel name (omit for all)"),
):
"""Stop one or all tunnels."""
cfg = _load_or_exit()
sd = _state_dir()
names = [tunnel] if tunnel else list(cfg.tunnels.keys())
if tunnel:
_require_tunnel(cfg, tunnel)
any_not_running = False
for name in names:
tcfg = cfg.tunnels[name]
mgr = TunnelManager(tcfg, state_dir=sd)
if not mgr.is_running():
typer.echo(f"Tunnel '{name}' is not running.")
any_not_running = True
else:
mgr.stop()
typer.echo(f"Stopped tunnel '{name}'.")
if any_not_running and len(names) == 1:
raise typer.Exit(2)
@app.command()
def restart(
tunnel: Optional[str] = typer.Argument(None, help="Tunnel name (omit for all)"),
):
"""Restart one or all tunnels."""
cfg = _load_or_exit()
sd = _state_dir()
names = [tunnel] if tunnel else list(cfg.tunnels.keys())
if tunnel:
_require_tunnel(cfg, tunnel)
for name in names:
tcfg = cfg.tunnels[name]
mgr = TunnelManager(tcfg, state_dir=sd)
mgr.stop()
mgr.start()
typer.echo(f"Restarted tunnel '{name}'.")
@app.command()
def status(
as_json: bool = typer.Option(False, "--json", help="Output as JSON"),
):
"""Show status of all tunnels."""
cfg = _load_or_exit()
sd = _state_dir()
state_mgr = StateManager(state_dir=sd)
rows = []
for name, tcfg in cfg.tunnels.items():
state = state_mgr.read_state(name)
pid = state_mgr.read_pid(name)
rows.append({
"tunnel": name,
"state": state.value,
"actor": tcfg.actor,
"host": tcfg.host,
"pid": pid,
"uptime": None, # future: track start time
"health": None, # future: last health check result
})
if as_json:
typer.echo(json.dumps(rows, indent=2))
else:
_print_status_table(rows)
def _print_status_table(rows):
headers = ["TUNNEL", "STATE", "ACTOR", "HOST", "PID"]
col_widths = [max(len(h), max((len(str(r.get(h.lower(), "") or "")) for r in rows), default=0)) for h in headers]
def _fmt_row(vals):
return " ".join(str(v).ljust(w) for v, w in zip(vals, col_widths))
typer.echo(_fmt_row(headers))
typer.echo(_fmt_row(["-" * w for w in col_widths]))
for row in rows:
typer.echo(_fmt_row([
row["tunnel"],
row["state"],
row["actor"],
row["host"],
str(row["pid"] or ""),
]))
@app.command()
def logs(
tunnel: str = typer.Argument(..., help="Tunnel name"),
lines: int = typer.Option(50, "--lines", "-n", help="Number of lines to show"),
follow: bool = typer.Option(False, "--follow", "-f", help="Follow the log"),
):
"""Show audit log for a tunnel."""
cfg = _load_or_exit()
_require_tunnel(cfg, tunnel)
sd = _state_dir()
logger = AuditLogger(state_dir=sd)
events = logger.read_events(tunnel)
if not events:
typer.echo(f"No log entries for tunnel '{tunnel}'.")
return
# Show last N lines
for entry in events[-lines:]:
ts = entry.get("timestamp", "")
event = entry.get("event", "")
actor = entry.get("actor", "")
detail = entry.get("detail", "")
parts = [ts, event, f"actor={actor}"]
if detail:
parts.append(detail)
typer.echo(" ".join(parts))
if follow:
import time
log_path = sd / f"{tunnel}.log"
try:
with log_path.open() as f:
f.seek(0, 2) # seek to end
while True:
line = f.readline()
if line:
try:
entry = json.loads(line)
ts = entry.get("timestamp", "")
event = entry.get("event", "")
actor = entry.get("actor", "")
detail = entry.get("detail", "")
parts = [ts, event, f"actor={actor}"]
if detail:
parts.append(detail)
typer.echo(" ".join(parts))
except json.JSONDecodeError:
pass
else:
time.sleep(0.5)
except KeyboardInterrupt:
pass

109
src/bridge/config.py Normal file
View File

@@ -0,0 +1,109 @@
"""Config loading for OpsBridge."""
from __future__ import annotations
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Dict
import yaml
from bridge.models import ActorInfo, HealthCheckConfig, ReconnectPolicy, TunnelConfig
class ConfigError(Exception):
"""Raised when config is invalid or missing."""
@dataclass
class BridgeConfig:
tunnels: Dict[str, TunnelConfig]
actors: Dict[str, ActorInfo]
def _default_config_path() -> Path:
return Path.home() / ".config" / "bridge" / "tunnels.yaml"
def load_config() -> BridgeConfig:
"""Load and validate tunnels.yaml. Respects BRIDGE_CONFIG env var."""
path = Path(os.environ.get("BRIDGE_CONFIG", str(_default_config_path())))
if not path.exists():
raise ConfigError(f"Config file not found: {path}")
try:
with path.open() as f:
raw = yaml.safe_load(f)
except yaml.YAMLError as e:
raise ConfigError(f"Invalid YAML in {path}: {e}") from e
if not isinstance(raw, dict):
raise ConfigError(f"Config must be a YAML mapping, got: {type(raw)}")
tunnels = _parse_tunnels(raw.get("tunnels") or {})
actors = _parse_actors(raw.get("actors") or {})
return BridgeConfig(tunnels=tunnels, actors=actors)
def _parse_tunnels(raw: dict) -> Dict[str, TunnelConfig]:
tunnels = {}
for name, data in raw.items():
if not isinstance(data, dict):
raise ConfigError(f"Tunnel '{name}' must be a mapping")
tunnels[name] = _parse_tunnel(name, data)
return tunnels
def _parse_tunnel(name: str, data: dict) -> TunnelConfig:
required = ["host", "remote_port", "local_port", "ssh_user", "ssh_key", "actor"]
for field in required:
if field not in data:
raise ConfigError(f"Tunnel '{name}' missing required field: {field}")
reconnect = ReconnectPolicy()
if "reconnect" in data and data["reconnect"]:
r = data["reconnect"]
reconnect = ReconnectPolicy(
max_attempts=r.get("max_attempts", 0),
backoff_initial=r.get("backoff_initial", 5),
backoff_max=r.get("backoff_max", 60),
)
health_check = None
if "health_check" in data and data["health_check"]:
hc = data["health_check"]
if "url" not in hc:
raise ConfigError(f"Tunnel '{name}' health_check missing required field: url")
health_check = HealthCheckConfig(
url=hc["url"],
interval_seconds=hc.get("interval_seconds", 30),
timeout_seconds=hc.get("timeout_seconds", 5),
)
return TunnelConfig(
name=name,
host=str(data["host"]),
remote_port=int(data["remote_port"]),
local_port=int(data["local_port"]),
ssh_user=str(data["ssh_user"]),
ssh_key=str(data["ssh_key"]),
actor=str(data["actor"]),
reconnect=reconnect,
health_check=health_check,
)
def _parse_actors(raw: dict) -> Dict[str, ActorInfo]:
actors = {}
for name, data in raw.items():
if not isinstance(data, dict):
raise ConfigError(f"Actor '{name}' must be a mapping")
if "class" not in data:
raise ConfigError(f"Actor '{name}' missing required field: class")
actors[name] = ActorInfo(
name=name,
actor_class=str(data["class"]),
description=str(data.get("description", "")),
)
return actors

31
src/bridge/health.py Normal file
View File

@@ -0,0 +1,31 @@
"""HTTP health checker for OpsBridge."""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Optional
import httpx
@dataclass
class HealthResult:
ok: bool
status_code: Optional[int] = None
error: Optional[str] = None
class HealthChecker:
def __init__(self, url: str, timeout_seconds: int = 5):
self._url = url
self._timeout = timeout_seconds
async def check(self) -> HealthResult:
try:
async with httpx.AsyncClient(timeout=self._timeout) as client:
response = await client.get(self._url)
response.raise_for_status()
return HealthResult(ok=True, status_code=response.status_code)
except httpx.HTTPStatusError as e:
return HealthResult(ok=False, status_code=e.response.status_code, error=str(e))
except Exception as e:
return HealthResult(ok=False, error=str(e))

252
src/bridge/manager.py Normal file
View File

@@ -0,0 +1,252 @@
"""Tunnel lifecycle manager for OpsBridge."""
from __future__ import annotations
import logging
import os
import signal
import subprocess
import time
from pathlib import Path
from typing import List, Optional
from bridge.audit import AuditEvent, AuditLogger
from bridge.config import BridgeConfig
from bridge.health import HealthChecker
from bridge.models import BridgeState, TunnelConfig
from bridge.state import StateManager
log = logging.getLogger(__name__)
def build_ssh_command(cfg: TunnelConfig) -> List[str]:
"""Build the SSH reverse tunnel command."""
key = os.path.expanduser(cfg.ssh_key)
return [
"ssh",
"-N",
"-R", f"{cfg.remote_port}:127.0.0.1:{cfg.local_port}",
"-i", key,
"-o", "ServerAliveInterval=10",
"-o", "ServerAliveCountMax=3",
"-o", "ExitOnForwardFailure=yes",
"-o", "StrictHostKeyChecking=accept-new",
f"{cfg.ssh_user}@{cfg.host}",
]
class TunnelManager:
"""Manages a single named SSH reverse tunnel.
start() daemonises: forks a child that runs the reconnect loop, then the
parent returns immediately after writing the manager PID.
"""
def __init__(self, cfg: TunnelConfig, state_dir: Optional[Path] = None):
self._cfg = cfg
self._state = StateManager(state_dir=state_dir)
self._audit = AuditLogger(state_dir=state_dir)
def get_state(self) -> BridgeState:
return self._state.read_state(self._cfg.name)
def is_running(self) -> bool:
return self._state.is_running(self._cfg.name)
def _actor_info(self):
return self._cfg.actor, "unknown"
def _next_backoff(self, attempt: int) -> int:
initial = self._cfg.reconnect.backoff_initial
max_b = self._cfg.reconnect.backoff_max
value = initial * (2 ** attempt)
return min(value, max_b)
def start(self) -> None:
"""Start the tunnel manager as a daemonised subprocess."""
if self.is_running():
log.info("Tunnel %s already running", self._cfg.name)
return
self._state.write_state(self._cfg.name, BridgeState.STARTING)
actor, actor_class = self._actor_info()
self._audit.log(
tunnel=self._cfg.name,
event=AuditEvent.BRIDGE_STARTED,
actor=actor,
actor_class=actor_class,
)
pid = os.fork()
if pid > 0:
# Parent: record manager PID and return
self._state.write_pid(self._cfg.name, pid)
return
# Child: become a daemon
os.setsid()
try:
self._run_loop()
except Exception as e:
log.exception("Tunnel manager loop crashed: %s", e)
finally:
self._state.write_state(self._cfg.name, BridgeState.STOPPED)
self._state.clear_pid(self._cfg.name)
self._audit.log(
tunnel=self._cfg.name,
event=AuditEvent.BRIDGE_STOPPED,
actor=actor,
actor_class=actor_class,
)
os._exit(0)
def stop(self) -> None:
"""Stop the running tunnel manager."""
pid = self._state.read_pid(self._cfg.name)
if pid is None:
self._state.write_state(self._cfg.name, BridgeState.STOPPED)
return
try:
os.kill(pid, signal.SIGTERM)
# Give up to 5 seconds for graceful shutdown
for _ in range(50):
try:
os.kill(pid, 0)
time.sleep(0.1)
except ProcessLookupError:
break
else:
# Force kill if still running
try:
os.kill(pid, signal.SIGKILL)
except ProcessLookupError:
pass
except ProcessLookupError:
pass
self._state.clear_pid(self._cfg.name)
self._state.write_state(self._cfg.name, BridgeState.STOPPED)
actor, actor_class = self._actor_info()
self._audit.log(
tunnel=self._cfg.name,
event=AuditEvent.BRIDGE_STOPPED,
actor=actor,
actor_class=actor_class,
)
def _run_loop(self) -> None:
"""Reconnect loop running in daemon child."""
import asyncio
cfg = self._cfg
actor, actor_class = self._actor_info()
attempt = 0
max_attempts = cfg.reconnect.max_attempts # 0 = infinite
# Setup signal handler for graceful shutdown
_stop = [False]
def _on_term(signum, frame):
_stop[0] = True
signal.signal(signal.SIGTERM, _on_term)
signal.signal(signal.SIGINT, _on_term)
while not _stop[0]:
if max_attempts > 0 and attempt >= max_attempts:
self._state.write_state(cfg.name, BridgeState.FAILED)
break
cmd = build_ssh_command(cfg)
log.info("Starting SSH: %s", " ".join(cmd))
self._state.write_state(cfg.name, BridgeState.STARTING)
try:
proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
except FileNotFoundError:
self._state.write_state(cfg.name, BridgeState.FAILED)
self._audit.log(
tunnel=cfg.name,
event=AuditEvent.BRIDGE_DISCONNECTED,
actor=actor,
actor_class=actor_class,
detail="ssh binary not found",
)
break
# Wait briefly then assume connected if still running
time.sleep(2)
if proc.poll() is None:
self._state.write_state(cfg.name, BridgeState.CONNECTED)
self._audit.log(
tunnel=cfg.name,
event=AuditEvent.BRIDGE_CONNECTED,
actor=actor,
actor_class=actor_class,
)
attempt = 0
# Health check loop
if cfg.health_check:
checker = HealthChecker(
url=cfg.health_check.url,
timeout_seconds=cfg.health_check.timeout_seconds,
)
health_failing = False
while not _stop[0] and proc.poll() is None:
result = asyncio.run(checker.check())
if result.ok:
if health_failing:
health_failing = False
self._state.write_state(cfg.name, BridgeState.CONNECTED)
self._audit.log(
tunnel=cfg.name,
event=AuditEvent.HEALTH_CHECK_RECOVERED,
actor=actor,
actor_class=actor_class,
)
else:
if not health_failing:
health_failing = True
self._state.write_state(cfg.name, BridgeState.DEGRADED)
self._audit.log(
tunnel=cfg.name,
event=AuditEvent.HEALTH_CHECK_FAILED,
actor=actor,
actor_class=actor_class,
detail=result.error or f"HTTP {result.status_code}",
)
time.sleep(cfg.health_check.interval_seconds)
else:
while not _stop[0] and proc.poll() is None:
time.sleep(1)
# SSH exited
if proc.poll() is not None:
self._audit.log(
tunnel=cfg.name,
event=AuditEvent.BRIDGE_DISCONNECTED,
actor=actor,
actor_class=actor_class,
detail=f"exit code {proc.returncode}",
)
if _stop[0]:
if proc.poll() is None:
proc.terminate()
break
attempt += 1
backoff = self._next_backoff(attempt - 1)
self._state.write_state(cfg.name, BridgeState.RECONNECTING)
self._audit.log(
tunnel=cfg.name,
event=AuditEvent.BRIDGE_RECONNECTING,
actor=actor,
actor_class=actor_class,
detail=f"retry {attempt}, backoff {backoff}s",
)
log.info("Reconnecting in %ds (attempt %d)", backoff, attempt)
time.sleep(backoff)

49
src/bridge/models.py Normal file
View File

@@ -0,0 +1,49 @@
"""Domain models for OpsBridge."""
from __future__ import annotations
from dataclasses import dataclass, field
from enum import Enum
from typing import Optional
class BridgeState(str, Enum):
STOPPED = "stopped"
STARTING = "starting"
CONNECTED = "connected"
DEGRADED = "degraded"
RECONNECTING = "reconnecting"
FAILED = "failed"
@dataclass
class ReconnectPolicy:
max_attempts: int = 0 # 0 = infinite
backoff_initial: int = 5
backoff_max: int = 60
@dataclass
class HealthCheckConfig:
url: str
interval_seconds: int = 30
timeout_seconds: int = 5
@dataclass
class TunnelConfig:
name: str
host: str
remote_port: int
local_port: int
ssh_user: str
ssh_key: str
actor: str
reconnect: ReconnectPolicy = field(default_factory=ReconnectPolicy)
health_check: Optional[HealthCheckConfig] = None
@dataclass
class ActorInfo:
name: str
actor_class: str # "human" or "automation"
description: str = ""

73
src/bridge/state.py Normal file
View File

@@ -0,0 +1,73 @@
"""State file management for OpsBridge."""
from __future__ import annotations
import os
from pathlib import Path
from typing import Optional
from bridge.models import BridgeState
def _default_state_dir() -> Path:
return Path.home() / ".local" / "state" / "bridge"
class StateManager:
def __init__(self, state_dir: Optional[Path] = None):
self._dir = Path(state_dir) if state_dir else _default_state_dir()
def _ensure_dir(self) -> None:
self._dir.mkdir(parents=True, exist_ok=True)
def _state_path(self, name: str) -> Path:
return self._dir / f"{name}.state"
def _pid_path(self, name: str) -> Path:
return self._dir / f"{name}.pid"
def read_state(self, name: str) -> BridgeState:
path = self._state_path(name)
if not path.exists():
return BridgeState.STOPPED
text = path.read_text().strip()
try:
return BridgeState(text)
except ValueError:
return BridgeState.STOPPED
def write_state(self, name: str, state: BridgeState) -> None:
self._ensure_dir()
self._state_path(name).write_text(state.value)
def read_pid(self, name: str) -> Optional[int]:
path = self._pid_path(name)
if not path.exists():
return None
try:
pid = int(path.read_text().strip())
except (ValueError, OSError):
return None
if _pid_alive(pid):
return pid
return None
def write_pid(self, name: str, pid: int) -> None:
self._ensure_dir()
self._pid_path(name).write_text(str(pid))
def clear_pid(self, name: str) -> None:
path = self._pid_path(name)
if path.exists():
path.unlink()
def is_running(self, name: str) -> bool:
return self.read_pid(name) is not None
def _pid_alive(pid: int) -> bool:
"""Return True if the process with given PID exists."""
try:
os.kill(pid, 0)
return True
except (ProcessLookupError, PermissionError):
return False