generated from coulomb/repo-seed
feat: implement OpsBridge CLI (BRIDGE-WP-0001)
Full TDD implementation of the `bridge` CLI tool covering all phases from BRIDGE-WP-0001: project scaffolding, config loading, state management, audit logging, health checks, tunnel lifecycle manager, and all CLI commands (up/down/restart/status/logs). 77 tests, all green. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
0
src/bridge/__init__.py
Normal file
0
src/bridge/__init__.py
Normal file
65
src/bridge/audit.py
Normal file
65
src/bridge/audit.py
Normal file
@@ -0,0 +1,65 @@
|
||||
"""Audit logging for OpsBridge lifecycle events."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
|
||||
class AuditEvent(str, Enum):
|
||||
BRIDGE_STARTED = "bridge_started"
|
||||
BRIDGE_CONNECTED = "bridge_connected"
|
||||
BRIDGE_DISCONNECTED = "bridge_disconnected"
|
||||
BRIDGE_RECONNECTING = "bridge_reconnecting"
|
||||
HEALTH_CHECK_FAILED = "health_check_failed"
|
||||
HEALTH_CHECK_RECOVERED = "health_check_recovered"
|
||||
BRIDGE_STOPPED = "bridge_stopped"
|
||||
|
||||
|
||||
def _default_state_dir() -> Path:
|
||||
return Path.home() / ".local" / "state" / "bridge"
|
||||
|
||||
|
||||
class AuditLogger:
|
||||
def __init__(self, state_dir: Optional[Path] = None):
|
||||
self._dir = Path(state_dir) if state_dir else _default_state_dir()
|
||||
|
||||
def _log_path(self, tunnel: str) -> Path:
|
||||
return self._dir / f"{tunnel}.log"
|
||||
|
||||
def log(
|
||||
self,
|
||||
tunnel: str,
|
||||
event: AuditEvent,
|
||||
actor: str,
|
||||
actor_class: str,
|
||||
detail: str = "",
|
||||
) -> None:
|
||||
self._dir.mkdir(parents=True, exist_ok=True)
|
||||
entry: Dict[str, Any] = {
|
||||
"timestamp": datetime.now(timezone.utc).isoformat(),
|
||||
"tunnel": tunnel,
|
||||
"actor": actor,
|
||||
"actor_class": actor_class,
|
||||
"event": event.value,
|
||||
}
|
||||
if detail:
|
||||
entry["detail"] = detail
|
||||
with self._log_path(tunnel).open("a") as f:
|
||||
f.write(json.dumps(entry) + "\n")
|
||||
|
||||
def read_events(self, tunnel: str) -> List[Dict[str, Any]]:
|
||||
path = self._log_path(tunnel)
|
||||
if not path.exists():
|
||||
return []
|
||||
events = []
|
||||
for line in path.read_text().splitlines():
|
||||
line = line.strip()
|
||||
if line:
|
||||
try:
|
||||
events.append(json.loads(line))
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
return events
|
||||
219
src/bridge/cli.py
Normal file
219
src/bridge/cli.py
Normal file
@@ -0,0 +1,219 @@
|
||||
"""CLI for OpsBridge — bridge command."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import typer
|
||||
|
||||
from bridge.audit import AuditLogger
|
||||
from bridge.config import ConfigError, load_config
|
||||
from bridge.manager import TunnelManager
|
||||
from bridge.models import BridgeState
|
||||
from bridge.state import StateManager
|
||||
|
||||
app = typer.Typer(
|
||||
name="bridge",
|
||||
help="OpsBridge — SSH reverse tunnel lifecycle manager.",
|
||||
no_args_is_help=True,
|
||||
)
|
||||
|
||||
|
||||
def _state_dir() -> Path:
|
||||
return Path(os.environ.get("BRIDGE_STATE_DIR", str(Path.home() / ".local" / "state" / "bridge")))
|
||||
|
||||
|
||||
def _load_or_exit():
|
||||
try:
|
||||
return load_config()
|
||||
except ConfigError as e:
|
||||
typer.echo(f"Error: {e}", err=True)
|
||||
raise typer.Exit(1)
|
||||
|
||||
|
||||
def _require_tunnel(cfg, name: str):
|
||||
if name not in cfg.tunnels:
|
||||
typer.echo(f"Error: tunnel '{name}' not found in config", err=True)
|
||||
raise typer.Exit(1)
|
||||
return cfg.tunnels[name]
|
||||
|
||||
|
||||
@app.command()
|
||||
def up(
|
||||
tunnel: Optional[str] = typer.Argument(None, help="Tunnel name (omit for all)"),
|
||||
):
|
||||
"""Start one or all tunnels."""
|
||||
cfg = _load_or_exit()
|
||||
sd = _state_dir()
|
||||
|
||||
names = [tunnel] if tunnel else list(cfg.tunnels.keys())
|
||||
if tunnel:
|
||||
_require_tunnel(cfg, tunnel)
|
||||
|
||||
any_already_running = False
|
||||
for name in names:
|
||||
tcfg = cfg.tunnels[name]
|
||||
mgr = TunnelManager(tcfg, state_dir=sd)
|
||||
if mgr.is_running():
|
||||
typer.echo(f"Tunnel '{name}' is already running.")
|
||||
any_already_running = True
|
||||
else:
|
||||
mgr.start()
|
||||
typer.echo(f"Started tunnel '{name}'.")
|
||||
|
||||
if any_already_running and len(names) == 1:
|
||||
raise typer.Exit(2)
|
||||
|
||||
|
||||
@app.command()
|
||||
def down(
|
||||
tunnel: Optional[str] = typer.Argument(None, help="Tunnel name (omit for all)"),
|
||||
):
|
||||
"""Stop one or all tunnels."""
|
||||
cfg = _load_or_exit()
|
||||
sd = _state_dir()
|
||||
|
||||
names = [tunnel] if tunnel else list(cfg.tunnels.keys())
|
||||
if tunnel:
|
||||
_require_tunnel(cfg, tunnel)
|
||||
|
||||
any_not_running = False
|
||||
for name in names:
|
||||
tcfg = cfg.tunnels[name]
|
||||
mgr = TunnelManager(tcfg, state_dir=sd)
|
||||
if not mgr.is_running():
|
||||
typer.echo(f"Tunnel '{name}' is not running.")
|
||||
any_not_running = True
|
||||
else:
|
||||
mgr.stop()
|
||||
typer.echo(f"Stopped tunnel '{name}'.")
|
||||
|
||||
if any_not_running and len(names) == 1:
|
||||
raise typer.Exit(2)
|
||||
|
||||
|
||||
@app.command()
|
||||
def restart(
|
||||
tunnel: Optional[str] = typer.Argument(None, help="Tunnel name (omit for all)"),
|
||||
):
|
||||
"""Restart one or all tunnels."""
|
||||
cfg = _load_or_exit()
|
||||
sd = _state_dir()
|
||||
|
||||
names = [tunnel] if tunnel else list(cfg.tunnels.keys())
|
||||
if tunnel:
|
||||
_require_tunnel(cfg, tunnel)
|
||||
|
||||
for name in names:
|
||||
tcfg = cfg.tunnels[name]
|
||||
mgr = TunnelManager(tcfg, state_dir=sd)
|
||||
mgr.stop()
|
||||
mgr.start()
|
||||
typer.echo(f"Restarted tunnel '{name}'.")
|
||||
|
||||
|
||||
@app.command()
|
||||
def status(
|
||||
as_json: bool = typer.Option(False, "--json", help="Output as JSON"),
|
||||
):
|
||||
"""Show status of all tunnels."""
|
||||
cfg = _load_or_exit()
|
||||
sd = _state_dir()
|
||||
state_mgr = StateManager(state_dir=sd)
|
||||
|
||||
rows = []
|
||||
for name, tcfg in cfg.tunnels.items():
|
||||
state = state_mgr.read_state(name)
|
||||
pid = state_mgr.read_pid(name)
|
||||
rows.append({
|
||||
"tunnel": name,
|
||||
"state": state.value,
|
||||
"actor": tcfg.actor,
|
||||
"host": tcfg.host,
|
||||
"pid": pid,
|
||||
"uptime": None, # future: track start time
|
||||
"health": None, # future: last health check result
|
||||
})
|
||||
|
||||
if as_json:
|
||||
typer.echo(json.dumps(rows, indent=2))
|
||||
else:
|
||||
_print_status_table(rows)
|
||||
|
||||
|
||||
def _print_status_table(rows):
|
||||
headers = ["TUNNEL", "STATE", "ACTOR", "HOST", "PID"]
|
||||
col_widths = [max(len(h), max((len(str(r.get(h.lower(), "") or "")) for r in rows), default=0)) for h in headers]
|
||||
|
||||
def _fmt_row(vals):
|
||||
return " ".join(str(v).ljust(w) for v, w in zip(vals, col_widths))
|
||||
|
||||
typer.echo(_fmt_row(headers))
|
||||
typer.echo(_fmt_row(["-" * w for w in col_widths]))
|
||||
for row in rows:
|
||||
typer.echo(_fmt_row([
|
||||
row["tunnel"],
|
||||
row["state"],
|
||||
row["actor"],
|
||||
row["host"],
|
||||
str(row["pid"] or ""),
|
||||
]))
|
||||
|
||||
|
||||
@app.command()
|
||||
def logs(
|
||||
tunnel: str = typer.Argument(..., help="Tunnel name"),
|
||||
lines: int = typer.Option(50, "--lines", "-n", help="Number of lines to show"),
|
||||
follow: bool = typer.Option(False, "--follow", "-f", help="Follow the log"),
|
||||
):
|
||||
"""Show audit log for a tunnel."""
|
||||
cfg = _load_or_exit()
|
||||
_require_tunnel(cfg, tunnel)
|
||||
|
||||
sd = _state_dir()
|
||||
logger = AuditLogger(state_dir=sd)
|
||||
events = logger.read_events(tunnel)
|
||||
|
||||
if not events:
|
||||
typer.echo(f"No log entries for tunnel '{tunnel}'.")
|
||||
return
|
||||
|
||||
# Show last N lines
|
||||
for entry in events[-lines:]:
|
||||
ts = entry.get("timestamp", "")
|
||||
event = entry.get("event", "")
|
||||
actor = entry.get("actor", "")
|
||||
detail = entry.get("detail", "")
|
||||
parts = [ts, event, f"actor={actor}"]
|
||||
if detail:
|
||||
parts.append(detail)
|
||||
typer.echo(" ".join(parts))
|
||||
|
||||
if follow:
|
||||
import time
|
||||
log_path = sd / f"{tunnel}.log"
|
||||
try:
|
||||
with log_path.open() as f:
|
||||
f.seek(0, 2) # seek to end
|
||||
while True:
|
||||
line = f.readline()
|
||||
if line:
|
||||
try:
|
||||
entry = json.loads(line)
|
||||
ts = entry.get("timestamp", "")
|
||||
event = entry.get("event", "")
|
||||
actor = entry.get("actor", "")
|
||||
detail = entry.get("detail", "")
|
||||
parts = [ts, event, f"actor={actor}"]
|
||||
if detail:
|
||||
parts.append(detail)
|
||||
typer.echo(" ".join(parts))
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
else:
|
||||
time.sleep(0.5)
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
109
src/bridge/config.py
Normal file
109
src/bridge/config.py
Normal file
@@ -0,0 +1,109 @@
|
||||
"""Config loading for OpsBridge."""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
|
||||
import yaml
|
||||
|
||||
from bridge.models import ActorInfo, HealthCheckConfig, ReconnectPolicy, TunnelConfig
|
||||
|
||||
|
||||
class ConfigError(Exception):
|
||||
"""Raised when config is invalid or missing."""
|
||||
|
||||
|
||||
@dataclass
|
||||
class BridgeConfig:
|
||||
tunnels: Dict[str, TunnelConfig]
|
||||
actors: Dict[str, ActorInfo]
|
||||
|
||||
|
||||
def _default_config_path() -> Path:
|
||||
return Path.home() / ".config" / "bridge" / "tunnels.yaml"
|
||||
|
||||
|
||||
def load_config() -> BridgeConfig:
|
||||
"""Load and validate tunnels.yaml. Respects BRIDGE_CONFIG env var."""
|
||||
path = Path(os.environ.get("BRIDGE_CONFIG", str(_default_config_path())))
|
||||
|
||||
if not path.exists():
|
||||
raise ConfigError(f"Config file not found: {path}")
|
||||
|
||||
try:
|
||||
with path.open() as f:
|
||||
raw = yaml.safe_load(f)
|
||||
except yaml.YAMLError as e:
|
||||
raise ConfigError(f"Invalid YAML in {path}: {e}") from e
|
||||
|
||||
if not isinstance(raw, dict):
|
||||
raise ConfigError(f"Config must be a YAML mapping, got: {type(raw)}")
|
||||
|
||||
tunnels = _parse_tunnels(raw.get("tunnels") or {})
|
||||
actors = _parse_actors(raw.get("actors") or {})
|
||||
return BridgeConfig(tunnels=tunnels, actors=actors)
|
||||
|
||||
|
||||
def _parse_tunnels(raw: dict) -> Dict[str, TunnelConfig]:
|
||||
tunnels = {}
|
||||
for name, data in raw.items():
|
||||
if not isinstance(data, dict):
|
||||
raise ConfigError(f"Tunnel '{name}' must be a mapping")
|
||||
tunnels[name] = _parse_tunnel(name, data)
|
||||
return tunnels
|
||||
|
||||
|
||||
def _parse_tunnel(name: str, data: dict) -> TunnelConfig:
|
||||
required = ["host", "remote_port", "local_port", "ssh_user", "ssh_key", "actor"]
|
||||
for field in required:
|
||||
if field not in data:
|
||||
raise ConfigError(f"Tunnel '{name}' missing required field: {field}")
|
||||
|
||||
reconnect = ReconnectPolicy()
|
||||
if "reconnect" in data and data["reconnect"]:
|
||||
r = data["reconnect"]
|
||||
reconnect = ReconnectPolicy(
|
||||
max_attempts=r.get("max_attempts", 0),
|
||||
backoff_initial=r.get("backoff_initial", 5),
|
||||
backoff_max=r.get("backoff_max", 60),
|
||||
)
|
||||
|
||||
health_check = None
|
||||
if "health_check" in data and data["health_check"]:
|
||||
hc = data["health_check"]
|
||||
if "url" not in hc:
|
||||
raise ConfigError(f"Tunnel '{name}' health_check missing required field: url")
|
||||
health_check = HealthCheckConfig(
|
||||
url=hc["url"],
|
||||
interval_seconds=hc.get("interval_seconds", 30),
|
||||
timeout_seconds=hc.get("timeout_seconds", 5),
|
||||
)
|
||||
|
||||
return TunnelConfig(
|
||||
name=name,
|
||||
host=str(data["host"]),
|
||||
remote_port=int(data["remote_port"]),
|
||||
local_port=int(data["local_port"]),
|
||||
ssh_user=str(data["ssh_user"]),
|
||||
ssh_key=str(data["ssh_key"]),
|
||||
actor=str(data["actor"]),
|
||||
reconnect=reconnect,
|
||||
health_check=health_check,
|
||||
)
|
||||
|
||||
|
||||
def _parse_actors(raw: dict) -> Dict[str, ActorInfo]:
|
||||
actors = {}
|
||||
for name, data in raw.items():
|
||||
if not isinstance(data, dict):
|
||||
raise ConfigError(f"Actor '{name}' must be a mapping")
|
||||
if "class" not in data:
|
||||
raise ConfigError(f"Actor '{name}' missing required field: class")
|
||||
actors[name] = ActorInfo(
|
||||
name=name,
|
||||
actor_class=str(data["class"]),
|
||||
description=str(data.get("description", "")),
|
||||
)
|
||||
return actors
|
||||
31
src/bridge/health.py
Normal file
31
src/bridge/health.py
Normal file
@@ -0,0 +1,31 @@
|
||||
"""HTTP health checker for OpsBridge."""
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
|
||||
|
||||
@dataclass
|
||||
class HealthResult:
|
||||
ok: bool
|
||||
status_code: Optional[int] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
class HealthChecker:
|
||||
def __init__(self, url: str, timeout_seconds: int = 5):
|
||||
self._url = url
|
||||
self._timeout = timeout_seconds
|
||||
|
||||
async def check(self) -> HealthResult:
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=self._timeout) as client:
|
||||
response = await client.get(self._url)
|
||||
response.raise_for_status()
|
||||
return HealthResult(ok=True, status_code=response.status_code)
|
||||
except httpx.HTTPStatusError as e:
|
||||
return HealthResult(ok=False, status_code=e.response.status_code, error=str(e))
|
||||
except Exception as e:
|
||||
return HealthResult(ok=False, error=str(e))
|
||||
252
src/bridge/manager.py
Normal file
252
src/bridge/manager.py
Normal file
@@ -0,0 +1,252 @@
|
||||
"""Tunnel lifecycle manager for OpsBridge."""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import subprocess
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
from bridge.audit import AuditEvent, AuditLogger
|
||||
from bridge.config import BridgeConfig
|
||||
from bridge.health import HealthChecker
|
||||
from bridge.models import BridgeState, TunnelConfig
|
||||
from bridge.state import StateManager
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def build_ssh_command(cfg: TunnelConfig) -> List[str]:
|
||||
"""Build the SSH reverse tunnel command."""
|
||||
key = os.path.expanduser(cfg.ssh_key)
|
||||
return [
|
||||
"ssh",
|
||||
"-N",
|
||||
"-R", f"{cfg.remote_port}:127.0.0.1:{cfg.local_port}",
|
||||
"-i", key,
|
||||
"-o", "ServerAliveInterval=10",
|
||||
"-o", "ServerAliveCountMax=3",
|
||||
"-o", "ExitOnForwardFailure=yes",
|
||||
"-o", "StrictHostKeyChecking=accept-new",
|
||||
f"{cfg.ssh_user}@{cfg.host}",
|
||||
]
|
||||
|
||||
|
||||
class TunnelManager:
|
||||
"""Manages a single named SSH reverse tunnel.
|
||||
|
||||
start() daemonises: forks a child that runs the reconnect loop, then the
|
||||
parent returns immediately after writing the manager PID.
|
||||
"""
|
||||
|
||||
def __init__(self, cfg: TunnelConfig, state_dir: Optional[Path] = None):
|
||||
self._cfg = cfg
|
||||
self._state = StateManager(state_dir=state_dir)
|
||||
self._audit = AuditLogger(state_dir=state_dir)
|
||||
|
||||
def get_state(self) -> BridgeState:
|
||||
return self._state.read_state(self._cfg.name)
|
||||
|
||||
def is_running(self) -> bool:
|
||||
return self._state.is_running(self._cfg.name)
|
||||
|
||||
def _actor_info(self):
|
||||
return self._cfg.actor, "unknown"
|
||||
|
||||
def _next_backoff(self, attempt: int) -> int:
|
||||
initial = self._cfg.reconnect.backoff_initial
|
||||
max_b = self._cfg.reconnect.backoff_max
|
||||
value = initial * (2 ** attempt)
|
||||
return min(value, max_b)
|
||||
|
||||
def start(self) -> None:
|
||||
"""Start the tunnel manager as a daemonised subprocess."""
|
||||
if self.is_running():
|
||||
log.info("Tunnel %s already running", self._cfg.name)
|
||||
return
|
||||
|
||||
self._state.write_state(self._cfg.name, BridgeState.STARTING)
|
||||
actor, actor_class = self._actor_info()
|
||||
self._audit.log(
|
||||
tunnel=self._cfg.name,
|
||||
event=AuditEvent.BRIDGE_STARTED,
|
||||
actor=actor,
|
||||
actor_class=actor_class,
|
||||
)
|
||||
|
||||
pid = os.fork()
|
||||
if pid > 0:
|
||||
# Parent: record manager PID and return
|
||||
self._state.write_pid(self._cfg.name, pid)
|
||||
return
|
||||
|
||||
# Child: become a daemon
|
||||
os.setsid()
|
||||
|
||||
try:
|
||||
self._run_loop()
|
||||
except Exception as e:
|
||||
log.exception("Tunnel manager loop crashed: %s", e)
|
||||
finally:
|
||||
self._state.write_state(self._cfg.name, BridgeState.STOPPED)
|
||||
self._state.clear_pid(self._cfg.name)
|
||||
self._audit.log(
|
||||
tunnel=self._cfg.name,
|
||||
event=AuditEvent.BRIDGE_STOPPED,
|
||||
actor=actor,
|
||||
actor_class=actor_class,
|
||||
)
|
||||
|
||||
os._exit(0)
|
||||
|
||||
def stop(self) -> None:
|
||||
"""Stop the running tunnel manager."""
|
||||
pid = self._state.read_pid(self._cfg.name)
|
||||
if pid is None:
|
||||
self._state.write_state(self._cfg.name, BridgeState.STOPPED)
|
||||
return
|
||||
|
||||
try:
|
||||
os.kill(pid, signal.SIGTERM)
|
||||
# Give up to 5 seconds for graceful shutdown
|
||||
for _ in range(50):
|
||||
try:
|
||||
os.kill(pid, 0)
|
||||
time.sleep(0.1)
|
||||
except ProcessLookupError:
|
||||
break
|
||||
else:
|
||||
# Force kill if still running
|
||||
try:
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
|
||||
self._state.clear_pid(self._cfg.name)
|
||||
self._state.write_state(self._cfg.name, BridgeState.STOPPED)
|
||||
actor, actor_class = self._actor_info()
|
||||
self._audit.log(
|
||||
tunnel=self._cfg.name,
|
||||
event=AuditEvent.BRIDGE_STOPPED,
|
||||
actor=actor,
|
||||
actor_class=actor_class,
|
||||
)
|
||||
|
||||
def _run_loop(self) -> None:
|
||||
"""Reconnect loop running in daemon child."""
|
||||
import asyncio
|
||||
|
||||
cfg = self._cfg
|
||||
actor, actor_class = self._actor_info()
|
||||
attempt = 0
|
||||
max_attempts = cfg.reconnect.max_attempts # 0 = infinite
|
||||
|
||||
# Setup signal handler for graceful shutdown
|
||||
_stop = [False]
|
||||
|
||||
def _on_term(signum, frame):
|
||||
_stop[0] = True
|
||||
|
||||
signal.signal(signal.SIGTERM, _on_term)
|
||||
signal.signal(signal.SIGINT, _on_term)
|
||||
|
||||
while not _stop[0]:
|
||||
if max_attempts > 0 and attempt >= max_attempts:
|
||||
self._state.write_state(cfg.name, BridgeState.FAILED)
|
||||
break
|
||||
|
||||
cmd = build_ssh_command(cfg)
|
||||
log.info("Starting SSH: %s", " ".join(cmd))
|
||||
self._state.write_state(cfg.name, BridgeState.STARTING)
|
||||
|
||||
try:
|
||||
proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
except FileNotFoundError:
|
||||
self._state.write_state(cfg.name, BridgeState.FAILED)
|
||||
self._audit.log(
|
||||
tunnel=cfg.name,
|
||||
event=AuditEvent.BRIDGE_DISCONNECTED,
|
||||
actor=actor,
|
||||
actor_class=actor_class,
|
||||
detail="ssh binary not found",
|
||||
)
|
||||
break
|
||||
|
||||
# Wait briefly then assume connected if still running
|
||||
time.sleep(2)
|
||||
if proc.poll() is None:
|
||||
self._state.write_state(cfg.name, BridgeState.CONNECTED)
|
||||
self._audit.log(
|
||||
tunnel=cfg.name,
|
||||
event=AuditEvent.BRIDGE_CONNECTED,
|
||||
actor=actor,
|
||||
actor_class=actor_class,
|
||||
)
|
||||
attempt = 0
|
||||
|
||||
# Health check loop
|
||||
if cfg.health_check:
|
||||
checker = HealthChecker(
|
||||
url=cfg.health_check.url,
|
||||
timeout_seconds=cfg.health_check.timeout_seconds,
|
||||
)
|
||||
health_failing = False
|
||||
while not _stop[0] and proc.poll() is None:
|
||||
result = asyncio.run(checker.check())
|
||||
if result.ok:
|
||||
if health_failing:
|
||||
health_failing = False
|
||||
self._state.write_state(cfg.name, BridgeState.CONNECTED)
|
||||
self._audit.log(
|
||||
tunnel=cfg.name,
|
||||
event=AuditEvent.HEALTH_CHECK_RECOVERED,
|
||||
actor=actor,
|
||||
actor_class=actor_class,
|
||||
)
|
||||
else:
|
||||
if not health_failing:
|
||||
health_failing = True
|
||||
self._state.write_state(cfg.name, BridgeState.DEGRADED)
|
||||
self._audit.log(
|
||||
tunnel=cfg.name,
|
||||
event=AuditEvent.HEALTH_CHECK_FAILED,
|
||||
actor=actor,
|
||||
actor_class=actor_class,
|
||||
detail=result.error or f"HTTP {result.status_code}",
|
||||
)
|
||||
time.sleep(cfg.health_check.interval_seconds)
|
||||
else:
|
||||
while not _stop[0] and proc.poll() is None:
|
||||
time.sleep(1)
|
||||
|
||||
# SSH exited
|
||||
if proc.poll() is not None:
|
||||
self._audit.log(
|
||||
tunnel=cfg.name,
|
||||
event=AuditEvent.BRIDGE_DISCONNECTED,
|
||||
actor=actor,
|
||||
actor_class=actor_class,
|
||||
detail=f"exit code {proc.returncode}",
|
||||
)
|
||||
|
||||
if _stop[0]:
|
||||
if proc.poll() is None:
|
||||
proc.terminate()
|
||||
break
|
||||
|
||||
attempt += 1
|
||||
backoff = self._next_backoff(attempt - 1)
|
||||
self._state.write_state(cfg.name, BridgeState.RECONNECTING)
|
||||
self._audit.log(
|
||||
tunnel=cfg.name,
|
||||
event=AuditEvent.BRIDGE_RECONNECTING,
|
||||
actor=actor,
|
||||
actor_class=actor_class,
|
||||
detail=f"retry {attempt}, backoff {backoff}s",
|
||||
)
|
||||
log.info("Reconnecting in %ds (attempt %d)", backoff, attempt)
|
||||
time.sleep(backoff)
|
||||
49
src/bridge/models.py
Normal file
49
src/bridge/models.py
Normal file
@@ -0,0 +1,49 @@
|
||||
"""Domain models for OpsBridge."""
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from enum import Enum
|
||||
from typing import Optional
|
||||
|
||||
|
||||
class BridgeState(str, Enum):
|
||||
STOPPED = "stopped"
|
||||
STARTING = "starting"
|
||||
CONNECTED = "connected"
|
||||
DEGRADED = "degraded"
|
||||
RECONNECTING = "reconnecting"
|
||||
FAILED = "failed"
|
||||
|
||||
|
||||
@dataclass
|
||||
class ReconnectPolicy:
|
||||
max_attempts: int = 0 # 0 = infinite
|
||||
backoff_initial: int = 5
|
||||
backoff_max: int = 60
|
||||
|
||||
|
||||
@dataclass
|
||||
class HealthCheckConfig:
|
||||
url: str
|
||||
interval_seconds: int = 30
|
||||
timeout_seconds: int = 5
|
||||
|
||||
|
||||
@dataclass
|
||||
class TunnelConfig:
|
||||
name: str
|
||||
host: str
|
||||
remote_port: int
|
||||
local_port: int
|
||||
ssh_user: str
|
||||
ssh_key: str
|
||||
actor: str
|
||||
reconnect: ReconnectPolicy = field(default_factory=ReconnectPolicy)
|
||||
health_check: Optional[HealthCheckConfig] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ActorInfo:
|
||||
name: str
|
||||
actor_class: str # "human" or "automation"
|
||||
description: str = ""
|
||||
73
src/bridge/state.py
Normal file
73
src/bridge/state.py
Normal file
@@ -0,0 +1,73 @@
|
||||
"""State file management for OpsBridge."""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from bridge.models import BridgeState
|
||||
|
||||
|
||||
def _default_state_dir() -> Path:
|
||||
return Path.home() / ".local" / "state" / "bridge"
|
||||
|
||||
|
||||
class StateManager:
|
||||
def __init__(self, state_dir: Optional[Path] = None):
|
||||
self._dir = Path(state_dir) if state_dir else _default_state_dir()
|
||||
|
||||
def _ensure_dir(self) -> None:
|
||||
self._dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def _state_path(self, name: str) -> Path:
|
||||
return self._dir / f"{name}.state"
|
||||
|
||||
def _pid_path(self, name: str) -> Path:
|
||||
return self._dir / f"{name}.pid"
|
||||
|
||||
def read_state(self, name: str) -> BridgeState:
|
||||
path = self._state_path(name)
|
||||
if not path.exists():
|
||||
return BridgeState.STOPPED
|
||||
text = path.read_text().strip()
|
||||
try:
|
||||
return BridgeState(text)
|
||||
except ValueError:
|
||||
return BridgeState.STOPPED
|
||||
|
||||
def write_state(self, name: str, state: BridgeState) -> None:
|
||||
self._ensure_dir()
|
||||
self._state_path(name).write_text(state.value)
|
||||
|
||||
def read_pid(self, name: str) -> Optional[int]:
|
||||
path = self._pid_path(name)
|
||||
if not path.exists():
|
||||
return None
|
||||
try:
|
||||
pid = int(path.read_text().strip())
|
||||
except (ValueError, OSError):
|
||||
return None
|
||||
if _pid_alive(pid):
|
||||
return pid
|
||||
return None
|
||||
|
||||
def write_pid(self, name: str, pid: int) -> None:
|
||||
self._ensure_dir()
|
||||
self._pid_path(name).write_text(str(pid))
|
||||
|
||||
def clear_pid(self, name: str) -> None:
|
||||
path = self._pid_path(name)
|
||||
if path.exists():
|
||||
path.unlink()
|
||||
|
||||
def is_running(self, name: str) -> bool:
|
||||
return self.read_pid(name) is not None
|
||||
|
||||
|
||||
def _pid_alive(pid: int) -> bool:
|
||||
"""Return True if the process with given PID exists."""
|
||||
try:
|
||||
os.kill(pid, 0)
|
||||
return True
|
||||
except (ProcessLookupError, PermissionError):
|
||||
return False
|
||||
Reference in New Issue
Block a user