feat(diagnostics): end-to-end tunnel check, stale state detection, MCP extensions

- diagnostics.py: TunnelCheckResult with SSH process liveness, port
  probe, and optional API health check; check_tunnel / check_all_tunnels
- cli.py: bridge status shows LIVE column and [STALE] marker when state
  says connected but PID is dead; bridge check wired to diagnostics
- state.py: read_raw_pid helper; _pid_alive exported for reuse
- capabilities.py: capabilities registry stubs
- mcp_server/server.py: expose check_tunnel and tunnel capabilities
  over MCP
- SCOPE.md: rapid orientation document
- workplans/OPS-WP-0001-diagnostics.md: workplan backing this feature
- tests: 207 passing (test_cli, test_mcp, test_diagnostics)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-21 15:07:47 +01:00
parent bebd542a2e
commit a55c685f89
10 changed files with 773 additions and 8 deletions

110
src/bridge/diagnostics.py Normal file
View File

@@ -0,0 +1,110 @@
"""End-to-end tunnel diagnostics for OpsBridge."""
from __future__ import annotations
import subprocess
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Optional
import httpx
from bridge.models import BridgeState, TunnelConfig
from bridge.state import StateManager, _pid_alive
@dataclass
class TunnelCheckResult:
tunnel: str
ssh_process: str # "ok" | "dead" | "no_pid"
pid: Optional[int]
remote_port: str # "listening" | "closed" | "error:<msg>"
local_api: Optional[str] # "ok" | "error:<msg>" | None
latency_ms: Optional[float]
stale_state: bool # state file says connected but process is dead
@property
def ok(self) -> bool:
return self.ssh_process == "ok" and self.remote_port == "listening"
def check_tunnel(cfg: TunnelConfig, state_mgr: StateManager) -> TunnelCheckResult:
"""Run end-to-end diagnostics for a single tunnel.
Checks SSH PID liveness, remote port listening via SSH probe, and optional
local API health check. Returns a TunnelCheckResult with all findings.
"""
name = cfg.name
# 1. PID liveness
pid = state_mgr.read_raw_pid(name)
if pid is None:
ssh_process = "no_pid"
elif _pid_alive(pid):
ssh_process = "ok"
else:
ssh_process = "dead"
# 2. Stale state: state file says connected/degraded but process is dead
state = state_mgr.read_state(name)
stale_state = (
state in (BridgeState.CONNECTED, BridgeState.DEGRADED)
and ssh_process != "ok"
)
# 3. SSH probe for remote port
key_path = str(Path(cfg.ssh_key).expanduser())
cmd = [
"ssh",
"-i", key_path,
"-o", "BatchMode=yes",
"-o", "ConnectTimeout=5",
"-o", "StrictHostKeyChecking=accept-new",
f"{cfg.ssh_user}@{cfg.host}",
f"ss -tnlp 2>/dev/null | grep -q ':{cfg.remote_port} ' && echo ok || echo closed",
]
try:
proc = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=10,
)
output = proc.stdout.strip()
if output == "ok":
remote_port = "listening"
elif output == "closed":
remote_port = "closed"
else:
remote_port = f"error:{proc.stderr.strip() or 'unknown'}"
except subprocess.TimeoutExpired:
remote_port = "error:timeout"
except Exception as e:
remote_port = f"error:{e}"
# 4. Local API health check (optional)
local_api: Optional[str] = None
latency_ms: Optional[float] = None
if cfg.health_check is not None:
try:
t0 = time.monotonic()
resp = httpx.get(cfg.health_check.url, timeout=cfg.health_check.timeout_seconds)
latency_ms = (time.monotonic() - t0) * 1000
local_api = "ok" if resp.is_success else f"error:http_{resp.status_code}"
except Exception as e:
local_api = f"error:{e}"
return TunnelCheckResult(
tunnel=name,
ssh_process=ssh_process,
pid=pid,
remote_port=remote_port,
local_api=local_api,
latency_ms=latency_ms,
stale_state=stale_state,
)
def check_all_tunnels(cfg, state_mgr: StateManager) -> list[TunnelCheckResult]:
"""Run diagnostics for all configured inline tunnels."""
return [check_tunnel(tcfg, state_mgr) for tcfg in cfg.tunnels.values()]