From a55c685f89409a9a5c98242180d40c2aa27765c3 Mon Sep 17 00:00:00 2001 From: tegwick Date: Sat, 21 Mar 2026 15:07:47 +0100 Subject: [PATCH] feat(diagnostics): end-to-end tunnel check, stale state detection, MCP extensions - diagnostics.py: TunnelCheckResult with SSH process liveness, port probe, and optional API health check; check_tunnel / check_all_tunnels - cli.py: bridge status shows LIVE column and [STALE] marker when state says connected but PID is dead; bridge check wired to diagnostics - state.py: read_raw_pid helper; _pid_alive exported for reuse - capabilities.py: capabilities registry stubs - mcp_server/server.py: expose check_tunnel and tunnel capabilities over MCP - SCOPE.md: rapid orientation document - workplans/OPS-WP-0001-diagnostics.md: workplan backing this feature - tests: 207 passing (test_cli, test_mcp, test_diagnostics) Co-Authored-By: Claude Sonnet 4.6 --- SCOPE.md | 11 ++ src/bridge/capabilities.py | 5 + src/bridge/cli.py | 99 +++++++++++++-- src/bridge/diagnostics.py | 110 +++++++++++++++++ src/bridge/mcp_server/server.py | 53 +++++++- src/bridge/state.py | 10 ++ tests/test_cli.py | 78 ++++++++++++ tests/test_diagnostics.py | 177 +++++++++++++++++++++++++++ tests/test_mcp.py | 74 +++++++++++ workplans/OPS-WP-0001-diagnostics.md | 164 +++++++++++++++++++++++++ 10 files changed, 773 insertions(+), 8 deletions(-) create mode 100644 src/bridge/diagnostics.py create mode 100644 tests/test_diagnostics.py create mode 100644 workplans/OPS-WP-0001-diagnostics.md diff --git a/SCOPE.md b/SCOPE.md index 05fee18..d9bdb37 100644 --- a/SCOPE.md +++ b/SCOPE.md @@ -91,6 +91,17 @@ Claude Code sessions run locally; the Custodian State Hub API runs locally. Remo --- +## Provided Capabilities + +```capability +type: infrastructure +title: SSH reverse tunnel connectivity +description: Named, auto-reconnecting SSH reverse tunnels with health checks and audit logging — keeps remote execution environments continuously connected to the local Custodian State Hub. +keywords: [ssh, tunnel, reverse-tunnel, connectivity, remote, bridge, ops-bridge] +``` + +--- + ## Getting Oriented - Start with: `README.txt` (architecture, config format, CLI commands, MCP integration) diff --git a/src/bridge/capabilities.py b/src/bridge/capabilities.py index d2d759b..c5bf2dc 100644 --- a/src/bridge/capabilities.py +++ b/src/bridge/capabilities.py @@ -68,6 +68,11 @@ CAPABILITIES: list[Capability] = [ description="Show bridge metadata", required_access_modes=frozenset({"cli", "mcp"}), ), + Capability( + name="bridge_check", + description="End-to-end tunnel diagnostics via SSH: SSH PID alive + remote port listening", + required_access_modes=frozenset({"cli", "mcp"}), + ), ] CAPABILITIES_BY_NAME: dict[str, Capability] = {c.name: c for c in CAPABILITIES} diff --git a/src/bridge/cli.py b/src/bridge/cli.py index 374aa7c..35ca422 100644 --- a/src/bridge/cli.py +++ b/src/bridge/cli.py @@ -1,6 +1,7 @@ """CLI for OpsBridge — bridge command.""" from __future__ import annotations +import dataclasses import json import os from pathlib import Path @@ -10,8 +11,9 @@ import typer from bridge.audit import AuditLogger from bridge.config import ConfigError, load_config +from bridge.diagnostics import check_all_tunnels, check_tunnel from bridge.manager import TunnelManager -from bridge.state import StateManager +from bridge.state import StateManager, _pid_alive app = typer.Typer( name="bridge", @@ -175,13 +177,20 @@ def status( rows = [] for name, tcfg in cfg.tunnels.items(): state = state_mgr.read_state(name) - pid = state_mgr.read_pid(name) + raw_pid = state_mgr.read_raw_pid(name) + pid_alive_val = _pid_alive(raw_pid) if raw_pid is not None else None + stale = ( + state.value in ("connected", "degraded") + and pid_alive_val is not True + ) rows.append({ "tunnel": name, "state": state.value, "actor": tcfg.actor, "host": tcfg.host, - "pid": pid, + "pid": raw_pid, + "pid_alive": pid_alive_val, + "stale": stale, "uptime": None, "health": None, }) @@ -196,10 +205,29 @@ def _print_status_table(rows): if not rows: typer.echo("No tunnels configured.") return - headers = ["TUNNEL", "STATE", "ACTOR", "HOST", "PID"] + + def _state_display(row): + s = row["state"] + if row.get("stale"): + s += " [STALE]" + return s + + def _live_display(row): + alive = row.get("pid_alive") + if alive is True: + return "yes" + elif alive is False: + return "no" + return "\u2014" + + headers = ["TUNNEL", "STATE", "ACTOR", "HOST", "PID", "LIVE"] col_widths = [ - max(len(h), max((len(str(r.get(h.lower(), "") or "")) for r in rows), default=0)) - for h in headers + max(len("TUNNEL"), max((len(row["tunnel"]) for row in rows), default=0)), + max(len("STATE"), max((len(_state_display(row)) for row in rows), default=0)), + max(len("ACTOR"), max((len(str(row.get("actor", "") or "")) for row in rows), default=0)), + max(len("HOST"), max((len(str(row.get("host", "") or "")) for row in rows), default=0)), + max(len("PID"), max((len(str(row["pid"] or "")) for row in rows), default=0)), + max(len("LIVE"), max((len(_live_display(row)) for row in rows), default=0)), ] def _fmt_row(vals): @@ -210,10 +238,11 @@ def _print_status_table(rows): for row in rows: typer.echo(_fmt_row([ row["tunnel"], - row["state"], + _state_display(row), row["actor"], row["host"], str(row["pid"] or ""), + _live_display(row), ])) @@ -272,6 +301,62 @@ def logs( pass +@app.command() +def check( + tunnel: Optional[str] = typer.Argument(None, help="Tunnel name (omit for all inline)"), + as_json: bool = typer.Option(False, "--json", help="Output as JSON"), +): + """End-to-end diagnostics: verify SSH PID alive and remote port listening.""" + cfg = _load_or_exit() + sd = _state_dir() + state_mgr = StateManager(state_dir=sd) + + if tunnel: + results = [check_tunnel(_resolve_tunnel(cfg, tunnel), state_mgr)] + else: + results = check_all_tunnels(cfg, state_mgr) + + if as_json: + typer.echo(json.dumps( + [{**dataclasses.asdict(r), "ok": r.ok} for r in results], + indent=2, + )) + else: + _print_check_table(results) + + if any(not r.ok for r in results): + raise typer.Exit(1) + + +def _print_check_table(results): + if not results: + typer.echo("No tunnels configured.") + return + headers = ["TUNNEL", "SSH", "PID", "PORT", "API", "OK"] + rows_data = [] + for r in results: + rows_data.append([ + r.tunnel, + r.ssh_process, + str(r.pid or ""), + r.remote_port, + r.local_api or "\u2014", + "yes" if r.ok else "no", + ]) + col_widths = [ + max(len(h), max((len(row[i]) for row in rows_data), default=0)) + for i, h in enumerate(headers) + ] + + def _fmt(vals): + return " ".join(str(v).ljust(w) for v, w in zip(vals, col_widths)) + + typer.echo(_fmt(headers)) + typer.echo(_fmt(["-" * w for w in col_widths])) + for row in rows_data: + typer.echo(_fmt(row)) + + # ─── targets commands ───────────────────────────────────────────────────────── @targets_app.callback(invoke_without_command=True) diff --git a/src/bridge/diagnostics.py b/src/bridge/diagnostics.py new file mode 100644 index 0000000..b486a93 --- /dev/null +++ b/src/bridge/diagnostics.py @@ -0,0 +1,110 @@ +"""End-to-end tunnel diagnostics for OpsBridge.""" +from __future__ import annotations + +import subprocess +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Optional + +import httpx + +from bridge.models import BridgeState, TunnelConfig +from bridge.state import StateManager, _pid_alive + + +@dataclass +class TunnelCheckResult: + tunnel: str + ssh_process: str # "ok" | "dead" | "no_pid" + pid: Optional[int] + remote_port: str # "listening" | "closed" | "error:" + local_api: Optional[str] # "ok" | "error:" | None + latency_ms: Optional[float] + stale_state: bool # state file says connected but process is dead + + @property + def ok(self) -> bool: + return self.ssh_process == "ok" and self.remote_port == "listening" + + +def check_tunnel(cfg: TunnelConfig, state_mgr: StateManager) -> TunnelCheckResult: + """Run end-to-end diagnostics for a single tunnel. + + Checks SSH PID liveness, remote port listening via SSH probe, and optional + local API health check. Returns a TunnelCheckResult with all findings. + """ + name = cfg.name + + # 1. PID liveness + pid = state_mgr.read_raw_pid(name) + if pid is None: + ssh_process = "no_pid" + elif _pid_alive(pid): + ssh_process = "ok" + else: + ssh_process = "dead" + + # 2. Stale state: state file says connected/degraded but process is dead + state = state_mgr.read_state(name) + stale_state = ( + state in (BridgeState.CONNECTED, BridgeState.DEGRADED) + and ssh_process != "ok" + ) + + # 3. SSH probe for remote port + key_path = str(Path(cfg.ssh_key).expanduser()) + cmd = [ + "ssh", + "-i", key_path, + "-o", "BatchMode=yes", + "-o", "ConnectTimeout=5", + "-o", "StrictHostKeyChecking=accept-new", + f"{cfg.ssh_user}@{cfg.host}", + f"ss -tnlp 2>/dev/null | grep -q ':{cfg.remote_port} ' && echo ok || echo closed", + ] + try: + proc = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=10, + ) + output = proc.stdout.strip() + if output == "ok": + remote_port = "listening" + elif output == "closed": + remote_port = "closed" + else: + remote_port = f"error:{proc.stderr.strip() or 'unknown'}" + except subprocess.TimeoutExpired: + remote_port = "error:timeout" + except Exception as e: + remote_port = f"error:{e}" + + # 4. Local API health check (optional) + local_api: Optional[str] = None + latency_ms: Optional[float] = None + if cfg.health_check is not None: + try: + t0 = time.monotonic() + resp = httpx.get(cfg.health_check.url, timeout=cfg.health_check.timeout_seconds) + latency_ms = (time.monotonic() - t0) * 1000 + local_api = "ok" if resp.is_success else f"error:http_{resp.status_code}" + except Exception as e: + local_api = f"error:{e}" + + return TunnelCheckResult( + tunnel=name, + ssh_process=ssh_process, + pid=pid, + remote_port=remote_port, + local_api=local_api, + latency_ms=latency_ms, + stale_state=stale_state, + ) + + +def check_all_tunnels(cfg, state_mgr: StateManager) -> list[TunnelCheckResult]: + """Run diagnostics for all configured inline tunnels.""" + return [check_tunnel(tcfg, state_mgr) for tcfg in cfg.tunnels.values()] diff --git a/src/bridge/mcp_server/server.py b/src/bridge/mcp_server/server.py index f171be4..23dd4ef 100644 --- a/src/bridge/mcp_server/server.py +++ b/src/bridge/mcp_server/server.py @@ -8,6 +8,7 @@ All tool functions return JSON-serialisable dicts/lists. """ from __future__ import annotations +import dataclasses import json import os from pathlib import Path @@ -15,6 +16,9 @@ from typing import Optional from fastmcp import FastMCP +from bridge.diagnostics import check_all_tunnels, check_tunnel +from bridge.state import StateManager + mcp = FastMCP( name="ops-bridge", instructions=( @@ -218,7 +222,6 @@ def bridge_status() -> list[dict]: if err: return [err] - from bridge.state import StateManager sd = _state_dir() state_mgr = StateManager(state_dir=sd) @@ -432,6 +435,48 @@ def catalog_show_bridge(bridge_id: str) -> dict: return result +# --------------------------------------------------------------------------- +# Diagnostics tool +# --------------------------------------------------------------------------- + +@mcp.tool() +def bridge_check(tunnel: Optional[str] = None) -> list[dict]: + """End-to-end diagnostics: SSH process alive + remote port listening. + + Args: + tunnel: Specific tunnel name, or None for all inline tunnels. + + Returns: + List of dicts with keys: tunnel, ssh_process, pid, remote_port, + local_api, latency_ms, stale_state, ok. + Returns [{"error": "..."}] on config load failure. + """ + cfg, err = _load_cfg_or_error() + if err: + return [err] + sd = _state_dir() + state_mgr = StateManager(state_dir=sd) + + if tunnel: + from bridge.catalog.loader import load_catalog + from bridge.catalog.resolver import BridgeNotFound, resolve + catalog = None + if cfg.catalog_path is not None: + try: + catalog = load_catalog(cfg.catalog_path) + except Exception: + pass + try: + tcfg = resolve(tunnel, catalog=catalog, inline_tunnels=cfg.tunnels) + except BridgeNotFound: + return [{"error": f"Tunnel '{tunnel}' not found in config or catalog"}] + results = [check_tunnel(tcfg, state_mgr)] + else: + results = check_all_tunnels(cfg, state_mgr) + + return [{**dataclasses.asdict(r), "ok": r.ok} for r in results] + + # --------------------------------------------------------------------------- # MCP resources # --------------------------------------------------------------------------- @@ -443,6 +488,12 @@ def resource_bridge_status() -> str: return json.dumps(rows, indent=2) +@mcp.resource("bridge://check") +def resource_bridge_check() -> str: + """Live end-to-end diagnostic snapshot for all tunnels.""" + return json.dumps(bridge_check(), indent=2) + + @mcp.resource("catalog://domains") def resource_catalog_domains() -> str: """List of all catalog domains as JSON.""" diff --git a/src/bridge/state.py b/src/bridge/state.py index e9e2550..9747296 100644 --- a/src/bridge/state.py +++ b/src/bridge/state.py @@ -51,6 +51,16 @@ class StateManager: return pid return None + def read_raw_pid(self, name: str) -> Optional[int]: + """Read PID from file without liveness check. Returns None if file absent/invalid.""" + path = self._pid_path(name) + if not path.exists(): + return None + try: + return int(path.read_text().strip()) + except (ValueError, OSError): + return None + def write_pid(self, name: str, pid: int) -> None: self._ensure_dir() self._pid_path(name).write_text(str(pid)) diff --git a/tests/test_cli.py b/tests/test_cli.py index 0fe5ad1..0cef90d 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -188,6 +188,84 @@ class TestLogsCommand: assert "bridge_started" in result.output +class TestCheckCommand: + def test_check_help(self): + result = runner.invoke(app, ["check", "--help"]) + assert result.exit_code == 0 + + @pytest.mark.capability("bridge_check") + @pytest.mark.access_mode("cli") + def test_check_all_pass(self, env): + from bridge.diagnostics import TunnelCheckResult + ok_result = TunnelCheckResult( + tunnel="test-tunnel", + ssh_process="ok", + pid=12345, + remote_port="listening", + local_api=None, + latency_ms=None, + stale_state=False, + ) + with patch("bridge.cli.check_all_tunnels", return_value=[ok_result]): + result = runner.invoke(app, ["check"], env=env) + assert result.exit_code == 0 + + def test_check_any_fail(self, env): + from bridge.diagnostics import TunnelCheckResult + fail_result = TunnelCheckResult( + tunnel="test-tunnel", + ssh_process="dead", + pid=None, + remote_port="closed", + local_api=None, + latency_ms=None, + stale_state=True, + ) + with patch("bridge.cli.check_all_tunnels", return_value=[fail_result]): + result = runner.invoke(app, ["check"], env=env) + assert result.exit_code == 1 + + def test_check_json_flag(self, env): + from bridge.diagnostics import TunnelCheckResult + ok_result = TunnelCheckResult( + tunnel="test-tunnel", + ssh_process="ok", + pid=12345, + remote_port="listening", + local_api=None, + latency_ms=None, + stale_state=False, + ) + with patch("bridge.cli.check_all_tunnels", return_value=[ok_result]): + result = runner.invoke(app, ["check", "--json"], env=env) + assert result.exit_code == 0 + data = json.loads(result.output) + assert isinstance(data, list) + assert len(data) == 1 + assert data[0]["ok"] is True + assert data[0]["tunnel"] == "test-tunnel" + assert data[0]["ssh_process"] == "ok" + + def test_check_specific_tunnel(self, env): + from bridge.diagnostics import TunnelCheckResult + ok_result = TunnelCheckResult( + tunnel="test-tunnel", + ssh_process="ok", + pid=12345, + remote_port="listening", + local_api=None, + latency_ms=None, + stale_state=False, + ) + with patch("bridge.cli.check_tunnel", return_value=ok_result): + result = runner.invoke(app, ["check", "test-tunnel"], env=env) + assert result.exit_code == 0 + + def test_check_unknown_tunnel_exit_1(self, env): + result = runner.invoke(app, ["check", "nonexistent"], env=env) + assert result.exit_code == 1 + + class TestRestartCommand: def test_restart_unknown_tunnel_exit_1(self, env): result = runner.invoke(app, ["restart", "nonexistent"], env=env) diff --git a/tests/test_diagnostics.py b/tests/test_diagnostics.py new file mode 100644 index 0000000..f71d36a --- /dev/null +++ b/tests/test_diagnostics.py @@ -0,0 +1,177 @@ +"""Tests for bridge.diagnostics — check_tunnel() logic.""" +from __future__ import annotations + +import subprocess +from unittest.mock import MagicMock, patch + +import pytest + +from bridge.diagnostics import TunnelCheckResult, check_all_tunnels, check_tunnel +from bridge.models import BridgeState, TunnelConfig +from bridge.state import StateManager + + +@pytest.fixture +def tcfg(): + return TunnelConfig( + name="test-tunnel", + host="coulombcore.local", + remote_port=18000, + local_port=8000, + ssh_user="ubuntu", + ssh_key="~/.ssh/id_ops", + actor="operator.bernd", + ) + + +@pytest.fixture +def state_mgr(tmp_path): + d = tmp_path / "state" + d.mkdir() + return StateManager(state_dir=d) + + +class TestCheckTunnel: + def test_no_pid(self, tcfg, state_mgr): + """No PID file → ssh_process='no_pid', ok=False.""" + with patch("bridge.diagnostics.subprocess.run") as mock_run: + mock_run.return_value = MagicMock(stdout="closed\n", stderr="", returncode=1) + result = check_tunnel(tcfg, state_mgr) + assert result.ssh_process == "no_pid" + assert result.pid is None + assert result.stale_state is False + assert result.ok is False + + def test_pid_dead(self, tcfg, state_mgr): + """Dead PID + connected state → ssh_process='dead', stale_state=True.""" + state_mgr.write_pid("test-tunnel", 99999) + state_mgr.write_state("test-tunnel", BridgeState.CONNECTED) + with ( + patch("bridge.diagnostics._pid_alive", return_value=False), + patch("bridge.diagnostics.subprocess.run") as mock_run, + ): + mock_run.return_value = MagicMock(stdout="closed\n", stderr="", returncode=1) + result = check_tunnel(tcfg, state_mgr) + assert result.ssh_process == "dead" + assert result.stale_state is True + assert result.ok is False + + def test_pid_alive_port_listening(self, tcfg, state_mgr): + """Alive PID + SSH reports port listening → remote_port='listening', ok=True.""" + state_mgr.write_pid("test-tunnel", 12345) + with ( + patch("bridge.diagnostics._pid_alive", return_value=True), + patch("bridge.diagnostics.subprocess.run") as mock_run, + ): + mock_run.return_value = MagicMock(stdout="ok\n", stderr="", returncode=0) + result = check_tunnel(tcfg, state_mgr) + assert result.ssh_process == "ok" + assert result.pid == 12345 + assert result.remote_port == "listening" + assert result.ok is True + + def test_pid_alive_port_closed(self, tcfg, state_mgr): + """Alive PID + SSH reports port closed → remote_port='closed', ok=False.""" + state_mgr.write_pid("test-tunnel", 12345) + with ( + patch("bridge.diagnostics._pid_alive", return_value=True), + patch("bridge.diagnostics.subprocess.run") as mock_run, + ): + mock_run.return_value = MagicMock(stdout="closed\n", stderr="", returncode=1) + result = check_tunnel(tcfg, state_mgr) + assert result.ssh_process == "ok" + assert result.remote_port == "closed" + assert result.ok is False + + def test_ssh_timeout(self, tcfg, state_mgr): + """SSH probe timeout → remote_port='error:timeout'.""" + state_mgr.write_pid("test-tunnel", 12345) + with ( + patch("bridge.diagnostics._pid_alive", return_value=True), + patch( + "bridge.diagnostics.subprocess.run", + side_effect=subprocess.TimeoutExpired(cmd=["ssh"], timeout=10), + ), + ): + result = check_tunnel(tcfg, state_mgr) + assert result.remote_port == "error:timeout" + assert result.ok is False + + def test_stale_state_not_flagged_when_stopped(self, tcfg, state_mgr): + """State=stopped + no PID → stale_state is False (not connected/degraded).""" + with patch("bridge.diagnostics.subprocess.run") as mock_run: + mock_run.return_value = MagicMock(stdout="closed\n", stderr="", returncode=1) + result = check_tunnel(tcfg, state_mgr) + assert result.stale_state is False + + def test_local_api_ok(self, tcfg, state_mgr, tmp_path): + """With health_check configured, ok response sets local_api='ok'.""" + from bridge.models import HealthCheckConfig + tcfg_with_health = TunnelConfig( + name="test-tunnel", + host="coulombcore.local", + remote_port=18000, + local_port=8000, + ssh_user="ubuntu", + ssh_key="~/.ssh/id_ops", + actor="operator.bernd", + health_check=HealthCheckConfig(url="http://127.0.0.1:8000/health"), + ) + state_mgr.write_pid("test-tunnel", 12345) + mock_resp = MagicMock() + mock_resp.is_success = True + with ( + patch("bridge.diagnostics._pid_alive", return_value=True), + patch("bridge.diagnostics.subprocess.run") as mock_run, + patch("bridge.diagnostics.httpx.get", return_value=mock_resp), + ): + mock_run.return_value = MagicMock(stdout="ok\n", stderr="", returncode=0) + result = check_tunnel(tcfg_with_health, state_mgr) + assert result.local_api == "ok" + assert result.latency_ms is not None + + +class TestCheckAllTunnels: + def test_check_all_iterates_tunnels(self, tmp_path): + """check_all_tunnels returns one result per tunnel in cfg.""" + from bridge.config import load_config + import textwrap, os + + cfg_file = tmp_path / "tunnels.yaml" + cfg_file.write_text(textwrap.dedent("""\ + tunnels: + t1: + host: h1.local + remote_port: 18001 + local_port: 8001 + ssh_user: ubuntu + ssh_key: ~/.ssh/id_ops + actor: operator.bernd + t2: + host: h2.local + remote_port: 18002 + local_port: 8002 + ssh_user: ubuntu + ssh_key: ~/.ssh/id_ops + actor: operator.bernd + actors: + operator.bernd: + class: human + description: Bernd + """)) + os.environ["BRIDGE_CONFIG"] = str(cfg_file) + try: + cfg = load_config() + finally: + del os.environ["BRIDGE_CONFIG"] + + state_dir = tmp_path / "state" + state_dir.mkdir() + state_mgr = StateManager(state_dir=state_dir) + + with patch("bridge.diagnostics.subprocess.run") as mock_run: + mock_run.return_value = MagicMock(stdout="closed\n", stderr="", returncode=1) + results = check_all_tunnels(cfg, state_mgr) + + assert len(results) == 2 + assert {r.tunnel for r in results} == {"t1", "t2"} diff --git a/tests/test_mcp.py b/tests/test_mcp.py index a5e859e..a811f48 100644 --- a/tests/test_mcp.py +++ b/tests/test_mcp.py @@ -469,6 +469,80 @@ class TestMcpCatalogShowBridge: assert "error" in data +# --------------------------------------------------------------------------- +# bridge_check +# --------------------------------------------------------------------------- + +class TestMcpBridgeCheck: + @pytest.mark.capability("bridge_check") + @pytest.mark.access_mode("mcp") + async def test_bridge_check_tool(self, env_simple): + """bridge_check returns a list of dicts with 'ok' key.""" + from bridge.diagnostics import TunnelCheckResult + mock_result = TunnelCheckResult( + tunnel="test-tunnel", + ssh_process="ok", + pid=12345, + remote_port="listening", + local_api=None, + latency_ms=None, + stale_state=False, + ) + with patch("bridge.mcp_server.server.check_all_tunnels", return_value=[mock_result]): + from fastmcp import Client + async with Client(mcp) as c: + result = await c.call_tool("bridge_check", {}) + data = _data(result) + assert isinstance(data, list) + assert len(data) == 1 + row = data[0] + assert "ok" in row + assert row["ok"] is True + assert row["tunnel"] == "test-tunnel" + assert row["ssh_process"] == "ok" + assert row["remote_port"] == "listening" + + async def test_bridge_check_specific_tunnel(self, env_simple): + """bridge_check with tunnel arg calls check_tunnel for that tunnel.""" + from bridge.diagnostics import TunnelCheckResult + mock_result = TunnelCheckResult( + tunnel="test-tunnel", + ssh_process="dead", + pid=None, + remote_port="closed", + local_api=None, + latency_ms=None, + stale_state=True, + ) + with patch("bridge.mcp_server.server.check_tunnel", return_value=mock_result): + from fastmcp import Client + async with Client(mcp) as c: + result = await c.call_tool("bridge_check", {"tunnel": "test-tunnel"}) + data = _data(result) + assert isinstance(data, list) + assert data[0]["ok"] is False + assert data[0]["stale_state"] is True + + async def test_bridge_check_unknown_tunnel(self, env_simple): + """bridge_check with unknown tunnel returns error dict.""" + from fastmcp import Client + async with Client(mcp) as c: + result = await c.call_tool("bridge_check", {"tunnel": "nonexistent"}) + data = _data(result) + assert isinstance(data, list) + assert "error" in data[0] + + async def test_bridge_check_bad_config(self, tmp_path, monkeypatch): + """bridge_check with bad config returns error dict.""" + monkeypatch.setenv("BRIDGE_CONFIG", str(tmp_path / "nonexistent.yaml")) + from fastmcp import Client + async with Client(mcp) as c: + result = await c.call_tool("bridge_check", {}) + data = _data(result) + assert isinstance(data, list) + assert "error" in data[0] + + # --------------------------------------------------------------------------- # Resources # --------------------------------------------------------------------------- diff --git a/workplans/OPS-WP-0001-diagnostics.md b/workplans/OPS-WP-0001-diagnostics.md new file mode 100644 index 0000000..f098bf9 --- /dev/null +++ b/workplans/OPS-WP-0001-diagnostics.md @@ -0,0 +1,164 @@ +--- +id: OPS-WP-0001 +type: workplan +title: "ops-bridge diagnostics and flow improvements" +domain: custodian +repo: ops-bridge +status: done +owner: claude +topic_slug: custodian +created: "2026-03-20" +updated: "2026-03-20" +state_hub_workstream_id: "6726cea2-447a-40b2-b0a0-edf495f07942" +--- + +# OPS-WP-0001 — ops-bridge diagnostics and flow improvements + +**Scope:** Add `bridge check` end-to-end diagnostics command, fix `bridge status` to +surface live PID liveness and flag stale state, add a `bridge_check` MCP tool, and +wire Makefile convenience targets in state-hub. + +**Context:** During a session, `bridge status` reported "connected" but the reverse +port forwarding was not active — stale `.state` files written by the daemon. The +status command does not verify the SSH process is alive or that the remote port is +actually listening. + +--- + +## Task: Add `read_raw_pid()` to StateManager + +```task +id: OPS-WP-0001-T01 +status: done +priority: high +state_hub_task_id: "05e98e85-699a-4982-bb3e-8f2538cde2c7" +``` + +Add `read_raw_pid(name)` to `src/bridge/state.py` — reads PID from file without +liveness check. Existing `read_pid()` (which also checks liveness) stays unchanged. + +--- + +## Task: Create `src/bridge/diagnostics.py` + +```task +id: OPS-WP-0001-T02 +status: done +priority: high +state_hub_task_id: "b68d7b1e-850b-469a-9de2-8b5d3d1f1c05" +``` + +New module with `TunnelCheckResult` dataclass (ssh_process, pid, remote_port, +local_api, latency_ms, stale_state, ok property) and `check_tunnel()` / +`check_all_tunnels()` functions. SSH probe via subprocess; optional httpx health check. + +--- + +## Task: Fix `bridge status` and add `bridge check` to CLI + +```task +id: OPS-WP-0001-T03 +status: done +priority: high +state_hub_task_id: "e87c6c5d-170c-4af3-905c-a48fae2edbe5" +``` + +Fix `status` to show live PID liveness (LIVE column) and flag stale state. +Add `check` command with `--json` flag; exit 1 if any tunnel not ok. +Add `_print_check_table` helper. + +--- + +## Task: Add `bridge_check` MCP tool and `bridge://check` resource + +```task +id: OPS-WP-0001-T04 +status: done +priority: high +state_hub_task_id: "7e97c112-20e2-4e2e-b853-53b10998392b" +``` + +Add `bridge_check(tunnel?)` tool and `bridge://check` resource to +`src/bridge/mcp_server/server.py`. + +--- + +## Task: Register `bridge_check` capability + +```task +id: OPS-WP-0001-T05 +status: done +priority: high +state_hub_task_id: "c69fc748-a706-46db-a4d5-30d60222452b" +``` + +Add `bridge_check` entry to `src/bridge/capabilities.py` with +`required_access_modes=frozenset({"cli", "mcp"})`. + +--- + +## Task: Write `tests/test_diagnostics.py` + +```task +id: OPS-WP-0001-T06 +status: done +priority: high +state_hub_task_id: "070ed088-74a6-48d3-81cf-739c2a2fd21b" +``` + +Unit tests: test_no_pid, test_pid_dead, test_pid_alive_port_listening, +test_pid_alive_port_closed, test_ssh_timeout. + +--- + +## Task: Add `TestCheckCommand` to `tests/test_cli.py` + +```task +id: OPS-WP-0001-T07 +status: done +priority: high +state_hub_task_id: "aae5ddc5-f823-4647-a536-8604ddb97946" +``` + +Tests: test_check_help, test_check_all_pass (marked capability+mode), +test_check_any_fail, test_check_json_flag, test_check_specific_tunnel. + +--- + +## Task: Add `TestMcpBridgeCheck` to `tests/test_mcp.py` + +```task +id: OPS-WP-0001-T08 +status: done +priority: high +state_hub_task_id: "ed492a3d-7a5f-465e-8cc3-d2f992f5462c" +``` + +Test: test_bridge_check_tool marked capability("bridge_check") + access_mode("mcp"). + +--- + +## Task: Add tunnels targets to state-hub Makefile + +```task +id: OPS-WP-0001-T09 +status: done +priority: medium +state_hub_task_id: "a3c77062-cff5-40e3-936c-b210b05f8839" +``` + +Add `tunnels-up`, `tunnels-status`, `tunnels-check` targets delegating to `bridge`. +Add to `.PHONY` line. + +--- + +## Task: Run test suite and verify + +```task +id: OPS-WP-0001-T10 +status: done +priority: high +state_hub_task_id: "e42de76c-fab7-4924-8929-38fa9eaca478" +``` + +`cd /home/worsch/ops-bridge && uv run pytest tests/ -v` — all tests green.