generated from coulomb/repo-seed
feat(diagnostics): end-to-end tunnel check, stale state detection, MCP extensions
- diagnostics.py: TunnelCheckResult with SSH process liveness, port probe, and optional API health check; check_tunnel / check_all_tunnels - cli.py: bridge status shows LIVE column and [STALE] marker when state says connected but PID is dead; bridge check wired to diagnostics - state.py: read_raw_pid helper; _pid_alive exported for reuse - capabilities.py: capabilities registry stubs - mcp_server/server.py: expose check_tunnel and tunnel capabilities over MCP - SCOPE.md: rapid orientation document - workplans/OPS-WP-0001-diagnostics.md: workplan backing this feature - tests: 207 passing (test_cli, test_mcp, test_diagnostics) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
11
SCOPE.md
11
SCOPE.md
@@ -91,6 +91,17 @@ Claude Code sessions run locally; the Custodian State Hub API runs locally. Remo
|
||||
|
||||
---
|
||||
|
||||
## Provided Capabilities
|
||||
|
||||
```capability
|
||||
type: infrastructure
|
||||
title: SSH reverse tunnel connectivity
|
||||
description: Named, auto-reconnecting SSH reverse tunnels with health checks and audit logging — keeps remote execution environments continuously connected to the local Custodian State Hub.
|
||||
keywords: [ssh, tunnel, reverse-tunnel, connectivity, remote, bridge, ops-bridge]
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Getting Oriented
|
||||
|
||||
- Start with: `README.txt` (architecture, config format, CLI commands, MCP integration)
|
||||
|
||||
@@ -68,6 +68,11 @@ CAPABILITIES: list[Capability] = [
|
||||
description="Show bridge metadata",
|
||||
required_access_modes=frozenset({"cli", "mcp"}),
|
||||
),
|
||||
Capability(
|
||||
name="bridge_check",
|
||||
description="End-to-end tunnel diagnostics via SSH: SSH PID alive + remote port listening",
|
||||
required_access_modes=frozenset({"cli", "mcp"}),
|
||||
),
|
||||
]
|
||||
|
||||
CAPABILITIES_BY_NAME: dict[str, Capability] = {c.name: c for c in CAPABILITIES}
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
"""CLI for OpsBridge — bridge command."""
|
||||
from __future__ import annotations
|
||||
|
||||
import dataclasses
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
@@ -10,8 +11,9 @@ import typer
|
||||
|
||||
from bridge.audit import AuditLogger
|
||||
from bridge.config import ConfigError, load_config
|
||||
from bridge.diagnostics import check_all_tunnels, check_tunnel
|
||||
from bridge.manager import TunnelManager
|
||||
from bridge.state import StateManager
|
||||
from bridge.state import StateManager, _pid_alive
|
||||
|
||||
app = typer.Typer(
|
||||
name="bridge",
|
||||
@@ -175,13 +177,20 @@ def status(
|
||||
rows = []
|
||||
for name, tcfg in cfg.tunnels.items():
|
||||
state = state_mgr.read_state(name)
|
||||
pid = state_mgr.read_pid(name)
|
||||
raw_pid = state_mgr.read_raw_pid(name)
|
||||
pid_alive_val = _pid_alive(raw_pid) if raw_pid is not None else None
|
||||
stale = (
|
||||
state.value in ("connected", "degraded")
|
||||
and pid_alive_val is not True
|
||||
)
|
||||
rows.append({
|
||||
"tunnel": name,
|
||||
"state": state.value,
|
||||
"actor": tcfg.actor,
|
||||
"host": tcfg.host,
|
||||
"pid": pid,
|
||||
"pid": raw_pid,
|
||||
"pid_alive": pid_alive_val,
|
||||
"stale": stale,
|
||||
"uptime": None,
|
||||
"health": None,
|
||||
})
|
||||
@@ -196,10 +205,29 @@ def _print_status_table(rows):
|
||||
if not rows:
|
||||
typer.echo("No tunnels configured.")
|
||||
return
|
||||
headers = ["TUNNEL", "STATE", "ACTOR", "HOST", "PID"]
|
||||
|
||||
def _state_display(row):
|
||||
s = row["state"]
|
||||
if row.get("stale"):
|
||||
s += " [STALE]"
|
||||
return s
|
||||
|
||||
def _live_display(row):
|
||||
alive = row.get("pid_alive")
|
||||
if alive is True:
|
||||
return "yes"
|
||||
elif alive is False:
|
||||
return "no"
|
||||
return "\u2014"
|
||||
|
||||
headers = ["TUNNEL", "STATE", "ACTOR", "HOST", "PID", "LIVE"]
|
||||
col_widths = [
|
||||
max(len(h), max((len(str(r.get(h.lower(), "") or "")) for r in rows), default=0))
|
||||
for h in headers
|
||||
max(len("TUNNEL"), max((len(row["tunnel"]) for row in rows), default=0)),
|
||||
max(len("STATE"), max((len(_state_display(row)) for row in rows), default=0)),
|
||||
max(len("ACTOR"), max((len(str(row.get("actor", "") or "")) for row in rows), default=0)),
|
||||
max(len("HOST"), max((len(str(row.get("host", "") or "")) for row in rows), default=0)),
|
||||
max(len("PID"), max((len(str(row["pid"] or "")) for row in rows), default=0)),
|
||||
max(len("LIVE"), max((len(_live_display(row)) for row in rows), default=0)),
|
||||
]
|
||||
|
||||
def _fmt_row(vals):
|
||||
@@ -210,10 +238,11 @@ def _print_status_table(rows):
|
||||
for row in rows:
|
||||
typer.echo(_fmt_row([
|
||||
row["tunnel"],
|
||||
row["state"],
|
||||
_state_display(row),
|
||||
row["actor"],
|
||||
row["host"],
|
||||
str(row["pid"] or ""),
|
||||
_live_display(row),
|
||||
]))
|
||||
|
||||
|
||||
@@ -272,6 +301,62 @@ def logs(
|
||||
pass
|
||||
|
||||
|
||||
@app.command()
|
||||
def check(
|
||||
tunnel: Optional[str] = typer.Argument(None, help="Tunnel name (omit for all inline)"),
|
||||
as_json: bool = typer.Option(False, "--json", help="Output as JSON"),
|
||||
):
|
||||
"""End-to-end diagnostics: verify SSH PID alive and remote port listening."""
|
||||
cfg = _load_or_exit()
|
||||
sd = _state_dir()
|
||||
state_mgr = StateManager(state_dir=sd)
|
||||
|
||||
if tunnel:
|
||||
results = [check_tunnel(_resolve_tunnel(cfg, tunnel), state_mgr)]
|
||||
else:
|
||||
results = check_all_tunnels(cfg, state_mgr)
|
||||
|
||||
if as_json:
|
||||
typer.echo(json.dumps(
|
||||
[{**dataclasses.asdict(r), "ok": r.ok} for r in results],
|
||||
indent=2,
|
||||
))
|
||||
else:
|
||||
_print_check_table(results)
|
||||
|
||||
if any(not r.ok for r in results):
|
||||
raise typer.Exit(1)
|
||||
|
||||
|
||||
def _print_check_table(results):
|
||||
if not results:
|
||||
typer.echo("No tunnels configured.")
|
||||
return
|
||||
headers = ["TUNNEL", "SSH", "PID", "PORT", "API", "OK"]
|
||||
rows_data = []
|
||||
for r in results:
|
||||
rows_data.append([
|
||||
r.tunnel,
|
||||
r.ssh_process,
|
||||
str(r.pid or ""),
|
||||
r.remote_port,
|
||||
r.local_api or "\u2014",
|
||||
"yes" if r.ok else "no",
|
||||
])
|
||||
col_widths = [
|
||||
max(len(h), max((len(row[i]) for row in rows_data), default=0))
|
||||
for i, h in enumerate(headers)
|
||||
]
|
||||
|
||||
def _fmt(vals):
|
||||
return " ".join(str(v).ljust(w) for v, w in zip(vals, col_widths))
|
||||
|
||||
typer.echo(_fmt(headers))
|
||||
typer.echo(_fmt(["-" * w for w in col_widths]))
|
||||
for row in rows_data:
|
||||
typer.echo(_fmt(row))
|
||||
|
||||
|
||||
# ─── targets commands ─────────────────────────────────────────────────────────
|
||||
|
||||
@targets_app.callback(invoke_without_command=True)
|
||||
|
||||
110
src/bridge/diagnostics.py
Normal file
110
src/bridge/diagnostics.py
Normal file
@@ -0,0 +1,110 @@
|
||||
"""End-to-end tunnel diagnostics for OpsBridge."""
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
|
||||
from bridge.models import BridgeState, TunnelConfig
|
||||
from bridge.state import StateManager, _pid_alive
|
||||
|
||||
|
||||
@dataclass
|
||||
class TunnelCheckResult:
|
||||
tunnel: str
|
||||
ssh_process: str # "ok" | "dead" | "no_pid"
|
||||
pid: Optional[int]
|
||||
remote_port: str # "listening" | "closed" | "error:<msg>"
|
||||
local_api: Optional[str] # "ok" | "error:<msg>" | None
|
||||
latency_ms: Optional[float]
|
||||
stale_state: bool # state file says connected but process is dead
|
||||
|
||||
@property
|
||||
def ok(self) -> bool:
|
||||
return self.ssh_process == "ok" and self.remote_port == "listening"
|
||||
|
||||
|
||||
def check_tunnel(cfg: TunnelConfig, state_mgr: StateManager) -> TunnelCheckResult:
|
||||
"""Run end-to-end diagnostics for a single tunnel.
|
||||
|
||||
Checks SSH PID liveness, remote port listening via SSH probe, and optional
|
||||
local API health check. Returns a TunnelCheckResult with all findings.
|
||||
"""
|
||||
name = cfg.name
|
||||
|
||||
# 1. PID liveness
|
||||
pid = state_mgr.read_raw_pid(name)
|
||||
if pid is None:
|
||||
ssh_process = "no_pid"
|
||||
elif _pid_alive(pid):
|
||||
ssh_process = "ok"
|
||||
else:
|
||||
ssh_process = "dead"
|
||||
|
||||
# 2. Stale state: state file says connected/degraded but process is dead
|
||||
state = state_mgr.read_state(name)
|
||||
stale_state = (
|
||||
state in (BridgeState.CONNECTED, BridgeState.DEGRADED)
|
||||
and ssh_process != "ok"
|
||||
)
|
||||
|
||||
# 3. SSH probe for remote port
|
||||
key_path = str(Path(cfg.ssh_key).expanduser())
|
||||
cmd = [
|
||||
"ssh",
|
||||
"-i", key_path,
|
||||
"-o", "BatchMode=yes",
|
||||
"-o", "ConnectTimeout=5",
|
||||
"-o", "StrictHostKeyChecking=accept-new",
|
||||
f"{cfg.ssh_user}@{cfg.host}",
|
||||
f"ss -tnlp 2>/dev/null | grep -q ':{cfg.remote_port} ' && echo ok || echo closed",
|
||||
]
|
||||
try:
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
output = proc.stdout.strip()
|
||||
if output == "ok":
|
||||
remote_port = "listening"
|
||||
elif output == "closed":
|
||||
remote_port = "closed"
|
||||
else:
|
||||
remote_port = f"error:{proc.stderr.strip() or 'unknown'}"
|
||||
except subprocess.TimeoutExpired:
|
||||
remote_port = "error:timeout"
|
||||
except Exception as e:
|
||||
remote_port = f"error:{e}"
|
||||
|
||||
# 4. Local API health check (optional)
|
||||
local_api: Optional[str] = None
|
||||
latency_ms: Optional[float] = None
|
||||
if cfg.health_check is not None:
|
||||
try:
|
||||
t0 = time.monotonic()
|
||||
resp = httpx.get(cfg.health_check.url, timeout=cfg.health_check.timeout_seconds)
|
||||
latency_ms = (time.monotonic() - t0) * 1000
|
||||
local_api = "ok" if resp.is_success else f"error:http_{resp.status_code}"
|
||||
except Exception as e:
|
||||
local_api = f"error:{e}"
|
||||
|
||||
return TunnelCheckResult(
|
||||
tunnel=name,
|
||||
ssh_process=ssh_process,
|
||||
pid=pid,
|
||||
remote_port=remote_port,
|
||||
local_api=local_api,
|
||||
latency_ms=latency_ms,
|
||||
stale_state=stale_state,
|
||||
)
|
||||
|
||||
|
||||
def check_all_tunnels(cfg, state_mgr: StateManager) -> list[TunnelCheckResult]:
|
||||
"""Run diagnostics for all configured inline tunnels."""
|
||||
return [check_tunnel(tcfg, state_mgr) for tcfg in cfg.tunnels.values()]
|
||||
@@ -8,6 +8,7 @@ All tool functions return JSON-serialisable dicts/lists.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import dataclasses
|
||||
import json
|
||||
import os
|
||||
from pathlib import Path
|
||||
@@ -15,6 +16,9 @@ from typing import Optional
|
||||
|
||||
from fastmcp import FastMCP
|
||||
|
||||
from bridge.diagnostics import check_all_tunnels, check_tunnel
|
||||
from bridge.state import StateManager
|
||||
|
||||
mcp = FastMCP(
|
||||
name="ops-bridge",
|
||||
instructions=(
|
||||
@@ -218,7 +222,6 @@ def bridge_status() -> list[dict]:
|
||||
if err:
|
||||
return [err]
|
||||
|
||||
from bridge.state import StateManager
|
||||
sd = _state_dir()
|
||||
state_mgr = StateManager(state_dir=sd)
|
||||
|
||||
@@ -432,6 +435,48 @@ def catalog_show_bridge(bridge_id: str) -> dict:
|
||||
return result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Diagnostics tool
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@mcp.tool()
|
||||
def bridge_check(tunnel: Optional[str] = None) -> list[dict]:
|
||||
"""End-to-end diagnostics: SSH process alive + remote port listening.
|
||||
|
||||
Args:
|
||||
tunnel: Specific tunnel name, or None for all inline tunnels.
|
||||
|
||||
Returns:
|
||||
List of dicts with keys: tunnel, ssh_process, pid, remote_port,
|
||||
local_api, latency_ms, stale_state, ok.
|
||||
Returns [{"error": "..."}] on config load failure.
|
||||
"""
|
||||
cfg, err = _load_cfg_or_error()
|
||||
if err:
|
||||
return [err]
|
||||
sd = _state_dir()
|
||||
state_mgr = StateManager(state_dir=sd)
|
||||
|
||||
if tunnel:
|
||||
from bridge.catalog.loader import load_catalog
|
||||
from bridge.catalog.resolver import BridgeNotFound, resolve
|
||||
catalog = None
|
||||
if cfg.catalog_path is not None:
|
||||
try:
|
||||
catalog = load_catalog(cfg.catalog_path)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
tcfg = resolve(tunnel, catalog=catalog, inline_tunnels=cfg.tunnels)
|
||||
except BridgeNotFound:
|
||||
return [{"error": f"Tunnel '{tunnel}' not found in config or catalog"}]
|
||||
results = [check_tunnel(tcfg, state_mgr)]
|
||||
else:
|
||||
results = check_all_tunnels(cfg, state_mgr)
|
||||
|
||||
return [{**dataclasses.asdict(r), "ok": r.ok} for r in results]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# MCP resources
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -443,6 +488,12 @@ def resource_bridge_status() -> str:
|
||||
return json.dumps(rows, indent=2)
|
||||
|
||||
|
||||
@mcp.resource("bridge://check")
|
||||
def resource_bridge_check() -> str:
|
||||
"""Live end-to-end diagnostic snapshot for all tunnels."""
|
||||
return json.dumps(bridge_check(), indent=2)
|
||||
|
||||
|
||||
@mcp.resource("catalog://domains")
|
||||
def resource_catalog_domains() -> str:
|
||||
"""List of all catalog domains as JSON."""
|
||||
|
||||
@@ -51,6 +51,16 @@ class StateManager:
|
||||
return pid
|
||||
return None
|
||||
|
||||
def read_raw_pid(self, name: str) -> Optional[int]:
|
||||
"""Read PID from file without liveness check. Returns None if file absent/invalid."""
|
||||
path = self._pid_path(name)
|
||||
if not path.exists():
|
||||
return None
|
||||
try:
|
||||
return int(path.read_text().strip())
|
||||
except (ValueError, OSError):
|
||||
return None
|
||||
|
||||
def write_pid(self, name: str, pid: int) -> None:
|
||||
self._ensure_dir()
|
||||
self._pid_path(name).write_text(str(pid))
|
||||
|
||||
@@ -188,6 +188,84 @@ class TestLogsCommand:
|
||||
assert "bridge_started" in result.output
|
||||
|
||||
|
||||
class TestCheckCommand:
|
||||
def test_check_help(self):
|
||||
result = runner.invoke(app, ["check", "--help"])
|
||||
assert result.exit_code == 0
|
||||
|
||||
@pytest.mark.capability("bridge_check")
|
||||
@pytest.mark.access_mode("cli")
|
||||
def test_check_all_pass(self, env):
|
||||
from bridge.diagnostics import TunnelCheckResult
|
||||
ok_result = TunnelCheckResult(
|
||||
tunnel="test-tunnel",
|
||||
ssh_process="ok",
|
||||
pid=12345,
|
||||
remote_port="listening",
|
||||
local_api=None,
|
||||
latency_ms=None,
|
||||
stale_state=False,
|
||||
)
|
||||
with patch("bridge.cli.check_all_tunnels", return_value=[ok_result]):
|
||||
result = runner.invoke(app, ["check"], env=env)
|
||||
assert result.exit_code == 0
|
||||
|
||||
def test_check_any_fail(self, env):
|
||||
from bridge.diagnostics import TunnelCheckResult
|
||||
fail_result = TunnelCheckResult(
|
||||
tunnel="test-tunnel",
|
||||
ssh_process="dead",
|
||||
pid=None,
|
||||
remote_port="closed",
|
||||
local_api=None,
|
||||
latency_ms=None,
|
||||
stale_state=True,
|
||||
)
|
||||
with patch("bridge.cli.check_all_tunnels", return_value=[fail_result]):
|
||||
result = runner.invoke(app, ["check"], env=env)
|
||||
assert result.exit_code == 1
|
||||
|
||||
def test_check_json_flag(self, env):
|
||||
from bridge.diagnostics import TunnelCheckResult
|
||||
ok_result = TunnelCheckResult(
|
||||
tunnel="test-tunnel",
|
||||
ssh_process="ok",
|
||||
pid=12345,
|
||||
remote_port="listening",
|
||||
local_api=None,
|
||||
latency_ms=None,
|
||||
stale_state=False,
|
||||
)
|
||||
with patch("bridge.cli.check_all_tunnels", return_value=[ok_result]):
|
||||
result = runner.invoke(app, ["check", "--json"], env=env)
|
||||
assert result.exit_code == 0
|
||||
data = json.loads(result.output)
|
||||
assert isinstance(data, list)
|
||||
assert len(data) == 1
|
||||
assert data[0]["ok"] is True
|
||||
assert data[0]["tunnel"] == "test-tunnel"
|
||||
assert data[0]["ssh_process"] == "ok"
|
||||
|
||||
def test_check_specific_tunnel(self, env):
|
||||
from bridge.diagnostics import TunnelCheckResult
|
||||
ok_result = TunnelCheckResult(
|
||||
tunnel="test-tunnel",
|
||||
ssh_process="ok",
|
||||
pid=12345,
|
||||
remote_port="listening",
|
||||
local_api=None,
|
||||
latency_ms=None,
|
||||
stale_state=False,
|
||||
)
|
||||
with patch("bridge.cli.check_tunnel", return_value=ok_result):
|
||||
result = runner.invoke(app, ["check", "test-tunnel"], env=env)
|
||||
assert result.exit_code == 0
|
||||
|
||||
def test_check_unknown_tunnel_exit_1(self, env):
|
||||
result = runner.invoke(app, ["check", "nonexistent"], env=env)
|
||||
assert result.exit_code == 1
|
||||
|
||||
|
||||
class TestRestartCommand:
|
||||
def test_restart_unknown_tunnel_exit_1(self, env):
|
||||
result = runner.invoke(app, ["restart", "nonexistent"], env=env)
|
||||
|
||||
177
tests/test_diagnostics.py
Normal file
177
tests/test_diagnostics.py
Normal file
@@ -0,0 +1,177 @@
|
||||
"""Tests for bridge.diagnostics — check_tunnel() logic."""
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
from bridge.diagnostics import TunnelCheckResult, check_all_tunnels, check_tunnel
|
||||
from bridge.models import BridgeState, TunnelConfig
|
||||
from bridge.state import StateManager
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def tcfg():
|
||||
return TunnelConfig(
|
||||
name="test-tunnel",
|
||||
host="coulombcore.local",
|
||||
remote_port=18000,
|
||||
local_port=8000,
|
||||
ssh_user="ubuntu",
|
||||
ssh_key="~/.ssh/id_ops",
|
||||
actor="operator.bernd",
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def state_mgr(tmp_path):
|
||||
d = tmp_path / "state"
|
||||
d.mkdir()
|
||||
return StateManager(state_dir=d)
|
||||
|
||||
|
||||
class TestCheckTunnel:
|
||||
def test_no_pid(self, tcfg, state_mgr):
|
||||
"""No PID file → ssh_process='no_pid', ok=False."""
|
||||
with patch("bridge.diagnostics.subprocess.run") as mock_run:
|
||||
mock_run.return_value = MagicMock(stdout="closed\n", stderr="", returncode=1)
|
||||
result = check_tunnel(tcfg, state_mgr)
|
||||
assert result.ssh_process == "no_pid"
|
||||
assert result.pid is None
|
||||
assert result.stale_state is False
|
||||
assert result.ok is False
|
||||
|
||||
def test_pid_dead(self, tcfg, state_mgr):
|
||||
"""Dead PID + connected state → ssh_process='dead', stale_state=True."""
|
||||
state_mgr.write_pid("test-tunnel", 99999)
|
||||
state_mgr.write_state("test-tunnel", BridgeState.CONNECTED)
|
||||
with (
|
||||
patch("bridge.diagnostics._pid_alive", return_value=False),
|
||||
patch("bridge.diagnostics.subprocess.run") as mock_run,
|
||||
):
|
||||
mock_run.return_value = MagicMock(stdout="closed\n", stderr="", returncode=1)
|
||||
result = check_tunnel(tcfg, state_mgr)
|
||||
assert result.ssh_process == "dead"
|
||||
assert result.stale_state is True
|
||||
assert result.ok is False
|
||||
|
||||
def test_pid_alive_port_listening(self, tcfg, state_mgr):
|
||||
"""Alive PID + SSH reports port listening → remote_port='listening', ok=True."""
|
||||
state_mgr.write_pid("test-tunnel", 12345)
|
||||
with (
|
||||
patch("bridge.diagnostics._pid_alive", return_value=True),
|
||||
patch("bridge.diagnostics.subprocess.run") as mock_run,
|
||||
):
|
||||
mock_run.return_value = MagicMock(stdout="ok\n", stderr="", returncode=0)
|
||||
result = check_tunnel(tcfg, state_mgr)
|
||||
assert result.ssh_process == "ok"
|
||||
assert result.pid == 12345
|
||||
assert result.remote_port == "listening"
|
||||
assert result.ok is True
|
||||
|
||||
def test_pid_alive_port_closed(self, tcfg, state_mgr):
|
||||
"""Alive PID + SSH reports port closed → remote_port='closed', ok=False."""
|
||||
state_mgr.write_pid("test-tunnel", 12345)
|
||||
with (
|
||||
patch("bridge.diagnostics._pid_alive", return_value=True),
|
||||
patch("bridge.diagnostics.subprocess.run") as mock_run,
|
||||
):
|
||||
mock_run.return_value = MagicMock(stdout="closed\n", stderr="", returncode=1)
|
||||
result = check_tunnel(tcfg, state_mgr)
|
||||
assert result.ssh_process == "ok"
|
||||
assert result.remote_port == "closed"
|
||||
assert result.ok is False
|
||||
|
||||
def test_ssh_timeout(self, tcfg, state_mgr):
|
||||
"""SSH probe timeout → remote_port='error:timeout'."""
|
||||
state_mgr.write_pid("test-tunnel", 12345)
|
||||
with (
|
||||
patch("bridge.diagnostics._pid_alive", return_value=True),
|
||||
patch(
|
||||
"bridge.diagnostics.subprocess.run",
|
||||
side_effect=subprocess.TimeoutExpired(cmd=["ssh"], timeout=10),
|
||||
),
|
||||
):
|
||||
result = check_tunnel(tcfg, state_mgr)
|
||||
assert result.remote_port == "error:timeout"
|
||||
assert result.ok is False
|
||||
|
||||
def test_stale_state_not_flagged_when_stopped(self, tcfg, state_mgr):
|
||||
"""State=stopped + no PID → stale_state is False (not connected/degraded)."""
|
||||
with patch("bridge.diagnostics.subprocess.run") as mock_run:
|
||||
mock_run.return_value = MagicMock(stdout="closed\n", stderr="", returncode=1)
|
||||
result = check_tunnel(tcfg, state_mgr)
|
||||
assert result.stale_state is False
|
||||
|
||||
def test_local_api_ok(self, tcfg, state_mgr, tmp_path):
|
||||
"""With health_check configured, ok response sets local_api='ok'."""
|
||||
from bridge.models import HealthCheckConfig
|
||||
tcfg_with_health = TunnelConfig(
|
||||
name="test-tunnel",
|
||||
host="coulombcore.local",
|
||||
remote_port=18000,
|
||||
local_port=8000,
|
||||
ssh_user="ubuntu",
|
||||
ssh_key="~/.ssh/id_ops",
|
||||
actor="operator.bernd",
|
||||
health_check=HealthCheckConfig(url="http://127.0.0.1:8000/health"),
|
||||
)
|
||||
state_mgr.write_pid("test-tunnel", 12345)
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.is_success = True
|
||||
with (
|
||||
patch("bridge.diagnostics._pid_alive", return_value=True),
|
||||
patch("bridge.diagnostics.subprocess.run") as mock_run,
|
||||
patch("bridge.diagnostics.httpx.get", return_value=mock_resp),
|
||||
):
|
||||
mock_run.return_value = MagicMock(stdout="ok\n", stderr="", returncode=0)
|
||||
result = check_tunnel(tcfg_with_health, state_mgr)
|
||||
assert result.local_api == "ok"
|
||||
assert result.latency_ms is not None
|
||||
|
||||
|
||||
class TestCheckAllTunnels:
|
||||
def test_check_all_iterates_tunnels(self, tmp_path):
|
||||
"""check_all_tunnels returns one result per tunnel in cfg."""
|
||||
from bridge.config import load_config
|
||||
import textwrap, os
|
||||
|
||||
cfg_file = tmp_path / "tunnels.yaml"
|
||||
cfg_file.write_text(textwrap.dedent("""\
|
||||
tunnels:
|
||||
t1:
|
||||
host: h1.local
|
||||
remote_port: 18001
|
||||
local_port: 8001
|
||||
ssh_user: ubuntu
|
||||
ssh_key: ~/.ssh/id_ops
|
||||
actor: operator.bernd
|
||||
t2:
|
||||
host: h2.local
|
||||
remote_port: 18002
|
||||
local_port: 8002
|
||||
ssh_user: ubuntu
|
||||
ssh_key: ~/.ssh/id_ops
|
||||
actor: operator.bernd
|
||||
actors:
|
||||
operator.bernd:
|
||||
class: human
|
||||
description: Bernd
|
||||
"""))
|
||||
os.environ["BRIDGE_CONFIG"] = str(cfg_file)
|
||||
try:
|
||||
cfg = load_config()
|
||||
finally:
|
||||
del os.environ["BRIDGE_CONFIG"]
|
||||
|
||||
state_dir = tmp_path / "state"
|
||||
state_dir.mkdir()
|
||||
state_mgr = StateManager(state_dir=state_dir)
|
||||
|
||||
with patch("bridge.diagnostics.subprocess.run") as mock_run:
|
||||
mock_run.return_value = MagicMock(stdout="closed\n", stderr="", returncode=1)
|
||||
results = check_all_tunnels(cfg, state_mgr)
|
||||
|
||||
assert len(results) == 2
|
||||
assert {r.tunnel for r in results} == {"t1", "t2"}
|
||||
@@ -469,6 +469,80 @@ class TestMcpCatalogShowBridge:
|
||||
assert "error" in data
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# bridge_check
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class TestMcpBridgeCheck:
|
||||
@pytest.mark.capability("bridge_check")
|
||||
@pytest.mark.access_mode("mcp")
|
||||
async def test_bridge_check_tool(self, env_simple):
|
||||
"""bridge_check returns a list of dicts with 'ok' key."""
|
||||
from bridge.diagnostics import TunnelCheckResult
|
||||
mock_result = TunnelCheckResult(
|
||||
tunnel="test-tunnel",
|
||||
ssh_process="ok",
|
||||
pid=12345,
|
||||
remote_port="listening",
|
||||
local_api=None,
|
||||
latency_ms=None,
|
||||
stale_state=False,
|
||||
)
|
||||
with patch("bridge.mcp_server.server.check_all_tunnels", return_value=[mock_result]):
|
||||
from fastmcp import Client
|
||||
async with Client(mcp) as c:
|
||||
result = await c.call_tool("bridge_check", {})
|
||||
data = _data(result)
|
||||
assert isinstance(data, list)
|
||||
assert len(data) == 1
|
||||
row = data[0]
|
||||
assert "ok" in row
|
||||
assert row["ok"] is True
|
||||
assert row["tunnel"] == "test-tunnel"
|
||||
assert row["ssh_process"] == "ok"
|
||||
assert row["remote_port"] == "listening"
|
||||
|
||||
async def test_bridge_check_specific_tunnel(self, env_simple):
|
||||
"""bridge_check with tunnel arg calls check_tunnel for that tunnel."""
|
||||
from bridge.diagnostics import TunnelCheckResult
|
||||
mock_result = TunnelCheckResult(
|
||||
tunnel="test-tunnel",
|
||||
ssh_process="dead",
|
||||
pid=None,
|
||||
remote_port="closed",
|
||||
local_api=None,
|
||||
latency_ms=None,
|
||||
stale_state=True,
|
||||
)
|
||||
with patch("bridge.mcp_server.server.check_tunnel", return_value=mock_result):
|
||||
from fastmcp import Client
|
||||
async with Client(mcp) as c:
|
||||
result = await c.call_tool("bridge_check", {"tunnel": "test-tunnel"})
|
||||
data = _data(result)
|
||||
assert isinstance(data, list)
|
||||
assert data[0]["ok"] is False
|
||||
assert data[0]["stale_state"] is True
|
||||
|
||||
async def test_bridge_check_unknown_tunnel(self, env_simple):
|
||||
"""bridge_check with unknown tunnel returns error dict."""
|
||||
from fastmcp import Client
|
||||
async with Client(mcp) as c:
|
||||
result = await c.call_tool("bridge_check", {"tunnel": "nonexistent"})
|
||||
data = _data(result)
|
||||
assert isinstance(data, list)
|
||||
assert "error" in data[0]
|
||||
|
||||
async def test_bridge_check_bad_config(self, tmp_path, monkeypatch):
|
||||
"""bridge_check with bad config returns error dict."""
|
||||
monkeypatch.setenv("BRIDGE_CONFIG", str(tmp_path / "nonexistent.yaml"))
|
||||
from fastmcp import Client
|
||||
async with Client(mcp) as c:
|
||||
result = await c.call_tool("bridge_check", {})
|
||||
data = _data(result)
|
||||
assert isinstance(data, list)
|
||||
assert "error" in data[0]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Resources
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
164
workplans/OPS-WP-0001-diagnostics.md
Normal file
164
workplans/OPS-WP-0001-diagnostics.md
Normal file
@@ -0,0 +1,164 @@
|
||||
---
|
||||
id: OPS-WP-0001
|
||||
type: workplan
|
||||
title: "ops-bridge diagnostics and flow improvements"
|
||||
domain: custodian
|
||||
repo: ops-bridge
|
||||
status: done
|
||||
owner: claude
|
||||
topic_slug: custodian
|
||||
created: "2026-03-20"
|
||||
updated: "2026-03-20"
|
||||
state_hub_workstream_id: "6726cea2-447a-40b2-b0a0-edf495f07942"
|
||||
---
|
||||
|
||||
# OPS-WP-0001 — ops-bridge diagnostics and flow improvements
|
||||
|
||||
**Scope:** Add `bridge check` end-to-end diagnostics command, fix `bridge status` to
|
||||
surface live PID liveness and flag stale state, add a `bridge_check` MCP tool, and
|
||||
wire Makefile convenience targets in state-hub.
|
||||
|
||||
**Context:** During a session, `bridge status` reported "connected" but the reverse
|
||||
port forwarding was not active — stale `.state` files written by the daemon. The
|
||||
status command does not verify the SSH process is alive or that the remote port is
|
||||
actually listening.
|
||||
|
||||
---
|
||||
|
||||
## Task: Add `read_raw_pid()` to StateManager
|
||||
|
||||
```task
|
||||
id: OPS-WP-0001-T01
|
||||
status: done
|
||||
priority: high
|
||||
state_hub_task_id: "05e98e85-699a-4982-bb3e-8f2538cde2c7"
|
||||
```
|
||||
|
||||
Add `read_raw_pid(name)` to `src/bridge/state.py` — reads PID from file without
|
||||
liveness check. Existing `read_pid()` (which also checks liveness) stays unchanged.
|
||||
|
||||
---
|
||||
|
||||
## Task: Create `src/bridge/diagnostics.py`
|
||||
|
||||
```task
|
||||
id: OPS-WP-0001-T02
|
||||
status: done
|
||||
priority: high
|
||||
state_hub_task_id: "b68d7b1e-850b-469a-9de2-8b5d3d1f1c05"
|
||||
```
|
||||
|
||||
New module with `TunnelCheckResult` dataclass (ssh_process, pid, remote_port,
|
||||
local_api, latency_ms, stale_state, ok property) and `check_tunnel()` /
|
||||
`check_all_tunnels()` functions. SSH probe via subprocess; optional httpx health check.
|
||||
|
||||
---
|
||||
|
||||
## Task: Fix `bridge status` and add `bridge check` to CLI
|
||||
|
||||
```task
|
||||
id: OPS-WP-0001-T03
|
||||
status: done
|
||||
priority: high
|
||||
state_hub_task_id: "e87c6c5d-170c-4af3-905c-a48fae2edbe5"
|
||||
```
|
||||
|
||||
Fix `status` to show live PID liveness (LIVE column) and flag stale state.
|
||||
Add `check` command with `--json` flag; exit 1 if any tunnel not ok.
|
||||
Add `_print_check_table` helper.
|
||||
|
||||
---
|
||||
|
||||
## Task: Add `bridge_check` MCP tool and `bridge://check` resource
|
||||
|
||||
```task
|
||||
id: OPS-WP-0001-T04
|
||||
status: done
|
||||
priority: high
|
||||
state_hub_task_id: "7e97c112-20e2-4e2e-b853-53b10998392b"
|
||||
```
|
||||
|
||||
Add `bridge_check(tunnel?)` tool and `bridge://check` resource to
|
||||
`src/bridge/mcp_server/server.py`.
|
||||
|
||||
---
|
||||
|
||||
## Task: Register `bridge_check` capability
|
||||
|
||||
```task
|
||||
id: OPS-WP-0001-T05
|
||||
status: done
|
||||
priority: high
|
||||
state_hub_task_id: "c69fc748-a706-46db-a4d5-30d60222452b"
|
||||
```
|
||||
|
||||
Add `bridge_check` entry to `src/bridge/capabilities.py` with
|
||||
`required_access_modes=frozenset({"cli", "mcp"})`.
|
||||
|
||||
---
|
||||
|
||||
## Task: Write `tests/test_diagnostics.py`
|
||||
|
||||
```task
|
||||
id: OPS-WP-0001-T06
|
||||
status: done
|
||||
priority: high
|
||||
state_hub_task_id: "070ed088-74a6-48d3-81cf-739c2a2fd21b"
|
||||
```
|
||||
|
||||
Unit tests: test_no_pid, test_pid_dead, test_pid_alive_port_listening,
|
||||
test_pid_alive_port_closed, test_ssh_timeout.
|
||||
|
||||
---
|
||||
|
||||
## Task: Add `TestCheckCommand` to `tests/test_cli.py`
|
||||
|
||||
```task
|
||||
id: OPS-WP-0001-T07
|
||||
status: done
|
||||
priority: high
|
||||
state_hub_task_id: "aae5ddc5-f823-4647-a536-8604ddb97946"
|
||||
```
|
||||
|
||||
Tests: test_check_help, test_check_all_pass (marked capability+mode),
|
||||
test_check_any_fail, test_check_json_flag, test_check_specific_tunnel.
|
||||
|
||||
---
|
||||
|
||||
## Task: Add `TestMcpBridgeCheck` to `tests/test_mcp.py`
|
||||
|
||||
```task
|
||||
id: OPS-WP-0001-T08
|
||||
status: done
|
||||
priority: high
|
||||
state_hub_task_id: "ed492a3d-7a5f-465e-8cc3-d2f992f5462c"
|
||||
```
|
||||
|
||||
Test: test_bridge_check_tool marked capability("bridge_check") + access_mode("mcp").
|
||||
|
||||
---
|
||||
|
||||
## Task: Add tunnels targets to state-hub Makefile
|
||||
|
||||
```task
|
||||
id: OPS-WP-0001-T09
|
||||
status: done
|
||||
priority: medium
|
||||
state_hub_task_id: "a3c77062-cff5-40e3-936c-b210b05f8839"
|
||||
```
|
||||
|
||||
Add `tunnels-up`, `tunnels-status`, `tunnels-check` targets delegating to `bridge`.
|
||||
Add to `.PHONY` line.
|
||||
|
||||
---
|
||||
|
||||
## Task: Run test suite and verify
|
||||
|
||||
```task
|
||||
id: OPS-WP-0001-T10
|
||||
status: done
|
||||
priority: high
|
||||
state_hub_task_id: "e42de76c-fab7-4924-8929-38fa9eaca478"
|
||||
```
|
||||
|
||||
`cd /home/worsch/ops-bridge && uv run pytest tests/ -v` — all tests green.
|
||||
Reference in New Issue
Block a user