feat(diagnostics): end-to-end tunnel check, stale state detection, MCP extensions

- diagnostics.py: TunnelCheckResult with SSH process liveness, port
  probe, and optional API health check; check_tunnel / check_all_tunnels
- cli.py: bridge status shows LIVE column and [STALE] marker when state
  says connected but PID is dead; bridge check wired to diagnostics
- state.py: read_raw_pid helper; _pid_alive exported for reuse
- capabilities.py: capabilities registry stubs
- mcp_server/server.py: expose check_tunnel and tunnel capabilities
  over MCP
- SCOPE.md: rapid orientation document
- workplans/OPS-WP-0001-diagnostics.md: workplan backing this feature
- tests: 207 passing (test_cli, test_mcp, test_diagnostics)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-21 15:07:47 +01:00
parent bebd542a2e
commit a55c685f89
10 changed files with 773 additions and 8 deletions

View File

@@ -8,6 +8,7 @@ All tool functions return JSON-serialisable dicts/lists.
"""
from __future__ import annotations
import dataclasses
import json
import os
from pathlib import Path
@@ -15,6 +16,9 @@ from typing import Optional
from fastmcp import FastMCP
from bridge.diagnostics import check_all_tunnels, check_tunnel
from bridge.state import StateManager
mcp = FastMCP(
name="ops-bridge",
instructions=(
@@ -218,7 +222,6 @@ def bridge_status() -> list[dict]:
if err:
return [err]
from bridge.state import StateManager
sd = _state_dir()
state_mgr = StateManager(state_dir=sd)
@@ -432,6 +435,48 @@ def catalog_show_bridge(bridge_id: str) -> dict:
return result
# ---------------------------------------------------------------------------
# Diagnostics tool
# ---------------------------------------------------------------------------
@mcp.tool()
def bridge_check(tunnel: Optional[str] = None) -> list[dict]:
"""End-to-end diagnostics: SSH process alive + remote port listening.
Args:
tunnel: Specific tunnel name, or None for all inline tunnels.
Returns:
List of dicts with keys: tunnel, ssh_process, pid, remote_port,
local_api, latency_ms, stale_state, ok.
Returns [{"error": "..."}] on config load failure.
"""
cfg, err = _load_cfg_or_error()
if err:
return [err]
sd = _state_dir()
state_mgr = StateManager(state_dir=sd)
if tunnel:
from bridge.catalog.loader import load_catalog
from bridge.catalog.resolver import BridgeNotFound, resolve
catalog = None
if cfg.catalog_path is not None:
try:
catalog = load_catalog(cfg.catalog_path)
except Exception:
pass
try:
tcfg = resolve(tunnel, catalog=catalog, inline_tunnels=cfg.tunnels)
except BridgeNotFound:
return [{"error": f"Tunnel '{tunnel}' not found in config or catalog"}]
results = [check_tunnel(tcfg, state_mgr)]
else:
results = check_all_tunnels(cfg, state_mgr)
return [{**dataclasses.asdict(r), "ok": r.ok} for r in results]
# ---------------------------------------------------------------------------
# MCP resources
# ---------------------------------------------------------------------------
@@ -443,6 +488,12 @@ def resource_bridge_status() -> str:
return json.dumps(rows, indent=2)
@mcp.resource("bridge://check")
def resource_bridge_check() -> str:
"""Live end-to-end diagnostic snapshot for all tunnels."""
return json.dumps(bridge_check(), indent=2)
@mcp.resource("catalog://domains")
def resource_catalog_domains() -> str:
"""List of all catalog domains as JSON."""