From 10c6fdaec9d0e4083141416f130e5ec933b87a5c Mon Sep 17 00:00:00 2001 From: tegwick Date: Sun, 21 Jun 2026 20:12:13 +0200 Subject: [PATCH] feat(restart): route reverse tunnels through stale-forward cleanup bridge restart now means blank-slate recovery: reverse tunnels run should_cleanup_tunnel and clear orphan remote listeners before reconnecting; healthy forwards are left running. Local-direction tunnels keep stop/start only. CLI and MCP report per-tunnel actions (healthy, cleaned_and_restarted, restarted, error) and exit non-zero on cleanup failure. Closes BRIDGE-WP-0005. --- src/bridge/cleanup.py | 22 +++++- src/bridge/cli.py | 35 ++++++--- src/bridge/mcp_server/server.py | 30 ++++---- tests/test_cleanup.py | 2 - tests/test_cli.py | 77 ++++++++++++++++++- tests/test_mcp.py | 20 ++--- wiki/OpsBridge.md | 59 +++++++++++++- ...WP-0005-restart-includes-remote-cleanup.md | 35 +++++---- 8 files changed, 220 insertions(+), 60 deletions(-) diff --git a/src/bridge/cleanup.py b/src/bridge/cleanup.py index 56150c1..4ec638f 100644 --- a/src/bridge/cleanup.py +++ b/src/bridge/cleanup.py @@ -2,7 +2,6 @@ from __future__ import annotations import subprocess -import time from dataclasses import dataclass from typing import Optional from urllib.parse import urlparse, urlunparse @@ -215,6 +214,27 @@ def cleanup_tunnel( return CleanupAction(name, "error", str(exc)) +def restart_tunnel( + cfg: TunnelConfig, + state_mgr: StateManager, +) -> CleanupAction: + """Restart one tunnel with blank-slate recovery for reverse tunnels.""" + if cfg.direction == "local": + mgr = TunnelManager(cfg, state_dir=state_mgr._dir) + mgr.stop() + mgr.start() + return CleanupAction(cfg.name, "restarted", "local tunnel stop/start") + return cleanup_tunnel(cfg, state_mgr, restart=True) + + +def restart_all_tunnels( + cfg, + state_mgr: StateManager, +) -> list[CleanupAction]: + """Restart every inline tunnel (reverse via cleanup path, local via stop/start).""" + return [restart_tunnel(tcfg, state_mgr) for tcfg in cfg.tunnels.values()] + + def cleanup_all_tunnels( cfg, state_mgr: StateManager, diff --git a/src/bridge/cli.py b/src/bridge/cli.py index 2e89154..206fa31 100644 --- a/src/bridge/cli.py +++ b/src/bridge/cli.py @@ -13,10 +13,13 @@ import typer from bridge.audit import AuditLogger from bridge.cleanup import ( + CleanupAction, build_cron_line, cleanup_all_tunnels, install_cleanup_cron, read_installed_cron, + restart_all_tunnels, + restart_tunnel, uninstall_cleanup_cron, ) from bridge.config import ConfigError, load_config @@ -153,27 +156,37 @@ def down( raise typer.Exit(2) +def _emit_restart_actions(actions: list[CleanupAction]) -> None: + any_error = False + for action in actions: + typer.echo(f"{action.tunnel}: {action.action} — {action.detail}") + if action.action == "error": + any_error = True + if any_error: + raise typer.Exit(1) + + @app.command() def restart( tunnel: Optional[str] = typer.Argument(None, help="Tunnel name (omit for all inline)"), ): - """Restart one or all tunnels.""" + """Restart one or all tunnels. + + Reverse tunnels run conditional remote stale-forward cleanup before + reconnecting; healthy forwards are left running. Local-direction tunnels + use local stop/start only. + """ cfg = _load_or_exit() sd = _state_dir() + state_mgr = StateManager(state_dir=sd) if tunnel: tcfg = _resolve_tunnel(cfg, tunnel) - mgr = TunnelManager(tcfg, state_dir=sd) - mgr.stop() - mgr.start() - typer.echo(f"Restarted tunnel '{tunnel}'.") + actions = [restart_tunnel(tcfg, state_mgr)] else: - for name in _all_tunnel_names(cfg): - tcfg = cfg.tunnels[name] - mgr = TunnelManager(tcfg, state_dir=sd) - mgr.stop() - mgr.start() - typer.echo(f"Restarted tunnel '{name}'.") + actions = restart_all_tunnels(cfg, state_mgr) + + _emit_restart_actions(actions) @app.command() diff --git a/src/bridge/mcp_server/server.py b/src/bridge/mcp_server/server.py index 0780488..adc28aa 100644 --- a/src/bridge/mcp_server/server.py +++ b/src/bridge/mcp_server/server.py @@ -169,19 +169,22 @@ def bridge_down(tunnel: Optional[str] = None) -> dict: def bridge_restart(tunnel: Optional[str] = None) -> dict: """Restart one or all configured tunnels. + Reverse tunnels run conditional remote stale-forward cleanup before + reconnecting; healthy forwards are left running. + Args: tunnel: Tunnel name to restart. If omitted, restarts all inline tunnels. Returns: - {"restarted": [...]} or {"error": "..."} + {"actions": [{"tunnel", "action", "detail"}, ...]} or {"error": "..."} """ cfg, err = _load_cfg_or_error() if err: return err - from bridge.manager import TunnelManager + from bridge.cleanup import restart_all_tunnels, restart_tunnel sd = _state_dir() - restarted = [] + state_mgr = StateManager(state_dir=sd) if tunnel: from bridge.catalog.loader import load_catalog @@ -196,18 +199,19 @@ def bridge_restart(tunnel: Optional[str] = None) -> dict: tcfg = resolve(tunnel, catalog=catalog, inline_tunnels=cfg.tunnels) except BridgeNotFound: return {"error": f"Tunnel '{tunnel}' not found in config or catalog"} - mgr = TunnelManager(tcfg, state_dir=sd) - mgr.stop() - mgr.start() - restarted.append(tunnel) + actions = [restart_tunnel(tcfg, state_mgr)] else: - for name, tcfg in cfg.tunnels.items(): - mgr = TunnelManager(tcfg, state_dir=sd) - mgr.stop() - mgr.start() - restarted.append(name) + actions = restart_all_tunnels(cfg, state_mgr) - return {"restarted": restarted} + payload = { + "actions": [ + {"tunnel": a.tunnel, "action": a.action, "detail": a.detail} + for a in actions + ], + } + if any(a.action == "error" for a in actions): + payload["error"] = "one or more tunnels failed to restart" + return payload @mcp.tool() diff --git a/tests/test_cleanup.py b/tests/test_cleanup.py index 4ad4198..c274fb2 100644 --- a/tests/test_cleanup.py +++ b/tests/test_cleanup.py @@ -4,12 +4,10 @@ from __future__ import annotations import textwrap from unittest.mock import MagicMock, patch -import pytest from typer.testing import CliRunner from bridge.cleanup import ( CleanupAction, - CleanupReport, build_cron_line, cleanup_all_tunnels, remote_forward_health_url, diff --git a/tests/test_cli.py b/tests/test_cli.py index b822ac8..4ae7993 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -266,25 +266,96 @@ class TestCheckCommand: assert result.exit_code == 1 +REVERSE_CONFIG = VALID_CONFIG + +LOCAL_TUNNEL_CONFIG = textwrap.dedent("""\ + tunnels: + k3s-api: + host: host.local + remote_port: 6443 + local_port: 6443 + ssh_user: ubuntu + ssh_key: ~/.ssh/id_ops + actor: adm-bernd + direction: local + actors: + adm-bernd: + class: adm + description: Bernd +""") + + class TestRestartCommand: def test_restart_unknown_tunnel_exit_1(self, env): result = runner.invoke(app, ["restart", "nonexistent"], env=env) assert result.exit_code == 1 + def test_restart_help_mentions_remote_cleanup(self): + result = runner.invoke(app, ["restart", "--help"]) + assert result.exit_code == 0 + assert "stale-forward" in result.output.lower() or "remote" in result.output.lower() + @pytest.mark.capability("bridge_restart") @pytest.mark.access_mode("cli") - def test_restart_calls_stop_then_start(self, env): - with patch("bridge.cli.TunnelManager") as mock_mgr_cls: + def test_restart_reverse_tunnel_delegates_to_cleanup(self, env): + from bridge.cleanup import CleanupAction + + with patch("bridge.cli.restart_tunnel") as mock_restart: + mock_restart.return_value = CleanupAction( + "test-tunnel", "healthy", "remote forward healthy" + ) + result = runner.invoke(app, ["restart", "test-tunnel"], env=env) + + assert result.exit_code == 0 + mock_restart.assert_called_once() + assert "test-tunnel: healthy" in result.output + + def test_restart_reverse_tunnel_reports_cleaned_and_restarted(self, env): + from bridge.cleanup import CleanupAction + + with patch("bridge.cli.restart_tunnel") as mock_restart: + mock_restart.return_value = CleanupAction( + "test-tunnel", + "cleaned_and_restarted", + "stale forward; restarted tunnel; cleared", + ) + result = runner.invoke(app, ["restart", "test-tunnel"], env=env) + + assert result.exit_code == 0 + assert "cleaned_and_restarted" in result.output + + def test_restart_reverse_tunnel_error_exit_1(self, env): + from bridge.cleanup import CleanupAction + + with patch("bridge.cli.restart_tunnel") as mock_restart: + mock_restart.return_value = CleanupAction( + "test-tunnel", "error", "cleanup failed: still_listening" + ) + result = runner.invoke(app, ["restart", "test-tunnel"], env=env) + + assert result.exit_code == 1 + assert "error" in result.output + + def test_restart_local_tunnel_uses_stop_start(self, tmp_path, state_dir): + config_file = tmp_path / "tunnels.yaml" + config_file.write_text(LOCAL_TUNNEL_CONFIG) + env = { + "BRIDGE_CONFIG": str(config_file), + "BRIDGE_STATE_DIR": str(state_dir), + } + + with patch("bridge.cleanup.TunnelManager") as mock_mgr_cls: mock_mgr = MagicMock() mock_mgr_cls.return_value = mock_mgr call_order = [] mock_mgr.stop.side_effect = lambda: call_order.append("stop") mock_mgr.start.side_effect = lambda: call_order.append("start") - result = runner.invoke(app, ["restart", "test-tunnel"], env=env) + result = runner.invoke(app, ["restart", "k3s-api"], env=env) assert result.exit_code == 0 assert call_order == ["stop", "start"] + assert "k3s-api: restarted" in result.output class TestCertStatusCommand: diff --git a/tests/test_mcp.py b/tests/test_mcp.py index e6ceaac..a6f2144 100644 --- a/tests/test_mcp.py +++ b/tests/test_mcp.py @@ -237,22 +237,22 @@ class TestMcpBridgeDown: class TestMcpBridgeRestart: @pytest.mark.capability("bridge_restart") @pytest.mark.access_mode("mcp") - async def test_bridge_restart_calls_stop_then_start(self, env_simple): - with patch("bridge.manager.TunnelManager") as mock_cls: - mock_mgr = MagicMock() - call_order = [] - mock_mgr.stop.side_effect = lambda: call_order.append("stop") - mock_mgr.start.side_effect = lambda: call_order.append("start") - mock_cls.return_value = mock_mgr + async def test_bridge_restart_delegates_to_cleanup(self, env_simple): + from bridge.cleanup import CleanupAction + + with patch("bridge.cleanup.restart_tunnel") as mock_restart: + mock_restart.return_value = CleanupAction( + "test-tunnel", "healthy", "remote forward healthy" + ) from fastmcp import Client async with Client(mcp) as c: result = await c.call_tool("bridge_restart", {"tunnel": "test-tunnel"}) data = _data(result) - assert "restarted" in data - assert "test-tunnel" in data["restarted"] - assert call_order == ["stop", "start"] + assert data["actions"][0]["tunnel"] == "test-tunnel" + assert data["actions"][0]["action"] == "healthy" + mock_restart.assert_called_once() async def test_bridge_restart_unknown_tunnel(self, env_simple): from fastmcp import Client diff --git a/wiki/OpsBridge.md b/wiki/OpsBridge.md index 8acbef2..2aff2c3 100644 --- a/wiki/OpsBridge.md +++ b/wiki/OpsBridge.md @@ -157,31 +157,82 @@ Just controlled operational access when you need it. Start a bridge: ``` -ob up hostA=hostB +bridge up state-hub-railiance01 ``` Check active bridges: ``` -ob status +bridge status ``` Investigate infrastructure targets: ``` -ob targets +bridge targets ``` Stop the bridge when finished: ``` -ob down hostA=hostB +bridge down state-hub-railiance01 ``` OpsBridge handles the lifecycle so operators can focus on solving the problem. --- +# Tunnel lifecycle commands + +| Command | Purpose | +|---------|---------| +| `bridge up` | Start tunnel(s) that are not already running | +| `bridge down` | Stop tunnel(s) that are running | +| `bridge restart` | Blank-slate recovery — get tunnel(s) operational again | +| `bridge maintenance cleanup` | Proactive hygiene sweep without implying restart | + +## `bridge restart` — blank-slate recovery + +`bridge restart` means *operational again*, not merely cycling the local manager +PID while a broken remote listener still holds the port. + +For **reverse** tunnels (State Hub exposure on remote hosts), restart: + +1. Runs `should_cleanup_tunnel` to detect stale SSH remote forwards +2. Clears orphan listeners on the remote host when needed +3. Reconnects the tunnel (stop + start) only when cleanup was required + +When the remote forward is already healthy, restart reports `healthy` and leaves +the working tunnel running — no unnecessary disruption. + +For **local-direction** tunnels (`direction: local` in `tunnels.yaml`, e.g. +`k3s-api-coulombcore`), restart uses local stop/start only; no remote cleanup. + +Use `bridge maintenance cleanup` for scheduled or manual hygiene without the +restart contract. The nightly cron (`bridge maintenance install-cron`) runs +`maintenance cleanup --restart` at 03:00. + +**Incident context:** stale orphan `sshd` remote forwards after laptop sleep +blocked `bridge restart` until operators discovered the maintenance subcommand. +See `state-hub/history/20260621-weekend-automation-assessment.md` and +`BRIDGE-WP-0005` in this repo. + +## Host roles + +Tunnels in `~/.config/bridge/tunnels.yaml` serve three host roles: + +| Role | Hosts | Behaviour | +|------|-------|-----------| +| **Workstation origin** | WSL laptop | Shutdown, sleep, and network changes kill local bridge processes without graceful remote SSH teardown. Orphan forwards on all remotes are common after wake. | +| **VPS remotes** | coulombcore, railiance01 | Normally always-on. Maintenance reboots clear kernel state, but laptop return can leave orphan forwards from the previous session if the VPS did not reboot. | +| **LAN builder** | haskelseed | Intermittently offline; same orphan-forward pattern when the workstation-side tunnel dies uncleanly. | + +Conditional remote cleanup before restart benefits all reverse tunnels. +`should_cleanup_tunnel` skips healthy forwards — VPS tunnels with live working +forwards are untouched. + +--- + # The Philosophy Behind OpsBridge Infrastructure teams succeed or fail based on how effectively they bridge the gaps between: diff --git a/workplans/BRIDGE-WP-0005-restart-includes-remote-cleanup.md b/workplans/BRIDGE-WP-0005-restart-includes-remote-cleanup.md index 1d422fb..cec7b12 100644 --- a/workplans/BRIDGE-WP-0005-restart-includes-remote-cleanup.md +++ b/workplans/BRIDGE-WP-0005-restart-includes-remote-cleanup.md @@ -4,7 +4,7 @@ type: workplan title: "Restart includes remote cleanup (blank-slate recovery)" domain: custodian repo: ops-bridge -status: ready +status: finished owner: codex topic_slug: custodian created: "2026-06-21" @@ -97,7 +97,7 @@ Emit the same action summary strings cleanup already uses (`healthy`, ```task id: BRIDGE-WP-0005-T01 -status: todo +status: done priority: high state_hub_task_id: "b61c5d45-1198-416d-aa15-f2063fc5eb14" ``` @@ -119,7 +119,7 @@ Requirements: ```task id: BRIDGE-WP-0005-T02 -status: todo +status: done priority: high state_hub_task_id: "b4ad0525-6936-4799-bead-3603d05c49af" ``` @@ -138,7 +138,7 @@ Update `tests/test_cli.py`: ```task id: BRIDGE-WP-0005-T03 -status: todo +status: done priority: medium state_hub_task_id: "60586375-b0b4-4d4c-ba87-0699e76bf30c" ``` @@ -156,7 +156,7 @@ Document the blank-slate restart contract: ```task id: BRIDGE-WP-0005-T04 -status: todo +status: cancelled priority: low state_hub_task_id: "518f1b5e-3098-42aa-9662-bdab1d7d269b" ``` @@ -166,26 +166,29 @@ once after repeated exit-255 bind failures (laptop wake without operator running `bridge restart`). Defer unless T1–T3 are done; mark `cancel` if heuristic risk outweighs benefit. -Done when documented decision: implement, defer, or cancel with reason. +**Decision (2026-06-21): cancelled for now.** Auto-cleanup inside the reconnect +loop risks killing a legitimately healthy orphan forward owned by another session +or operator. `bridge restart` now covers the operator-facing blank-slate path; +nightly `maintenance cleanup --restart` covers unattended hygiene. Revisit only if +wake-from-sleep reconnect failures remain frequent after a month of observation. ## T5 — Live verification on workstation + VPS ```task id: BRIDGE-WP-0005-T05 -status: todo +status: done priority: medium state_hub_task_id: "b5d305ef-5b5d-4afe-a992-e0960d07af79" ``` After T1–T2 ship, verify on real config: -1. **railiance01** — reproduce stale-forward scenario (or simulate); confirm - `bridge restart state-hub-railiance01` clears and connects without needing - the maintenance subcommand. -2. **haskelseed** — `bridge restart state-hub-haskelseed` after a manual - `bridge down` while remote port still listens (Alpine `netstat` path from - ADHOC-2026-06-14). -3. **coulombcore** — confirm healthy tunnel restart is a no-op remote cleanup - (`healthy` action) and does not disrupt a working forward. +1. **railiance01** — `state-hub-mcp-railiance01` was `reconnecting` with stale + forward; `bridge restart` reported `cleaned_and_restarted` and tunnel reached + `connected`. +2. **haskelseed** — not exercised (all tunnels already healthy); Alpine netstat + path unchanged from ADHOC-2026-06-14 and covered by existing cleanup tests. +3. **coulombcore** — `bridge restart state-hub-coulombcore` reported `healthy`, + PID unchanged (4116), forward undisturbed. -Log a State Hub progress note on workstream close. Mark workplan `finished`. \ No newline at end of file +State Hub progress logged (2026-06-21). Workplan marked `finished`. \ No newline at end of file