feat(restart): route reverse tunnels through stale-forward cleanup

bridge restart now means blank-slate recovery: reverse tunnels run
should_cleanup_tunnel and clear orphan remote listeners before reconnecting;
healthy forwards are left running. Local-direction tunnels keep stop/start
only. CLI and MCP report per-tunnel actions (healthy, cleaned_and_restarted,
restarted, error) and exit non-zero on cleanup failure.

Closes BRIDGE-WP-0005.
This commit is contained in:
2026-06-21 20:12:13 +02:00
parent 8c11acc00c
commit 10c6fdaec9
8 changed files with 220 additions and 60 deletions

View File

@@ -2,7 +2,6 @@
from __future__ import annotations from __future__ import annotations
import subprocess import subprocess
import time
from dataclasses import dataclass from dataclasses import dataclass
from typing import Optional from typing import Optional
from urllib.parse import urlparse, urlunparse from urllib.parse import urlparse, urlunparse
@@ -215,6 +214,27 @@ def cleanup_tunnel(
return CleanupAction(name, "error", str(exc)) return CleanupAction(name, "error", str(exc))
def restart_tunnel(
cfg: TunnelConfig,
state_mgr: StateManager,
) -> CleanupAction:
"""Restart one tunnel with blank-slate recovery for reverse tunnels."""
if cfg.direction == "local":
mgr = TunnelManager(cfg, state_dir=state_mgr._dir)
mgr.stop()
mgr.start()
return CleanupAction(cfg.name, "restarted", "local tunnel stop/start")
return cleanup_tunnel(cfg, state_mgr, restart=True)
def restart_all_tunnels(
cfg,
state_mgr: StateManager,
) -> list[CleanupAction]:
"""Restart every inline tunnel (reverse via cleanup path, local via stop/start)."""
return [restart_tunnel(tcfg, state_mgr) for tcfg in cfg.tunnels.values()]
def cleanup_all_tunnels( def cleanup_all_tunnels(
cfg, cfg,
state_mgr: StateManager, state_mgr: StateManager,

View File

@@ -13,10 +13,13 @@ import typer
from bridge.audit import AuditLogger from bridge.audit import AuditLogger
from bridge.cleanup import ( from bridge.cleanup import (
CleanupAction,
build_cron_line, build_cron_line,
cleanup_all_tunnels, cleanup_all_tunnels,
install_cleanup_cron, install_cleanup_cron,
read_installed_cron, read_installed_cron,
restart_all_tunnels,
restart_tunnel,
uninstall_cleanup_cron, uninstall_cleanup_cron,
) )
from bridge.config import ConfigError, load_config from bridge.config import ConfigError, load_config
@@ -153,27 +156,37 @@ def down(
raise typer.Exit(2) raise typer.Exit(2)
def _emit_restart_actions(actions: list[CleanupAction]) -> None:
any_error = False
for action in actions:
typer.echo(f"{action.tunnel}: {action.action}{action.detail}")
if action.action == "error":
any_error = True
if any_error:
raise typer.Exit(1)
@app.command() @app.command()
def restart( def restart(
tunnel: Optional[str] = typer.Argument(None, help="Tunnel name (omit for all inline)"), tunnel: Optional[str] = typer.Argument(None, help="Tunnel name (omit for all inline)"),
): ):
"""Restart one or all tunnels.""" """Restart one or all tunnels.
Reverse tunnels run conditional remote stale-forward cleanup before
reconnecting; healthy forwards are left running. Local-direction tunnels
use local stop/start only.
"""
cfg = _load_or_exit() cfg = _load_or_exit()
sd = _state_dir() sd = _state_dir()
state_mgr = StateManager(state_dir=sd)
if tunnel: if tunnel:
tcfg = _resolve_tunnel(cfg, tunnel) tcfg = _resolve_tunnel(cfg, tunnel)
mgr = TunnelManager(tcfg, state_dir=sd) actions = [restart_tunnel(tcfg, state_mgr)]
mgr.stop()
mgr.start()
typer.echo(f"Restarted tunnel '{tunnel}'.")
else: else:
for name in _all_tunnel_names(cfg): actions = restart_all_tunnels(cfg, state_mgr)
tcfg = cfg.tunnels[name]
mgr = TunnelManager(tcfg, state_dir=sd) _emit_restart_actions(actions)
mgr.stop()
mgr.start()
typer.echo(f"Restarted tunnel '{name}'.")
@app.command() @app.command()

View File

@@ -169,19 +169,22 @@ def bridge_down(tunnel: Optional[str] = None) -> dict:
def bridge_restart(tunnel: Optional[str] = None) -> dict: def bridge_restart(tunnel: Optional[str] = None) -> dict:
"""Restart one or all configured tunnels. """Restart one or all configured tunnels.
Reverse tunnels run conditional remote stale-forward cleanup before
reconnecting; healthy forwards are left running.
Args: Args:
tunnel: Tunnel name to restart. If omitted, restarts all inline tunnels. tunnel: Tunnel name to restart. If omitted, restarts all inline tunnels.
Returns: Returns:
{"restarted": [...]} or {"error": "..."} {"actions": [{"tunnel", "action", "detail"}, ...]} or {"error": "..."}
""" """
cfg, err = _load_cfg_or_error() cfg, err = _load_cfg_or_error()
if err: if err:
return err return err
from bridge.manager import TunnelManager from bridge.cleanup import restart_all_tunnels, restart_tunnel
sd = _state_dir() sd = _state_dir()
restarted = [] state_mgr = StateManager(state_dir=sd)
if tunnel: if tunnel:
from bridge.catalog.loader import load_catalog from bridge.catalog.loader import load_catalog
@@ -196,18 +199,19 @@ def bridge_restart(tunnel: Optional[str] = None) -> dict:
tcfg = resolve(tunnel, catalog=catalog, inline_tunnels=cfg.tunnels) tcfg = resolve(tunnel, catalog=catalog, inline_tunnels=cfg.tunnels)
except BridgeNotFound: except BridgeNotFound:
return {"error": f"Tunnel '{tunnel}' not found in config or catalog"} return {"error": f"Tunnel '{tunnel}' not found in config or catalog"}
mgr = TunnelManager(tcfg, state_dir=sd) actions = [restart_tunnel(tcfg, state_mgr)]
mgr.stop()
mgr.start()
restarted.append(tunnel)
else: else:
for name, tcfg in cfg.tunnels.items(): actions = restart_all_tunnels(cfg, state_mgr)
mgr = TunnelManager(tcfg, state_dir=sd)
mgr.stop()
mgr.start()
restarted.append(name)
return {"restarted": restarted} payload = {
"actions": [
{"tunnel": a.tunnel, "action": a.action, "detail": a.detail}
for a in actions
],
}
if any(a.action == "error" for a in actions):
payload["error"] = "one or more tunnels failed to restart"
return payload
@mcp.tool() @mcp.tool()

View File

@@ -4,12 +4,10 @@ from __future__ import annotations
import textwrap import textwrap
from unittest.mock import MagicMock, patch from unittest.mock import MagicMock, patch
import pytest
from typer.testing import CliRunner from typer.testing import CliRunner
from bridge.cleanup import ( from bridge.cleanup import (
CleanupAction, CleanupAction,
CleanupReport,
build_cron_line, build_cron_line,
cleanup_all_tunnels, cleanup_all_tunnels,
remote_forward_health_url, remote_forward_health_url,

View File

@@ -266,25 +266,96 @@ class TestCheckCommand:
assert result.exit_code == 1 assert result.exit_code == 1
REVERSE_CONFIG = VALID_CONFIG
LOCAL_TUNNEL_CONFIG = textwrap.dedent("""\
tunnels:
k3s-api:
host: host.local
remote_port: 6443
local_port: 6443
ssh_user: ubuntu
ssh_key: ~/.ssh/id_ops
actor: adm-bernd
direction: local
actors:
adm-bernd:
class: adm
description: Bernd
""")
class TestRestartCommand: class TestRestartCommand:
def test_restart_unknown_tunnel_exit_1(self, env): def test_restart_unknown_tunnel_exit_1(self, env):
result = runner.invoke(app, ["restart", "nonexistent"], env=env) result = runner.invoke(app, ["restart", "nonexistent"], env=env)
assert result.exit_code == 1 assert result.exit_code == 1
def test_restart_help_mentions_remote_cleanup(self):
result = runner.invoke(app, ["restart", "--help"])
assert result.exit_code == 0
assert "stale-forward" in result.output.lower() or "remote" in result.output.lower()
@pytest.mark.capability("bridge_restart") @pytest.mark.capability("bridge_restart")
@pytest.mark.access_mode("cli") @pytest.mark.access_mode("cli")
def test_restart_calls_stop_then_start(self, env): def test_restart_reverse_tunnel_delegates_to_cleanup(self, env):
with patch("bridge.cli.TunnelManager") as mock_mgr_cls: from bridge.cleanup import CleanupAction
with patch("bridge.cli.restart_tunnel") as mock_restart:
mock_restart.return_value = CleanupAction(
"test-tunnel", "healthy", "remote forward healthy"
)
result = runner.invoke(app, ["restart", "test-tunnel"], env=env)
assert result.exit_code == 0
mock_restart.assert_called_once()
assert "test-tunnel: healthy" in result.output
def test_restart_reverse_tunnel_reports_cleaned_and_restarted(self, env):
from bridge.cleanup import CleanupAction
with patch("bridge.cli.restart_tunnel") as mock_restart:
mock_restart.return_value = CleanupAction(
"test-tunnel",
"cleaned_and_restarted",
"stale forward; restarted tunnel; cleared",
)
result = runner.invoke(app, ["restart", "test-tunnel"], env=env)
assert result.exit_code == 0
assert "cleaned_and_restarted" in result.output
def test_restart_reverse_tunnel_error_exit_1(self, env):
from bridge.cleanup import CleanupAction
with patch("bridge.cli.restart_tunnel") as mock_restart:
mock_restart.return_value = CleanupAction(
"test-tunnel", "error", "cleanup failed: still_listening"
)
result = runner.invoke(app, ["restart", "test-tunnel"], env=env)
assert result.exit_code == 1
assert "error" in result.output
def test_restart_local_tunnel_uses_stop_start(self, tmp_path, state_dir):
config_file = tmp_path / "tunnels.yaml"
config_file.write_text(LOCAL_TUNNEL_CONFIG)
env = {
"BRIDGE_CONFIG": str(config_file),
"BRIDGE_STATE_DIR": str(state_dir),
}
with patch("bridge.cleanup.TunnelManager") as mock_mgr_cls:
mock_mgr = MagicMock() mock_mgr = MagicMock()
mock_mgr_cls.return_value = mock_mgr mock_mgr_cls.return_value = mock_mgr
call_order = [] call_order = []
mock_mgr.stop.side_effect = lambda: call_order.append("stop") mock_mgr.stop.side_effect = lambda: call_order.append("stop")
mock_mgr.start.side_effect = lambda: call_order.append("start") mock_mgr.start.side_effect = lambda: call_order.append("start")
result = runner.invoke(app, ["restart", "test-tunnel"], env=env) result = runner.invoke(app, ["restart", "k3s-api"], env=env)
assert result.exit_code == 0 assert result.exit_code == 0
assert call_order == ["stop", "start"] assert call_order == ["stop", "start"]
assert "k3s-api: restarted" in result.output
class TestCertStatusCommand: class TestCertStatusCommand:

View File

@@ -237,22 +237,22 @@ class TestMcpBridgeDown:
class TestMcpBridgeRestart: class TestMcpBridgeRestart:
@pytest.mark.capability("bridge_restart") @pytest.mark.capability("bridge_restart")
@pytest.mark.access_mode("mcp") @pytest.mark.access_mode("mcp")
async def test_bridge_restart_calls_stop_then_start(self, env_simple): async def test_bridge_restart_delegates_to_cleanup(self, env_simple):
with patch("bridge.manager.TunnelManager") as mock_cls: from bridge.cleanup import CleanupAction
mock_mgr = MagicMock()
call_order = [] with patch("bridge.cleanup.restart_tunnel") as mock_restart:
mock_mgr.stop.side_effect = lambda: call_order.append("stop") mock_restart.return_value = CleanupAction(
mock_mgr.start.side_effect = lambda: call_order.append("start") "test-tunnel", "healthy", "remote forward healthy"
mock_cls.return_value = mock_mgr )
from fastmcp import Client from fastmcp import Client
async with Client(mcp) as c: async with Client(mcp) as c:
result = await c.call_tool("bridge_restart", {"tunnel": "test-tunnel"}) result = await c.call_tool("bridge_restart", {"tunnel": "test-tunnel"})
data = _data(result) data = _data(result)
assert "restarted" in data assert data["actions"][0]["tunnel"] == "test-tunnel"
assert "test-tunnel" in data["restarted"] assert data["actions"][0]["action"] == "healthy"
assert call_order == ["stop", "start"] mock_restart.assert_called_once()
async def test_bridge_restart_unknown_tunnel(self, env_simple): async def test_bridge_restart_unknown_tunnel(self, env_simple):
from fastmcp import Client from fastmcp import Client

View File

@@ -157,31 +157,82 @@ Just controlled operational access when you need it.
Start a bridge: Start a bridge:
``` ```
ob up hostA=hostB bridge up state-hub-railiance01
``` ```
Check active bridges: Check active bridges:
``` ```
ob status bridge status
``` ```
Investigate infrastructure targets: Investigate infrastructure targets:
``` ```
ob targets bridge targets
``` ```
Stop the bridge when finished: Stop the bridge when finished:
``` ```
ob down hostA=hostB bridge down state-hub-railiance01
``` ```
OpsBridge handles the lifecycle so operators can focus on solving the problem. OpsBridge handles the lifecycle so operators can focus on solving the problem.
--- ---
# Tunnel lifecycle commands
| Command | Purpose |
|---------|---------|
| `bridge up` | Start tunnel(s) that are not already running |
| `bridge down` | Stop tunnel(s) that are running |
| `bridge restart` | Blank-slate recovery — get tunnel(s) operational again |
| `bridge maintenance cleanup` | Proactive hygiene sweep without implying restart |
## `bridge restart` — blank-slate recovery
`bridge restart` means *operational again*, not merely cycling the local manager
PID while a broken remote listener still holds the port.
For **reverse** tunnels (State Hub exposure on remote hosts), restart:
1. Runs `should_cleanup_tunnel` to detect stale SSH remote forwards
2. Clears orphan listeners on the remote host when needed
3. Reconnects the tunnel (stop + start) only when cleanup was required
When the remote forward is already healthy, restart reports `healthy` and leaves
the working tunnel running — no unnecessary disruption.
For **local-direction** tunnels (`direction: local` in `tunnels.yaml`, e.g.
`k3s-api-coulombcore`), restart uses local stop/start only; no remote cleanup.
Use `bridge maintenance cleanup` for scheduled or manual hygiene without the
restart contract. The nightly cron (`bridge maintenance install-cron`) runs
`maintenance cleanup --restart` at 03:00.
**Incident context:** stale orphan `sshd` remote forwards after laptop sleep
blocked `bridge restart` until operators discovered the maintenance subcommand.
See `state-hub/history/20260621-weekend-automation-assessment.md` and
`BRIDGE-WP-0005` in this repo.
## Host roles
Tunnels in `~/.config/bridge/tunnels.yaml` serve three host roles:
| Role | Hosts | Behaviour |
|------|-------|-----------|
| **Workstation origin** | WSL laptop | Shutdown, sleep, and network changes kill local bridge processes without graceful remote SSH teardown. Orphan forwards on all remotes are common after wake. |
| **VPS remotes** | coulombcore, railiance01 | Normally always-on. Maintenance reboots clear kernel state, but laptop return can leave orphan forwards from the previous session if the VPS did not reboot. |
| **LAN builder** | haskelseed | Intermittently offline; same orphan-forward pattern when the workstation-side tunnel dies uncleanly. |
Conditional remote cleanup before restart benefits all reverse tunnels.
`should_cleanup_tunnel` skips healthy forwards — VPS tunnels with live working
forwards are untouched.
---
# The Philosophy Behind OpsBridge # The Philosophy Behind OpsBridge
Infrastructure teams succeed or fail based on how effectively they bridge the gaps between: Infrastructure teams succeed or fail based on how effectively they bridge the gaps between:

View File

@@ -4,7 +4,7 @@ type: workplan
title: "Restart includes remote cleanup (blank-slate recovery)" title: "Restart includes remote cleanup (blank-slate recovery)"
domain: custodian domain: custodian
repo: ops-bridge repo: ops-bridge
status: ready status: finished
owner: codex owner: codex
topic_slug: custodian topic_slug: custodian
created: "2026-06-21" created: "2026-06-21"
@@ -97,7 +97,7 @@ Emit the same action summary strings cleanup already uses (`healthy`,
```task ```task
id: BRIDGE-WP-0005-T01 id: BRIDGE-WP-0005-T01
status: todo status: done
priority: high priority: high
state_hub_task_id: "b61c5d45-1198-416d-aa15-f2063fc5eb14" state_hub_task_id: "b61c5d45-1198-416d-aa15-f2063fc5eb14"
``` ```
@@ -119,7 +119,7 @@ Requirements:
```task ```task
id: BRIDGE-WP-0005-T02 id: BRIDGE-WP-0005-T02
status: todo status: done
priority: high priority: high
state_hub_task_id: "b4ad0525-6936-4799-bead-3603d05c49af" state_hub_task_id: "b4ad0525-6936-4799-bead-3603d05c49af"
``` ```
@@ -138,7 +138,7 @@ Update `tests/test_cli.py`:
```task ```task
id: BRIDGE-WP-0005-T03 id: BRIDGE-WP-0005-T03
status: todo status: done
priority: medium priority: medium
state_hub_task_id: "60586375-b0b4-4d4c-ba87-0699e76bf30c" state_hub_task_id: "60586375-b0b4-4d4c-ba87-0699e76bf30c"
``` ```
@@ -156,7 +156,7 @@ Document the blank-slate restart contract:
```task ```task
id: BRIDGE-WP-0005-T04 id: BRIDGE-WP-0005-T04
status: todo status: cancelled
priority: low priority: low
state_hub_task_id: "518f1b5e-3098-42aa-9662-bdab1d7d269b" state_hub_task_id: "518f1b5e-3098-42aa-9662-bdab1d7d269b"
``` ```
@@ -166,26 +166,29 @@ once after repeated exit-255 bind failures (laptop wake without operator running
`bridge restart`). Defer unless T1T3 are done; mark `cancel` if heuristic risk `bridge restart`). Defer unless T1T3 are done; mark `cancel` if heuristic risk
outweighs benefit. outweighs benefit.
Done when documented decision: implement, defer, or cancel with reason. **Decision (2026-06-21): cancelled for now.** Auto-cleanup inside the reconnect
loop risks killing a legitimately healthy orphan forward owned by another session
or operator. `bridge restart` now covers the operator-facing blank-slate path;
nightly `maintenance cleanup --restart` covers unattended hygiene. Revisit only if
wake-from-sleep reconnect failures remain frequent after a month of observation.
## T5 — Live verification on workstation + VPS ## T5 — Live verification on workstation + VPS
```task ```task
id: BRIDGE-WP-0005-T05 id: BRIDGE-WP-0005-T05
status: todo status: done
priority: medium priority: medium
state_hub_task_id: "b5d305ef-5b5d-4afe-a992-e0960d07af79" state_hub_task_id: "b5d305ef-5b5d-4afe-a992-e0960d07af79"
``` ```
After T1T2 ship, verify on real config: After T1T2 ship, verify on real config:
1. **railiance01**reproduce stale-forward scenario (or simulate); confirm 1. **railiance01**`state-hub-mcp-railiance01` was `reconnecting` with stale
`bridge restart state-hub-railiance01` clears and connects without needing forward; `bridge restart` reported `cleaned_and_restarted` and tunnel reached
the maintenance subcommand. `connected`.
2. **haskelseed**`bridge restart state-hub-haskelseed` after a manual 2. **haskelseed**not exercised (all tunnels already healthy); Alpine netstat
`bridge down` while remote port still listens (Alpine `netstat` path from path unchanged from ADHOC-2026-06-14 and covered by existing cleanup tests.
ADHOC-2026-06-14). 3. **coulombcore**`bridge restart state-hub-coulombcore` reported `healthy`,
3. **coulombcore** — confirm healthy tunnel restart is a no-op remote cleanup PID unchanged (4116), forward undisturbed.
(`healthy` action) and does not disrupt a working forward.
Log a State Hub progress note on workstream close. Mark workplan `finished`. State Hub progress logged (2026-06-21). Workplan marked `finished`.