feat(restart): route reverse tunnels through stale-forward cleanup

bridge restart now means blank-slate recovery: reverse tunnels run
should_cleanup_tunnel and clear orphan remote listeners before reconnecting;
healthy forwards are left running. Local-direction tunnels keep stop/start
only. CLI and MCP report per-tunnel actions (healthy, cleaned_and_restarted,
restarted, error) and exit non-zero on cleanup failure.

Closes BRIDGE-WP-0005.
This commit is contained in:
2026-06-21 20:12:13 +02:00
parent 8c11acc00c
commit 10c6fdaec9
8 changed files with 220 additions and 60 deletions

View File

@@ -2,7 +2,6 @@
from __future__ import annotations
import subprocess
import time
from dataclasses import dataclass
from typing import Optional
from urllib.parse import urlparse, urlunparse
@@ -215,6 +214,27 @@ def cleanup_tunnel(
return CleanupAction(name, "error", str(exc))
def restart_tunnel(
cfg: TunnelConfig,
state_mgr: StateManager,
) -> CleanupAction:
"""Restart one tunnel with blank-slate recovery for reverse tunnels."""
if cfg.direction == "local":
mgr = TunnelManager(cfg, state_dir=state_mgr._dir)
mgr.stop()
mgr.start()
return CleanupAction(cfg.name, "restarted", "local tunnel stop/start")
return cleanup_tunnel(cfg, state_mgr, restart=True)
def restart_all_tunnels(
cfg,
state_mgr: StateManager,
) -> list[CleanupAction]:
"""Restart every inline tunnel (reverse via cleanup path, local via stop/start)."""
return [restart_tunnel(tcfg, state_mgr) for tcfg in cfg.tunnels.values()]
def cleanup_all_tunnels(
cfg,
state_mgr: StateManager,

View File

@@ -13,10 +13,13 @@ import typer
from bridge.audit import AuditLogger
from bridge.cleanup import (
CleanupAction,
build_cron_line,
cleanup_all_tunnels,
install_cleanup_cron,
read_installed_cron,
restart_all_tunnels,
restart_tunnel,
uninstall_cleanup_cron,
)
from bridge.config import ConfigError, load_config
@@ -153,27 +156,37 @@ def down(
raise typer.Exit(2)
def _emit_restart_actions(actions: list[CleanupAction]) -> None:
any_error = False
for action in actions:
typer.echo(f"{action.tunnel}: {action.action}{action.detail}")
if action.action == "error":
any_error = True
if any_error:
raise typer.Exit(1)
@app.command()
def restart(
tunnel: Optional[str] = typer.Argument(None, help="Tunnel name (omit for all inline)"),
):
"""Restart one or all tunnels."""
"""Restart one or all tunnels.
Reverse tunnels run conditional remote stale-forward cleanup before
reconnecting; healthy forwards are left running. Local-direction tunnels
use local stop/start only.
"""
cfg = _load_or_exit()
sd = _state_dir()
state_mgr = StateManager(state_dir=sd)
if tunnel:
tcfg = _resolve_tunnel(cfg, tunnel)
mgr = TunnelManager(tcfg, state_dir=sd)
mgr.stop()
mgr.start()
typer.echo(f"Restarted tunnel '{tunnel}'.")
actions = [restart_tunnel(tcfg, state_mgr)]
else:
for name in _all_tunnel_names(cfg):
tcfg = cfg.tunnels[name]
mgr = TunnelManager(tcfg, state_dir=sd)
mgr.stop()
mgr.start()
typer.echo(f"Restarted tunnel '{name}'.")
actions = restart_all_tunnels(cfg, state_mgr)
_emit_restart_actions(actions)
@app.command()

View File

@@ -169,19 +169,22 @@ def bridge_down(tunnel: Optional[str] = None) -> dict:
def bridge_restart(tunnel: Optional[str] = None) -> dict:
"""Restart one or all configured tunnels.
Reverse tunnels run conditional remote stale-forward cleanup before
reconnecting; healthy forwards are left running.
Args:
tunnel: Tunnel name to restart. If omitted, restarts all inline tunnels.
Returns:
{"restarted": [...]} or {"error": "..."}
{"actions": [{"tunnel", "action", "detail"}, ...]} or {"error": "..."}
"""
cfg, err = _load_cfg_or_error()
if err:
return err
from bridge.manager import TunnelManager
from bridge.cleanup import restart_all_tunnels, restart_tunnel
sd = _state_dir()
restarted = []
state_mgr = StateManager(state_dir=sd)
if tunnel:
from bridge.catalog.loader import load_catalog
@@ -196,18 +199,19 @@ def bridge_restart(tunnel: Optional[str] = None) -> dict:
tcfg = resolve(tunnel, catalog=catalog, inline_tunnels=cfg.tunnels)
except BridgeNotFound:
return {"error": f"Tunnel '{tunnel}' not found in config or catalog"}
mgr = TunnelManager(tcfg, state_dir=sd)
mgr.stop()
mgr.start()
restarted.append(tunnel)
actions = [restart_tunnel(tcfg, state_mgr)]
else:
for name, tcfg in cfg.tunnels.items():
mgr = TunnelManager(tcfg, state_dir=sd)
mgr.stop()
mgr.start()
restarted.append(name)
actions = restart_all_tunnels(cfg, state_mgr)
return {"restarted": restarted}
payload = {
"actions": [
{"tunnel": a.tunnel, "action": a.action, "detail": a.detail}
for a in actions
],
}
if any(a.action == "error" for a in actions):
payload["error"] = "one or more tunnels failed to restart"
return payload
@mcp.tool()

View File

@@ -4,12 +4,10 @@ from __future__ import annotations
import textwrap
from unittest.mock import MagicMock, patch
import pytest
from typer.testing import CliRunner
from bridge.cleanup import (
CleanupAction,
CleanupReport,
build_cron_line,
cleanup_all_tunnels,
remote_forward_health_url,

View File

@@ -266,25 +266,96 @@ class TestCheckCommand:
assert result.exit_code == 1
REVERSE_CONFIG = VALID_CONFIG
LOCAL_TUNNEL_CONFIG = textwrap.dedent("""\
tunnels:
k3s-api:
host: host.local
remote_port: 6443
local_port: 6443
ssh_user: ubuntu
ssh_key: ~/.ssh/id_ops
actor: adm-bernd
direction: local
actors:
adm-bernd:
class: adm
description: Bernd
""")
class TestRestartCommand:
def test_restart_unknown_tunnel_exit_1(self, env):
result = runner.invoke(app, ["restart", "nonexistent"], env=env)
assert result.exit_code == 1
def test_restart_help_mentions_remote_cleanup(self):
result = runner.invoke(app, ["restart", "--help"])
assert result.exit_code == 0
assert "stale-forward" in result.output.lower() or "remote" in result.output.lower()
@pytest.mark.capability("bridge_restart")
@pytest.mark.access_mode("cli")
def test_restart_calls_stop_then_start(self, env):
with patch("bridge.cli.TunnelManager") as mock_mgr_cls:
def test_restart_reverse_tunnel_delegates_to_cleanup(self, env):
from bridge.cleanup import CleanupAction
with patch("bridge.cli.restart_tunnel") as mock_restart:
mock_restart.return_value = CleanupAction(
"test-tunnel", "healthy", "remote forward healthy"
)
result = runner.invoke(app, ["restart", "test-tunnel"], env=env)
assert result.exit_code == 0
mock_restart.assert_called_once()
assert "test-tunnel: healthy" in result.output
def test_restart_reverse_tunnel_reports_cleaned_and_restarted(self, env):
from bridge.cleanup import CleanupAction
with patch("bridge.cli.restart_tunnel") as mock_restart:
mock_restart.return_value = CleanupAction(
"test-tunnel",
"cleaned_and_restarted",
"stale forward; restarted tunnel; cleared",
)
result = runner.invoke(app, ["restart", "test-tunnel"], env=env)
assert result.exit_code == 0
assert "cleaned_and_restarted" in result.output
def test_restart_reverse_tunnel_error_exit_1(self, env):
from bridge.cleanup import CleanupAction
with patch("bridge.cli.restart_tunnel") as mock_restart:
mock_restart.return_value = CleanupAction(
"test-tunnel", "error", "cleanup failed: still_listening"
)
result = runner.invoke(app, ["restart", "test-tunnel"], env=env)
assert result.exit_code == 1
assert "error" in result.output
def test_restart_local_tunnel_uses_stop_start(self, tmp_path, state_dir):
config_file = tmp_path / "tunnels.yaml"
config_file.write_text(LOCAL_TUNNEL_CONFIG)
env = {
"BRIDGE_CONFIG": str(config_file),
"BRIDGE_STATE_DIR": str(state_dir),
}
with patch("bridge.cleanup.TunnelManager") as mock_mgr_cls:
mock_mgr = MagicMock()
mock_mgr_cls.return_value = mock_mgr
call_order = []
mock_mgr.stop.side_effect = lambda: call_order.append("stop")
mock_mgr.start.side_effect = lambda: call_order.append("start")
result = runner.invoke(app, ["restart", "test-tunnel"], env=env)
result = runner.invoke(app, ["restart", "k3s-api"], env=env)
assert result.exit_code == 0
assert call_order == ["stop", "start"]
assert "k3s-api: restarted" in result.output
class TestCertStatusCommand:

View File

@@ -237,22 +237,22 @@ class TestMcpBridgeDown:
class TestMcpBridgeRestart:
@pytest.mark.capability("bridge_restart")
@pytest.mark.access_mode("mcp")
async def test_bridge_restart_calls_stop_then_start(self, env_simple):
with patch("bridge.manager.TunnelManager") as mock_cls:
mock_mgr = MagicMock()
call_order = []
mock_mgr.stop.side_effect = lambda: call_order.append("stop")
mock_mgr.start.side_effect = lambda: call_order.append("start")
mock_cls.return_value = mock_mgr
async def test_bridge_restart_delegates_to_cleanup(self, env_simple):
from bridge.cleanup import CleanupAction
with patch("bridge.cleanup.restart_tunnel") as mock_restart:
mock_restart.return_value = CleanupAction(
"test-tunnel", "healthy", "remote forward healthy"
)
from fastmcp import Client
async with Client(mcp) as c:
result = await c.call_tool("bridge_restart", {"tunnel": "test-tunnel"})
data = _data(result)
assert "restarted" in data
assert "test-tunnel" in data["restarted"]
assert call_order == ["stop", "start"]
assert data["actions"][0]["tunnel"] == "test-tunnel"
assert data["actions"][0]["action"] == "healthy"
mock_restart.assert_called_once()
async def test_bridge_restart_unknown_tunnel(self, env_simple):
from fastmcp import Client

View File

@@ -157,31 +157,82 @@ Just controlled operational access when you need it.
Start a bridge:
```
ob up hostA=hostB
bridge up state-hub-railiance01
```
Check active bridges:
```
ob status
bridge status
```
Investigate infrastructure targets:
```
ob targets
bridge targets
```
Stop the bridge when finished:
```
ob down hostA=hostB
bridge down state-hub-railiance01
```
OpsBridge handles the lifecycle so operators can focus on solving the problem.
---
# Tunnel lifecycle commands
| Command | Purpose |
|---------|---------|
| `bridge up` | Start tunnel(s) that are not already running |
| `bridge down` | Stop tunnel(s) that are running |
| `bridge restart` | Blank-slate recovery — get tunnel(s) operational again |
| `bridge maintenance cleanup` | Proactive hygiene sweep without implying restart |
## `bridge restart` — blank-slate recovery
`bridge restart` means *operational again*, not merely cycling the local manager
PID while a broken remote listener still holds the port.
For **reverse** tunnels (State Hub exposure on remote hosts), restart:
1. Runs `should_cleanup_tunnel` to detect stale SSH remote forwards
2. Clears orphan listeners on the remote host when needed
3. Reconnects the tunnel (stop + start) only when cleanup was required
When the remote forward is already healthy, restart reports `healthy` and leaves
the working tunnel running — no unnecessary disruption.
For **local-direction** tunnels (`direction: local` in `tunnels.yaml`, e.g.
`k3s-api-coulombcore`), restart uses local stop/start only; no remote cleanup.
Use `bridge maintenance cleanup` for scheduled or manual hygiene without the
restart contract. The nightly cron (`bridge maintenance install-cron`) runs
`maintenance cleanup --restart` at 03:00.
**Incident context:** stale orphan `sshd` remote forwards after laptop sleep
blocked `bridge restart` until operators discovered the maintenance subcommand.
See `state-hub/history/20260621-weekend-automation-assessment.md` and
`BRIDGE-WP-0005` in this repo.
## Host roles
Tunnels in `~/.config/bridge/tunnels.yaml` serve three host roles:
| Role | Hosts | Behaviour |
|------|-------|-----------|
| **Workstation origin** | WSL laptop | Shutdown, sleep, and network changes kill local bridge processes without graceful remote SSH teardown. Orphan forwards on all remotes are common after wake. |
| **VPS remotes** | coulombcore, railiance01 | Normally always-on. Maintenance reboots clear kernel state, but laptop return can leave orphan forwards from the previous session if the VPS did not reboot. |
| **LAN builder** | haskelseed | Intermittently offline; same orphan-forward pattern when the workstation-side tunnel dies uncleanly. |
Conditional remote cleanup before restart benefits all reverse tunnels.
`should_cleanup_tunnel` skips healthy forwards — VPS tunnels with live working
forwards are untouched.
---
# The Philosophy Behind OpsBridge
Infrastructure teams succeed or fail based on how effectively they bridge the gaps between:

View File

@@ -4,7 +4,7 @@ type: workplan
title: "Restart includes remote cleanup (blank-slate recovery)"
domain: custodian
repo: ops-bridge
status: ready
status: finished
owner: codex
topic_slug: custodian
created: "2026-06-21"
@@ -97,7 +97,7 @@ Emit the same action summary strings cleanup already uses (`healthy`,
```task
id: BRIDGE-WP-0005-T01
status: todo
status: done
priority: high
state_hub_task_id: "b61c5d45-1198-416d-aa15-f2063fc5eb14"
```
@@ -119,7 +119,7 @@ Requirements:
```task
id: BRIDGE-WP-0005-T02
status: todo
status: done
priority: high
state_hub_task_id: "b4ad0525-6936-4799-bead-3603d05c49af"
```
@@ -138,7 +138,7 @@ Update `tests/test_cli.py`:
```task
id: BRIDGE-WP-0005-T03
status: todo
status: done
priority: medium
state_hub_task_id: "60586375-b0b4-4d4c-ba87-0699e76bf30c"
```
@@ -156,7 +156,7 @@ Document the blank-slate restart contract:
```task
id: BRIDGE-WP-0005-T04
status: todo
status: cancelled
priority: low
state_hub_task_id: "518f1b5e-3098-42aa-9662-bdab1d7d269b"
```
@@ -166,26 +166,29 @@ once after repeated exit-255 bind failures (laptop wake without operator running
`bridge restart`). Defer unless T1T3 are done; mark `cancel` if heuristic risk
outweighs benefit.
Done when documented decision: implement, defer, or cancel with reason.
**Decision (2026-06-21): cancelled for now.** Auto-cleanup inside the reconnect
loop risks killing a legitimately healthy orphan forward owned by another session
or operator. `bridge restart` now covers the operator-facing blank-slate path;
nightly `maintenance cleanup --restart` covers unattended hygiene. Revisit only if
wake-from-sleep reconnect failures remain frequent after a month of observation.
## T5 — Live verification on workstation + VPS
```task
id: BRIDGE-WP-0005-T05
status: todo
status: done
priority: medium
state_hub_task_id: "b5d305ef-5b5d-4afe-a992-e0960d07af79"
```
After T1T2 ship, verify on real config:
1. **railiance01**reproduce stale-forward scenario (or simulate); confirm
`bridge restart state-hub-railiance01` clears and connects without needing
the maintenance subcommand.
2. **haskelseed**`bridge restart state-hub-haskelseed` after a manual
`bridge down` while remote port still listens (Alpine `netstat` path from
ADHOC-2026-06-14).
3. **coulombcore** — confirm healthy tunnel restart is a no-op remote cleanup
(`healthy` action) and does not disrupt a working forward.
1. **railiance01**`state-hub-mcp-railiance01` was `reconnecting` with stale
forward; `bridge restart` reported `cleaned_and_restarted` and tunnel reached
`connected`.
2. **haskelseed**not exercised (all tunnels already healthy); Alpine netstat
path unchanged from ADHOC-2026-06-14 and covered by existing cleanup tests.
3. **coulombcore**`bridge restart state-hub-coulombcore` reported `healthy`,
PID unchanged (4116), forward undisturbed.
Log a State Hub progress note on workstream close. Mark workplan `finished`.
State Hub progress logged (2026-06-21). Workplan marked `finished`.