generated from coulomb/repo-seed
feat(restart): route reverse tunnels through stale-forward cleanup
bridge restart now means blank-slate recovery: reverse tunnels run should_cleanup_tunnel and clear orphan remote listeners before reconnecting; healthy forwards are left running. Local-direction tunnels keep stop/start only. CLI and MCP report per-tunnel actions (healthy, cleaned_and_restarted, restarted, error) and exit non-zero on cleanup failure. Closes BRIDGE-WP-0005.
This commit is contained in:
@@ -2,7 +2,6 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
@@ -215,6 +214,27 @@ def cleanup_tunnel(
|
||||
return CleanupAction(name, "error", str(exc))
|
||||
|
||||
|
||||
def restart_tunnel(
|
||||
cfg: TunnelConfig,
|
||||
state_mgr: StateManager,
|
||||
) -> CleanupAction:
|
||||
"""Restart one tunnel with blank-slate recovery for reverse tunnels."""
|
||||
if cfg.direction == "local":
|
||||
mgr = TunnelManager(cfg, state_dir=state_mgr._dir)
|
||||
mgr.stop()
|
||||
mgr.start()
|
||||
return CleanupAction(cfg.name, "restarted", "local tunnel stop/start")
|
||||
return cleanup_tunnel(cfg, state_mgr, restart=True)
|
||||
|
||||
|
||||
def restart_all_tunnels(
|
||||
cfg,
|
||||
state_mgr: StateManager,
|
||||
) -> list[CleanupAction]:
|
||||
"""Restart every inline tunnel (reverse via cleanup path, local via stop/start)."""
|
||||
return [restart_tunnel(tcfg, state_mgr) for tcfg in cfg.tunnels.values()]
|
||||
|
||||
|
||||
def cleanup_all_tunnels(
|
||||
cfg,
|
||||
state_mgr: StateManager,
|
||||
|
||||
@@ -13,10 +13,13 @@ import typer
|
||||
|
||||
from bridge.audit import AuditLogger
|
||||
from bridge.cleanup import (
|
||||
CleanupAction,
|
||||
build_cron_line,
|
||||
cleanup_all_tunnels,
|
||||
install_cleanup_cron,
|
||||
read_installed_cron,
|
||||
restart_all_tunnels,
|
||||
restart_tunnel,
|
||||
uninstall_cleanup_cron,
|
||||
)
|
||||
from bridge.config import ConfigError, load_config
|
||||
@@ -153,27 +156,37 @@ def down(
|
||||
raise typer.Exit(2)
|
||||
|
||||
|
||||
def _emit_restart_actions(actions: list[CleanupAction]) -> None:
|
||||
any_error = False
|
||||
for action in actions:
|
||||
typer.echo(f"{action.tunnel}: {action.action} — {action.detail}")
|
||||
if action.action == "error":
|
||||
any_error = True
|
||||
if any_error:
|
||||
raise typer.Exit(1)
|
||||
|
||||
|
||||
@app.command()
|
||||
def restart(
|
||||
tunnel: Optional[str] = typer.Argument(None, help="Tunnel name (omit for all inline)"),
|
||||
):
|
||||
"""Restart one or all tunnels."""
|
||||
"""Restart one or all tunnels.
|
||||
|
||||
Reverse tunnels run conditional remote stale-forward cleanup before
|
||||
reconnecting; healthy forwards are left running. Local-direction tunnels
|
||||
use local stop/start only.
|
||||
"""
|
||||
cfg = _load_or_exit()
|
||||
sd = _state_dir()
|
||||
state_mgr = StateManager(state_dir=sd)
|
||||
|
||||
if tunnel:
|
||||
tcfg = _resolve_tunnel(cfg, tunnel)
|
||||
mgr = TunnelManager(tcfg, state_dir=sd)
|
||||
mgr.stop()
|
||||
mgr.start()
|
||||
typer.echo(f"Restarted tunnel '{tunnel}'.")
|
||||
actions = [restart_tunnel(tcfg, state_mgr)]
|
||||
else:
|
||||
for name in _all_tunnel_names(cfg):
|
||||
tcfg = cfg.tunnels[name]
|
||||
mgr = TunnelManager(tcfg, state_dir=sd)
|
||||
mgr.stop()
|
||||
mgr.start()
|
||||
typer.echo(f"Restarted tunnel '{name}'.")
|
||||
actions = restart_all_tunnels(cfg, state_mgr)
|
||||
|
||||
_emit_restart_actions(actions)
|
||||
|
||||
|
||||
@app.command()
|
||||
|
||||
@@ -169,19 +169,22 @@ def bridge_down(tunnel: Optional[str] = None) -> dict:
|
||||
def bridge_restart(tunnel: Optional[str] = None) -> dict:
|
||||
"""Restart one or all configured tunnels.
|
||||
|
||||
Reverse tunnels run conditional remote stale-forward cleanup before
|
||||
reconnecting; healthy forwards are left running.
|
||||
|
||||
Args:
|
||||
tunnel: Tunnel name to restart. If omitted, restarts all inline tunnels.
|
||||
|
||||
Returns:
|
||||
{"restarted": [...]} or {"error": "..."}
|
||||
{"actions": [{"tunnel", "action", "detail"}, ...]} or {"error": "..."}
|
||||
"""
|
||||
cfg, err = _load_cfg_or_error()
|
||||
if err:
|
||||
return err
|
||||
|
||||
from bridge.manager import TunnelManager
|
||||
from bridge.cleanup import restart_all_tunnels, restart_tunnel
|
||||
sd = _state_dir()
|
||||
restarted = []
|
||||
state_mgr = StateManager(state_dir=sd)
|
||||
|
||||
if tunnel:
|
||||
from bridge.catalog.loader import load_catalog
|
||||
@@ -196,18 +199,19 @@ def bridge_restart(tunnel: Optional[str] = None) -> dict:
|
||||
tcfg = resolve(tunnel, catalog=catalog, inline_tunnels=cfg.tunnels)
|
||||
except BridgeNotFound:
|
||||
return {"error": f"Tunnel '{tunnel}' not found in config or catalog"}
|
||||
mgr = TunnelManager(tcfg, state_dir=sd)
|
||||
mgr.stop()
|
||||
mgr.start()
|
||||
restarted.append(tunnel)
|
||||
actions = [restart_tunnel(tcfg, state_mgr)]
|
||||
else:
|
||||
for name, tcfg in cfg.tunnels.items():
|
||||
mgr = TunnelManager(tcfg, state_dir=sd)
|
||||
mgr.stop()
|
||||
mgr.start()
|
||||
restarted.append(name)
|
||||
actions = restart_all_tunnels(cfg, state_mgr)
|
||||
|
||||
return {"restarted": restarted}
|
||||
payload = {
|
||||
"actions": [
|
||||
{"tunnel": a.tunnel, "action": a.action, "detail": a.detail}
|
||||
for a in actions
|
||||
],
|
||||
}
|
||||
if any(a.action == "error" for a in actions):
|
||||
payload["error"] = "one or more tunnels failed to restart"
|
||||
return payload
|
||||
|
||||
|
||||
@mcp.tool()
|
||||
|
||||
@@ -4,12 +4,10 @@ from __future__ import annotations
|
||||
import textwrap
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
from typer.testing import CliRunner
|
||||
|
||||
from bridge.cleanup import (
|
||||
CleanupAction,
|
||||
CleanupReport,
|
||||
build_cron_line,
|
||||
cleanup_all_tunnels,
|
||||
remote_forward_health_url,
|
||||
|
||||
@@ -266,25 +266,96 @@ class TestCheckCommand:
|
||||
assert result.exit_code == 1
|
||||
|
||||
|
||||
REVERSE_CONFIG = VALID_CONFIG
|
||||
|
||||
LOCAL_TUNNEL_CONFIG = textwrap.dedent("""\
|
||||
tunnels:
|
||||
k3s-api:
|
||||
host: host.local
|
||||
remote_port: 6443
|
||||
local_port: 6443
|
||||
ssh_user: ubuntu
|
||||
ssh_key: ~/.ssh/id_ops
|
||||
actor: adm-bernd
|
||||
direction: local
|
||||
actors:
|
||||
adm-bernd:
|
||||
class: adm
|
||||
description: Bernd
|
||||
""")
|
||||
|
||||
|
||||
class TestRestartCommand:
|
||||
def test_restart_unknown_tunnel_exit_1(self, env):
|
||||
result = runner.invoke(app, ["restart", "nonexistent"], env=env)
|
||||
assert result.exit_code == 1
|
||||
|
||||
def test_restart_help_mentions_remote_cleanup(self):
|
||||
result = runner.invoke(app, ["restart", "--help"])
|
||||
assert result.exit_code == 0
|
||||
assert "stale-forward" in result.output.lower() or "remote" in result.output.lower()
|
||||
|
||||
@pytest.mark.capability("bridge_restart")
|
||||
@pytest.mark.access_mode("cli")
|
||||
def test_restart_calls_stop_then_start(self, env):
|
||||
with patch("bridge.cli.TunnelManager") as mock_mgr_cls:
|
||||
def test_restart_reverse_tunnel_delegates_to_cleanup(self, env):
|
||||
from bridge.cleanup import CleanupAction
|
||||
|
||||
with patch("bridge.cli.restart_tunnel") as mock_restart:
|
||||
mock_restart.return_value = CleanupAction(
|
||||
"test-tunnel", "healthy", "remote forward healthy"
|
||||
)
|
||||
result = runner.invoke(app, ["restart", "test-tunnel"], env=env)
|
||||
|
||||
assert result.exit_code == 0
|
||||
mock_restart.assert_called_once()
|
||||
assert "test-tunnel: healthy" in result.output
|
||||
|
||||
def test_restart_reverse_tunnel_reports_cleaned_and_restarted(self, env):
|
||||
from bridge.cleanup import CleanupAction
|
||||
|
||||
with patch("bridge.cli.restart_tunnel") as mock_restart:
|
||||
mock_restart.return_value = CleanupAction(
|
||||
"test-tunnel",
|
||||
"cleaned_and_restarted",
|
||||
"stale forward; restarted tunnel; cleared",
|
||||
)
|
||||
result = runner.invoke(app, ["restart", "test-tunnel"], env=env)
|
||||
|
||||
assert result.exit_code == 0
|
||||
assert "cleaned_and_restarted" in result.output
|
||||
|
||||
def test_restart_reverse_tunnel_error_exit_1(self, env):
|
||||
from bridge.cleanup import CleanupAction
|
||||
|
||||
with patch("bridge.cli.restart_tunnel") as mock_restart:
|
||||
mock_restart.return_value = CleanupAction(
|
||||
"test-tunnel", "error", "cleanup failed: still_listening"
|
||||
)
|
||||
result = runner.invoke(app, ["restart", "test-tunnel"], env=env)
|
||||
|
||||
assert result.exit_code == 1
|
||||
assert "error" in result.output
|
||||
|
||||
def test_restart_local_tunnel_uses_stop_start(self, tmp_path, state_dir):
|
||||
config_file = tmp_path / "tunnels.yaml"
|
||||
config_file.write_text(LOCAL_TUNNEL_CONFIG)
|
||||
env = {
|
||||
"BRIDGE_CONFIG": str(config_file),
|
||||
"BRIDGE_STATE_DIR": str(state_dir),
|
||||
}
|
||||
|
||||
with patch("bridge.cleanup.TunnelManager") as mock_mgr_cls:
|
||||
mock_mgr = MagicMock()
|
||||
mock_mgr_cls.return_value = mock_mgr
|
||||
call_order = []
|
||||
mock_mgr.stop.side_effect = lambda: call_order.append("stop")
|
||||
mock_mgr.start.side_effect = lambda: call_order.append("start")
|
||||
|
||||
result = runner.invoke(app, ["restart", "test-tunnel"], env=env)
|
||||
result = runner.invoke(app, ["restart", "k3s-api"], env=env)
|
||||
|
||||
assert result.exit_code == 0
|
||||
assert call_order == ["stop", "start"]
|
||||
assert "k3s-api: restarted" in result.output
|
||||
|
||||
|
||||
class TestCertStatusCommand:
|
||||
|
||||
@@ -237,22 +237,22 @@ class TestMcpBridgeDown:
|
||||
class TestMcpBridgeRestart:
|
||||
@pytest.mark.capability("bridge_restart")
|
||||
@pytest.mark.access_mode("mcp")
|
||||
async def test_bridge_restart_calls_stop_then_start(self, env_simple):
|
||||
with patch("bridge.manager.TunnelManager") as mock_cls:
|
||||
mock_mgr = MagicMock()
|
||||
call_order = []
|
||||
mock_mgr.stop.side_effect = lambda: call_order.append("stop")
|
||||
mock_mgr.start.side_effect = lambda: call_order.append("start")
|
||||
mock_cls.return_value = mock_mgr
|
||||
async def test_bridge_restart_delegates_to_cleanup(self, env_simple):
|
||||
from bridge.cleanup import CleanupAction
|
||||
|
||||
with patch("bridge.cleanup.restart_tunnel") as mock_restart:
|
||||
mock_restart.return_value = CleanupAction(
|
||||
"test-tunnel", "healthy", "remote forward healthy"
|
||||
)
|
||||
|
||||
from fastmcp import Client
|
||||
async with Client(mcp) as c:
|
||||
result = await c.call_tool("bridge_restart", {"tunnel": "test-tunnel"})
|
||||
|
||||
data = _data(result)
|
||||
assert "restarted" in data
|
||||
assert "test-tunnel" in data["restarted"]
|
||||
assert call_order == ["stop", "start"]
|
||||
assert data["actions"][0]["tunnel"] == "test-tunnel"
|
||||
assert data["actions"][0]["action"] == "healthy"
|
||||
mock_restart.assert_called_once()
|
||||
|
||||
async def test_bridge_restart_unknown_tunnel(self, env_simple):
|
||||
from fastmcp import Client
|
||||
|
||||
@@ -157,31 +157,82 @@ Just controlled operational access when you need it.
|
||||
Start a bridge:
|
||||
|
||||
```
|
||||
ob up hostA=hostB
|
||||
bridge up state-hub-railiance01
|
||||
```
|
||||
|
||||
Check active bridges:
|
||||
|
||||
```
|
||||
ob status
|
||||
bridge status
|
||||
```
|
||||
|
||||
Investigate infrastructure targets:
|
||||
|
||||
```
|
||||
ob targets
|
||||
bridge targets
|
||||
```
|
||||
|
||||
Stop the bridge when finished:
|
||||
|
||||
```
|
||||
ob down hostA=hostB
|
||||
bridge down state-hub-railiance01
|
||||
```
|
||||
|
||||
OpsBridge handles the lifecycle so operators can focus on solving the problem.
|
||||
|
||||
---
|
||||
|
||||
# Tunnel lifecycle commands
|
||||
|
||||
| Command | Purpose |
|
||||
|---------|---------|
|
||||
| `bridge up` | Start tunnel(s) that are not already running |
|
||||
| `bridge down` | Stop tunnel(s) that are running |
|
||||
| `bridge restart` | Blank-slate recovery — get tunnel(s) operational again |
|
||||
| `bridge maintenance cleanup` | Proactive hygiene sweep without implying restart |
|
||||
|
||||
## `bridge restart` — blank-slate recovery
|
||||
|
||||
`bridge restart` means *operational again*, not merely cycling the local manager
|
||||
PID while a broken remote listener still holds the port.
|
||||
|
||||
For **reverse** tunnels (State Hub exposure on remote hosts), restart:
|
||||
|
||||
1. Runs `should_cleanup_tunnel` to detect stale SSH remote forwards
|
||||
2. Clears orphan listeners on the remote host when needed
|
||||
3. Reconnects the tunnel (stop + start) only when cleanup was required
|
||||
|
||||
When the remote forward is already healthy, restart reports `healthy` and leaves
|
||||
the working tunnel running — no unnecessary disruption.
|
||||
|
||||
For **local-direction** tunnels (`direction: local` in `tunnels.yaml`, e.g.
|
||||
`k3s-api-coulombcore`), restart uses local stop/start only; no remote cleanup.
|
||||
|
||||
Use `bridge maintenance cleanup` for scheduled or manual hygiene without the
|
||||
restart contract. The nightly cron (`bridge maintenance install-cron`) runs
|
||||
`maintenance cleanup --restart` at 03:00.
|
||||
|
||||
**Incident context:** stale orphan `sshd` remote forwards after laptop sleep
|
||||
blocked `bridge restart` until operators discovered the maintenance subcommand.
|
||||
See `state-hub/history/20260621-weekend-automation-assessment.md` and
|
||||
`BRIDGE-WP-0005` in this repo.
|
||||
|
||||
## Host roles
|
||||
|
||||
Tunnels in `~/.config/bridge/tunnels.yaml` serve three host roles:
|
||||
|
||||
| Role | Hosts | Behaviour |
|
||||
|------|-------|-----------|
|
||||
| **Workstation origin** | WSL laptop | Shutdown, sleep, and network changes kill local bridge processes without graceful remote SSH teardown. Orphan forwards on all remotes are common after wake. |
|
||||
| **VPS remotes** | coulombcore, railiance01 | Normally always-on. Maintenance reboots clear kernel state, but laptop return can leave orphan forwards from the previous session if the VPS did not reboot. |
|
||||
| **LAN builder** | haskelseed | Intermittently offline; same orphan-forward pattern when the workstation-side tunnel dies uncleanly. |
|
||||
|
||||
Conditional remote cleanup before restart benefits all reverse tunnels.
|
||||
`should_cleanup_tunnel` skips healthy forwards — VPS tunnels with live working
|
||||
forwards are untouched.
|
||||
|
||||
---
|
||||
|
||||
# The Philosophy Behind OpsBridge
|
||||
|
||||
Infrastructure teams succeed or fail based on how effectively they bridge the gaps between:
|
||||
|
||||
@@ -4,7 +4,7 @@ type: workplan
|
||||
title: "Restart includes remote cleanup (blank-slate recovery)"
|
||||
domain: custodian
|
||||
repo: ops-bridge
|
||||
status: ready
|
||||
status: finished
|
||||
owner: codex
|
||||
topic_slug: custodian
|
||||
created: "2026-06-21"
|
||||
@@ -97,7 +97,7 @@ Emit the same action summary strings cleanup already uses (`healthy`,
|
||||
|
||||
```task
|
||||
id: BRIDGE-WP-0005-T01
|
||||
status: todo
|
||||
status: done
|
||||
priority: high
|
||||
state_hub_task_id: "b61c5d45-1198-416d-aa15-f2063fc5eb14"
|
||||
```
|
||||
@@ -119,7 +119,7 @@ Requirements:
|
||||
|
||||
```task
|
||||
id: BRIDGE-WP-0005-T02
|
||||
status: todo
|
||||
status: done
|
||||
priority: high
|
||||
state_hub_task_id: "b4ad0525-6936-4799-bead-3603d05c49af"
|
||||
```
|
||||
@@ -138,7 +138,7 @@ Update `tests/test_cli.py`:
|
||||
|
||||
```task
|
||||
id: BRIDGE-WP-0005-T03
|
||||
status: todo
|
||||
status: done
|
||||
priority: medium
|
||||
state_hub_task_id: "60586375-b0b4-4d4c-ba87-0699e76bf30c"
|
||||
```
|
||||
@@ -156,7 +156,7 @@ Document the blank-slate restart contract:
|
||||
|
||||
```task
|
||||
id: BRIDGE-WP-0005-T04
|
||||
status: todo
|
||||
status: cancelled
|
||||
priority: low
|
||||
state_hub_task_id: "518f1b5e-3098-42aa-9662-bdab1d7d269b"
|
||||
```
|
||||
@@ -166,26 +166,29 @@ once after repeated exit-255 bind failures (laptop wake without operator running
|
||||
`bridge restart`). Defer unless T1–T3 are done; mark `cancel` if heuristic risk
|
||||
outweighs benefit.
|
||||
|
||||
Done when documented decision: implement, defer, or cancel with reason.
|
||||
**Decision (2026-06-21): cancelled for now.** Auto-cleanup inside the reconnect
|
||||
loop risks killing a legitimately healthy orphan forward owned by another session
|
||||
or operator. `bridge restart` now covers the operator-facing blank-slate path;
|
||||
nightly `maintenance cleanup --restart` covers unattended hygiene. Revisit only if
|
||||
wake-from-sleep reconnect failures remain frequent after a month of observation.
|
||||
|
||||
## T5 — Live verification on workstation + VPS
|
||||
|
||||
```task
|
||||
id: BRIDGE-WP-0005-T05
|
||||
status: todo
|
||||
status: done
|
||||
priority: medium
|
||||
state_hub_task_id: "b5d305ef-5b5d-4afe-a992-e0960d07af79"
|
||||
```
|
||||
|
||||
After T1–T2 ship, verify on real config:
|
||||
|
||||
1. **railiance01** — reproduce stale-forward scenario (or simulate); confirm
|
||||
`bridge restart state-hub-railiance01` clears and connects without needing
|
||||
the maintenance subcommand.
|
||||
2. **haskelseed** — `bridge restart state-hub-haskelseed` after a manual
|
||||
`bridge down` while remote port still listens (Alpine `netstat` path from
|
||||
ADHOC-2026-06-14).
|
||||
3. **coulombcore** — confirm healthy tunnel restart is a no-op remote cleanup
|
||||
(`healthy` action) and does not disrupt a working forward.
|
||||
1. **railiance01** — `state-hub-mcp-railiance01` was `reconnecting` with stale
|
||||
forward; `bridge restart` reported `cleaned_and_restarted` and tunnel reached
|
||||
`connected`.
|
||||
2. **haskelseed** — not exercised (all tunnels already healthy); Alpine netstat
|
||||
path unchanged from ADHOC-2026-06-14 and covered by existing cleanup tests.
|
||||
3. **coulombcore** — `bridge restart state-hub-coulombcore` reported `healthy`,
|
||||
PID unchanged (4116), forward undisturbed.
|
||||
|
||||
Log a State Hub progress note on workstream close. Mark workplan `finished`.
|
||||
State Hub progress logged (2026-06-21). Workplan marked `finished`.
|
||||
Reference in New Issue
Block a user