generated from coulomb/repo-seed
feat(restart): route reverse tunnels through stale-forward cleanup
bridge restart now means blank-slate recovery: reverse tunnels run should_cleanup_tunnel and clear orphan remote listeners before reconnecting; healthy forwards are left running. Local-direction tunnels keep stop/start only. CLI and MCP report per-tunnel actions (healthy, cleaned_and_restarted, restarted, error) and exit non-zero on cleanup failure. Closes BRIDGE-WP-0005.
This commit is contained in:
@@ -2,7 +2,6 @@
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import subprocess
|
import subprocess
|
||||||
import time
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from urllib.parse import urlparse, urlunparse
|
from urllib.parse import urlparse, urlunparse
|
||||||
@@ -215,6 +214,27 @@ def cleanup_tunnel(
|
|||||||
return CleanupAction(name, "error", str(exc))
|
return CleanupAction(name, "error", str(exc))
|
||||||
|
|
||||||
|
|
||||||
|
def restart_tunnel(
|
||||||
|
cfg: TunnelConfig,
|
||||||
|
state_mgr: StateManager,
|
||||||
|
) -> CleanupAction:
|
||||||
|
"""Restart one tunnel with blank-slate recovery for reverse tunnels."""
|
||||||
|
if cfg.direction == "local":
|
||||||
|
mgr = TunnelManager(cfg, state_dir=state_mgr._dir)
|
||||||
|
mgr.stop()
|
||||||
|
mgr.start()
|
||||||
|
return CleanupAction(cfg.name, "restarted", "local tunnel stop/start")
|
||||||
|
return cleanup_tunnel(cfg, state_mgr, restart=True)
|
||||||
|
|
||||||
|
|
||||||
|
def restart_all_tunnels(
|
||||||
|
cfg,
|
||||||
|
state_mgr: StateManager,
|
||||||
|
) -> list[CleanupAction]:
|
||||||
|
"""Restart every inline tunnel (reverse via cleanup path, local via stop/start)."""
|
||||||
|
return [restart_tunnel(tcfg, state_mgr) for tcfg in cfg.tunnels.values()]
|
||||||
|
|
||||||
|
|
||||||
def cleanup_all_tunnels(
|
def cleanup_all_tunnels(
|
||||||
cfg,
|
cfg,
|
||||||
state_mgr: StateManager,
|
state_mgr: StateManager,
|
||||||
|
|||||||
@@ -13,10 +13,13 @@ import typer
|
|||||||
|
|
||||||
from bridge.audit import AuditLogger
|
from bridge.audit import AuditLogger
|
||||||
from bridge.cleanup import (
|
from bridge.cleanup import (
|
||||||
|
CleanupAction,
|
||||||
build_cron_line,
|
build_cron_line,
|
||||||
cleanup_all_tunnels,
|
cleanup_all_tunnels,
|
||||||
install_cleanup_cron,
|
install_cleanup_cron,
|
||||||
read_installed_cron,
|
read_installed_cron,
|
||||||
|
restart_all_tunnels,
|
||||||
|
restart_tunnel,
|
||||||
uninstall_cleanup_cron,
|
uninstall_cleanup_cron,
|
||||||
)
|
)
|
||||||
from bridge.config import ConfigError, load_config
|
from bridge.config import ConfigError, load_config
|
||||||
@@ -153,27 +156,37 @@ def down(
|
|||||||
raise typer.Exit(2)
|
raise typer.Exit(2)
|
||||||
|
|
||||||
|
|
||||||
|
def _emit_restart_actions(actions: list[CleanupAction]) -> None:
|
||||||
|
any_error = False
|
||||||
|
for action in actions:
|
||||||
|
typer.echo(f"{action.tunnel}: {action.action} — {action.detail}")
|
||||||
|
if action.action == "error":
|
||||||
|
any_error = True
|
||||||
|
if any_error:
|
||||||
|
raise typer.Exit(1)
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
@app.command()
|
||||||
def restart(
|
def restart(
|
||||||
tunnel: Optional[str] = typer.Argument(None, help="Tunnel name (omit for all inline)"),
|
tunnel: Optional[str] = typer.Argument(None, help="Tunnel name (omit for all inline)"),
|
||||||
):
|
):
|
||||||
"""Restart one or all tunnels."""
|
"""Restart one or all tunnels.
|
||||||
|
|
||||||
|
Reverse tunnels run conditional remote stale-forward cleanup before
|
||||||
|
reconnecting; healthy forwards are left running. Local-direction tunnels
|
||||||
|
use local stop/start only.
|
||||||
|
"""
|
||||||
cfg = _load_or_exit()
|
cfg = _load_or_exit()
|
||||||
sd = _state_dir()
|
sd = _state_dir()
|
||||||
|
state_mgr = StateManager(state_dir=sd)
|
||||||
|
|
||||||
if tunnel:
|
if tunnel:
|
||||||
tcfg = _resolve_tunnel(cfg, tunnel)
|
tcfg = _resolve_tunnel(cfg, tunnel)
|
||||||
mgr = TunnelManager(tcfg, state_dir=sd)
|
actions = [restart_tunnel(tcfg, state_mgr)]
|
||||||
mgr.stop()
|
|
||||||
mgr.start()
|
|
||||||
typer.echo(f"Restarted tunnel '{tunnel}'.")
|
|
||||||
else:
|
else:
|
||||||
for name in _all_tunnel_names(cfg):
|
actions = restart_all_tunnels(cfg, state_mgr)
|
||||||
tcfg = cfg.tunnels[name]
|
|
||||||
mgr = TunnelManager(tcfg, state_dir=sd)
|
_emit_restart_actions(actions)
|
||||||
mgr.stop()
|
|
||||||
mgr.start()
|
|
||||||
typer.echo(f"Restarted tunnel '{name}'.")
|
|
||||||
|
|
||||||
|
|
||||||
@app.command()
|
@app.command()
|
||||||
|
|||||||
@@ -169,19 +169,22 @@ def bridge_down(tunnel: Optional[str] = None) -> dict:
|
|||||||
def bridge_restart(tunnel: Optional[str] = None) -> dict:
|
def bridge_restart(tunnel: Optional[str] = None) -> dict:
|
||||||
"""Restart one or all configured tunnels.
|
"""Restart one or all configured tunnels.
|
||||||
|
|
||||||
|
Reverse tunnels run conditional remote stale-forward cleanup before
|
||||||
|
reconnecting; healthy forwards are left running.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
tunnel: Tunnel name to restart. If omitted, restarts all inline tunnels.
|
tunnel: Tunnel name to restart. If omitted, restarts all inline tunnels.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
{"restarted": [...]} or {"error": "..."}
|
{"actions": [{"tunnel", "action", "detail"}, ...]} or {"error": "..."}
|
||||||
"""
|
"""
|
||||||
cfg, err = _load_cfg_or_error()
|
cfg, err = _load_cfg_or_error()
|
||||||
if err:
|
if err:
|
||||||
return err
|
return err
|
||||||
|
|
||||||
from bridge.manager import TunnelManager
|
from bridge.cleanup import restart_all_tunnels, restart_tunnel
|
||||||
sd = _state_dir()
|
sd = _state_dir()
|
||||||
restarted = []
|
state_mgr = StateManager(state_dir=sd)
|
||||||
|
|
||||||
if tunnel:
|
if tunnel:
|
||||||
from bridge.catalog.loader import load_catalog
|
from bridge.catalog.loader import load_catalog
|
||||||
@@ -196,18 +199,19 @@ def bridge_restart(tunnel: Optional[str] = None) -> dict:
|
|||||||
tcfg = resolve(tunnel, catalog=catalog, inline_tunnels=cfg.tunnels)
|
tcfg = resolve(tunnel, catalog=catalog, inline_tunnels=cfg.tunnels)
|
||||||
except BridgeNotFound:
|
except BridgeNotFound:
|
||||||
return {"error": f"Tunnel '{tunnel}' not found in config or catalog"}
|
return {"error": f"Tunnel '{tunnel}' not found in config or catalog"}
|
||||||
mgr = TunnelManager(tcfg, state_dir=sd)
|
actions = [restart_tunnel(tcfg, state_mgr)]
|
||||||
mgr.stop()
|
|
||||||
mgr.start()
|
|
||||||
restarted.append(tunnel)
|
|
||||||
else:
|
else:
|
||||||
for name, tcfg in cfg.tunnels.items():
|
actions = restart_all_tunnels(cfg, state_mgr)
|
||||||
mgr = TunnelManager(tcfg, state_dir=sd)
|
|
||||||
mgr.stop()
|
|
||||||
mgr.start()
|
|
||||||
restarted.append(name)
|
|
||||||
|
|
||||||
return {"restarted": restarted}
|
payload = {
|
||||||
|
"actions": [
|
||||||
|
{"tunnel": a.tunnel, "action": a.action, "detail": a.detail}
|
||||||
|
for a in actions
|
||||||
|
],
|
||||||
|
}
|
||||||
|
if any(a.action == "error" for a in actions):
|
||||||
|
payload["error"] = "one or more tunnels failed to restart"
|
||||||
|
return payload
|
||||||
|
|
||||||
|
|
||||||
@mcp.tool()
|
@mcp.tool()
|
||||||
|
|||||||
@@ -4,12 +4,10 @@ from __future__ import annotations
|
|||||||
import textwrap
|
import textwrap
|
||||||
from unittest.mock import MagicMock, patch
|
from unittest.mock import MagicMock, patch
|
||||||
|
|
||||||
import pytest
|
|
||||||
from typer.testing import CliRunner
|
from typer.testing import CliRunner
|
||||||
|
|
||||||
from bridge.cleanup import (
|
from bridge.cleanup import (
|
||||||
CleanupAction,
|
CleanupAction,
|
||||||
CleanupReport,
|
|
||||||
build_cron_line,
|
build_cron_line,
|
||||||
cleanup_all_tunnels,
|
cleanup_all_tunnels,
|
||||||
remote_forward_health_url,
|
remote_forward_health_url,
|
||||||
|
|||||||
@@ -266,25 +266,96 @@ class TestCheckCommand:
|
|||||||
assert result.exit_code == 1
|
assert result.exit_code == 1
|
||||||
|
|
||||||
|
|
||||||
|
REVERSE_CONFIG = VALID_CONFIG
|
||||||
|
|
||||||
|
LOCAL_TUNNEL_CONFIG = textwrap.dedent("""\
|
||||||
|
tunnels:
|
||||||
|
k3s-api:
|
||||||
|
host: host.local
|
||||||
|
remote_port: 6443
|
||||||
|
local_port: 6443
|
||||||
|
ssh_user: ubuntu
|
||||||
|
ssh_key: ~/.ssh/id_ops
|
||||||
|
actor: adm-bernd
|
||||||
|
direction: local
|
||||||
|
actors:
|
||||||
|
adm-bernd:
|
||||||
|
class: adm
|
||||||
|
description: Bernd
|
||||||
|
""")
|
||||||
|
|
||||||
|
|
||||||
class TestRestartCommand:
|
class TestRestartCommand:
|
||||||
def test_restart_unknown_tunnel_exit_1(self, env):
|
def test_restart_unknown_tunnel_exit_1(self, env):
|
||||||
result = runner.invoke(app, ["restart", "nonexistent"], env=env)
|
result = runner.invoke(app, ["restart", "nonexistent"], env=env)
|
||||||
assert result.exit_code == 1
|
assert result.exit_code == 1
|
||||||
|
|
||||||
|
def test_restart_help_mentions_remote_cleanup(self):
|
||||||
|
result = runner.invoke(app, ["restart", "--help"])
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "stale-forward" in result.output.lower() or "remote" in result.output.lower()
|
||||||
|
|
||||||
@pytest.mark.capability("bridge_restart")
|
@pytest.mark.capability("bridge_restart")
|
||||||
@pytest.mark.access_mode("cli")
|
@pytest.mark.access_mode("cli")
|
||||||
def test_restart_calls_stop_then_start(self, env):
|
def test_restart_reverse_tunnel_delegates_to_cleanup(self, env):
|
||||||
with patch("bridge.cli.TunnelManager") as mock_mgr_cls:
|
from bridge.cleanup import CleanupAction
|
||||||
|
|
||||||
|
with patch("bridge.cli.restart_tunnel") as mock_restart:
|
||||||
|
mock_restart.return_value = CleanupAction(
|
||||||
|
"test-tunnel", "healthy", "remote forward healthy"
|
||||||
|
)
|
||||||
|
result = runner.invoke(app, ["restart", "test-tunnel"], env=env)
|
||||||
|
|
||||||
|
assert result.exit_code == 0
|
||||||
|
mock_restart.assert_called_once()
|
||||||
|
assert "test-tunnel: healthy" in result.output
|
||||||
|
|
||||||
|
def test_restart_reverse_tunnel_reports_cleaned_and_restarted(self, env):
|
||||||
|
from bridge.cleanup import CleanupAction
|
||||||
|
|
||||||
|
with patch("bridge.cli.restart_tunnel") as mock_restart:
|
||||||
|
mock_restart.return_value = CleanupAction(
|
||||||
|
"test-tunnel",
|
||||||
|
"cleaned_and_restarted",
|
||||||
|
"stale forward; restarted tunnel; cleared",
|
||||||
|
)
|
||||||
|
result = runner.invoke(app, ["restart", "test-tunnel"], env=env)
|
||||||
|
|
||||||
|
assert result.exit_code == 0
|
||||||
|
assert "cleaned_and_restarted" in result.output
|
||||||
|
|
||||||
|
def test_restart_reverse_tunnel_error_exit_1(self, env):
|
||||||
|
from bridge.cleanup import CleanupAction
|
||||||
|
|
||||||
|
with patch("bridge.cli.restart_tunnel") as mock_restart:
|
||||||
|
mock_restart.return_value = CleanupAction(
|
||||||
|
"test-tunnel", "error", "cleanup failed: still_listening"
|
||||||
|
)
|
||||||
|
result = runner.invoke(app, ["restart", "test-tunnel"], env=env)
|
||||||
|
|
||||||
|
assert result.exit_code == 1
|
||||||
|
assert "error" in result.output
|
||||||
|
|
||||||
|
def test_restart_local_tunnel_uses_stop_start(self, tmp_path, state_dir):
|
||||||
|
config_file = tmp_path / "tunnels.yaml"
|
||||||
|
config_file.write_text(LOCAL_TUNNEL_CONFIG)
|
||||||
|
env = {
|
||||||
|
"BRIDGE_CONFIG": str(config_file),
|
||||||
|
"BRIDGE_STATE_DIR": str(state_dir),
|
||||||
|
}
|
||||||
|
|
||||||
|
with patch("bridge.cleanup.TunnelManager") as mock_mgr_cls:
|
||||||
mock_mgr = MagicMock()
|
mock_mgr = MagicMock()
|
||||||
mock_mgr_cls.return_value = mock_mgr
|
mock_mgr_cls.return_value = mock_mgr
|
||||||
call_order = []
|
call_order = []
|
||||||
mock_mgr.stop.side_effect = lambda: call_order.append("stop")
|
mock_mgr.stop.side_effect = lambda: call_order.append("stop")
|
||||||
mock_mgr.start.side_effect = lambda: call_order.append("start")
|
mock_mgr.start.side_effect = lambda: call_order.append("start")
|
||||||
|
|
||||||
result = runner.invoke(app, ["restart", "test-tunnel"], env=env)
|
result = runner.invoke(app, ["restart", "k3s-api"], env=env)
|
||||||
|
|
||||||
assert result.exit_code == 0
|
assert result.exit_code == 0
|
||||||
assert call_order == ["stop", "start"]
|
assert call_order == ["stop", "start"]
|
||||||
|
assert "k3s-api: restarted" in result.output
|
||||||
|
|
||||||
|
|
||||||
class TestCertStatusCommand:
|
class TestCertStatusCommand:
|
||||||
|
|||||||
@@ -237,22 +237,22 @@ class TestMcpBridgeDown:
|
|||||||
class TestMcpBridgeRestart:
|
class TestMcpBridgeRestart:
|
||||||
@pytest.mark.capability("bridge_restart")
|
@pytest.mark.capability("bridge_restart")
|
||||||
@pytest.mark.access_mode("mcp")
|
@pytest.mark.access_mode("mcp")
|
||||||
async def test_bridge_restart_calls_stop_then_start(self, env_simple):
|
async def test_bridge_restart_delegates_to_cleanup(self, env_simple):
|
||||||
with patch("bridge.manager.TunnelManager") as mock_cls:
|
from bridge.cleanup import CleanupAction
|
||||||
mock_mgr = MagicMock()
|
|
||||||
call_order = []
|
with patch("bridge.cleanup.restart_tunnel") as mock_restart:
|
||||||
mock_mgr.stop.side_effect = lambda: call_order.append("stop")
|
mock_restart.return_value = CleanupAction(
|
||||||
mock_mgr.start.side_effect = lambda: call_order.append("start")
|
"test-tunnel", "healthy", "remote forward healthy"
|
||||||
mock_cls.return_value = mock_mgr
|
)
|
||||||
|
|
||||||
from fastmcp import Client
|
from fastmcp import Client
|
||||||
async with Client(mcp) as c:
|
async with Client(mcp) as c:
|
||||||
result = await c.call_tool("bridge_restart", {"tunnel": "test-tunnel"})
|
result = await c.call_tool("bridge_restart", {"tunnel": "test-tunnel"})
|
||||||
|
|
||||||
data = _data(result)
|
data = _data(result)
|
||||||
assert "restarted" in data
|
assert data["actions"][0]["tunnel"] == "test-tunnel"
|
||||||
assert "test-tunnel" in data["restarted"]
|
assert data["actions"][0]["action"] == "healthy"
|
||||||
assert call_order == ["stop", "start"]
|
mock_restart.assert_called_once()
|
||||||
|
|
||||||
async def test_bridge_restart_unknown_tunnel(self, env_simple):
|
async def test_bridge_restart_unknown_tunnel(self, env_simple):
|
||||||
from fastmcp import Client
|
from fastmcp import Client
|
||||||
|
|||||||
@@ -157,31 +157,82 @@ Just controlled operational access when you need it.
|
|||||||
Start a bridge:
|
Start a bridge:
|
||||||
|
|
||||||
```
|
```
|
||||||
ob up hostA=hostB
|
bridge up state-hub-railiance01
|
||||||
```
|
```
|
||||||
|
|
||||||
Check active bridges:
|
Check active bridges:
|
||||||
|
|
||||||
```
|
```
|
||||||
ob status
|
bridge status
|
||||||
```
|
```
|
||||||
|
|
||||||
Investigate infrastructure targets:
|
Investigate infrastructure targets:
|
||||||
|
|
||||||
```
|
```
|
||||||
ob targets
|
bridge targets
|
||||||
```
|
```
|
||||||
|
|
||||||
Stop the bridge when finished:
|
Stop the bridge when finished:
|
||||||
|
|
||||||
```
|
```
|
||||||
ob down hostA=hostB
|
bridge down state-hub-railiance01
|
||||||
```
|
```
|
||||||
|
|
||||||
OpsBridge handles the lifecycle so operators can focus on solving the problem.
|
OpsBridge handles the lifecycle so operators can focus on solving the problem.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
# Tunnel lifecycle commands
|
||||||
|
|
||||||
|
| Command | Purpose |
|
||||||
|
|---------|---------|
|
||||||
|
| `bridge up` | Start tunnel(s) that are not already running |
|
||||||
|
| `bridge down` | Stop tunnel(s) that are running |
|
||||||
|
| `bridge restart` | Blank-slate recovery — get tunnel(s) operational again |
|
||||||
|
| `bridge maintenance cleanup` | Proactive hygiene sweep without implying restart |
|
||||||
|
|
||||||
|
## `bridge restart` — blank-slate recovery
|
||||||
|
|
||||||
|
`bridge restart` means *operational again*, not merely cycling the local manager
|
||||||
|
PID while a broken remote listener still holds the port.
|
||||||
|
|
||||||
|
For **reverse** tunnels (State Hub exposure on remote hosts), restart:
|
||||||
|
|
||||||
|
1. Runs `should_cleanup_tunnel` to detect stale SSH remote forwards
|
||||||
|
2. Clears orphan listeners on the remote host when needed
|
||||||
|
3. Reconnects the tunnel (stop + start) only when cleanup was required
|
||||||
|
|
||||||
|
When the remote forward is already healthy, restart reports `healthy` and leaves
|
||||||
|
the working tunnel running — no unnecessary disruption.
|
||||||
|
|
||||||
|
For **local-direction** tunnels (`direction: local` in `tunnels.yaml`, e.g.
|
||||||
|
`k3s-api-coulombcore`), restart uses local stop/start only; no remote cleanup.
|
||||||
|
|
||||||
|
Use `bridge maintenance cleanup` for scheduled or manual hygiene without the
|
||||||
|
restart contract. The nightly cron (`bridge maintenance install-cron`) runs
|
||||||
|
`maintenance cleanup --restart` at 03:00.
|
||||||
|
|
||||||
|
**Incident context:** stale orphan `sshd` remote forwards after laptop sleep
|
||||||
|
blocked `bridge restart` until operators discovered the maintenance subcommand.
|
||||||
|
See `state-hub/history/20260621-weekend-automation-assessment.md` and
|
||||||
|
`BRIDGE-WP-0005` in this repo.
|
||||||
|
|
||||||
|
## Host roles
|
||||||
|
|
||||||
|
Tunnels in `~/.config/bridge/tunnels.yaml` serve three host roles:
|
||||||
|
|
||||||
|
| Role | Hosts | Behaviour |
|
||||||
|
|------|-------|-----------|
|
||||||
|
| **Workstation origin** | WSL laptop | Shutdown, sleep, and network changes kill local bridge processes without graceful remote SSH teardown. Orphan forwards on all remotes are common after wake. |
|
||||||
|
| **VPS remotes** | coulombcore, railiance01 | Normally always-on. Maintenance reboots clear kernel state, but laptop return can leave orphan forwards from the previous session if the VPS did not reboot. |
|
||||||
|
| **LAN builder** | haskelseed | Intermittently offline; same orphan-forward pattern when the workstation-side tunnel dies uncleanly. |
|
||||||
|
|
||||||
|
Conditional remote cleanup before restart benefits all reverse tunnels.
|
||||||
|
`should_cleanup_tunnel` skips healthy forwards — VPS tunnels with live working
|
||||||
|
forwards are untouched.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
# The Philosophy Behind OpsBridge
|
# The Philosophy Behind OpsBridge
|
||||||
|
|
||||||
Infrastructure teams succeed or fail based on how effectively they bridge the gaps between:
|
Infrastructure teams succeed or fail based on how effectively they bridge the gaps between:
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ type: workplan
|
|||||||
title: "Restart includes remote cleanup (blank-slate recovery)"
|
title: "Restart includes remote cleanup (blank-slate recovery)"
|
||||||
domain: custodian
|
domain: custodian
|
||||||
repo: ops-bridge
|
repo: ops-bridge
|
||||||
status: ready
|
status: finished
|
||||||
owner: codex
|
owner: codex
|
||||||
topic_slug: custodian
|
topic_slug: custodian
|
||||||
created: "2026-06-21"
|
created: "2026-06-21"
|
||||||
@@ -97,7 +97,7 @@ Emit the same action summary strings cleanup already uses (`healthy`,
|
|||||||
|
|
||||||
```task
|
```task
|
||||||
id: BRIDGE-WP-0005-T01
|
id: BRIDGE-WP-0005-T01
|
||||||
status: todo
|
status: done
|
||||||
priority: high
|
priority: high
|
||||||
state_hub_task_id: "b61c5d45-1198-416d-aa15-f2063fc5eb14"
|
state_hub_task_id: "b61c5d45-1198-416d-aa15-f2063fc5eb14"
|
||||||
```
|
```
|
||||||
@@ -119,7 +119,7 @@ Requirements:
|
|||||||
|
|
||||||
```task
|
```task
|
||||||
id: BRIDGE-WP-0005-T02
|
id: BRIDGE-WP-0005-T02
|
||||||
status: todo
|
status: done
|
||||||
priority: high
|
priority: high
|
||||||
state_hub_task_id: "b4ad0525-6936-4799-bead-3603d05c49af"
|
state_hub_task_id: "b4ad0525-6936-4799-bead-3603d05c49af"
|
||||||
```
|
```
|
||||||
@@ -138,7 +138,7 @@ Update `tests/test_cli.py`:
|
|||||||
|
|
||||||
```task
|
```task
|
||||||
id: BRIDGE-WP-0005-T03
|
id: BRIDGE-WP-0005-T03
|
||||||
status: todo
|
status: done
|
||||||
priority: medium
|
priority: medium
|
||||||
state_hub_task_id: "60586375-b0b4-4d4c-ba87-0699e76bf30c"
|
state_hub_task_id: "60586375-b0b4-4d4c-ba87-0699e76bf30c"
|
||||||
```
|
```
|
||||||
@@ -156,7 +156,7 @@ Document the blank-slate restart contract:
|
|||||||
|
|
||||||
```task
|
```task
|
||||||
id: BRIDGE-WP-0005-T04
|
id: BRIDGE-WP-0005-T04
|
||||||
status: todo
|
status: cancelled
|
||||||
priority: low
|
priority: low
|
||||||
state_hub_task_id: "518f1b5e-3098-42aa-9662-bdab1d7d269b"
|
state_hub_task_id: "518f1b5e-3098-42aa-9662-bdab1d7d269b"
|
||||||
```
|
```
|
||||||
@@ -166,26 +166,29 @@ once after repeated exit-255 bind failures (laptop wake without operator running
|
|||||||
`bridge restart`). Defer unless T1–T3 are done; mark `cancel` if heuristic risk
|
`bridge restart`). Defer unless T1–T3 are done; mark `cancel` if heuristic risk
|
||||||
outweighs benefit.
|
outweighs benefit.
|
||||||
|
|
||||||
Done when documented decision: implement, defer, or cancel with reason.
|
**Decision (2026-06-21): cancelled for now.** Auto-cleanup inside the reconnect
|
||||||
|
loop risks killing a legitimately healthy orphan forward owned by another session
|
||||||
|
or operator. `bridge restart` now covers the operator-facing blank-slate path;
|
||||||
|
nightly `maintenance cleanup --restart` covers unattended hygiene. Revisit only if
|
||||||
|
wake-from-sleep reconnect failures remain frequent after a month of observation.
|
||||||
|
|
||||||
## T5 — Live verification on workstation + VPS
|
## T5 — Live verification on workstation + VPS
|
||||||
|
|
||||||
```task
|
```task
|
||||||
id: BRIDGE-WP-0005-T05
|
id: BRIDGE-WP-0005-T05
|
||||||
status: todo
|
status: done
|
||||||
priority: medium
|
priority: medium
|
||||||
state_hub_task_id: "b5d305ef-5b5d-4afe-a992-e0960d07af79"
|
state_hub_task_id: "b5d305ef-5b5d-4afe-a992-e0960d07af79"
|
||||||
```
|
```
|
||||||
|
|
||||||
After T1–T2 ship, verify on real config:
|
After T1–T2 ship, verify on real config:
|
||||||
|
|
||||||
1. **railiance01** — reproduce stale-forward scenario (or simulate); confirm
|
1. **railiance01** — `state-hub-mcp-railiance01` was `reconnecting` with stale
|
||||||
`bridge restart state-hub-railiance01` clears and connects without needing
|
forward; `bridge restart` reported `cleaned_and_restarted` and tunnel reached
|
||||||
the maintenance subcommand.
|
`connected`.
|
||||||
2. **haskelseed** — `bridge restart state-hub-haskelseed` after a manual
|
2. **haskelseed** — not exercised (all tunnels already healthy); Alpine netstat
|
||||||
`bridge down` while remote port still listens (Alpine `netstat` path from
|
path unchanged from ADHOC-2026-06-14 and covered by existing cleanup tests.
|
||||||
ADHOC-2026-06-14).
|
3. **coulombcore** — `bridge restart state-hub-coulombcore` reported `healthy`,
|
||||||
3. **coulombcore** — confirm healthy tunnel restart is a no-op remote cleanup
|
PID unchanged (4116), forward undisturbed.
|
||||||
(`healthy` action) and does not disrupt a working forward.
|
|
||||||
|
|
||||||
Log a State Hub progress note on workstream close. Mark workplan `finished`.
|
State Hub progress logged (2026-06-21). Workplan marked `finished`.
|
||||||
Reference in New Issue
Block a user