Files
ops-bridge/src/bridge/cleanup.py
tegwick 4e9882909f feat(maintenance): nightly stale SSH forward cleanup at 03:00
Add bridge maintenance cleanup to detect reverse tunnels whose remote
port is bound but no longer forwards (zombie sshd sessions), kill the
stale listeners on the remote host, and optionally restart the tunnel.

Includes install-cron/uninstall-cron/show-cron helpers and README notes
for the actcore-state-hub-bridge failure mode we hit on railiance01.
2026-06-19 15:59:27 +02:00

308 lines
8.9 KiB
Python

"""Nightly maintenance: detect and clear stale SSH remote port forwards."""
from __future__ import annotations
import subprocess
import time
from dataclasses import dataclass
from typing import Optional
from urllib.parse import urlparse, urlunparse
import httpx
from bridge.diagnostics import _remote_port_probe_command, check_tunnel
from bridge.manager import TunnelManager
from bridge.models import TunnelConfig
from bridge.state import StateManager
@dataclass
class CleanupAction:
tunnel: str
action: str # skipped | healthy | cleaned | cleaned_and_restarted | error
detail: str = ""
@dataclass
class CleanupReport:
actions: list[CleanupAction]
@property
def cleaned_count(self) -> int:
return sum(1 for a in self.actions if a.action.startswith("cleaned"))
def remote_forward_health_url(cfg: TunnelConfig) -> Optional[str]:
"""Map the local health_check URL to the remote forwarded port."""
if cfg.health_check is None or cfg.direction == "local":
return None
parsed = urlparse(cfg.health_check.url)
if not parsed.hostname:
return None
netloc = f"{parsed.hostname}:{cfg.remote_port}"
return urlunparse(parsed._replace(netloc=netloc))
def _ssh_base_cmd(cfg: TunnelConfig) -> list[str]:
from pathlib import Path
return [
"ssh",
"-i",
str(Path(cfg.ssh_key).expanduser()),
"-o",
"BatchMode=yes",
"-o",
"ConnectTimeout=10",
"-o",
"StrictHostKeyChecking=accept-new",
f"{cfg.ssh_user}@{cfg.host}",
]
def _run_ssh(cfg: TunnelConfig, remote_command: str, *, timeout: float = 30) -> subprocess.CompletedProcess[str]:
return subprocess.run(
[*_ssh_base_cmd(cfg), remote_command],
capture_output=True,
text=True,
timeout=timeout,
)
def remote_port_listening(cfg: TunnelConfig) -> bool:
proc = _run_ssh(cfg, _remote_port_probe_command(cfg.remote_port), timeout=15)
return proc.stdout.strip() == "ok"
def probe_remote_forward(cfg: TunnelConfig) -> tuple[bool, str]:
"""Return (healthy, detail) for the remote forwarded service."""
url = remote_forward_health_url(cfg)
if url is None:
return True, "no remote health url configured"
timeout = cfg.health_check.timeout_seconds if cfg.health_check else 5
remote_cmd = (
f"curl -sf --max-time {timeout} {url!r} >/dev/null "
"&& echo ok || echo fail"
)
try:
proc = _run_ssh(cfg, remote_cmd, timeout=timeout + 15)
except subprocess.TimeoutExpired:
return False, "remote health probe timed out"
output = proc.stdout.strip()
if output == "ok":
return True, "remote forward healthy"
if proc.returncode != 0 and proc.stderr.strip():
return False, proc.stderr.strip()
return False, "remote forward unhealthy"
def local_service_healthy(cfg: TunnelConfig) -> Optional[bool]:
if cfg.health_check is None:
return None
try:
resp = httpx.get(
cfg.health_check.url,
timeout=cfg.health_check.timeout_seconds,
)
return resp.is_success
except Exception:
return False
def _remote_cleanup_script(port: int) -> str:
return f"""set -eu
port={port}
pids=""
if command -v lsof >/dev/null 2>&1; then
pids=$(sudo -n lsof -t -iTCP:$port -sTCP:LISTEN 2>/dev/null || true)
if [ -z "$pids" ]; then
pids=$(lsof -t -iTCP:$port -sTCP:LISTEN 2>/dev/null || true)
fi
fi
if [ -z "$pids" ] && command -v fuser >/dev/null 2>&1; then
pids=$(fuser -n tcp $port 2>/dev/null | tr -s ' ' '\\n' | grep -E '^[0-9]+$' || true)
fi
if [ -z "$pids" ]; then
echo "no_listeners"
exit 0
fi
echo "killing:$pids"
for pid in $pids; do
kill "$pid" 2>/dev/null || sudo -n kill "$pid" 2>/dev/null || true
done
sleep 1
if ss -tln 2>/dev/null | grep -q ":$port "; then
echo "still_listening"
else
echo "cleared"
fi
"""
def clear_stale_remote_binding(cfg: TunnelConfig) -> tuple[bool, str]:
try:
proc = _run_ssh(cfg, _remote_cleanup_script(cfg.remote_port), timeout=30)
except subprocess.TimeoutExpired:
return False, "remote cleanup timed out"
output = proc.stdout.strip()
if "cleared" in output:
return True, output
if "no_listeners" in output:
return True, "no listeners found"
if "still_listening" in output:
return False, output
detail = output or proc.stderr.strip() or f"exit {proc.returncode}"
return False, detail
def should_cleanup_tunnel(
cfg: TunnelConfig,
state_mgr: StateManager,
) -> tuple[bool, str]:
"""Decide whether a reverse tunnel's remote binding looks stale."""
if cfg.direction == "local":
return False, "local tunnel"
if not remote_port_listening(cfg):
return False, "remote port closed"
remote_ok, remote_detail = probe_remote_forward(cfg)
if remote_ok:
return False, remote_detail
check = check_tunnel(cfg, state_mgr)
local_ok = local_service_healthy(cfg)
if local_ok is True and not remote_ok:
return True, f"stale forward: {remote_detail}"
if check.ssh_process != "ok" and check.remote_port == "listening":
return True, f"orphan forward while ssh {check.ssh_process}: {remote_detail}"
if check.ssh_process == "ok" and not remote_ok:
return True, f"broken forward with live client: {remote_detail}"
return False, remote_detail
def cleanup_tunnel(
cfg: TunnelConfig,
state_mgr: StateManager,
*,
restart: bool,
) -> CleanupAction:
name = cfg.name
try:
needed, reason = should_cleanup_tunnel(cfg, state_mgr)
if not needed:
return CleanupAction(name, "healthy", reason)
ok, detail = clear_stale_remote_binding(cfg)
if not ok:
return CleanupAction(name, "error", f"cleanup failed: {detail}")
if not restart:
return CleanupAction(name, "cleaned", f"{reason}; {detail}")
mgr = TunnelManager(cfg, state_dir=state_mgr._dir)
was_running = mgr.is_running()
if was_running:
mgr.stop()
mgr.start()
action = "cleaned_and_restarted"
verb = "restarted" if was_running else "started"
return CleanupAction(name, action, f"{reason}; {verb} tunnel; {detail}")
except Exception as exc:
return CleanupAction(name, "error", str(exc))
def cleanup_all_tunnels(
cfg,
state_mgr: StateManager,
*,
restart: bool,
tunnel_name: Optional[str] = None,
) -> CleanupReport:
tunnels = cfg.tunnels.values()
if tunnel_name is not None:
if tunnel_name not in cfg.tunnels:
raise KeyError(tunnel_name)
tunnels = [cfg.tunnels[tunnel_name]]
actions = [
cleanup_tunnel(tcfg, state_mgr, restart=restart)
for tcfg in tunnels
if tcfg.direction != "local"
]
return CleanupReport(actions=actions)
CRON_MARKER = "# ops-bridge: maintenance cleanup"
CRON_SCHEDULE = "0 3 * * *"
CRON_LOG = "~/.local/state/bridge/cleanup.log"
def build_cron_line() -> str:
bridge_bin = "~/.local/bin/bridge"
return (
f"{CRON_SCHEDULE} BRIDGE_CONFIG=~/.config/bridge/tunnels.yaml "
f"{bridge_bin} maintenance cleanup --restart "
f">> {CRON_LOG} 2>&1 {CRON_MARKER}"
)
def read_installed_cron() -> Optional[str]:
proc = subprocess.run(["crontab", "-l"], capture_output=True, text=True)
if proc.returncode != 0:
return None
for line in proc.stdout.splitlines():
if CRON_MARKER in line:
return line.strip()
return None
def install_cleanup_cron() -> tuple[bool, str]:
existing = read_installed_cron()
if existing:
return False, f"cron already installed: {existing}"
proc = subprocess.run(["crontab", "-l"], capture_output=True, text=True)
current = proc.stdout if proc.returncode == 0 else ""
new_line = build_cron_line()
body = current.rstrip("\n")
if body:
body += "\n"
body += new_line + "\n"
write = subprocess.run(
["crontab", "-"],
input=body,
capture_output=True,
text=True,
)
if write.returncode != 0:
return False, write.stderr.strip() or "crontab write failed"
return True, new_line
def uninstall_cleanup_cron() -> tuple[bool, str]:
proc = subprocess.run(["crontab", "-l"], capture_output=True, text=True)
if proc.returncode != 0:
return False, "no crontab installed"
kept = [
line
for line in proc.stdout.splitlines()
if CRON_MARKER not in line
]
if len(kept) == len(proc.stdout.splitlines()):
return False, "cleanup cron not found"
body = "\n".join(kept).rstrip("\n")
if body:
body += "\n"
write = subprocess.run(
["crontab", "-"],
input=body,
capture_output=True,
text=True,
)
if write.returncode != 0:
return False, write.stderr.strip() or "crontab write failed"
return True, "removed cleanup cron entry"