feat(directive): implement BRIDGE-WP-0004 AccessManagementDirective alignment

- ActorType enum (adm/agt/atm) replaces actor_class string; config validates
  naming convention (adm-*/agt-*/atm-*) with hard ConfigError on mismatch;
  legacy 'human'/'automation' values accepted with DeprecationWarning
- cert_command: pluggable shell string run before each SSH launch; cert written
  to state dir; -i cert appended to SSH command alongside -i key
- TTL-aware cert refresh: parses Valid-to via ssh-keygen -L; pre-emptive restart
  5 min before expiry (no backoff, no attempt increment); CERT_EXPIRING logged
- CertAcquisitionError: cert failures trigger normal backoff/retry loop
- cert_identity: Key ID parsed from cert and recorded in BRIDGE_CONNECTED event
- bridge cert-status: new CLI command; exit 1 on expired cert; --json flag
- 233 tests passing, ruff clean

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-15 09:38:29 +02:00
parent 22601ef3e6
commit bd169a07e2
17 changed files with 730 additions and 145 deletions

View File

@@ -16,6 +16,7 @@ class AuditEvent(str, Enum):
HEALTH_CHECK_FAILED = "health_check_failed"
HEALTH_CHECK_RECOVERED = "health_check_recovered"
BRIDGE_STOPPED = "bridge_stopped"
CERT_EXPIRING = "cert_expiring"
def _default_state_dir() -> Path:
@@ -34,19 +35,22 @@ class AuditLogger:
tunnel: str,
event: AuditEvent,
actor: str,
actor_class: str,
actor_type: str,
detail: str = "",
cert_identity: Optional[str] = None,
) -> None:
self._dir.mkdir(parents=True, exist_ok=True)
entry: Dict[str, Any] = {
"timestamp": datetime.now(timezone.utc).isoformat(),
"tunnel": tunnel,
"actor": actor,
"actor_class": actor_class,
"actor_type": actor_type,
"event": event.value,
}
if detail:
entry["detail"] = detail
if cert_identity:
entry["cert_identity"] = cert_identity
with self._log_path(tunnel).open("a") as f:
f.write(json.dumps(entry) + "\n")

View File

@@ -73,6 +73,11 @@ CAPABILITIES: list[Capability] = [
description="End-to-end tunnel diagnostics via SSH: SSH PID alive + remote port listening",
required_access_modes=frozenset({"cli", "mcp"}),
),
Capability(
name="bridge_cert_status",
description="Show certificate status for tunnels using cert_command mode",
required_access_modes=frozenset({"cli"}),
),
]
CAPABILITIES_BY_NAME: dict[str, Capability] = {c.name: c for c in CAPABILITIES}

View File

@@ -4,6 +4,8 @@ from __future__ import annotations
import dataclasses
import json
import os
import subprocess
from datetime import datetime
from pathlib import Path
from typing import Optional
@@ -357,6 +359,84 @@ def _print_check_table(results):
typer.echo(_fmt(row))
@app.command("cert-status")
def cert_status(
tunnel: Optional[str] = typer.Argument(None, help="Tunnel name (omit for all inline)"),
as_json: bool = typer.Option(False, "--json", help="Output as JSON"),
):
"""Show certificate status for tunnels using cert_command mode."""
cfg = _load_or_exit()
sd = _state_dir()
names = [tunnel] if tunnel else list(cfg.tunnels.keys())
rows = []
any_expired = False
for name in names:
cert_file = sd / f"{name}-cert.pub"
if not cert_file.exists():
rows.append({"tunnel": name, "mode": "static-key", "cert_file": None})
continue
try:
result = subprocess.run(
["ssh-keygen", "-L", "-f", str(cert_file)],
capture_output=True, text=True, check=False,
)
info = {"tunnel": name, "mode": "cert", "cert_file": str(cert_file)}
for line in result.stdout.splitlines():
line = line.strip()
if line.startswith("Key ID:"):
info["key_id"] = line.split(":", 1)[1].strip().strip('"')
elif line.startswith("Valid:"):
parts = line.split()
if len(parts) >= 5 and parts[1] == "from" and parts[3] == "to":
info["valid_from"] = parts[2]
info["valid_until"] = parts[4]
try:
expires = datetime.fromisoformat(parts[4])
now = datetime.now()
remaining = expires - now
if remaining.total_seconds() <= 0:
info["expired"] = True
any_expired = True
else:
info["expired"] = False
mins = int(remaining.total_seconds() // 60)
info["ttl_remaining"] = f"{mins}m"
except ValueError:
pass
rows.append(info)
except FileNotFoundError:
rows.append({"tunnel": name, "mode": "cert", "error": "ssh-keygen not found"})
if as_json:
typer.echo(json.dumps(rows, indent=2))
else:
for row in rows:
mode = row.get("mode", "unknown")
if mode == "static-key":
typer.echo(f"{row['tunnel']} static-key / no cert")
elif "error" in row:
typer.echo(f"{row['tunnel']} ERROR: {row['error']}")
else:
parts = [row["tunnel"]]
if "key_id" in row:
parts.append(f"id={row['key_id']}")
if "valid_from" in row:
parts.append(f"from={row['valid_from']}")
if "valid_until" in row:
parts.append(f"until={row['valid_until']}")
if row.get("expired"):
parts.append("EXPIRED")
elif "ttl_remaining" in row:
parts.append(f"ttl={row['ttl_remaining']}")
typer.echo(" ".join(parts))
if any_expired:
raise typer.Exit(1)
# ─── targets commands ─────────────────────────────────────────────────────────
@targets_app.callback(invoke_without_command=True)

View File

@@ -2,13 +2,14 @@
from __future__ import annotations
import os
import warnings
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Optional
import yaml
from bridge.models import ActorInfo, HealthCheckConfig, ReconnectPolicy, TunnelConfig
from bridge.models import ActorInfo, ActorType, HealthCheckConfig, ReconnectPolicy, TunnelConfig
class ConfigError(Exception):
@@ -91,6 +92,10 @@ def _parse_tunnel(name: str, data: dict) -> TunnelConfig:
if direction not in ("reverse", "local"):
raise ConfigError(f"Tunnel '{name}' direction must be 'reverse' or 'local', got: {direction!r}")
cert_command = data.get("cert_command") or None
if cert_command is not None:
cert_command = str(cert_command)
return TunnelConfig(
name=name,
host=str(data["host"]),
@@ -102,9 +107,40 @@ def _parse_tunnel(name: str, data: dict) -> TunnelConfig:
reconnect=reconnect,
health_check=health_check,
direction=direction,
cert_command=cert_command,
)
_LEGACY_CLASS_MAP = {
"human": ActorType.ADM,
"automation": ActorType.ATM,
}
_ACTOR_TYPE_PREFIXES = {
ActorType.ADM: "adm-",
ActorType.AGT: "agt-",
ActorType.ATM: "atm-",
}
def _parse_actor_type(name: str, raw_class: str) -> ActorType:
if raw_class in _LEGACY_CLASS_MAP:
warnings.warn(
f"Actor '{name}': class '{raw_class}' is deprecated; "
f"use '{_LEGACY_CLASS_MAP[raw_class].value}' instead.",
DeprecationWarning,
stacklevel=4,
)
return _LEGACY_CLASS_MAP[raw_class]
try:
return ActorType(raw_class)
except ValueError:
raise ConfigError(
f"Actor '{name}' has unknown class '{raw_class}'; "
f"must be one of: adm, agt, atm (or legacy: human, automation)"
)
def _parse_actors(raw: dict) -> Dict[str, ActorInfo]:
actors = {}
for name, data in raw.items():
@@ -112,9 +148,16 @@ def _parse_actors(raw: dict) -> Dict[str, ActorInfo]:
raise ConfigError(f"Actor '{name}' must be a mapping")
if "class" not in data:
raise ConfigError(f"Actor '{name}' missing required field: class")
actor_type = _parse_actor_type(name, str(data["class"]))
required_prefix = _ACTOR_TYPE_PREFIXES[actor_type]
if not name.startswith(required_prefix):
raise ConfigError(
f"Actor '{name}' has type '{actor_type.value}' but name must start "
f"with '{required_prefix}' (got '{name}')"
)
actors[name] = ActorInfo(
name=name,
actor_class=str(data["class"]),
actor_type=actor_type,
description=str(data.get("description", "")),
)
return actors

View File

@@ -6,35 +6,102 @@ import os
import signal
import subprocess
import time
from datetime import datetime, timedelta
from pathlib import Path
from typing import List, Optional
from bridge.audit import AuditEvent, AuditLogger
from bridge.health import HealthChecker
from bridge.models import BridgeState, TunnelConfig
from bridge.models import BridgeState, CertAcquisitionError, TunnelConfig
from bridge.state import StateManager
log = logging.getLogger(__name__)
def build_ssh_command(cfg: TunnelConfig) -> List[str]:
def _actor_type_from_name(name: str) -> str:
for prefix in ("adm", "agt", "atm"):
if name.startswith(f"{prefix}-"):
return prefix
return "unknown"
def build_ssh_command(cfg: TunnelConfig, cert_path: Optional[Path] = None) -> List[str]:
"""Build the SSH tunnel command (reverse -R or local -L)."""
key = os.path.expanduser(cfg.ssh_key)
if cfg.direction == "local":
forward_flag = ["-L", f"{cfg.local_port}:127.0.0.1:{cfg.remote_port}"]
else:
forward_flag = ["-R", f"{cfg.remote_port}:127.0.0.1:{cfg.local_port}"]
return [
cmd = [
"ssh",
"-N",
*forward_flag,
"-i", key,
]
if cert_path is not None:
cmd += ["-i", str(cert_path)]
cmd += [
"-o", "ServerAliveInterval=10",
"-o", "ServerAliveCountMax=3",
"-o", "ExitOnForwardFailure=yes",
"-o", "StrictHostKeyChecking=accept-new",
f"{cfg.ssh_user}@{cfg.host}",
]
return cmd
def _run_cert_command(cfg: TunnelConfig, state_dir: Path) -> Optional[Path]:
"""Run cert_command and write cert to state dir. Returns cert path or None."""
if cfg.cert_command is None:
return None
result = subprocess.run(
cfg.cert_command,
shell=True,
capture_output=True,
text=True,
)
if result.returncode != 0:
raise CertAcquisitionError(result.stderr.strip())
cert_path = state_dir / f"{cfg.name}-cert.pub"
cert_path.write_text(result.stdout)
return cert_path
def _parse_cert_identity(cert_path: Path) -> Optional[str]:
"""Parse Key ID from ssh-keygen -L output."""
try:
result = subprocess.run(
["ssh-keygen", "-L", "-f", str(cert_path)],
capture_output=True,
text=True,
)
for line in result.stdout.splitlines():
line = line.strip()
if line.startswith("Key ID:"):
return line.split(":", 1)[1].strip().strip('"')
except Exception:
pass
return None
def _parse_cert_expiry(cert_path: Path) -> Optional[datetime]:
"""Parse Valid-before datetime from ssh-keygen -L output."""
try:
result = subprocess.run(
["ssh-keygen", "-L", "-f", str(cert_path)],
capture_output=True,
text=True,
)
for line in result.stdout.splitlines():
line = line.strip()
if line.startswith("Valid:"):
# "Valid: from 2026-05-15T10:00:00 to 2026-05-15T22:00:00"
parts = line.split()
if len(parts) >= 5 and parts[3] == "to":
return datetime.fromisoformat(parts[4])
except Exception:
pass
return None
class TunnelManager:
@@ -56,7 +123,8 @@ class TunnelManager:
return self._state.is_running(self._cfg.name)
def _actor_info(self):
return self._cfg.actor, "unknown"
actor = self._cfg.actor
return actor, _actor_type_from_name(actor)
def _next_backoff(self, attempt: int) -> int:
initial = self._cfg.reconnect.backoff_initial
@@ -71,12 +139,12 @@ class TunnelManager:
return
self._state.write_state(self._cfg.name, BridgeState.STARTING)
actor, actor_class = self._actor_info()
actor, actor_type = self._actor_info()
self._audit.log(
tunnel=self._cfg.name,
event=AuditEvent.BRIDGE_STARTED,
actor=actor,
actor_class=actor_class,
actor_type=actor_type,
)
pid = os.fork()
@@ -99,7 +167,7 @@ class TunnelManager:
tunnel=self._cfg.name,
event=AuditEvent.BRIDGE_STOPPED,
actor=actor,
actor_class=actor_class,
actor_type=actor_type,
)
os._exit(0)
@@ -131,12 +199,12 @@ class TunnelManager:
self._state.clear_pid(self._cfg.name)
self._state.write_state(self._cfg.name, BridgeState.STOPPED)
actor, actor_class = self._actor_info()
actor, actor_type = self._actor_info()
self._audit.log(
tunnel=self._cfg.name,
event=AuditEvent.BRIDGE_STOPPED,
actor=actor,
actor_class=actor_class,
actor_type=actor_type,
)
def _run_loop(self) -> None:
@@ -144,11 +212,11 @@ class TunnelManager:
import asyncio
cfg = self._cfg
actor, actor_class = self._actor_info()
actor, actor_type = self._actor_info()
attempt = 0
max_attempts = cfg.reconnect.max_attempts # 0 = infinite
state_dir = self._state._dir
# Setup signal handler for graceful shutdown
_stop = [False]
def _on_term(signum, frame):
@@ -162,7 +230,31 @@ class TunnelManager:
self._state.write_state(cfg.name, BridgeState.FAILED)
break
cmd = build_ssh_command(cfg)
# Acquire cert before each SSH launch (T3, T7)
try:
cert_path = _run_cert_command(cfg, state_dir)
except CertAcquisitionError as e:
self._audit.log(
tunnel=cfg.name,
event=AuditEvent.BRIDGE_DISCONNECTED,
actor=actor,
actor_type=actor_type,
detail=f"cert acquisition failed: {e}",
)
attempt += 1
if max_attempts > 0 and attempt >= max_attempts:
self._state.write_state(cfg.name, BridgeState.FAILED)
break
backoff = self._next_backoff(attempt - 1)
self._state.write_state(cfg.name, BridgeState.RECONNECTING)
log.info("Cert acquisition failed, retrying in %ds", backoff)
time.sleep(backoff)
continue
cert_identity = _parse_cert_identity(cert_path) if cert_path else None
cert_expires_at = _parse_cert_expiry(cert_path) if cert_path else None
cmd = build_ssh_command(cfg, cert_path=cert_path)
log.info("Starting SSH: %s", " ".join(cmd))
self._state.write_state(cfg.name, BridgeState.STARTING)
@@ -174,24 +266,30 @@ class TunnelManager:
tunnel=cfg.name,
event=AuditEvent.BRIDGE_DISCONNECTED,
actor=actor,
actor_class=actor_class,
actor_type=actor_type,
detail="ssh binary not found",
)
break
# Wait briefly then assume connected if still running
time.sleep(2)
_ttl_refresh = False
if proc.poll() is None:
self._state.write_state(cfg.name, BridgeState.CONNECTED)
self._audit.log(
tunnel=cfg.name,
event=AuditEvent.BRIDGE_CONNECTED,
actor=actor,
actor_class=actor_class,
actor_type=actor_type,
cert_identity=cert_identity,
)
attempt = 0
# Health check loop
def _check_ttl() -> bool:
"""Return True if cert is within 5 min of expiry and SSH should restart."""
if cert_expires_at is None:
return False
return datetime.now() >= cert_expires_at - timedelta(minutes=5)
if cfg.health_check:
checker = HealthChecker(
url=cfg.health_check.url,
@@ -199,6 +297,18 @@ class TunnelManager:
)
health_failing = False
while not _stop[0] and proc.poll() is None:
if _check_ttl():
self._audit.log(
tunnel=cfg.name,
event=AuditEvent.CERT_EXPIRING,
actor=actor,
actor_type=actor_type,
cert_identity=cert_identity,
detail=str(cert_expires_at),
)
proc.terminate()
_ttl_refresh = True
break
result = asyncio.run(checker.check())
if result.ok:
if health_failing:
@@ -208,7 +318,7 @@ class TunnelManager:
tunnel=cfg.name,
event=AuditEvent.HEALTH_CHECK_RECOVERED,
actor=actor,
actor_class=actor_class,
actor_type=actor_type,
)
else:
if not health_failing:
@@ -218,21 +328,36 @@ class TunnelManager:
tunnel=cfg.name,
event=AuditEvent.HEALTH_CHECK_FAILED,
actor=actor,
actor_class=actor_class,
actor_type=actor_type,
detail=result.error or f"HTTP {result.status_code}",
)
time.sleep(cfg.health_check.interval_seconds)
else:
while not _stop[0] and proc.poll() is None:
if _check_ttl():
self._audit.log(
tunnel=cfg.name,
event=AuditEvent.CERT_EXPIRING,
actor=actor,
actor_type=actor_type,
cert_identity=cert_identity,
detail=str(cert_expires_at),
)
proc.terminate()
_ttl_refresh = True
break
time.sleep(1)
# SSH exited
if _ttl_refresh:
# Planned cert refresh — don't count as failure, no backoff
continue
if proc.poll() is not None:
self._audit.log(
tunnel=cfg.name,
event=AuditEvent.BRIDGE_DISCONNECTED,
actor=actor,
actor_class=actor_class,
actor_type=actor_type,
detail=f"exit code {proc.returncode}",
)
@@ -248,7 +373,7 @@ class TunnelManager:
tunnel=cfg.name,
event=AuditEvent.BRIDGE_RECONNECTING,
actor=actor,
actor_class=actor_class,
actor_type=actor_type,
detail=f"retry {attempt}, backoff {backoff}s",
)
log.info("Reconnecting in %ds (attempt %d)", backoff, attempt)

View File

@@ -15,6 +15,16 @@ class BridgeState(str, Enum):
FAILED = "failed"
class ActorType(str, Enum):
ADM = "adm" # human operator
AGT = "agt" # LLM-powered autonomous agent
ATM = "atm" # deterministic script / pipeline
class CertAcquisitionError(Exception):
"""Raised when cert_command fails to produce a certificate."""
@dataclass
class ReconnectPolicy:
max_attempts: int = 0 # 0 = infinite
@@ -41,10 +51,11 @@ class TunnelConfig:
reconnect: ReconnectPolicy = field(default_factory=ReconnectPolicy)
health_check: Optional[HealthCheckConfig] = None
direction: str = "reverse" # "reverse" (-R) or "local" (-L)
cert_command: Optional[str] = None
@dataclass
class ActorInfo:
name: str
actor_class: str # "human" or "automation"
actor_type: ActorType
description: str = ""