"""Load and validate the routing pointer catalog. The catalog lives at ``registry/routing/catalog.yaml`` in the repo root. Resolution order: 1. ``WARDEN_ROUTING_CATALOG`` env var, if set (used by tests / overrides). 2. Walk upward from this module looking for ``registry/routing/catalog.yaml``. Validation enforces the **no-double-source rule**: only ``warden_executes: true`` entries may carry an authored ``steps`` block or a ``cert_command``. Any non-SSH entry that does so is a validation error — ops-warden points at the owner's doc, it never restates another subsystem's procedure. """ from __future__ import annotations import os import re from dataclasses import dataclass from datetime import date from pathlib import Path from typing import List, Optional import yaml from warden.routing.models import RouteEntry # Structured handoff string fields (WP-0014) — templates and pointers only. # Every one is scanned for accidental secret material; see _assert_no_secret_material. _HANDOFF_STR_FIELDS = ("auth_method", "path_template", "fetch_command", "policy_ref") # Known secret-bearing token prefixes — a literal here means a value leaked into # the catalog (which is git-tracked and agent-visible). Templates use `<...>`. _SECRET_PREFIXES = ( "ghp_", "gho_", "ghs_", "github_pat_", # GitHub "sk-", "sk_live_", "sk_test_", # OpenAI / Stripe "xoxb-", "xoxp-", # Slack "AKIA", "ASIA", # AWS access key ids "hvs.", "hvb.", "s.", # Vault/OpenBao service tokens "AIza", # Google "eyJ", # JWT ) # A long unbroken high-entropy run that is not a placeholder — likely a raw value. _HIGH_ENTROPY_RUN = re.compile(r"[A-Za-z0-9_\-]{32,}") _REQUIRED_FIELDS = ( "id", "title", "need_keywords", "owner_repo", "subsystem", "warden_executes", "wiki_ref", "canon_ref", "reviewed", "status", ) _VALID_STATUS = ("active", "draft") # Default review cadence — see wiki/AccessRouting.md#drift-review-cadence DEFAULT_STALE_DAYS = 90 def days_since_review(reviewed: str, *, today: Optional[date] = None) -> int: """Calendar days between reviewed date (YYYY-MM-DD) and today.""" reviewed_date = date.fromisoformat(reviewed) ref = today or date.today() return (ref - reviewed_date).days def is_review_stale( reviewed: str, *, threshold_days: int = DEFAULT_STALE_DAYS, today: Optional[date] = None, ) -> bool: """True when reviewed date is older than the cadence threshold.""" return days_since_review(reviewed, today=today) > threshold_days class CatalogError(Exception): """Raised when the routing catalog is missing or invalid.""" def find_catalog_path(start: Optional[Path] = None) -> Path: """Locate registry/routing/catalog.yaml. Honors WARDEN_ROUTING_CATALOG first; otherwise walks up from `start` (default: this module) until a repo root containing the catalog is found. """ override = os.environ.get("WARDEN_ROUTING_CATALOG") if override: return Path(os.path.expanduser(override)) rel = Path("registry") / "routing" / "catalog.yaml" here = (start or Path(__file__)).resolve() for parent in [here, *here.parents]: candidate = parent / rel if candidate.exists(): return candidate raise CatalogError( f"Routing catalog not found ({rel}). Set WARDEN_ROUTING_CATALOG to override." ) @dataclass class Catalog: path: Path entries: List[RouteEntry] # --- lookup helpers --------------------------------------------------- def get(self, entry_id: str) -> Optional[RouteEntry]: for e in self.entries: if e.id == entry_id: return e return None def listed(self, include_draft: bool = False) -> List[RouteEntry]: if include_draft: return list(self.entries) return [e for e in self.entries if e.is_active] def find(self, query: str, include_draft: bool = False, limit: int = 5) -> List[RouteEntry]: """Rank entries by keyword overlap with the query. Highest first.""" tokens = [t for t in query.lower().replace("-", " ").split() if t] pool = self.listed(include_draft=include_draft) scored = [(e.match_score(tokens), e) for e in pool] scored = [(s, e) for s, e in scored if s > 0] scored.sort(key=lambda pair: (-pair[0], pair[1].id)) return [e for _, e in scored[:limit]] def stale( self, include_draft: bool = False, threshold_days: int = DEFAULT_STALE_DAYS, *, today: Optional[date] = None, ) -> List[RouteEntry]: """Entries whose reviewed date is past the cadence threshold.""" return [ e for e in self.listed(include_draft=include_draft) if is_review_stale(e.reviewed, threshold_days=threshold_days, today=today) ] def _assert_no_secret_material(entry_id: str, field_name: str, value: str) -> None: """Reject a handoff field that appears to embed a literal secret value. The structured handoff fields are command/path *templates*: concrete values must be placeholders (`<...>`) or field names, never a real credential. The catalog is git-tracked and agent-visible, so a leaked value here is the exact custody failure WP-0014 forbids. We screen for known token prefixes and for a long high-entropy run that is not a placeholder. """ lowered = value.lower() for prefix in _SECRET_PREFIXES: if prefix.lower() in lowered: raise CatalogError( f"entry {entry_id!r} field {field_name!r} appears to contain a literal " f"secret (matched {prefix!r}). Handoff fields are templates — use " "placeholders like /, never a real value." ) for run in _HIGH_ENTROPY_RUN.findall(value): # Allow long placeholder/path/identifier tokens; flag anything else. if "<" in run or ">" in run: continue if run.replace("_", "").replace("-", "").isalpha(): continue # all-letters run (e.g. a long word) — not a credential raise CatalogError( f"entry {entry_id!r} field {field_name!r} contains a high-entropy token " f"({run[:8]}…) that is not a placeholder — suspected leaked secret value." ) def _parse_entry(raw: dict, index: int) -> RouteEntry: if not isinstance(raw, dict): raise CatalogError(f"entry #{index} is not a mapping") missing = [f for f in _REQUIRED_FIELDS if f not in raw] if missing: ident = raw.get("id", f"#{index}") raise CatalogError(f"entry {ident!r} missing required field(s): {', '.join(missing)}") warden_executes = bool(raw["warden_executes"]) steps = raw.get("steps") or [] cert_command = raw.get("cert_command") status = str(raw["status"]) if status not in _VALID_STATUS: raise CatalogError( f"entry {raw['id']!r} has invalid status {status!r} (expected one of {_VALID_STATUS})" ) # No-double-source rule: authored procedure only on the SSH lane. if not warden_executes and steps: raise CatalogError( f"entry {raw['id']!r} is not warden_executes but carries a `steps` block " "— routed needs point at the owner's doc; they must not restate procedure " "(no-double-source rule)." ) if not warden_executes and cert_command: raise CatalogError( f"entry {raw['id']!r} is not warden_executes but carries a `cert_command`." ) if not isinstance(raw["need_keywords"], list): raise CatalogError(f"entry {raw['id']!r} need_keywords must be a list") # Structured handoff fields (WP-0014) — optional, screened for secret material. entry_id = str(raw["id"]) handoff: dict[str, Optional[str]] = {} for fname in _HANDOFF_STR_FIELDS: val = raw.get(fname) if val is None or val == "": handoff[fname] = None continue sval = str(val) _assert_no_secret_material(entry_id, fname, sval) handoff[fname] = sval exec_capable = bool(raw.get("exec_capable", False)) # A lane cannot be proxy-executable without a fetch_command to run. if exec_capable and not handoff["fetch_command"]: raise CatalogError( f"entry {entry_id!r} sets exec_capable: true but has no fetch_command — " "a proxyable lane must declare the command warden runs as the caller." ) return RouteEntry( id=entry_id, title=str(raw["title"]), need_keywords=[str(k) for k in raw["need_keywords"]], owner_repo=str(raw["owner_repo"]), subsystem=str(raw["subsystem"]), warden_executes=warden_executes, wiki_ref=str(raw["wiki_ref"]), canon_ref=str(raw["canon_ref"]), reviewed=str(raw["reviewed"]), status=status, steps=[str(s) for s in steps], cert_command=str(cert_command) if cert_command else None, auth_method=handoff["auth_method"], path_template=handoff["path_template"], fetch_command=handoff["fetch_command"], exec_capable=exec_capable, policy_ref=handoff["policy_ref"], ) def load_catalog(path: Optional[Path] = None) -> Catalog: """Load, parse, and validate the routing catalog.""" catalog_path = path or find_catalog_path() if not catalog_path.exists(): raise CatalogError(f"Routing catalog not found: {catalog_path}") try: with catalog_path.open() as f: raw = yaml.safe_load(f) except yaml.YAMLError as e: raise CatalogError(f"Invalid YAML in {catalog_path}: {e}") from e if not isinstance(raw, dict): raise CatalogError("Catalog must be a YAML mapping") raw_entries = raw.get("entries") if not isinstance(raw_entries, list) or not raw_entries: raise CatalogError("Catalog has no `entries` list") entries: List[RouteEntry] = [] seen: set[str] = set() for i, raw_entry in enumerate(raw_entries): entry = _parse_entry(raw_entry, i) if entry.id in seen: raise CatalogError(f"duplicate entry id: {entry.id!r}") seen.add(entry.id) entries.append(entry) return Catalog(path=catalog_path, entries=entries)