the-custodian/tools/batch_author_repo_classifications.py

#!/usr/bin/env python3
"""First-pass .repo-classification.yaml author for CUST-WP-0050 T11.

Writes agent-classified files for local git checkouts, validates against
canon/standards/repo-classification.allowed.yaml, and optionally commits.

Skips repos that already carry human-reviewed classifications and slugs on the
exclusion list.
"""
from __future__ import annotations

import argparse
import json
import subprocess
import sys
import urllib.request
from datetime import date
from pathlib import Path

import yaml

REPO_ROOT = Path(__file__).resolve().parent.parent
ALLOWED_PATH = REPO_ROOT / "canon" / "standards" / "repo-classification.allowed.yaml"
EXCLUSIONS_PATH = REPO_ROOT / "canon" / "standards" / "repo-classification.exclusions.yaml"
VALIDATOR = REPO_ROOT / "tools" / "validate_repo_classification.py"
HOME = Path.home()
API_BASE = "http://127.0.0.1:8000"
CLASSIFIED_AT = date.today().isoformat()

# Curated overrides — standard §13 examples and T11 first-pass judgments.
OVERRIDES: dict[str, dict] = {
    "helix-forge": {
        "category": "product",
        "domain": "infotech",
        "secondary_domains": ["agents"],
        "capability_tags": [
            "platform",
            "capability-registry",
            "coordination",
            "knowledge",
            "product-development",
        ],
        "business_stake": ["product", "technology", "execution", "automation", "intelligence"],
        "business_mechanics": ["intention", "coordination", "operation", "adaptation"],
        "notes": "Capability development platform; standard §13.1 example.",
    },
    "identity-canon": {
        "category": "research",
        "domain": "infotech",
        "secondary_domains": ["government"],
        "capability_tags": ["identity", "access-control", "terminology", "canon", "governance"],
        "business_stake": ["technology", "legal", "operations", "intelligence"],
        "business_mechanics": ["intention", "control", "adaptation"],
        "notes": "Identity terminology and canon; standard §13.3 example.",
    },
    "net-kingdom": {
        "category": "product",
        "domain": "infotech",
        "secondary_domains": [],
        "capability_tags": ["security", "identity", "platform", "operations", "access-control"],
        "business_stake": ["technology", "operations", "legal", "automation"],
        "business_mechanics": ["control", "operation", "adaptation"],
        "notes": "NetKingdom security/identity platform; standard §13.4 example.",
    },
    "citation-evidence": {
        "category": "product",
        "domain": "infotech",
        "secondary_domains": ["communication", "government"],
        "capability_tags": ["citations", "evidence", "knowledge", "traceability", "source-management"],
        "business_stake": ["intelligence", "legal", "product", "technology"],
        "business_mechanics": ["control", "coordination", "adaptation"],
        "notes": "Citation and evidence product; standard §13.5 example.",
    },
    "adaptive-pricing": {
        "category": "product",
        "domain": "financials",
        "secondary_domains": ["infotech", "agents"],
        "capability_tags": ["pricing", "monetization", "lifecycle", "decision-support", "product-development"],
        "business_stake": ["finance", "product", "sales", "intelligence", "automation"],
        "business_mechanics": ["intention", "control", "adaptation"],
        "notes": "Adaptive pricing product; standard §13.6 example.",
    },
    "reuse-surface": {
        "category": "product",
        "domain": "infotech",
        "secondary_domains": ["agents"],
        "capability_tags": ["capability-registry", "discovery", "reuse", "maturity", "evidence"],
        "business_stake": ["technology", "product", "intelligence", "automation"],
        "business_mechanics": ["intention", "control", "adaptation"],
        "notes": "Reuse discovery surface; standard §13.7 example.",
    },
    "audit-core": {
        "category": "tooling",
        "domain": "infotech",
        "secondary_domains": [],
        "capability_tags": ["audit", "traceability", "security", "governance", "operations"],
        "business_stake": ["technology", "operations", "legal", "automation"],
        "business_mechanics": ["control", "operation"],
        "notes": "Multi-tenant audit emit capability for platform bootstrap wiring.",
    },
    "whynot-design": {
        "category": "product",
        "domain": "consumer",
        "secondary_domains": ["communication"],
        "capability_tags": ["design-system", "documentation", "product-development", "experience"],
        "business_stake": ["product", "experience", "technology"],
        "business_mechanics": ["intention", "coordination", "adaptation"],
        "notes": "whynot visual language — tokens, CSS, and web components for prototype artefacts.",
    },
    "coordination-engine": {
        "category": "product",
        "domain": "communication",
        "secondary_domains": ["infotech", "agents"],
        "capability_tags": ["coordination", "workflow", "orchestration", "evidence", "platform"],
        "business_stake": ["product", "technology", "operations", "automation"],
        "business_mechanics": ["coordination", "operation", "adaptation"],
        "notes": "Goal-driven digital coordination framework and adapter runtime.",
    },
    "human-resources": {
        "category": "research",
        "domain": "consumer",
        "secondary_domains": ["health"],
        "capability_tags": ["knowledge", "documentation", "product-development"],
        "business_stake": ["people", "product", "experience"],
        "business_mechanics": ["intention", "adaptation"],
        "notes": "Research toward optimal human performance.",
    },
    "repo-seed": {
        "category": "tooling",
        "domain": "infotech",
        "secondary_domains": [],
        "capability_tags": ["platform", "configuration", "documentation"],
        "business_stake": ["technology", "execution"],
        "business_mechanics": ["operation"],
        "notes": "Git template for bootstrapping coulomb projects.",
    },
    "tegwick-control": {
        "category": "research",
        "domain": "consumer",
        "secondary_domains": ["infotech"],
        "capability_tags": ["coordination", "governance", "documentation", "knowledge"],
        "business_stake": ["people", "operations", "intelligence"],
        "business_mechanics": ["intention", "coordination", "adaptation"],
        "notes": "Personal control repository for life/projects landscape.",
    },
    "whynot-control": {
        "category": "research",
        "domain": "consumer",
        "secondary_domains": ["communication"],
        "capability_tags": ["product-development", "knowledge", "coordination", "documentation"],
        "business_stake": ["product", "experience", "intelligence"],
        "business_mechanics": ["intention", "coordination", "adaptation"],
        "notes": "whynot prototype and market-signal control repository.",
    },
    "markitect-main": {
        "category": "product",
        "domain": "communication",
        "secondary_domains": ["infotech", "agents"],
        "capability_tags": ["knowledge", "documentation", "product-development", "platform"],
        "business_stake": ["product", "technology", "execution"],
        "business_mechanics": ["intention", "coordination", "operation", "adaptation"],
        "notes": "Markitect main product repo; successor to archived markitect-project.",
    },
    "vantage-point": {
        "category": "research",
        "domain": "infotech",
        "secondary_domains": [],
        "capability_tags": ["knowledge", "analytics", "platform", "documentation"],
        "business_stake": ["technology", "intelligence", "product"],
        "business_mechanics": ["intention", "adaptation"],
        "notes": "Network-based graph model exploration and dependency reasoning framework.",
    },
}

DOMAIN_DEFAULT_STAKE: dict[str, list[str]] = {
    "infotech": ["technology", "product", "operations"],
    "financials": ["finance", "technology", "operations"],
    "communication": ["product", "experience", "technology"],
    "consumer": ["product", "experience"],
    "agents": ["technology", "automation", "product"],
    "government": ["legal", "operations", "technology"],
    "health": ["product", "experience", "operations"],
    "realestate": ["finance", "operations"],
}

SLUG_TAG_HINTS: dict[str, list[str]] = {
    "citation": ["citations", "evidence", "knowledge"],
    "evidence": ["evidence", "traceability", "source-management"],
    "railiance": ["platform", "operations"],
    "markitect": ["knowledge", "documentation"],
    "marki": ["knowledge", "documentation"],
    "ops-": ["operations", "platform"],
    "canon": ["canon", "knowledge", "governance"],
    "flex-auth": ["identity", "access-control", "policy"],
    "key-cape": ["identity", "access-control", "security"],
    "shard-wiki": ["knowledge", "documentation"],
    "vergabe": ["procurement", "governance"],
    "agentic": ["automation", "orchestration"],
    "repo-scoping": ["governance", "policy", "coordination"],
}


def load_excluded_slugs() -> set[str]:
    with EXCLUSIONS_PATH.open() as fh:
        doc = yaml.safe_load(fh)
    return {entry["slug"].split("/")[-1] for entry in doc.get("exclusions", [])}


def fetch_hub_repos() -> dict[str, dict]:
    with urllib.request.urlopen(f"{API_BASE}/repos/", timeout=30) as resp:
        repos = json.load(resp)
    return {r["slug"]: r for r in repos}


def local_git_repos() -> dict[str, Path]:
    found: dict[str, Path] = {}
    for child in HOME.iterdir():
        if child.name.startswith(".") or not child.is_dir():
            continue
        if (child / ".git").is_dir():
            found[child.name] = child
    return found


def _tag_hints(slug: str) -> list[str]:
    tags: list[str] = []
    for needle, hint_tags in SLUG_TAG_HINTS.items():
        if needle in slug:
            for tag in hint_tags:
                if tag not in tags:
                    tags.append(tag)
    return tags


def _enrich_from_hub(slug: str, hub: dict | None) -> dict:
    if hub:
        category = hub.get("category") or "project"
        domain = hub.get("domain_slug") or "infotech"
        secondary = hub.get("secondary_domains") or []
        tags = list(hub.get("capability_tags") or [])
        stake = list(hub.get("business_stake") or [])
        mechanics = list(hub.get("business_mechanics") or [])
    else:
        category = "project"
        domain = "infotech"
        secondary = []
        tags = []
        stake = []
        mechanics = []

    for tag in _tag_hints(slug):
        if tag not in tags:
            tags.append(tag)
    if not stake:
        stake = list(DOMAIN_DEFAULT_STAKE.get(domain, ["technology", "product"]))
    if not mechanics:
        mechanics = ["coordination", "operation"]

    return {
        "category": category,
        "domain": domain,
        "secondary_domains": secondary,
        "capability_tags": tags,
        "business_stake": stake,
        "business_mechanics": mechanics,
        "notes": f"First-pass agent classification (CUST-WP-0050 T11); derived from hub migration row.",
    }


def build_classification(slug: str, hub: dict | None) -> dict:
    if slug in OVERRIDES:
        data = dict(OVERRIDES[slug])
    else:
        data = _enrich_from_hub(slug, hub)
    data.pop("notes", None)
    block = {
        "repo_classification": {
            "standard": "Repo Classification Standard",
            "version": "1.0",
            "classified_at": CLASSIFIED_AT,
            "classified_by": "agent",
            **{k: v for k, v in data.items() if k != "notes"},
        }
    }
    if "notes" in OVERRIDES.get(slug, {}) or "notes" in data:
        note = OVERRIDES.get(slug, {}).get("notes") or data.get("notes")
        if note:
            block["repo_classification"]["notes"] = note
    return block


def render_yaml(block: dict) -> str:
    return yaml.dump(block, sort_keys=False, allow_unicode=True, default_flow_style=False)


def validate_file(path: Path) -> bool:
    proc = subprocess.run(
        [sys.executable, str(VALIDATOR), str(path)],
        capture_output=True,
        text=True,
    )
    if proc.returncode != 0:
        print(proc.stdout)
        print(proc.stderr, file=sys.stderr)
    return proc.returncode == 0


def maybe_commit(repo_path: Path, dry_run: bool) -> None:
    if dry_run:
        return
    subprocess.run(["git", "add", ".repo-classification.yaml"], cwd=repo_path, check=True)
    status = subprocess.run(
        ["git", "diff", "--cached", "--quiet"],
        cwd=repo_path,
    )
    if status.returncode != 0:
        subprocess.run(
            [
                "git",
                "commit",
                "-m",
                "Add .repo-classification.yaml (CUST-WP-0050 T11 agent first-pass)",
            ],
            cwd=repo_path,
            check=True,
        )


def main() -> int:
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument("--dry-run", action="store_true", help="Print actions only")
    parser.add_argument("--no-commit", action="store_true", help="Write files but skip git commit")
    args = parser.parse_args()

    excluded = load_excluded_slugs()
    hub_repos = fetch_hub_repos()
    local_repos = local_git_repos()

    targets = sorted(set(local_repos) - excluded)
    written: list[str] = []
    skipped_human: list[str] = []
    failed: list[str] = []

    for slug in targets:
        repo_path = local_repos[slug]
        target = repo_path / ".repo-classification.yaml"
        if target.exists():
            with target.open() as fh:
                existing = yaml.safe_load(fh) or {}
            if existing.get("repo_classification", {}).get("classified_by") == "human":
                skipped_human.append(slug)
                continue

        block = build_classification(slug, hub_repos.get(slug))
        content = render_yaml(block)
        if args.dry_run:
            print(f"[dry-run] would write {target}")
            continue

        target.write_text(content)
        if not validate_file(target):
            failed.append(slug)
            continue
        if not args.no_commit:
            maybe_commit(repo_path, dry_run=False)
        written.append(slug)

    print(f"Written: {len(written)}")
    for slug in written:
        print(f"  + {slug}")
    print(f"Skipped (human-reviewed): {len(skipped_human)}")
    for slug in skipped_human:
        print(f"  = {slug}")
    if failed:
        print(f"Failed validation: {len(failed)}", file=sys.stderr)
        for slug in failed:
            print(f"  ! {slug}", file=sys.stderr)
        return 1
    return 0


if __name__ == "__main__":
    raise SystemExit(main())