the-custodian/tools/validate_repo_classification.py

#!/usr/bin/env python3
"""Validate a .repo-classification.yaml against the Repo Classification Standard.

Single small linter shared by repo authors and (later) the State Hub registration
validator. It checks a repo's classification file against the controlled
vocabularies in canon/standards/repo-classification.allowed.yaml.

Usage:
    validate_repo_classification.py <path-to-.repo-classification.yaml> ...
    validate_repo_classification.py --self-test

Exit code 0 = all files valid (warnings allowed); 1 = at least one invalid.

CUST-WP-0050 T01. Depends on PyYAML (stdlib + pyyaml only).
"""
from __future__ import annotations

import re
import sys
from pathlib import Path

import yaml

ALLOWED_PATH = (
    Path(__file__).resolve().parent.parent
    / "canon"
    / "standards"
    / "repo-classification.allowed.yaml"
)


def load_allowed(path: Path = ALLOWED_PATH) -> dict:
    with path.open() as fh:
        return yaml.safe_load(fh)


def _known_capability_tags(allowed: dict) -> set[str]:
    tags: set[str] = set()
    for fam in (allowed.get("capability_families") or {}).values():
        tags.update(fam or [])
    return tags


def validate(doc: dict, allowed: dict) -> tuple[list[str], list[str]]:
    """Return (errors, warnings) for a parsed classification document."""
    errors: list[str] = []
    warnings: list[str] = []

    block = doc.get("repo_classification") if isinstance(doc, dict) else None
    if not isinstance(block, dict):
        return (["missing top-level `repo_classification:` mapping"], [])

    categories = set(allowed["categories"])
    domains = set(allowed["domains"])
    stakes = set(allowed["business_stake"])
    mechanics = set(allowed["business_mechanics"])
    guidance = allowed.get("guidance", {})
    pattern = re.compile(guidance.get("capability_tag_pattern", r"^[a-z0-9]+(-[a-z0-9]+)*$"))

    # category — required, exactly one allowed value
    category = block.get("category")
    if category is None:
        errors.append("`category` is required")
    elif category not in categories:
        errors.append(f"`category` '{category}' not in {sorted(categories)}")

    # domain — required, exactly one allowed value
    domain = block.get("domain")
    if domain is None:
        errors.append("`domain` is required")
    elif domain not in domains:
        errors.append(f"`domain` '{domain}' not in allowed domains")

    # secondary_domains — 0..n allowed domains, excluding primary, no dups
    secondary = block.get("secondary_domains") or []
    if not isinstance(secondary, list):
        errors.append("`secondary_domains` must be a list")
        secondary = []
    for d in secondary:
        if d not in domains:
            errors.append(f"secondary domain '{d}' not in allowed domains")
        if d == domain:
            errors.append(f"secondary domain '{d}' repeats the primary domain")
    if len(secondary) != len(set(secondary)):
        errors.append("`secondary_domains` contains duplicates")
    smax = guidance.get("secondary_domains_max", 3)
    if len(secondary) > smax:
        warnings.append(f"{len(secondary)} secondary_domains exceeds recommended max {smax}")

    # capability_tags — open-ended, kebab-case; warn on unknown/synonym
    tags = block.get("capability_tags") or []
    if not isinstance(tags, list):
        errors.append("`capability_tags` must be a list")
        tags = []
    known = _known_capability_tags(allowed)
    for t in tags:
        if not isinstance(t, str) or not pattern.match(t):
            errors.append(f"capability_tag '{t}' is not lowercase kebab-case")
        elif t not in known:
            warnings.append(f"capability_tag '{t}' is not a recommended family tag (allowed, check for synonym)")

    # business_stake — 0..n allowed; recommend 2..6
    stake = block.get("business_stake") or []
    if not isinstance(stake, list):
        errors.append("`business_stake` must be a list")
        stake = []
    for s in stake:
        if s not in stakes:
            errors.append(f"business_stake '{s}' not in {sorted(stakes)}")
    if stake:
        lo = guidance.get("business_stake_recommended_min", 2)
        hi = guidance.get("business_stake_recommended_max", 6)
        if not (lo <= len(stake) <= hi):
            warnings.append(f"{len(stake)} business_stake values; {lo}-{hi} recommended")

    # business_mechanics — 0..n allowed
    mech = block.get("business_mechanics") or []
    if not isinstance(mech, list):
        errors.append("`business_mechanics` must be a list")
        mech = []
    for m in mech:
        if m not in mechanics:
            errors.append(f"business_mechanics '{m}' not in {sorted(mechanics)}")

    return errors, warnings


def validate_file(path: Path, allowed: dict) -> bool:
    try:
        doc = yaml.safe_load(path.read_text())
    except (OSError, yaml.YAMLError) as exc:
        print(f"FAIL  {path}: cannot read/parse ({exc})")
        return False
    errors, warnings = validate(doc, allowed)
    for w in warnings:
        print(f"warn  {path}: {w}")
    if errors:
        for e in errors:
            print(f"FAIL  {path}: {e}")
        return False
    print(f"ok    {path}")
    return True


def self_test(allowed: dict) -> bool:
    good = {
        "repo_classification": {
            "category": "research",
            "domain": "infotech",
            "secondary_domains": ["agents"],
            "capability_tags": ["governance", "knowledge", "coordination"],
            "business_stake": ["technology", "operations", "intelligence"],
            "business_mechanics": ["intention", "control", "coordination"],
        }
    }
    bad = {
        "repo_classification": {
            "category": "platform",          # not a category (it's a tag)
            "domain": "knowledge",           # not a market domain
            "secondary_domains": ["infotech", "infotech"],
            "capability_tags": ["Stuff", "access-control"],
            "business_stake": ["technology", "wizardry"],
            "business_mechanics": ["teleportation"],
        }
    }
    ge, _ = validate(good, allowed)
    be, _ = validate(bad, allowed)
    ok = (ge == []) and (len(be) >= 5)
    print(f"self-test: good_errors={len(ge)} bad_errors={len(be)} -> {'PASS' if ok else 'FAIL'}")
    return ok


def main(argv: list[str]) -> int:
    allowed = load_allowed()
    args = argv[1:]
    if not args or args == ["--self-test"]:
        return 0 if self_test(allowed) else 1
    all_ok = True
    for a in args:
        if not validate_file(Path(a), allowed):
            all_ok = False
    return 0 if all_ok else 1


if __name__ == "__main__":
    raise SystemExit(main(sys.argv))