#!/usr/bin/env python3 """Validate a .repo-classification.yaml against the Repo Classification Standard. Single small linter shared by repo authors and (later) the State Hub registration validator. It checks a repo's classification file against the controlled vocabularies in canon/standards/repo-classification.allowed.yaml. Usage: validate_repo_classification.py ... validate_repo_classification.py --self-test Exit code 0 = all files valid (warnings allowed); 1 = at least one invalid. CUST-WP-0050 T01. Depends on PyYAML (stdlib + pyyaml only). """ from __future__ import annotations import re import sys from pathlib import Path import yaml ALLOWED_PATH = ( Path(__file__).resolve().parent.parent / "canon" / "standards" / "repo-classification.allowed.yaml" ) def load_allowed(path: Path = ALLOWED_PATH) -> dict: with path.open() as fh: return yaml.safe_load(fh) def _known_capability_tags(allowed: dict) -> set[str]: tags: set[str] = set() for fam in (allowed.get("capability_families") or {}).values(): tags.update(fam or []) return tags def validate(doc: dict, allowed: dict) -> tuple[list[str], list[str]]: """Return (errors, warnings) for a parsed classification document.""" errors: list[str] = [] warnings: list[str] = [] block = doc.get("repo_classification") if isinstance(doc, dict) else None if not isinstance(block, dict): return (["missing top-level `repo_classification:` mapping"], []) categories = set(allowed["categories"]) domains = set(allowed["domains"]) stakes = set(allowed["business_stake"]) mechanics = set(allowed["business_mechanics"]) guidance = allowed.get("guidance", {}) pattern = re.compile(guidance.get("capability_tag_pattern", r"^[a-z0-9]+(-[a-z0-9]+)*$")) # category — required, exactly one allowed value category = block.get("category") if category is None: errors.append("`category` is required") elif category not in categories: errors.append(f"`category` '{category}' not in {sorted(categories)}") # domain — required, exactly one allowed value domain = block.get("domain") if domain is None: errors.append("`domain` is required") elif domain not in domains: errors.append(f"`domain` '{domain}' not in allowed domains") # secondary_domains — 0..n allowed domains, excluding primary, no dups secondary = block.get("secondary_domains") or [] if not isinstance(secondary, list): errors.append("`secondary_domains` must be a list") secondary = [] for d in secondary: if d not in domains: errors.append(f"secondary domain '{d}' not in allowed domains") if d == domain: errors.append(f"secondary domain '{d}' repeats the primary domain") if len(secondary) != len(set(secondary)): errors.append("`secondary_domains` contains duplicates") smax = guidance.get("secondary_domains_max", 3) if len(secondary) > smax: warnings.append(f"{len(secondary)} secondary_domains exceeds recommended max {smax}") # capability_tags — open-ended, kebab-case; warn on unknown/synonym tags = block.get("capability_tags") or [] if not isinstance(tags, list): errors.append("`capability_tags` must be a list") tags = [] known = _known_capability_tags(allowed) for t in tags: if not isinstance(t, str) or not pattern.match(t): errors.append(f"capability_tag '{t}' is not lowercase kebab-case") elif t not in known: warnings.append(f"capability_tag '{t}' is not a recommended family tag (allowed, check for synonym)") # business_stake — 0..n allowed; recommend 2..6 stake = block.get("business_stake") or [] if not isinstance(stake, list): errors.append("`business_stake` must be a list") stake = [] for s in stake: if s not in stakes: errors.append(f"business_stake '{s}' not in {sorted(stakes)}") if stake: lo = guidance.get("business_stake_recommended_min", 2) hi = guidance.get("business_stake_recommended_max", 6) if not (lo <= len(stake) <= hi): warnings.append(f"{len(stake)} business_stake values; {lo}-{hi} recommended") # business_mechanics — 0..n allowed mech = block.get("business_mechanics") or [] if not isinstance(mech, list): errors.append("`business_mechanics` must be a list") mech = [] for m in mech: if m not in mechanics: errors.append(f"business_mechanics '{m}' not in {sorted(mechanics)}") return errors, warnings def validate_file(path: Path, allowed: dict) -> bool: try: doc = yaml.safe_load(path.read_text()) except (OSError, yaml.YAMLError) as exc: print(f"FAIL {path}: cannot read/parse ({exc})") return False errors, warnings = validate(doc, allowed) for w in warnings: print(f"warn {path}: {w}") if errors: for e in errors: print(f"FAIL {path}: {e}") return False print(f"ok {path}") return True def self_test(allowed: dict) -> bool: good = { "repo_classification": { "category": "research", "domain": "infotech", "secondary_domains": ["agents"], "capability_tags": ["governance", "knowledge", "coordination"], "business_stake": ["technology", "operations", "intelligence"], "business_mechanics": ["intention", "control", "coordination"], } } bad = { "repo_classification": { "category": "platform", # not a category (it's a tag) "domain": "knowledge", # not a market domain "secondary_domains": ["infotech", "infotech"], "capability_tags": ["Stuff", "access-control"], "business_stake": ["technology", "wizardry"], "business_mechanics": ["teleportation"], } } ge, _ = validate(good, allowed) be, _ = validate(bad, allowed) ok = (ge == []) and (len(be) >= 5) print(f"self-test: good_errors={len(ge)} bad_errors={len(be)} -> {'PASS' if ok else 'FAIL'}") return ok def main(argv: list[str]) -> int: allowed = load_allowed() args = argv[1:] if not args or args == ["--self-test"]: return 0 if self_test(allowed) else 1 all_ok = True for a in args: if not validate_file(Path(a), allowed): all_ok = False return 0 if all_ok else 1 if __name__ == "__main__": raise SystemExit(main(sys.argv))