Start CUST-WP-0050: T01 allowed-values + validator; classify the-custodian
Activate the workplan and complete T01: add the machine-readable controlled vocabulary canon/standards/repo-classification.allowed.yaml (categories, domains, business_stake, business_mechanics, capability families, guidance), reference it from the standard §12, and add tools/validate_repo_classification.py (stdlib + PyYAML, --self-test PASS). Begin T02: author the-custodian/.repo-classification.yaml (research · infotech · agents), which validates clean. classified_by: agent, pending human review. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
186
tools/validate_repo_classification.py
Normal file
186
tools/validate_repo_classification.py
Normal file
@@ -0,0 +1,186 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Validate a .repo-classification.yaml against the Repo Classification Standard.
|
||||
|
||||
Single small linter shared by repo authors and (later) the State Hub registration
|
||||
validator. It checks a repo's classification file against the controlled
|
||||
vocabularies in canon/standards/repo-classification.allowed.yaml.
|
||||
|
||||
Usage:
|
||||
validate_repo_classification.py <path-to-.repo-classification.yaml> ...
|
||||
validate_repo_classification.py --self-test
|
||||
|
||||
Exit code 0 = all files valid (warnings allowed); 1 = at least one invalid.
|
||||
|
||||
CUST-WP-0050 T01. Depends on PyYAML (stdlib + pyyaml only).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
ALLOWED_PATH = (
|
||||
Path(__file__).resolve().parent.parent
|
||||
/ "canon"
|
||||
/ "standards"
|
||||
/ "repo-classification.allowed.yaml"
|
||||
)
|
||||
|
||||
|
||||
def load_allowed(path: Path = ALLOWED_PATH) -> dict:
|
||||
with path.open() as fh:
|
||||
return yaml.safe_load(fh)
|
||||
|
||||
|
||||
def _known_capability_tags(allowed: dict) -> set[str]:
|
||||
tags: set[str] = set()
|
||||
for fam in (allowed.get("capability_families") or {}).values():
|
||||
tags.update(fam or [])
|
||||
return tags
|
||||
|
||||
|
||||
def validate(doc: dict, allowed: dict) -> tuple[list[str], list[str]]:
|
||||
"""Return (errors, warnings) for a parsed classification document."""
|
||||
errors: list[str] = []
|
||||
warnings: list[str] = []
|
||||
|
||||
block = doc.get("repo_classification") if isinstance(doc, dict) else None
|
||||
if not isinstance(block, dict):
|
||||
return (["missing top-level `repo_classification:` mapping"], [])
|
||||
|
||||
categories = set(allowed["categories"])
|
||||
domains = set(allowed["domains"])
|
||||
stakes = set(allowed["business_stake"])
|
||||
mechanics = set(allowed["business_mechanics"])
|
||||
guidance = allowed.get("guidance", {})
|
||||
pattern = re.compile(guidance.get("capability_tag_pattern", r"^[a-z0-9]+(-[a-z0-9]+)*$"))
|
||||
|
||||
# category — required, exactly one allowed value
|
||||
category = block.get("category")
|
||||
if category is None:
|
||||
errors.append("`category` is required")
|
||||
elif category not in categories:
|
||||
errors.append(f"`category` '{category}' not in {sorted(categories)}")
|
||||
|
||||
# domain — required, exactly one allowed value
|
||||
domain = block.get("domain")
|
||||
if domain is None:
|
||||
errors.append("`domain` is required")
|
||||
elif domain not in domains:
|
||||
errors.append(f"`domain` '{domain}' not in allowed domains")
|
||||
|
||||
# secondary_domains — 0..n allowed domains, excluding primary, no dups
|
||||
secondary = block.get("secondary_domains") or []
|
||||
if not isinstance(secondary, list):
|
||||
errors.append("`secondary_domains` must be a list")
|
||||
secondary = []
|
||||
for d in secondary:
|
||||
if d not in domains:
|
||||
errors.append(f"secondary domain '{d}' not in allowed domains")
|
||||
if d == domain:
|
||||
errors.append(f"secondary domain '{d}' repeats the primary domain")
|
||||
if len(secondary) != len(set(secondary)):
|
||||
errors.append("`secondary_domains` contains duplicates")
|
||||
smax = guidance.get("secondary_domains_max", 3)
|
||||
if len(secondary) > smax:
|
||||
warnings.append(f"{len(secondary)} secondary_domains exceeds recommended max {smax}")
|
||||
|
||||
# capability_tags — open-ended, kebab-case; warn on unknown/synonym
|
||||
tags = block.get("capability_tags") or []
|
||||
if not isinstance(tags, list):
|
||||
errors.append("`capability_tags` must be a list")
|
||||
tags = []
|
||||
known = _known_capability_tags(allowed)
|
||||
for t in tags:
|
||||
if not isinstance(t, str) or not pattern.match(t):
|
||||
errors.append(f"capability_tag '{t}' is not lowercase kebab-case")
|
||||
elif t not in known:
|
||||
warnings.append(f"capability_tag '{t}' is not a recommended family tag (allowed, check for synonym)")
|
||||
|
||||
# business_stake — 0..n allowed; recommend 2..6
|
||||
stake = block.get("business_stake") or []
|
||||
if not isinstance(stake, list):
|
||||
errors.append("`business_stake` must be a list")
|
||||
stake = []
|
||||
for s in stake:
|
||||
if s not in stakes:
|
||||
errors.append(f"business_stake '{s}' not in {sorted(stakes)}")
|
||||
if stake:
|
||||
lo = guidance.get("business_stake_recommended_min", 2)
|
||||
hi = guidance.get("business_stake_recommended_max", 6)
|
||||
if not (lo <= len(stake) <= hi):
|
||||
warnings.append(f"{len(stake)} business_stake values; {lo}-{hi} recommended")
|
||||
|
||||
# business_mechanics — 0..n allowed
|
||||
mech = block.get("business_mechanics") or []
|
||||
if not isinstance(mech, list):
|
||||
errors.append("`business_mechanics` must be a list")
|
||||
mech = []
|
||||
for m in mech:
|
||||
if m not in mechanics:
|
||||
errors.append(f"business_mechanics '{m}' not in {sorted(mechanics)}")
|
||||
|
||||
return errors, warnings
|
||||
|
||||
|
||||
def validate_file(path: Path, allowed: dict) -> bool:
|
||||
try:
|
||||
doc = yaml.safe_load(path.read_text())
|
||||
except (OSError, yaml.YAMLError) as exc:
|
||||
print(f"FAIL {path}: cannot read/parse ({exc})")
|
||||
return False
|
||||
errors, warnings = validate(doc, allowed)
|
||||
for w in warnings:
|
||||
print(f"warn {path}: {w}")
|
||||
if errors:
|
||||
for e in errors:
|
||||
print(f"FAIL {path}: {e}")
|
||||
return False
|
||||
print(f"ok {path}")
|
||||
return True
|
||||
|
||||
|
||||
def self_test(allowed: dict) -> bool:
|
||||
good = {
|
||||
"repo_classification": {
|
||||
"category": "research",
|
||||
"domain": "infotech",
|
||||
"secondary_domains": ["agents"],
|
||||
"capability_tags": ["governance", "knowledge", "coordination"],
|
||||
"business_stake": ["technology", "operations", "intelligence"],
|
||||
"business_mechanics": ["intention", "control", "coordination"],
|
||||
}
|
||||
}
|
||||
bad = {
|
||||
"repo_classification": {
|
||||
"category": "platform", # not a category (it's a tag)
|
||||
"domain": "knowledge", # not a market domain
|
||||
"secondary_domains": ["infotech", "infotech"],
|
||||
"capability_tags": ["Stuff", "access-control"],
|
||||
"business_stake": ["technology", "wizardry"],
|
||||
"business_mechanics": ["teleportation"],
|
||||
}
|
||||
}
|
||||
ge, _ = validate(good, allowed)
|
||||
be, _ = validate(bad, allowed)
|
||||
ok = (ge == []) and (len(be) >= 5)
|
||||
print(f"self-test: good_errors={len(ge)} bad_errors={len(be)} -> {'PASS' if ok else 'FAIL'}")
|
||||
return ok
|
||||
|
||||
|
||||
def main(argv: list[str]) -> int:
|
||||
allowed = load_allowed()
|
||||
args = argv[1:]
|
||||
if not args or args == ["--self-test"]:
|
||||
return 0 if self_test(allowed) else 1
|
||||
all_ok = True
|
||||
for a in args:
|
||||
if not validate_file(Path(a), allowed):
|
||||
all_ok = False
|
||||
return 0 if all_ok else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main(sys.argv))
|
||||
Reference in New Issue
Block a user