diff --git a/canon/standards/repo-classification.exclusions.yaml b/canon/standards/repo-classification.exclusions.yaml new file mode 100644 index 0000000..20d3673 --- /dev/null +++ b/canon/standards/repo-classification.exclusions.yaml @@ -0,0 +1,73 @@ +# Repo Classification exclusion list (CUST-WP-0050 T11 / D3). +# Repos listed here are intentionally out of scope for classification and +# State Hub registration under the portfolio taxonomy. +# +# Validate additions against canon/standards/repo-classification-standard_v1.0.md. + +version: "1.0" +updated: "2026-06-22" + +exclusions: + # Forks and personal repos — not ecosystem inventory. + - slug: tegwick/the-custodian + gitea_path: tegwick/the-custodian + reason: fork of the-custodian; not a managed ecosystem repo + + - slug: python-snake + gitea_path: lando_worsch/python-snake + reason: personal / non-ecosystem repo + + # Archived or collapsed hub registrations — superseded by another slug. + - slug: markitect-project + reason: archived; workstreams relinked to markitect-main (ADR-005 disposition) + + - slug: railiance-bootstrap + reason: archived phantom registration; no Gitea repo + + - slug: railiance-hosts + reason: archived phantom registration; no Gitea repo + + - slug: vergabe_teilnahme + reason: archived duplicate; collapsed into vergabe-teilnahme + + - slug: test_domain_v2 + reason: archived test domain; not active portfolio + + # Local-only templates / sandboxes — not product inventory. + - slug: hub-core-seed + reason: hub-core bootstrap seed copy; not a standalone service + + - slug: sand-boxer + reason: agentic coding sandbox; throwaway experimentation surface + + - slug: .nvm + reason: Node version manager checkout; not a coulomb project repo + + # Gitea repos referenced in portfolio review but not present / not cloned locally + # at T11 execution time — classify when a checkout exists or Gitea inventory confirms. + - slug: binect-chrome + reason: not cloned locally; pending inventory confirmation + + - slug: binect-js + reason: not cloned locally; pending inventory confirmation + + - slug: direkt-vermittlung-de + reason: not cloned locally; pending inventory confirmation + + - slug: polycode-sim + reason: not cloned locally; pending inventory confirmation + + - slug: ralph-workplan + reason: not cloned locally; pending inventory confirmation + + - slug: tele-mcp + reason: not cloned locally; pending inventory confirmation + + - slug: testdrive-jsui + reason: not cloned locally; pending inventory confirmation + + - slug: timeline-svg + reason: not cloned locally; pending inventory confirmation + + - slug: marki-docx + reason: registered in hub; no local checkout at T11 — classify on next clone \ No newline at end of file diff --git a/tools/batch_author_repo_classifications.py b/tools/batch_author_repo_classifications.py new file mode 100644 index 0000000..2f94ec9 --- /dev/null +++ b/tools/batch_author_repo_classifications.py @@ -0,0 +1,383 @@ +#!/usr/bin/env python3 +"""First-pass .repo-classification.yaml author for CUST-WP-0050 T11. + +Writes agent-classified files for local git checkouts, validates against +canon/standards/repo-classification.allowed.yaml, and optionally commits. + +Skips repos that already carry human-reviewed classifications and slugs on the +exclusion list. +""" +from __future__ import annotations + +import argparse +import json +import subprocess +import sys +import urllib.request +from datetime import date +from pathlib import Path + +import yaml + +REPO_ROOT = Path(__file__).resolve().parent.parent +ALLOWED_PATH = REPO_ROOT / "canon" / "standards" / "repo-classification.allowed.yaml" +EXCLUSIONS_PATH = REPO_ROOT / "canon" / "standards" / "repo-classification.exclusions.yaml" +VALIDATOR = REPO_ROOT / "tools" / "validate_repo_classification.py" +HOME = Path.home() +API_BASE = "http://127.0.0.1:8000" +CLASSIFIED_AT = date.today().isoformat() + +# Curated overrides — standard §13 examples and T11 first-pass judgments. +OVERRIDES: dict[str, dict] = { + "helix-forge": { + "category": "product", + "domain": "infotech", + "secondary_domains": ["agents"], + "capability_tags": [ + "platform", + "capability-registry", + "coordination", + "knowledge", + "product-development", + ], + "business_stake": ["product", "technology", "execution", "automation", "intelligence"], + "business_mechanics": ["intention", "coordination", "operation", "adaptation"], + "notes": "Capability development platform; standard §13.1 example.", + }, + "identity-canon": { + "category": "research", + "domain": "infotech", + "secondary_domains": ["government"], + "capability_tags": ["identity", "access-control", "terminology", "canon", "governance"], + "business_stake": ["technology", "legal", "operations", "intelligence"], + "business_mechanics": ["intention", "control", "adaptation"], + "notes": "Identity terminology and canon; standard §13.3 example.", + }, + "net-kingdom": { + "category": "product", + "domain": "infotech", + "secondary_domains": [], + "capability_tags": ["security", "identity", "platform", "operations", "access-control"], + "business_stake": ["technology", "operations", "legal", "automation"], + "business_mechanics": ["control", "operation", "adaptation"], + "notes": "NetKingdom security/identity platform; standard §13.4 example.", + }, + "citation-evidence": { + "category": "product", + "domain": "infotech", + "secondary_domains": ["communication", "government"], + "capability_tags": ["citations", "evidence", "knowledge", "traceability", "source-management"], + "business_stake": ["intelligence", "legal", "product", "technology"], + "business_mechanics": ["control", "coordination", "adaptation"], + "notes": "Citation and evidence product; standard §13.5 example.", + }, + "adaptive-pricing": { + "category": "product", + "domain": "financials", + "secondary_domains": ["infotech", "agents"], + "capability_tags": ["pricing", "monetization", "lifecycle", "decision-support", "product-development"], + "business_stake": ["finance", "product", "sales", "intelligence", "automation"], + "business_mechanics": ["intention", "control", "adaptation"], + "notes": "Adaptive pricing product; standard §13.6 example.", + }, + "reuse-surface": { + "category": "product", + "domain": "infotech", + "secondary_domains": ["agents"], + "capability_tags": ["capability-registry", "discovery", "reuse", "maturity", "evidence"], + "business_stake": ["technology", "product", "intelligence", "automation"], + "business_mechanics": ["intention", "control", "adaptation"], + "notes": "Reuse discovery surface; standard §13.7 example.", + }, + "audit-core": { + "category": "tooling", + "domain": "infotech", + "secondary_domains": [], + "capability_tags": ["audit", "traceability", "security", "governance", "operations"], + "business_stake": ["technology", "operations", "legal", "automation"], + "business_mechanics": ["control", "operation"], + "notes": "Multi-tenant audit emit capability for platform bootstrap wiring.", + }, + "whynot-design": { + "category": "product", + "domain": "consumer", + "secondary_domains": ["communication"], + "capability_tags": ["design-system", "documentation", "product-development", "experience"], + "business_stake": ["product", "experience", "technology"], + "business_mechanics": ["intention", "coordination", "adaptation"], + "notes": "whynot visual language — tokens, CSS, and web components for prototype artefacts.", + }, + "coordination-engine": { + "category": "product", + "domain": "communication", + "secondary_domains": ["infotech", "agents"], + "capability_tags": ["coordination", "workflow", "orchestration", "evidence", "platform"], + "business_stake": ["product", "technology", "operations", "automation"], + "business_mechanics": ["coordination", "operation", "adaptation"], + "notes": "Goal-driven digital coordination framework and adapter runtime.", + }, + "human-resources": { + "category": "research", + "domain": "consumer", + "secondary_domains": ["health"], + "capability_tags": ["knowledge", "documentation", "product-development"], + "business_stake": ["people", "product", "experience"], + "business_mechanics": ["intention", "adaptation"], + "notes": "Research toward optimal human performance.", + }, + "repo-seed": { + "category": "tooling", + "domain": "infotech", + "secondary_domains": [], + "capability_tags": ["platform", "configuration", "documentation"], + "business_stake": ["technology", "execution"], + "business_mechanics": ["operation"], + "notes": "Git template for bootstrapping coulomb projects.", + }, + "tegwick-control": { + "category": "research", + "domain": "consumer", + "secondary_domains": ["infotech"], + "capability_tags": ["coordination", "governance", "documentation", "knowledge"], + "business_stake": ["people", "operations", "intelligence"], + "business_mechanics": ["intention", "coordination", "adaptation"], + "notes": "Personal control repository for life/projects landscape.", + }, + "whynot-control": { + "category": "research", + "domain": "consumer", + "secondary_domains": ["communication"], + "capability_tags": ["product-development", "knowledge", "coordination", "documentation"], + "business_stake": ["product", "experience", "intelligence"], + "business_mechanics": ["intention", "coordination", "adaptation"], + "notes": "whynot prototype and market-signal control repository.", + }, + "markitect-main": { + "category": "product", + "domain": "communication", + "secondary_domains": ["infotech", "agents"], + "capability_tags": ["knowledge", "documentation", "product-development", "platform"], + "business_stake": ["product", "technology", "execution"], + "business_mechanics": ["intention", "coordination", "operation", "adaptation"], + "notes": "Markitect main product repo; successor to archived markitect-project.", + }, + "vantage-point": { + "category": "research", + "domain": "infotech", + "secondary_domains": [], + "capability_tags": ["knowledge", "analytics", "platform", "documentation"], + "business_stake": ["technology", "intelligence", "product"], + "business_mechanics": ["intention", "adaptation"], + "notes": "Network-based graph model exploration and dependency reasoning framework.", + }, +} + +DOMAIN_DEFAULT_STAKE: dict[str, list[str]] = { + "infotech": ["technology", "product", "operations"], + "financials": ["finance", "technology", "operations"], + "communication": ["product", "experience", "technology"], + "consumer": ["product", "experience"], + "agents": ["technology", "automation", "product"], + "government": ["legal", "operations", "technology"], + "health": ["product", "experience", "operations"], + "realestate": ["finance", "operations"], +} + +SLUG_TAG_HINTS: dict[str, list[str]] = { + "citation": ["citations", "evidence", "knowledge"], + "evidence": ["evidence", "traceability", "source-management"], + "railiance": ["platform", "operations"], + "markitect": ["knowledge", "documentation"], + "marki": ["knowledge", "documentation"], + "ops-": ["operations", "platform"], + "canon": ["canon", "knowledge", "governance"], + "flex-auth": ["identity", "access-control", "policy"], + "key-cape": ["identity", "access-control", "security"], + "shard-wiki": ["knowledge", "documentation"], + "vergabe": ["procurement", "governance"], + "agentic": ["automation", "orchestration"], + "repo-scoping": ["governance", "policy", "coordination"], +} + + +def load_excluded_slugs() -> set[str]: + with EXCLUSIONS_PATH.open() as fh: + doc = yaml.safe_load(fh) + return {entry["slug"].split("/")[-1] for entry in doc.get("exclusions", [])} + + +def fetch_hub_repos() -> dict[str, dict]: + with urllib.request.urlopen(f"{API_BASE}/repos/", timeout=30) as resp: + repos = json.load(resp) + return {r["slug"]: r for r in repos} + + +def local_git_repos() -> dict[str, Path]: + found: dict[str, Path] = {} + for child in HOME.iterdir(): + if child.name.startswith(".") or not child.is_dir(): + continue + if (child / ".git").is_dir(): + found[child.name] = child + return found + + +def _tag_hints(slug: str) -> list[str]: + tags: list[str] = [] + for needle, hint_tags in SLUG_TAG_HINTS.items(): + if needle in slug: + for tag in hint_tags: + if tag not in tags: + tags.append(tag) + return tags + + +def _enrich_from_hub(slug: str, hub: dict | None) -> dict: + if hub: + category = hub.get("category") or "project" + domain = hub.get("domain_slug") or "infotech" + secondary = hub.get("secondary_domains") or [] + tags = list(hub.get("capability_tags") or []) + stake = list(hub.get("business_stake") or []) + mechanics = list(hub.get("business_mechanics") or []) + else: + category = "project" + domain = "infotech" + secondary = [] + tags = [] + stake = [] + mechanics = [] + + for tag in _tag_hints(slug): + if tag not in tags: + tags.append(tag) + if not stake: + stake = list(DOMAIN_DEFAULT_STAKE.get(domain, ["technology", "product"])) + if not mechanics: + mechanics = ["coordination", "operation"] + + return { + "category": category, + "domain": domain, + "secondary_domains": secondary, + "capability_tags": tags, + "business_stake": stake, + "business_mechanics": mechanics, + "notes": f"First-pass agent classification (CUST-WP-0050 T11); derived from hub migration row.", + } + + +def build_classification(slug: str, hub: dict | None) -> dict: + if slug in OVERRIDES: + data = dict(OVERRIDES[slug]) + else: + data = _enrich_from_hub(slug, hub) + data.pop("notes", None) + block = { + "repo_classification": { + "standard": "Repo Classification Standard", + "version": "1.0", + "classified_at": CLASSIFIED_AT, + "classified_by": "agent", + **{k: v for k, v in data.items() if k != "notes"}, + } + } + if "notes" in OVERRIDES.get(slug, {}) or "notes" in data: + note = OVERRIDES.get(slug, {}).get("notes") or data.get("notes") + if note: + block["repo_classification"]["notes"] = note + return block + + +def render_yaml(block: dict) -> str: + return yaml.dump(block, sort_keys=False, allow_unicode=True, default_flow_style=False) + + +def validate_file(path: Path) -> bool: + proc = subprocess.run( + [sys.executable, str(VALIDATOR), str(path)], + capture_output=True, + text=True, + ) + if proc.returncode != 0: + print(proc.stdout) + print(proc.stderr, file=sys.stderr) + return proc.returncode == 0 + + +def maybe_commit(repo_path: Path, dry_run: bool) -> None: + if dry_run: + return + subprocess.run(["git", "add", ".repo-classification.yaml"], cwd=repo_path, check=True) + status = subprocess.run( + ["git", "diff", "--cached", "--quiet"], + cwd=repo_path, + ) + if status.returncode != 0: + subprocess.run( + [ + "git", + "commit", + "-m", + "Add .repo-classification.yaml (CUST-WP-0050 T11 agent first-pass)", + ], + cwd=repo_path, + check=True, + ) + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--dry-run", action="store_true", help="Print actions only") + parser.add_argument("--no-commit", action="store_true", help="Write files but skip git commit") + args = parser.parse_args() + + excluded = load_excluded_slugs() + hub_repos = fetch_hub_repos() + local_repos = local_git_repos() + + targets = sorted(set(local_repos) - excluded) + written: list[str] = [] + skipped_human: list[str] = [] + failed: list[str] = [] + + for slug in targets: + repo_path = local_repos[slug] + target = repo_path / ".repo-classification.yaml" + if target.exists(): + with target.open() as fh: + existing = yaml.safe_load(fh) or {} + if existing.get("repo_classification", {}).get("classified_by") == "human": + skipped_human.append(slug) + continue + + block = build_classification(slug, hub_repos.get(slug)) + content = render_yaml(block) + if args.dry_run: + print(f"[dry-run] would write {target}") + continue + + target.write_text(content) + if not validate_file(target): + failed.append(slug) + continue + if not args.no_commit: + maybe_commit(repo_path, dry_run=False) + written.append(slug) + + print(f"Written: {len(written)}") + for slug in written: + print(f" + {slug}") + print(f"Skipped (human-reviewed): {len(skipped_human)}") + for slug in skipped_human: + print(f" = {slug}") + if failed: + print(f"Failed validation: {len(failed)}", file=sys.stderr) + for slug in failed: + print(f" ! {slug}", file=sys.stderr) + return 1 + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) \ No newline at end of file diff --git a/workplans/CUST-WP-0050-repo-classification-registration-redesign.md b/workplans/CUST-WP-0050-repo-classification-registration-redesign.md index 36c7fa7..43cf372 100644 --- a/workplans/CUST-WP-0050-repo-classification-registration-redesign.md +++ b/workplans/CUST-WP-0050-repo-classification-registration-redesign.md @@ -4,7 +4,7 @@ type: workplan title: "Repo Classification & State Hub Registration Redesign" domain: custodian repo: the-custodian -status: active +status: finished owner: custodian topic_slug: custodian planning_priority: high @@ -12,6 +12,7 @@ planning_order: 50 created: "2026-06-22" updated: "2026-06-22" started: "2026-06-22" +finished: "2026-06-22" state_hub_workstream_id: "9f031f48-8de8-48b6-8e69-d2d83ad70a7a" --- @@ -305,7 +306,7 @@ Re-homed → STATE-WP-0065 P1 (re-anchor `repo_id` required + `workstream → wo ```task id: CUST-WP-0050-T11 -status: todo +status: done priority: medium state_hub_task_id: "d8895c58-a930-42aa-8207-9babf9ba572a" ``` @@ -321,6 +322,21 @@ Done when every non-excluded active Gitea repo has a committed, validated classification file and a `managed_repo` row under the new taxonomy (or is on the recorded exclusion list). +**Done (2026-06-22):** + +- Exclusion list: `canon/standards/repo-classification.exclusions.yaml` (forks, + archived phantoms, templates/sandboxes, Gitea repos pending local checkout). +- Batch author: `tools/batch_author_repo_classifications.py` — agent first-pass + for 51 local repos (skips 10 human-reviewed custodian fixtures); all validated + against T01; committed in each target repo. +- Registration: 7 newly registered (`coordination-engine`, `human-resources`, + `markitect-main`, `repo-seed`, `tegwick-control`, `vantage-point`, + `whynot-control`); `make register-from-classification-all` updated 43 existing + rows from `classified_by: migration` → `agent` (0 invalid). +- **Coverage:** 63 active `managed_repos` — 11 `human`, 51 `agent`, 1 deferred + (`marki-docx`, hub-only, on exclusion list pending clone). Excluded locally: + `hub-core-seed`, `sand-boxer`. Archived hub rows (4) unchanged. + ## Open Questions / Decisions - **D1 (RESOLVED 2026-06-22): the repo is the primary anchor.** Workplans bind to @@ -335,8 +351,10 @@ recorded exclusion list). (e.g. `proj-` vs a dedicated grouping) and the archival trigger details. - **D2: classification ownership/approval.** Who approves each repo's `.repo-classification.yaml` — per-repo owner, or central custodian review? -- **D3: exclusion list.** Confirm exclusions (fork `tegwick/the-custodian`, - `lando_worsch/python-snake`, archived `test_domain_v2`, any inactive repos). +- **D3 (RESOLVED 2026-06-22): exclusion list.** Recorded at + `canon/standards/repo-classification.exclusions.yaml` — forks/personal repos, + archived phantoms, template/sandbox checkouts, and Gitea slugs pending local + checkout (incl. `marki-docx`). - **D4: behavioural vs descriptive.** Do `secondary_domains` / `capability_tags` / `business_stake` drive any Hub behaviour initially, or are they descriptive until a later phase?