From 0b16167769f222b76a2cdc07670f58e80024a8b9 Mon Sep 17 00:00:00 2001 From: tegwick Date: Fri, 15 May 2026 12:48:41 +0200 Subject: [PATCH] Add self-scoping assessment comparison --- docs/self-scoping/README.md | 12 + src/repo_registry/cli.py | 42 ++++ src/repo_registry/self_scoping/__init__.py | 3 +- src/repo_registry/self_scoping/comparison.py | 238 ++++++++++++++++++ tests/test_cli.py | 23 ++ tests/test_self_scoping_comparison.py | 62 +++++ ...P-0013-self-scoping-baseline-evaluation.md | 10 +- 7 files changed, 388 insertions(+), 2 deletions(-) create mode 100644 src/repo_registry/self_scoping/comparison.py create mode 100644 tests/test_self_scoping_comparison.py diff --git a/docs/self-scoping/README.md b/docs/self-scoping/README.md index 1eca3de..9274060 100644 --- a/docs/self-scoping/README.md +++ b/docs/self-scoping/README.md @@ -48,3 +48,15 @@ The command reads an existing registry database and does not clone or scan the target repository. It records the target analysis metadata, candidate graph, approved map at export time, review decisions, fact and content summaries, known regression patterns, and current repo-scoping engine identity. + +Compare an assessment against the curated golden profile: + +```bash +repo-scoping compare-assessment \ + --golden docs/self-scoping/golden/repo-scoping-golden-profile.v1.json \ + --assessment docs/self-scoping/assessments/repo-scoping-known-bad-2026-05-15-run-39.json \ + --format markdown +``` + +The first comparison report highlights missing expected capabilities, forbidden +native capabilities, known regression patterns, and misplaced API/CLI features. diff --git a/src/repo_registry/cli.py b/src/repo_registry/cli.py index 83a0f21..2e0ce73 100644 --- a/src/repo_registry/cli.py +++ b/src/repo_registry/cli.py @@ -9,6 +9,12 @@ from repo_registry.core.service import RegistryService from repo_registry.llm_extraction import LLMCandidateExtractor, create_llm_connect_adapter from repo_registry.repo_ingestion.git import GitIngestionService from repo_registry.self_scoping.assessment import artifact_json, export_assessment_artifact +from repo_registry.self_scoping.comparison import ( + compare_assessment_to_golden, + comparison_json, + comparison_markdown, + load_json, +) from repo_registry.storage.sqlite import NotFoundError, RegistryStore from repo_registry.web_api.app import Settings @@ -76,6 +82,23 @@ def build_parser() -> argparse.ArgumentParser: export.add_argument("--summary", help="Assessment summary override.") export.add_argument("--database-path", help="Override REPO_REGISTRY_DATABASE_PATH.") export.add_argument("--checkout-root", help="Override REPO_REGISTRY_CHECKOUT_ROOT.") + compare = subparsers.add_parser( + "compare-assessment", + help="Compare a self-scoping assessment artifact against a golden profile.", + ) + compare.add_argument("--golden", required=True, help="Golden profile JSON path.") + compare.add_argument( + "--assessment", + required=True, + help="Assessment artifact JSON path.", + ) + compare.add_argument("--output", help="Write comparison report to this path instead of stdout.") + compare.add_argument( + "--format", + choices=["json", "markdown"], + default="markdown", + help="Comparison report format.", + ) return parser @@ -86,6 +109,8 @@ def main(argv: Sequence[str] | None = None) -> int: return rebuild_characteristics_command(args, parser) if args.command == "export-assessment": return export_assessment_command(args, parser) + if args.command == "compare-assessment": + return compare_assessment_command(args) parser.error(f"unknown command: {args.command}") return 2 @@ -122,6 +147,23 @@ def rebuild_characteristics_command( return 0 +def compare_assessment_command(args: argparse.Namespace) -> int: + comparison = compare_assessment_to_golden( + load_json(args.golden), + load_json(args.assessment), + ) + content = ( + comparison_json(comparison) + if args.format == "json" + else comparison_markdown(comparison) + ) + if args.output: + Path(args.output).write_text(content, encoding="utf-8") + else: + print(content, end="" if content.endswith("\n") else "\n") + return 0 + + def export_assessment_command( args: argparse.Namespace, parser: argparse.ArgumentParser, diff --git a/src/repo_registry/self_scoping/__init__.py b/src/repo_registry/self_scoping/__init__.py index 14b593e..3ba3de1 100644 --- a/src/repo_registry/self_scoping/__init__.py +++ b/src/repo_registry/self_scoping/__init__.py @@ -1,3 +1,4 @@ from repo_registry.self_scoping.assessment import export_assessment_artifact +from repo_registry.self_scoping.comparison import compare_assessment_to_golden -__all__ = ["export_assessment_artifact"] +__all__ = ["compare_assessment_to_golden", "export_assessment_artifact"] diff --git a/src/repo_registry/self_scoping/comparison.py b/src/repo_registry/self_scoping/comparison.py new file mode 100644 index 0000000..131e392 --- /dev/null +++ b/src/repo_registry/self_scoping/comparison.py @@ -0,0 +1,238 @@ +from __future__ import annotations + +import json +from datetime import UTC, datetime +from pathlib import Path +from typing import Any + + +COMPARISON_SCHEMA_VERSION = "self-scoping-comparison/v1" + + +def load_json(path: str | Path) -> dict[str, Any]: + return json.loads(Path(path).read_text(encoding="utf-8")) + + +def compare_assessment_to_golden( + golden_profile: dict[str, Any], + assessment: dict[str, Any], +) -> dict[str, Any]: + expected = _expected_capabilities(golden_profile) + forbidden = _forbidden_capabilities(golden_profile) + generated = _generated_capabilities(assessment) + generated_names = set(generated) + missing_expected = sorted(expected - generated_names) + matched_expected = sorted(expected & generated_names) + forbidden_present = sorted(forbidden & generated_names) + known_regressions = assessment.get("known_regression_patterns", []) + misplaced_features = _misplaced_features(generated) + status = _status( + missing_expected=missing_expected, + forbidden_present=forbidden_present, + known_regressions=known_regressions, + misplaced_features=misplaced_features, + ) + + return { + "schema_version": COMPARISON_SCHEMA_VERSION, + "comparison_id": _comparison_id(golden_profile, assessment), + "created_at": datetime.now(UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z"), + "golden_profile_id": golden_profile.get("profile_id", ""), + "assessment_artifact_id": assessment.get("artifact_id", ""), + "target_repo_slug": assessment.get("target_repository", {}).get("repo_slug", ""), + "status": status, + "summary": _summary(status, missing_expected, forbidden_present, known_regressions), + "matched_expected_capabilities": matched_expected, + "missing_expected_capabilities": missing_expected, + "unexpected_native_capabilities": _unexpected_capabilities( + generated_names, + expected, + forbidden, + ), + "forbidden_native_capabilities_present": forbidden_present, + "known_regression_patterns": known_regressions, + "misplaced_features": misplaced_features, + "comparison_hints": _comparison_hints(status), + } + + +def comparison_json(comparison: dict[str, Any]) -> str: + return json.dumps(comparison, indent=2, sort_keys=True) + "\n" + + +def comparison_markdown(comparison: dict[str, Any]) -> str: + lines = [ + f"# Self-Scoping Comparison: {comparison['assessment_artifact_id']}", + "", + f"- Status: `{comparison['status']}`", + f"- Golden profile: `{comparison['golden_profile_id']}`", + f"- Target repo: `{comparison['target_repo_slug']}`", + f"- Summary: {comparison['summary']}", + "", + "## Missing Expected Capabilities", + *_bullets(comparison["missing_expected_capabilities"]), + "", + "## Forbidden Native Capabilities Present", + *_bullets(comparison["forbidden_native_capabilities_present"]), + "", + "## Known Regression Patterns", + *_regression_bullets(comparison["known_regression_patterns"]), + "", + "## Misplaced Features", + *_misplaced_feature_bullets(comparison["misplaced_features"]), + "", + "## Matched Expected Capabilities", + *_bullets(comparison["matched_expected_capabilities"]), + "", + "## Review Hints", + *_bullets(comparison["comparison_hints"]), + "", + ] + return "\n".join(lines) + + +def _expected_capabilities(golden_profile: dict[str, Any]) -> set[str]: + return { + capability["name"] + for capability in golden_profile.get("ability", {}).get("expected_capabilities", []) + if capability.get("name") + } + + +def _forbidden_capabilities(golden_profile: dict[str, Any]) -> set[str]: + return { + capability["name"] + for capability in golden_profile.get("forbidden_native_capabilities", []) + if capability.get("name") + } + + +def _generated_capabilities(assessment: dict[str, Any]) -> dict[str, dict[str, Any]]: + result: dict[str, dict[str, Any]] = {} + for ability in assessment.get("generated_tree", {}).get("abilities", []): + for capability in ability.get("capabilities", []): + name = capability.get("name") + if name: + result[name] = capability + return result + + +def _unexpected_capabilities( + generated_names: set[str], + expected: set[str], + forbidden: set[str], +) -> list[str]: + return sorted(generated_names - expected - forbidden) + + +def _misplaced_features( + generated: dict[str, dict[str, Any]], +) -> list[dict[str, str]]: + misplaced: list[dict[str, str]] = [] + for capability_name, capability in generated.items(): + primary_class = capability.get("primary_class", "") + if primary_class not in {"llm-integration", "provider-routing"}: + continue + for feature in capability.get("features", []): + if feature.get("type") not in {"API", "CLI"}: + continue + misplaced.append( + { + "capability": capability_name, + "feature": feature.get("name", ""), + "feature_type": feature.get("type", ""), + "reason": "API/CLI surface is nested below provider-routing capability.", + } + ) + return misplaced + + +def _status( + *, + missing_expected: list[str], + forbidden_present: list[str], + known_regressions: list[dict[str, Any]], + misplaced_features: list[dict[str, str]], +) -> str: + if forbidden_present or misplaced_features or any( + item.get("severity") in {"high", "critical"} for item in known_regressions + ): + return "regression" + if missing_expected or known_regressions: + return "needs_review" + return "candidate_improvement" + + +def _summary( + status: str, + missing_expected: list[str], + forbidden_present: list[str], + known_regressions: list[dict[str, Any]], +) -> str: + if status == "regression": + return ( + "Assessment repeats known or forbidden self-scoping patterns; prefer " + "the golden profile until the engine is corrected." + ) + if status == "needs_review": + return ( + f"Assessment needs review: {len(missing_expected)} expected " + f"capability(s) missing and {len(known_regressions)} regression " + "pattern(s) reported." + ) + return "Assessment covers the golden profile without known regression patterns." + + +def _comparison_hints(status: str) -> list[str]: + if status == "regression": + return [ + "Do not promote this assessment as a preferred baseline.", + "Inspect forbidden capabilities and misplaced features first.", + "Use the findings as signal for scanner, generator, or acceptance-policy changes.", + ] + if status == "needs_review": + return [ + "Review missing expected capabilities before choosing old or new output.", + "Check whether the golden profile needs a curator-approved update.", + ] + return [ + "Candidate appears better than the known golden checks.", + "Human or agentic review should still confirm source evidence quality.", + ] + + +def _comparison_id( + golden_profile: dict[str, Any], + assessment: dict[str, Any], +) -> str: + return ( + f"{golden_profile.get('profile_id', 'golden')}" + f"__{assessment.get('artifact_id', 'assessment')}" + ) + + +def _bullets(items: list[str]) -> list[str]: + if not items: + return ["- None"] + return [f"- {item}" for item in items] + + +def _regression_bullets(items: list[dict[str, Any]]) -> list[str]: + if not items: + return ["- None"] + return [ + f"- `{item.get('id', '')}` {item.get('title', '')}: {item.get('description', '')}" + for item in items + ] + + +def _misplaced_feature_bullets(items: list[dict[str, str]]) -> list[str]: + if not items: + return ["- None"] + return [ + ( + f"- `{item['feature']}` under `{item['capability']}` " + f"({item['feature_type']}): {item['reason']}" + ) + for item in items + ] diff --git a/tests/test_cli.py b/tests/test_cli.py index 2d08c5f..9f44fb9 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -131,3 +131,26 @@ def test_export_assessment_cli_writes_completed_run_artifact(tmp_path): assert artifact["execution"]["analysis_run_id"] == summary.analysis_run.id assert artifact["assessment"]["role"] == "challenger" assert artifact["generated_tree"]["abilities"] + + +def test_compare_assessment_cli_writes_markdown_report(tmp_path): + output_path = tmp_path / "comparison.md" + + exit_code = main( + [ + "compare-assessment", + "--golden", + "docs/self-scoping/golden/repo-scoping-golden-profile.v1.json", + "--assessment", + "docs/self-scoping/assessments/repo-scoping-known-bad-2026-05-15-run-39.json", + "--output", + str(output_path), + "--format", + "markdown", + ] + ) + + report = output_path.read_text(encoding="utf-8") + assert exit_code == 0 + assert "Status: `regression`" in report + assert "Route LLM Requests Across Providers" in report diff --git a/tests/test_self_scoping_comparison.py b/tests/test_self_scoping_comparison.py new file mode 100644 index 0000000..da5f1b9 --- /dev/null +++ b/tests/test_self_scoping_comparison.py @@ -0,0 +1,62 @@ +from pathlib import Path + +from repo_registry.self_scoping.comparison import ( + compare_assessment_to_golden, + comparison_markdown, + load_json, +) + + +ROOT = Path(__file__).resolve().parents[1] +GOLDEN_PROFILE = ( + ROOT + / "docs" + / "self-scoping" + / "golden" + / "repo-scoping-golden-profile.v1.json" +) +KNOWN_BAD = ( + ROOT + / "docs" + / "self-scoping" + / "assessments" + / "repo-scoping-known-bad-2026-05-15-run-39.json" +) + + +def test_compare_known_bad_assessment_to_golden_flags_regression(): + comparison = compare_assessment_to_golden( + load_json(GOLDEN_PROFILE), + load_json(KNOWN_BAD), + ) + + assert comparison["schema_version"] == "self-scoping-comparison/v1" + assert comparison["status"] == "regression" + assert "Route LLM Requests Across Providers" in comparison[ + "forbidden_native_capabilities_present" + ] + assert "Scan Repositories Into Observed Facts" in comparison[ + "missing_expected_capabilities" + ] + assert {item["id"] for item in comparison["known_regression_patterns"]} >= { + "RREG-SELF-REG-001", + "RREG-SELF-REG-002", + "RREG-SELF-REG-003", + } + assert any( + item["feature_type"] == "API" for item in comparison["misplaced_features"] + ) + + +def test_comparison_markdown_summarizes_actionable_sections(): + comparison = compare_assessment_to_golden( + load_json(GOLDEN_PROFILE), + load_json(KNOWN_BAD), + ) + + markdown = comparison_markdown(comparison) + + assert "# Self-Scoping Comparison" in markdown + assert "## Missing Expected Capabilities" in markdown + assert "## Forbidden Native Capabilities Present" in markdown + assert "Route LLM Requests Across Providers" in markdown diff --git a/workplans/RREG-WP-0013-self-scoping-baseline-evaluation.md b/workplans/RREG-WP-0013-self-scoping-baseline-evaluation.md index 694ca35..20cf7b9 100644 --- a/workplans/RREG-WP-0013-self-scoping-baseline-evaluation.md +++ b/workplans/RREG-WP-0013-self-scoping-baseline-evaluation.md @@ -168,7 +168,7 @@ the exporter and CLI path. ```task id: RREG-WP-0013-T05 -status: todo +status: done priority: high state_hub_task_id: "2b71069b-6150-45f4-84a2-59f5ec1e04c0" ``` @@ -196,6 +196,14 @@ Acceptance criteria: - It can compare deterministic-only and agent-reviewed runs without losing provenance. +Implementation note 2026-05-15: added +`src/repo_registry/self_scoping/comparison.py` and the +`repo-scoping compare-assessment` CLI command. The first comparison report +checks assessment artifacts against the repo-scoping golden profile, reports +missing expected capabilities, forbidden native capability matches, known +regression patterns, and misplaced API/CLI features under provider-routing +capabilities. Reports can be emitted as JSON or Markdown. + ## T06: Add Side-By-Side Review UI ```task