generated from coulomb/repo-seed
Add self-scoping assessment comparison
This commit is contained in:
@@ -48,3 +48,15 @@ The command reads an existing registry database and does not clone or scan the
|
||||
target repository. It records the target analysis metadata, candidate graph,
|
||||
approved map at export time, review decisions, fact and content summaries, known
|
||||
regression patterns, and current repo-scoping engine identity.
|
||||
|
||||
Compare an assessment against the curated golden profile:
|
||||
|
||||
```bash
|
||||
repo-scoping compare-assessment \
|
||||
--golden docs/self-scoping/golden/repo-scoping-golden-profile.v1.json \
|
||||
--assessment docs/self-scoping/assessments/repo-scoping-known-bad-2026-05-15-run-39.json \
|
||||
--format markdown
|
||||
```
|
||||
|
||||
The first comparison report highlights missing expected capabilities, forbidden
|
||||
native capabilities, known regression patterns, and misplaced API/CLI features.
|
||||
|
||||
@@ -9,6 +9,12 @@ from repo_registry.core.service import RegistryService
|
||||
from repo_registry.llm_extraction import LLMCandidateExtractor, create_llm_connect_adapter
|
||||
from repo_registry.repo_ingestion.git import GitIngestionService
|
||||
from repo_registry.self_scoping.assessment import artifact_json, export_assessment_artifact
|
||||
from repo_registry.self_scoping.comparison import (
|
||||
compare_assessment_to_golden,
|
||||
comparison_json,
|
||||
comparison_markdown,
|
||||
load_json,
|
||||
)
|
||||
from repo_registry.storage.sqlite import NotFoundError, RegistryStore
|
||||
from repo_registry.web_api.app import Settings
|
||||
|
||||
@@ -76,6 +82,23 @@ def build_parser() -> argparse.ArgumentParser:
|
||||
export.add_argument("--summary", help="Assessment summary override.")
|
||||
export.add_argument("--database-path", help="Override REPO_REGISTRY_DATABASE_PATH.")
|
||||
export.add_argument("--checkout-root", help="Override REPO_REGISTRY_CHECKOUT_ROOT.")
|
||||
compare = subparsers.add_parser(
|
||||
"compare-assessment",
|
||||
help="Compare a self-scoping assessment artifact against a golden profile.",
|
||||
)
|
||||
compare.add_argument("--golden", required=True, help="Golden profile JSON path.")
|
||||
compare.add_argument(
|
||||
"--assessment",
|
||||
required=True,
|
||||
help="Assessment artifact JSON path.",
|
||||
)
|
||||
compare.add_argument("--output", help="Write comparison report to this path instead of stdout.")
|
||||
compare.add_argument(
|
||||
"--format",
|
||||
choices=["json", "markdown"],
|
||||
default="markdown",
|
||||
help="Comparison report format.",
|
||||
)
|
||||
return parser
|
||||
|
||||
|
||||
@@ -86,6 +109,8 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
return rebuild_characteristics_command(args, parser)
|
||||
if args.command == "export-assessment":
|
||||
return export_assessment_command(args, parser)
|
||||
if args.command == "compare-assessment":
|
||||
return compare_assessment_command(args)
|
||||
parser.error(f"unknown command: {args.command}")
|
||||
return 2
|
||||
|
||||
@@ -122,6 +147,23 @@ def rebuild_characteristics_command(
|
||||
return 0
|
||||
|
||||
|
||||
def compare_assessment_command(args: argparse.Namespace) -> int:
|
||||
comparison = compare_assessment_to_golden(
|
||||
load_json(args.golden),
|
||||
load_json(args.assessment),
|
||||
)
|
||||
content = (
|
||||
comparison_json(comparison)
|
||||
if args.format == "json"
|
||||
else comparison_markdown(comparison)
|
||||
)
|
||||
if args.output:
|
||||
Path(args.output).write_text(content, encoding="utf-8")
|
||||
else:
|
||||
print(content, end="" if content.endswith("\n") else "\n")
|
||||
return 0
|
||||
|
||||
|
||||
def export_assessment_command(
|
||||
args: argparse.Namespace,
|
||||
parser: argparse.ArgumentParser,
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
from repo_registry.self_scoping.assessment import export_assessment_artifact
|
||||
from repo_registry.self_scoping.comparison import compare_assessment_to_golden
|
||||
|
||||
__all__ = ["export_assessment_artifact"]
|
||||
__all__ = ["compare_assessment_to_golden", "export_assessment_artifact"]
|
||||
|
||||
238
src/repo_registry/self_scoping/comparison.py
Normal file
238
src/repo_registry/self_scoping/comparison.py
Normal file
@@ -0,0 +1,238 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
|
||||
COMPARISON_SCHEMA_VERSION = "self-scoping-comparison/v1"
|
||||
|
||||
|
||||
def load_json(path: str | Path) -> dict[str, Any]:
|
||||
return json.loads(Path(path).read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def compare_assessment_to_golden(
|
||||
golden_profile: dict[str, Any],
|
||||
assessment: dict[str, Any],
|
||||
) -> dict[str, Any]:
|
||||
expected = _expected_capabilities(golden_profile)
|
||||
forbidden = _forbidden_capabilities(golden_profile)
|
||||
generated = _generated_capabilities(assessment)
|
||||
generated_names = set(generated)
|
||||
missing_expected = sorted(expected - generated_names)
|
||||
matched_expected = sorted(expected & generated_names)
|
||||
forbidden_present = sorted(forbidden & generated_names)
|
||||
known_regressions = assessment.get("known_regression_patterns", [])
|
||||
misplaced_features = _misplaced_features(generated)
|
||||
status = _status(
|
||||
missing_expected=missing_expected,
|
||||
forbidden_present=forbidden_present,
|
||||
known_regressions=known_regressions,
|
||||
misplaced_features=misplaced_features,
|
||||
)
|
||||
|
||||
return {
|
||||
"schema_version": COMPARISON_SCHEMA_VERSION,
|
||||
"comparison_id": _comparison_id(golden_profile, assessment),
|
||||
"created_at": datetime.now(UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z"),
|
||||
"golden_profile_id": golden_profile.get("profile_id", ""),
|
||||
"assessment_artifact_id": assessment.get("artifact_id", ""),
|
||||
"target_repo_slug": assessment.get("target_repository", {}).get("repo_slug", ""),
|
||||
"status": status,
|
||||
"summary": _summary(status, missing_expected, forbidden_present, known_regressions),
|
||||
"matched_expected_capabilities": matched_expected,
|
||||
"missing_expected_capabilities": missing_expected,
|
||||
"unexpected_native_capabilities": _unexpected_capabilities(
|
||||
generated_names,
|
||||
expected,
|
||||
forbidden,
|
||||
),
|
||||
"forbidden_native_capabilities_present": forbidden_present,
|
||||
"known_regression_patterns": known_regressions,
|
||||
"misplaced_features": misplaced_features,
|
||||
"comparison_hints": _comparison_hints(status),
|
||||
}
|
||||
|
||||
|
||||
def comparison_json(comparison: dict[str, Any]) -> str:
|
||||
return json.dumps(comparison, indent=2, sort_keys=True) + "\n"
|
||||
|
||||
|
||||
def comparison_markdown(comparison: dict[str, Any]) -> str:
|
||||
lines = [
|
||||
f"# Self-Scoping Comparison: {comparison['assessment_artifact_id']}",
|
||||
"",
|
||||
f"- Status: `{comparison['status']}`",
|
||||
f"- Golden profile: `{comparison['golden_profile_id']}`",
|
||||
f"- Target repo: `{comparison['target_repo_slug']}`",
|
||||
f"- Summary: {comparison['summary']}",
|
||||
"",
|
||||
"## Missing Expected Capabilities",
|
||||
*_bullets(comparison["missing_expected_capabilities"]),
|
||||
"",
|
||||
"## Forbidden Native Capabilities Present",
|
||||
*_bullets(comparison["forbidden_native_capabilities_present"]),
|
||||
"",
|
||||
"## Known Regression Patterns",
|
||||
*_regression_bullets(comparison["known_regression_patterns"]),
|
||||
"",
|
||||
"## Misplaced Features",
|
||||
*_misplaced_feature_bullets(comparison["misplaced_features"]),
|
||||
"",
|
||||
"## Matched Expected Capabilities",
|
||||
*_bullets(comparison["matched_expected_capabilities"]),
|
||||
"",
|
||||
"## Review Hints",
|
||||
*_bullets(comparison["comparison_hints"]),
|
||||
"",
|
||||
]
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _expected_capabilities(golden_profile: dict[str, Any]) -> set[str]:
|
||||
return {
|
||||
capability["name"]
|
||||
for capability in golden_profile.get("ability", {}).get("expected_capabilities", [])
|
||||
if capability.get("name")
|
||||
}
|
||||
|
||||
|
||||
def _forbidden_capabilities(golden_profile: dict[str, Any]) -> set[str]:
|
||||
return {
|
||||
capability["name"]
|
||||
for capability in golden_profile.get("forbidden_native_capabilities", [])
|
||||
if capability.get("name")
|
||||
}
|
||||
|
||||
|
||||
def _generated_capabilities(assessment: dict[str, Any]) -> dict[str, dict[str, Any]]:
|
||||
result: dict[str, dict[str, Any]] = {}
|
||||
for ability in assessment.get("generated_tree", {}).get("abilities", []):
|
||||
for capability in ability.get("capabilities", []):
|
||||
name = capability.get("name")
|
||||
if name:
|
||||
result[name] = capability
|
||||
return result
|
||||
|
||||
|
||||
def _unexpected_capabilities(
|
||||
generated_names: set[str],
|
||||
expected: set[str],
|
||||
forbidden: set[str],
|
||||
) -> list[str]:
|
||||
return sorted(generated_names - expected - forbidden)
|
||||
|
||||
|
||||
def _misplaced_features(
|
||||
generated: dict[str, dict[str, Any]],
|
||||
) -> list[dict[str, str]]:
|
||||
misplaced: list[dict[str, str]] = []
|
||||
for capability_name, capability in generated.items():
|
||||
primary_class = capability.get("primary_class", "")
|
||||
if primary_class not in {"llm-integration", "provider-routing"}:
|
||||
continue
|
||||
for feature in capability.get("features", []):
|
||||
if feature.get("type") not in {"API", "CLI"}:
|
||||
continue
|
||||
misplaced.append(
|
||||
{
|
||||
"capability": capability_name,
|
||||
"feature": feature.get("name", ""),
|
||||
"feature_type": feature.get("type", ""),
|
||||
"reason": "API/CLI surface is nested below provider-routing capability.",
|
||||
}
|
||||
)
|
||||
return misplaced
|
||||
|
||||
|
||||
def _status(
|
||||
*,
|
||||
missing_expected: list[str],
|
||||
forbidden_present: list[str],
|
||||
known_regressions: list[dict[str, Any]],
|
||||
misplaced_features: list[dict[str, str]],
|
||||
) -> str:
|
||||
if forbidden_present or misplaced_features or any(
|
||||
item.get("severity") in {"high", "critical"} for item in known_regressions
|
||||
):
|
||||
return "regression"
|
||||
if missing_expected or known_regressions:
|
||||
return "needs_review"
|
||||
return "candidate_improvement"
|
||||
|
||||
|
||||
def _summary(
|
||||
status: str,
|
||||
missing_expected: list[str],
|
||||
forbidden_present: list[str],
|
||||
known_regressions: list[dict[str, Any]],
|
||||
) -> str:
|
||||
if status == "regression":
|
||||
return (
|
||||
"Assessment repeats known or forbidden self-scoping patterns; prefer "
|
||||
"the golden profile until the engine is corrected."
|
||||
)
|
||||
if status == "needs_review":
|
||||
return (
|
||||
f"Assessment needs review: {len(missing_expected)} expected "
|
||||
f"capability(s) missing and {len(known_regressions)} regression "
|
||||
"pattern(s) reported."
|
||||
)
|
||||
return "Assessment covers the golden profile without known regression patterns."
|
||||
|
||||
|
||||
def _comparison_hints(status: str) -> list[str]:
|
||||
if status == "regression":
|
||||
return [
|
||||
"Do not promote this assessment as a preferred baseline.",
|
||||
"Inspect forbidden capabilities and misplaced features first.",
|
||||
"Use the findings as signal for scanner, generator, or acceptance-policy changes.",
|
||||
]
|
||||
if status == "needs_review":
|
||||
return [
|
||||
"Review missing expected capabilities before choosing old or new output.",
|
||||
"Check whether the golden profile needs a curator-approved update.",
|
||||
]
|
||||
return [
|
||||
"Candidate appears better than the known golden checks.",
|
||||
"Human or agentic review should still confirm source evidence quality.",
|
||||
]
|
||||
|
||||
|
||||
def _comparison_id(
|
||||
golden_profile: dict[str, Any],
|
||||
assessment: dict[str, Any],
|
||||
) -> str:
|
||||
return (
|
||||
f"{golden_profile.get('profile_id', 'golden')}"
|
||||
f"__{assessment.get('artifact_id', 'assessment')}"
|
||||
)
|
||||
|
||||
|
||||
def _bullets(items: list[str]) -> list[str]:
|
||||
if not items:
|
||||
return ["- None"]
|
||||
return [f"- {item}" for item in items]
|
||||
|
||||
|
||||
def _regression_bullets(items: list[dict[str, Any]]) -> list[str]:
|
||||
if not items:
|
||||
return ["- None"]
|
||||
return [
|
||||
f"- `{item.get('id', '')}` {item.get('title', '')}: {item.get('description', '')}"
|
||||
for item in items
|
||||
]
|
||||
|
||||
|
||||
def _misplaced_feature_bullets(items: list[dict[str, str]]) -> list[str]:
|
||||
if not items:
|
||||
return ["- None"]
|
||||
return [
|
||||
(
|
||||
f"- `{item['feature']}` under `{item['capability']}` "
|
||||
f"({item['feature_type']}): {item['reason']}"
|
||||
)
|
||||
for item in items
|
||||
]
|
||||
@@ -131,3 +131,26 @@ def test_export_assessment_cli_writes_completed_run_artifact(tmp_path):
|
||||
assert artifact["execution"]["analysis_run_id"] == summary.analysis_run.id
|
||||
assert artifact["assessment"]["role"] == "challenger"
|
||||
assert artifact["generated_tree"]["abilities"]
|
||||
|
||||
|
||||
def test_compare_assessment_cli_writes_markdown_report(tmp_path):
|
||||
output_path = tmp_path / "comparison.md"
|
||||
|
||||
exit_code = main(
|
||||
[
|
||||
"compare-assessment",
|
||||
"--golden",
|
||||
"docs/self-scoping/golden/repo-scoping-golden-profile.v1.json",
|
||||
"--assessment",
|
||||
"docs/self-scoping/assessments/repo-scoping-known-bad-2026-05-15-run-39.json",
|
||||
"--output",
|
||||
str(output_path),
|
||||
"--format",
|
||||
"markdown",
|
||||
]
|
||||
)
|
||||
|
||||
report = output_path.read_text(encoding="utf-8")
|
||||
assert exit_code == 0
|
||||
assert "Status: `regression`" in report
|
||||
assert "Route LLM Requests Across Providers" in report
|
||||
|
||||
62
tests/test_self_scoping_comparison.py
Normal file
62
tests/test_self_scoping_comparison.py
Normal file
@@ -0,0 +1,62 @@
|
||||
from pathlib import Path
|
||||
|
||||
from repo_registry.self_scoping.comparison import (
|
||||
compare_assessment_to_golden,
|
||||
comparison_markdown,
|
||||
load_json,
|
||||
)
|
||||
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
GOLDEN_PROFILE = (
|
||||
ROOT
|
||||
/ "docs"
|
||||
/ "self-scoping"
|
||||
/ "golden"
|
||||
/ "repo-scoping-golden-profile.v1.json"
|
||||
)
|
||||
KNOWN_BAD = (
|
||||
ROOT
|
||||
/ "docs"
|
||||
/ "self-scoping"
|
||||
/ "assessments"
|
||||
/ "repo-scoping-known-bad-2026-05-15-run-39.json"
|
||||
)
|
||||
|
||||
|
||||
def test_compare_known_bad_assessment_to_golden_flags_regression():
|
||||
comparison = compare_assessment_to_golden(
|
||||
load_json(GOLDEN_PROFILE),
|
||||
load_json(KNOWN_BAD),
|
||||
)
|
||||
|
||||
assert comparison["schema_version"] == "self-scoping-comparison/v1"
|
||||
assert comparison["status"] == "regression"
|
||||
assert "Route LLM Requests Across Providers" in comparison[
|
||||
"forbidden_native_capabilities_present"
|
||||
]
|
||||
assert "Scan Repositories Into Observed Facts" in comparison[
|
||||
"missing_expected_capabilities"
|
||||
]
|
||||
assert {item["id"] for item in comparison["known_regression_patterns"]} >= {
|
||||
"RREG-SELF-REG-001",
|
||||
"RREG-SELF-REG-002",
|
||||
"RREG-SELF-REG-003",
|
||||
}
|
||||
assert any(
|
||||
item["feature_type"] == "API" for item in comparison["misplaced_features"]
|
||||
)
|
||||
|
||||
|
||||
def test_comparison_markdown_summarizes_actionable_sections():
|
||||
comparison = compare_assessment_to_golden(
|
||||
load_json(GOLDEN_PROFILE),
|
||||
load_json(KNOWN_BAD),
|
||||
)
|
||||
|
||||
markdown = comparison_markdown(comparison)
|
||||
|
||||
assert "# Self-Scoping Comparison" in markdown
|
||||
assert "## Missing Expected Capabilities" in markdown
|
||||
assert "## Forbidden Native Capabilities Present" in markdown
|
||||
assert "Route LLM Requests Across Providers" in markdown
|
||||
@@ -168,7 +168,7 @@ the exporter and CLI path.
|
||||
|
||||
```task
|
||||
id: RREG-WP-0013-T05
|
||||
status: todo
|
||||
status: done
|
||||
priority: high
|
||||
state_hub_task_id: "2b71069b-6150-45f4-84a2-59f5ec1e04c0"
|
||||
```
|
||||
@@ -196,6 +196,14 @@ Acceptance criteria:
|
||||
- It can compare deterministic-only and agent-reviewed runs without losing
|
||||
provenance.
|
||||
|
||||
Implementation note 2026-05-15: added
|
||||
`src/repo_registry/self_scoping/comparison.py` and the
|
||||
`repo-scoping compare-assessment` CLI command. The first comparison report
|
||||
checks assessment artifacts against the repo-scoping golden profile, reports
|
||||
missing expected capabilities, forbidden native capability matches, known
|
||||
regression patterns, and misplaced API/CLI features under provider-routing
|
||||
capabilities. Reports can be emitted as JSON or Markdown.
|
||||
|
||||
## T06: Add Side-By-Side Review UI
|
||||
|
||||
```task
|
||||
|
||||
Reference in New Issue
Block a user