Add self-scoping assessment comparison

This commit is contained in:
2026-05-15 12:48:41 +02:00
parent d14cb316c7
commit 0b16167769
7 changed files with 388 additions and 2 deletions

View File

@@ -48,3 +48,15 @@ The command reads an existing registry database and does not clone or scan the
target repository. It records the target analysis metadata, candidate graph,
approved map at export time, review decisions, fact and content summaries, known
regression patterns, and current repo-scoping engine identity.
Compare an assessment against the curated golden profile:
```bash
repo-scoping compare-assessment \
--golden docs/self-scoping/golden/repo-scoping-golden-profile.v1.json \
--assessment docs/self-scoping/assessments/repo-scoping-known-bad-2026-05-15-run-39.json \
--format markdown
```
The first comparison report highlights missing expected capabilities, forbidden
native capabilities, known regression patterns, and misplaced API/CLI features.

View File

@@ -9,6 +9,12 @@ from repo_registry.core.service import RegistryService
from repo_registry.llm_extraction import LLMCandidateExtractor, create_llm_connect_adapter
from repo_registry.repo_ingestion.git import GitIngestionService
from repo_registry.self_scoping.assessment import artifact_json, export_assessment_artifact
from repo_registry.self_scoping.comparison import (
compare_assessment_to_golden,
comparison_json,
comparison_markdown,
load_json,
)
from repo_registry.storage.sqlite import NotFoundError, RegistryStore
from repo_registry.web_api.app import Settings
@@ -76,6 +82,23 @@ def build_parser() -> argparse.ArgumentParser:
export.add_argument("--summary", help="Assessment summary override.")
export.add_argument("--database-path", help="Override REPO_REGISTRY_DATABASE_PATH.")
export.add_argument("--checkout-root", help="Override REPO_REGISTRY_CHECKOUT_ROOT.")
compare = subparsers.add_parser(
"compare-assessment",
help="Compare a self-scoping assessment artifact against a golden profile.",
)
compare.add_argument("--golden", required=True, help="Golden profile JSON path.")
compare.add_argument(
"--assessment",
required=True,
help="Assessment artifact JSON path.",
)
compare.add_argument("--output", help="Write comparison report to this path instead of stdout.")
compare.add_argument(
"--format",
choices=["json", "markdown"],
default="markdown",
help="Comparison report format.",
)
return parser
@@ -86,6 +109,8 @@ def main(argv: Sequence[str] | None = None) -> int:
return rebuild_characteristics_command(args, parser)
if args.command == "export-assessment":
return export_assessment_command(args, parser)
if args.command == "compare-assessment":
return compare_assessment_command(args)
parser.error(f"unknown command: {args.command}")
return 2
@@ -122,6 +147,23 @@ def rebuild_characteristics_command(
return 0
def compare_assessment_command(args: argparse.Namespace) -> int:
comparison = compare_assessment_to_golden(
load_json(args.golden),
load_json(args.assessment),
)
content = (
comparison_json(comparison)
if args.format == "json"
else comparison_markdown(comparison)
)
if args.output:
Path(args.output).write_text(content, encoding="utf-8")
else:
print(content, end="" if content.endswith("\n") else "\n")
return 0
def export_assessment_command(
args: argparse.Namespace,
parser: argparse.ArgumentParser,

View File

@@ -1,3 +1,4 @@
from repo_registry.self_scoping.assessment import export_assessment_artifact
from repo_registry.self_scoping.comparison import compare_assessment_to_golden
__all__ = ["export_assessment_artifact"]
__all__ = ["compare_assessment_to_golden", "export_assessment_artifact"]

View File

@@ -0,0 +1,238 @@
from __future__ import annotations
import json
from datetime import UTC, datetime
from pathlib import Path
from typing import Any
COMPARISON_SCHEMA_VERSION = "self-scoping-comparison/v1"
def load_json(path: str | Path) -> dict[str, Any]:
return json.loads(Path(path).read_text(encoding="utf-8"))
def compare_assessment_to_golden(
golden_profile: dict[str, Any],
assessment: dict[str, Any],
) -> dict[str, Any]:
expected = _expected_capabilities(golden_profile)
forbidden = _forbidden_capabilities(golden_profile)
generated = _generated_capabilities(assessment)
generated_names = set(generated)
missing_expected = sorted(expected - generated_names)
matched_expected = sorted(expected & generated_names)
forbidden_present = sorted(forbidden & generated_names)
known_regressions = assessment.get("known_regression_patterns", [])
misplaced_features = _misplaced_features(generated)
status = _status(
missing_expected=missing_expected,
forbidden_present=forbidden_present,
known_regressions=known_regressions,
misplaced_features=misplaced_features,
)
return {
"schema_version": COMPARISON_SCHEMA_VERSION,
"comparison_id": _comparison_id(golden_profile, assessment),
"created_at": datetime.now(UTC).replace(microsecond=0).isoformat().replace("+00:00", "Z"),
"golden_profile_id": golden_profile.get("profile_id", ""),
"assessment_artifact_id": assessment.get("artifact_id", ""),
"target_repo_slug": assessment.get("target_repository", {}).get("repo_slug", ""),
"status": status,
"summary": _summary(status, missing_expected, forbidden_present, known_regressions),
"matched_expected_capabilities": matched_expected,
"missing_expected_capabilities": missing_expected,
"unexpected_native_capabilities": _unexpected_capabilities(
generated_names,
expected,
forbidden,
),
"forbidden_native_capabilities_present": forbidden_present,
"known_regression_patterns": known_regressions,
"misplaced_features": misplaced_features,
"comparison_hints": _comparison_hints(status),
}
def comparison_json(comparison: dict[str, Any]) -> str:
return json.dumps(comparison, indent=2, sort_keys=True) + "\n"
def comparison_markdown(comparison: dict[str, Any]) -> str:
lines = [
f"# Self-Scoping Comparison: {comparison['assessment_artifact_id']}",
"",
f"- Status: `{comparison['status']}`",
f"- Golden profile: `{comparison['golden_profile_id']}`",
f"- Target repo: `{comparison['target_repo_slug']}`",
f"- Summary: {comparison['summary']}",
"",
"## Missing Expected Capabilities",
*_bullets(comparison["missing_expected_capabilities"]),
"",
"## Forbidden Native Capabilities Present",
*_bullets(comparison["forbidden_native_capabilities_present"]),
"",
"## Known Regression Patterns",
*_regression_bullets(comparison["known_regression_patterns"]),
"",
"## Misplaced Features",
*_misplaced_feature_bullets(comparison["misplaced_features"]),
"",
"## Matched Expected Capabilities",
*_bullets(comparison["matched_expected_capabilities"]),
"",
"## Review Hints",
*_bullets(comparison["comparison_hints"]),
"",
]
return "\n".join(lines)
def _expected_capabilities(golden_profile: dict[str, Any]) -> set[str]:
return {
capability["name"]
for capability in golden_profile.get("ability", {}).get("expected_capabilities", [])
if capability.get("name")
}
def _forbidden_capabilities(golden_profile: dict[str, Any]) -> set[str]:
return {
capability["name"]
for capability in golden_profile.get("forbidden_native_capabilities", [])
if capability.get("name")
}
def _generated_capabilities(assessment: dict[str, Any]) -> dict[str, dict[str, Any]]:
result: dict[str, dict[str, Any]] = {}
for ability in assessment.get("generated_tree", {}).get("abilities", []):
for capability in ability.get("capabilities", []):
name = capability.get("name")
if name:
result[name] = capability
return result
def _unexpected_capabilities(
generated_names: set[str],
expected: set[str],
forbidden: set[str],
) -> list[str]:
return sorted(generated_names - expected - forbidden)
def _misplaced_features(
generated: dict[str, dict[str, Any]],
) -> list[dict[str, str]]:
misplaced: list[dict[str, str]] = []
for capability_name, capability in generated.items():
primary_class = capability.get("primary_class", "")
if primary_class not in {"llm-integration", "provider-routing"}:
continue
for feature in capability.get("features", []):
if feature.get("type") not in {"API", "CLI"}:
continue
misplaced.append(
{
"capability": capability_name,
"feature": feature.get("name", ""),
"feature_type": feature.get("type", ""),
"reason": "API/CLI surface is nested below provider-routing capability.",
}
)
return misplaced
def _status(
*,
missing_expected: list[str],
forbidden_present: list[str],
known_regressions: list[dict[str, Any]],
misplaced_features: list[dict[str, str]],
) -> str:
if forbidden_present or misplaced_features or any(
item.get("severity") in {"high", "critical"} for item in known_regressions
):
return "regression"
if missing_expected or known_regressions:
return "needs_review"
return "candidate_improvement"
def _summary(
status: str,
missing_expected: list[str],
forbidden_present: list[str],
known_regressions: list[dict[str, Any]],
) -> str:
if status == "regression":
return (
"Assessment repeats known or forbidden self-scoping patterns; prefer "
"the golden profile until the engine is corrected."
)
if status == "needs_review":
return (
f"Assessment needs review: {len(missing_expected)} expected "
f"capability(s) missing and {len(known_regressions)} regression "
"pattern(s) reported."
)
return "Assessment covers the golden profile without known regression patterns."
def _comparison_hints(status: str) -> list[str]:
if status == "regression":
return [
"Do not promote this assessment as a preferred baseline.",
"Inspect forbidden capabilities and misplaced features first.",
"Use the findings as signal for scanner, generator, or acceptance-policy changes.",
]
if status == "needs_review":
return [
"Review missing expected capabilities before choosing old or new output.",
"Check whether the golden profile needs a curator-approved update.",
]
return [
"Candidate appears better than the known golden checks.",
"Human or agentic review should still confirm source evidence quality.",
]
def _comparison_id(
golden_profile: dict[str, Any],
assessment: dict[str, Any],
) -> str:
return (
f"{golden_profile.get('profile_id', 'golden')}"
f"__{assessment.get('artifact_id', 'assessment')}"
)
def _bullets(items: list[str]) -> list[str]:
if not items:
return ["- None"]
return [f"- {item}" for item in items]
def _regression_bullets(items: list[dict[str, Any]]) -> list[str]:
if not items:
return ["- None"]
return [
f"- `{item.get('id', '')}` {item.get('title', '')}: {item.get('description', '')}"
for item in items
]
def _misplaced_feature_bullets(items: list[dict[str, str]]) -> list[str]:
if not items:
return ["- None"]
return [
(
f"- `{item['feature']}` under `{item['capability']}` "
f"({item['feature_type']}): {item['reason']}"
)
for item in items
]

View File

@@ -131,3 +131,26 @@ def test_export_assessment_cli_writes_completed_run_artifact(tmp_path):
assert artifact["execution"]["analysis_run_id"] == summary.analysis_run.id
assert artifact["assessment"]["role"] == "challenger"
assert artifact["generated_tree"]["abilities"]
def test_compare_assessment_cli_writes_markdown_report(tmp_path):
output_path = tmp_path / "comparison.md"
exit_code = main(
[
"compare-assessment",
"--golden",
"docs/self-scoping/golden/repo-scoping-golden-profile.v1.json",
"--assessment",
"docs/self-scoping/assessments/repo-scoping-known-bad-2026-05-15-run-39.json",
"--output",
str(output_path),
"--format",
"markdown",
]
)
report = output_path.read_text(encoding="utf-8")
assert exit_code == 0
assert "Status: `regression`" in report
assert "Route LLM Requests Across Providers" in report

View File

@@ -0,0 +1,62 @@
from pathlib import Path
from repo_registry.self_scoping.comparison import (
compare_assessment_to_golden,
comparison_markdown,
load_json,
)
ROOT = Path(__file__).resolve().parents[1]
GOLDEN_PROFILE = (
ROOT
/ "docs"
/ "self-scoping"
/ "golden"
/ "repo-scoping-golden-profile.v1.json"
)
KNOWN_BAD = (
ROOT
/ "docs"
/ "self-scoping"
/ "assessments"
/ "repo-scoping-known-bad-2026-05-15-run-39.json"
)
def test_compare_known_bad_assessment_to_golden_flags_regression():
comparison = compare_assessment_to_golden(
load_json(GOLDEN_PROFILE),
load_json(KNOWN_BAD),
)
assert comparison["schema_version"] == "self-scoping-comparison/v1"
assert comparison["status"] == "regression"
assert "Route LLM Requests Across Providers" in comparison[
"forbidden_native_capabilities_present"
]
assert "Scan Repositories Into Observed Facts" in comparison[
"missing_expected_capabilities"
]
assert {item["id"] for item in comparison["known_regression_patterns"]} >= {
"RREG-SELF-REG-001",
"RREG-SELF-REG-002",
"RREG-SELF-REG-003",
}
assert any(
item["feature_type"] == "API" for item in comparison["misplaced_features"]
)
def test_comparison_markdown_summarizes_actionable_sections():
comparison = compare_assessment_to_golden(
load_json(GOLDEN_PROFILE),
load_json(KNOWN_BAD),
)
markdown = comparison_markdown(comparison)
assert "# Self-Scoping Comparison" in markdown
assert "## Missing Expected Capabilities" in markdown
assert "## Forbidden Native Capabilities Present" in markdown
assert "Route LLM Requests Across Providers" in markdown

View File

@@ -168,7 +168,7 @@ the exporter and CLI path.
```task
id: RREG-WP-0013-T05
status: todo
status: done
priority: high
state_hub_task_id: "2b71069b-6150-45f4-84a2-59f5ec1e04c0"
```
@@ -196,6 +196,14 @@ Acceptance criteria:
- It can compare deterministic-only and agent-reviewed runs without losing
provenance.
Implementation note 2026-05-15: added
`src/repo_registry/self_scoping/comparison.py` and the
`repo-scoping compare-assessment` CLI command. The first comparison report
checks assessment artifacts against the repo-scoping golden profile, reports
missing expected capabilities, forbidden native capability matches, known
regression patterns, and misplaced API/CLI features under provider-routing
capabilities. Reports can be emitted as JSON or Markdown.
## T06: Add Side-By-Side Review UI
```task