diff --git a/docs/self-scoping/README.md b/docs/self-scoping/README.md index 9c8d50d..5767984 100644 --- a/docs/self-scoping/README.md +++ b/docs/self-scoping/README.md @@ -13,6 +13,8 @@ instead of relying on memory or screenshots. repo-scoping capability truth. - `workflow.md` explains how to run challenger assessments, interpret outcomes, and decide whether to update the golden profile or fix the engine. +- `outcomes/` stores append-only reviewer decisions created from side-by-side + comparisons. - `../schemas/self-scoping-assessment.schema.json` defines the immutable assessment-run artifact shape. @@ -35,6 +37,12 @@ assessment. Compare the challenger to the golden profile and to the negative seed. Reviewers should be able to choose whether the old result, new result, or neither is better, then store that judgement as a new assessment outcome. +The curator UI exposes this loop at `/ui/self-scoping`. It reads the golden and +assessment JSON files from this directory, highlights missing, forbidden, and +misplaced hierarchy entries, and records reviewer preference without mutating +the compared artifacts. The same page can compare two assessment runs directly +so reviewers can choose whether the old baseline or new challenger is better. + ## Export Command Export a completed analysis run as a challenger artifact: diff --git a/docs/self-scoping/outcomes/README.md b/docs/self-scoping/outcomes/README.md new file mode 100644 index 0000000..b3a6513 --- /dev/null +++ b/docs/self-scoping/outcomes/README.md @@ -0,0 +1,9 @@ +# Self-Scoping Review Outcomes + +This directory stores append-only review decisions recorded from the +self-scoping comparison UI. Outcome files bind a reviewer choice to a golden +profile, an assessment artifact, and the repo-scoping engine identity captured +in that assessment. + +Do not edit historical assessment artifacts to record a preference. Add a new +outcome record instead. diff --git a/docs/self-scoping/workflow.md b/docs/self-scoping/workflow.md index e7ad494..4cd3ede 100644 --- a/docs/self-scoping/workflow.md +++ b/docs/self-scoping/workflow.md @@ -56,17 +56,25 @@ are committed. 2. Read the comparison report. -3. If the report says `regression`, inspect forbidden capabilities, misplaced +3. Open the curator UI at `/ui/self-scoping` to compare the golden profile and + assessment artifact side by side. + +4. When an earlier baseline assessment exists, use the same page's two-run + comparison to judge old output against the new challenger. + +5. If the report says `regression`, inspect forbidden capabilities, misplaced features, and known regression patterns first. -4. If the report says `needs_review`, inspect missing expected capabilities and +6. If the report says `needs_review`, inspect missing expected capabilities and source evidence before choosing old or new output. -5. If the report says `candidate_improvement`, still confirm that the +7. If the report says `candidate_improvement`, still confirm that the hierarchy, source refs, and native-utility boundaries make sense. -6. Record the decision as an assessment outcome before changing the active - baseline. +8. Record the decision as an assessment outcome before changing the active + baseline. The UI writes append-only outcome records under + `docs/self-scoping/outcomes/`; it does not rewrite historical assessment or + golden-profile artifacts. ## CI Use diff --git a/src/repo_registry/self_scoping/__init__.py b/src/repo_registry/self_scoping/__init__.py index 3ba3de1..8e0cb5a 100644 --- a/src/repo_registry/self_scoping/__init__.py +++ b/src/repo_registry/self_scoping/__init__.py @@ -1,4 +1,13 @@ from repo_registry.self_scoping.assessment import export_assessment_artifact from repo_registry.self_scoping.comparison import compare_assessment_to_golden +from repo_registry.self_scoping.review_store import ( + record_assessment_outcome, + record_assessment_pair_outcome, +) -__all__ = ["compare_assessment_to_golden", "export_assessment_artifact"] +__all__ = [ + "compare_assessment_to_golden", + "export_assessment_artifact", + "record_assessment_outcome", + "record_assessment_pair_outcome", +] diff --git a/src/repo_registry/self_scoping/review_store.py b/src/repo_registry/self_scoping/review_store.py new file mode 100644 index 0000000..7e2290d --- /dev/null +++ b/src/repo_registry/self_scoping/review_store.py @@ -0,0 +1,217 @@ +from __future__ import annotations + +import json +import os +from dataclasses import dataclass +from datetime import UTC, datetime +from pathlib import Path +from typing import Any +from uuid import uuid4 + + +SELF_SCOPING_ROOT_ENV = "REPO_REGISTRY_SELF_SCOPING_ROOT" +OUTCOME_SCHEMA_VERSION = "self-scoping-review-outcome/v1" +ALLOWED_OUTCOMES = { + "prefer_golden", + "prefer_assessment", + "prefer_baseline", + "prefer_challenger", + "tie", + "needs_human", + "reject_assessment", + "reject_challenger", +} + + +@dataclass(frozen=True) +class ReviewArtifact: + path: str + artifact_id: str + title: str + updated_at: str + + +def self_scoping_root(root: str | Path | None = None) -> Path: + configured = root or os.environ.get(SELF_SCOPING_ROOT_ENV) or "docs/self-scoping" + return Path(configured).resolve() + + +def list_golden_profiles(root: str | Path | None = None) -> list[ReviewArtifact]: + return _list_artifacts("golden", root=root) + + +def list_assessment_artifacts(root: str | Path | None = None) -> list[ReviewArtifact]: + return _list_artifacts("assessments", root=root) + + +def load_json_artifact( + relative_path: str, + root: str | Path | None = None, +) -> dict[str, Any]: + artifact_path = _safe_artifact_path(relative_path, root=root) + return json.loads(artifact_path.read_text(encoding="utf-8")) + + +def list_outcome_records(root: str | Path | None = None) -> list[dict[str, Any]]: + outcomes_dir = self_scoping_root(root) / "outcomes" + if not outcomes_dir.exists(): + return [] + records: list[dict[str, Any]] = [] + for path in sorted(outcomes_dir.glob("*.json"), reverse=True): + try: + records.append(json.loads(path.read_text(encoding="utf-8"))) + except json.JSONDecodeError: + continue + return records + + +def record_assessment_outcome( + *, + golden_path: str, + assessment_path: str, + outcome: str, + reviewer: str, + notes: str, + comparison_status: str, + root: str | Path | None = None, +) -> dict[str, Any]: + if outcome not in ALLOWED_OUTCOMES: + raise ValueError(f"unsupported review outcome: {outcome}") + + base = self_scoping_root(root) + golden = load_json_artifact(golden_path, root=base) + assessment = load_json_artifact(assessment_path, root=base) + created_at = _created_at() + outcome_id = _outcome_id(created_at, assessment_path, outcome) + record = { + "schema_version": OUTCOME_SCHEMA_VERSION, + "outcome_id": outcome_id, + "created_at": created_at, + "reviewer": reviewer.strip() or "codex", + "outcome": outcome, + "notes": notes.strip(), + "comparison_status": comparison_status, + "golden_profile_path": golden_path, + "golden_profile_id": golden.get("profile_id", ""), + "assessment_artifact_path": assessment_path, + "assessment_artifact_id": assessment.get("artifact_id", ""), + "engine_identity": assessment.get("engine_identity", {}), + "decision_scope": "baseline-comparison", + } + + _write_outcome(record, base) + return record + + +def record_assessment_pair_outcome( + *, + baseline_path: str, + challenger_path: str, + outcome: str, + reviewer: str, + notes: str, + comparison_status: str, + root: str | Path | None = None, +) -> dict[str, Any]: + if outcome not in ALLOWED_OUTCOMES: + raise ValueError(f"unsupported review outcome: {outcome}") + + base = self_scoping_root(root) + baseline = load_json_artifact(baseline_path, root=base) + challenger = load_json_artifact(challenger_path, root=base) + created_at = _created_at() + outcome_id = _outcome_id( + created_at, + f"{Path(baseline_path).stem}__{Path(challenger_path).stem}", + outcome, + ) + record = { + "schema_version": OUTCOME_SCHEMA_VERSION, + "outcome_id": outcome_id, + "created_at": created_at, + "reviewer": reviewer.strip() or "codex", + "outcome": outcome, + "notes": notes.strip(), + "comparison_status": comparison_status, + "baseline_assessment_path": baseline_path, + "baseline_assessment_artifact_id": baseline.get("artifact_id", ""), + "baseline_engine_identity": baseline.get("engine_identity", {}), + "challenger_assessment_path": challenger_path, + "challenger_assessment_artifact_id": challenger.get("artifact_id", ""), + "challenger_engine_identity": challenger.get("engine_identity", {}), + "decision_scope": "assessment-pair-comparison", + } + _write_outcome(record, base) + return record + + +def _created_at() -> str: + return ( + datetime.now(UTC) + .replace(microsecond=0) + .isoformat() + .replace("+00:00", "Z") + ) + + +def _write_outcome(record: dict[str, Any], base: Path) -> None: + outcomes_dir = base / "outcomes" + outcomes_dir.mkdir(parents=True, exist_ok=True) + output_path = outcomes_dir / f"{record['outcome_id']}.json" + output_path.write_text( + json.dumps(record, indent=2, sort_keys=True) + "\n", + encoding="utf-8", + ) + + +def _list_artifacts(kind: str, root: str | Path | None = None) -> list[ReviewArtifact]: + base = self_scoping_root(root) + artifacts: list[ReviewArtifact] = [] + for path in sorted((base / kind).glob("*.json")): + try: + payload = json.loads(path.read_text(encoding="utf-8")) + except json.JSONDecodeError: + continue + artifacts.append( + ReviewArtifact( + path=path.relative_to(base).as_posix(), + artifact_id=str( + payload.get("artifact_id") or payload.get("profile_id") or path.stem + ), + title=str( + payload.get("title") + or payload.get("assessment", {}).get("summary") + or payload.get("artifact_type") + or path.stem + ), + updated_at=str( + payload.get("updated_at") or payload.get("created_at") or "" + ), + ) + ) + return artifacts + + +def _safe_artifact_path(relative_path: str, root: str | Path | None = None) -> Path: + base = self_scoping_root(root) + artifact_path = (base / relative_path).resolve() + try: + artifact_path.relative_to(base) + except ValueError as exc: + raise ValueError(f"artifact path escapes self-scoping root: {relative_path}") from exc + if artifact_path.suffix != ".json": + raise ValueError(f"artifact path is not JSON: {relative_path}") + if not artifact_path.exists(): + raise FileNotFoundError(relative_path) + return artifact_path + + +def _outcome_id(created_at: str, assessment_path: str, outcome: str) -> str: + timestamp = ( + created_at.replace("-", "") + .replace(":", "") + .replace("T", "-") + .replace("Z", "") + ) + assessment_stem = Path(assessment_path).stem.replace(".", "-") + return f"{timestamp}__{assessment_stem}__{outcome}__{uuid4().hex[:8]}" diff --git a/src/repo_registry/web_ui/views.py b/src/repo_registry/web_ui/views.py index 7eda0ef..3c04644 100644 --- a/src/repo_registry/web_ui/views.py +++ b/src/repo_registry/web_ui/views.py @@ -10,12 +10,36 @@ from fastapi import APIRouter, Depends, Form, HTTPException, Query from fastapi.responses import HTMLResponse, PlainTextResponse, RedirectResponse from repo_registry.core.service import RegistryService +from repo_registry.self_scoping.comparison import compare_assessment_to_golden +from repo_registry.self_scoping.review_store import ( + ALLOWED_OUTCOMES, + list_assessment_artifacts, + list_golden_profiles, + list_outcome_records, + load_json_artifact, + record_assessment_outcome, + record_assessment_pair_outcome, +) from repo_registry.storage.sqlite import NotFoundError from repo_registry.web_api.app import get_service router = APIRouter(include_in_schema=False) APP_NAME = "Repository Scoping" +REVIEW_OUTCOME_LABELS = { + "prefer_golden": "Prefer Golden", + "prefer_assessment": "Prefer Assessment", + "tie": "Tie", + "needs_human": "Needs Human Review", + "reject_assessment": "Reject Assessment", +} +PAIR_REVIEW_OUTCOME_LABELS = { + "prefer_baseline": "Prefer Baseline", + "prefer_challenger": "Prefer Challenger", + "tie": "Tie", + "needs_human": "Needs Human Review", + "reject_challenger": "Reject Challenger", +} def repository_directory_name(url: str, fallback: str) -> str: @@ -188,6 +212,29 @@ def page( }} .tree ul {{ margin: 8px 0 0 20px; padding: 0; }} .tree li {{ margin: 6px 0; }} + .review-grid {{ + display: grid; + grid-template-columns: minmax(0, 1fr) minmax(0, 1fr); + gap: 18px; + align-items: start; + }} + .review-item {{ + border-left: 3px solid var(--line); + padding: 8px 0 8px 10px; + margin: 8px 0; + }} + .review-item.match {{ border-color: #10b981; }} + .review-item.problem {{ border-color: var(--danger); background: #fffafa; }} + .review-item.warn {{ border-color: #f59e0b; background: #fffaf0; }} + .review-item h3 {{ margin-top: 0; }} + .review-list {{ margin: 8px 0 0 18px; padding: 0; }} + .review-list .warn {{ color: var(--warn); font-weight: 650; }} + .review-meta {{ + display: flex; + gap: 8px; + flex-wrap: wrap; + align-items: center; + }} .source {{ color: var(--muted); font-family: ui-monospace, SFMono-Regular, Consolas, monospace; font-size: 12px; }} .scope-document {{ margin: 0; @@ -279,6 +326,7 @@ def page( header {{ padding: 12px 16px; }} main {{ padding: 16px; }} .grid {{ grid-template-columns: 1fr; }} + .review-grid {{ grid-template-columns: 1fr; }} .graph-shell {{ grid-template-columns: 1fr; }} .graph-canvas {{ min-height: 560px; }} table, tbody, tr, td {{ display: block; width: 100%; }} @@ -297,6 +345,7 @@ def page( @@ -405,6 +454,614 @@ def scope_document() -> HTMLResponse: return page("SCOPE.md", body) +def render_self_scoping_index( + *, + error_message: str | None = None, + status_code: int = 200, +) -> HTMLResponse: + golden_profiles = list_golden_profiles() + assessments = list_assessment_artifacts() + outcomes = list_outcome_records() + error = ( + f""" + + """ + if error_message + else "" + ) + missing_inputs = "" + if not golden_profiles or not assessments: + missing_inputs = """ +
+ Add at least one golden profile and one assessment artifact under + docs/self-scoping before opening a comparison. +
+ """ + body = f""" +

Self-Scoping Review

+ {error} + {missing_inputs} +
+
+

Compare To Golden

+
+ + +
+ +
+
+

Compare Two Runs

+
+ + +
+ +
+
+
+
+

Recorded Outcomes

+ {_render_outcome_table(outcomes)} +
+
+ """ + response = page("Self-Scoping Review", body) + response.status_code = status_code + return response + + +@router.get("/ui/self-scoping") +def self_scoping_index() -> HTMLResponse: + return render_self_scoping_index() + + +@router.get("/ui/self-scoping/review") +def self_scoping_review( + golden: str = Query(...), + assessment: str = Query(...), + saved: str | None = Query(default=None), +) -> HTMLResponse: + try: + golden_profile = load_json_artifact(golden) + assessment_artifact = load_json_artifact(assessment) + except (FileNotFoundError, ValueError, json.JSONDecodeError) as exc: + return render_self_scoping_index(error_message=str(exc), status_code=400) + + comparison = compare_assessment_to_golden(golden_profile, assessment_artifact) + comparison_status = comparison["status"] + comparison_summary = comparison["summary"] + matched_count = len(comparison["matched_expected_capabilities"]) + missing_count = len(comparison["missing_expected_capabilities"]) + forbidden_count = len(comparison["forbidden_native_capabilities_present"]) + misplaced_count = len(comparison["misplaced_features"]) + saved_notice = ( + f""" +
+ Saved assessment outcome {escape(saved)}. +
+ """ + if saved + else "" + ) + body = f""" +

Self-Scoping Comparison

+ {saved_notice} +
+
+ Back +
+
+ {escape(comparison_status)} +

{escape(comparison_summary)}

+
+
+ Matched {matched_count} + Missing {missing_count} + Forbidden {forbidden_count} + Misplaced {misplaced_count} +
+
+
+
+

Golden Profile

+ {_render_golden_tree(golden_profile, comparison)} +
+
+

Assessment Output

+ {_render_assessment_tree(assessment_artifact, comparison)} +
+
+
+

Record Review Outcome

+
+ + + + + + +
+ + Saving outcome... +
+
+
+ """ + return page("Self-Scoping Comparison", body) + + +@router.post("/ui/self-scoping/review") +def save_self_scoping_review( + golden_path: str = Form(...), + assessment_path: str = Form(...), + outcome: str = Form(...), + reviewer: str = Form("codex"), + notes: str = Form(""), + comparison_status: str = Form(""), +): + try: + record = record_assessment_outcome( + golden_path=golden_path, + assessment_path=assessment_path, + outcome=outcome, + reviewer=reviewer, + notes=notes, + comparison_status=comparison_status, + ) + except (FileNotFoundError, ValueError, json.JSONDecodeError) as exc: + return render_self_scoping_index(error_message=str(exc), status_code=400) + return RedirectResponse( + ( + "/ui/self-scoping/review" + f"?golden={quote_plus(golden_path)}" + f"&assessment={quote_plus(assessment_path)}" + f"&saved={quote_plus(record['outcome_id'])}" + ), + status_code=303, + ) + + +@router.get("/ui/self-scoping/run-review") +def self_scoping_run_review( + baseline: str = Query(...), + challenger: str = Query(...), + saved: str | None = Query(default=None), +) -> HTMLResponse: + try: + baseline_artifact = load_json_artifact(baseline) + challenger_artifact = load_json_artifact(challenger) + except (FileNotFoundError, ValueError, json.JSONDecodeError) as exc: + return render_self_scoping_index(error_message=str(exc), status_code=400) + + comparison = _assessment_tree_diff(baseline_artifact, challenger_artifact) + comparison_status = comparison["status"] + comparison_summary = comparison["summary"] + shared_count = len(comparison["shared_capabilities"]) + baseline_only_count = len(comparison["baseline_only_capabilities"]) + challenger_only_count = len(comparison["challenger_only_capabilities"]) + moved_feature_count = len(comparison["moved_feature_names"]) + saved_notice = ( + f""" +
+ Saved assessment outcome {escape(saved)}. +
+ """ + if saved + else "" + ) + body = f""" +

Assessment Run Comparison

+ {saved_notice} +
+
+ Back +
+
+ {escape(comparison_status)} +

{escape(comparison_summary)}

+
+
+ Shared {shared_count} + Baseline only {baseline_only_count} + Challenger only {challenger_only_count} + Moved features {moved_feature_count} +
+
+
+
+

Baseline Run

+ {_render_assessment_tree_for_run_diff(baseline_artifact, comparison, role="baseline")} +
+
+

Challenger Run

+ {_render_assessment_tree_for_run_diff(challenger_artifact, comparison, role="challenger")} +
+
+
+

Record Review Outcome

+
+ + + + + + +
+ + Saving outcome... +
+
+
+ """ + return page("Assessment Run Comparison", body) + + +@router.post("/ui/self-scoping/run-review") +def save_self_scoping_run_review( + baseline_path: str = Form(...), + challenger_path: str = Form(...), + outcome: str = Form(...), + reviewer: str = Form("codex"), + notes: str = Form(""), + comparison_status: str = Form(""), +): + try: + record = record_assessment_pair_outcome( + baseline_path=baseline_path, + challenger_path=challenger_path, + outcome=outcome, + reviewer=reviewer, + notes=notes, + comparison_status=comparison_status, + ) + except (FileNotFoundError, ValueError, json.JSONDecodeError) as exc: + return render_self_scoping_index(error_message=str(exc), status_code=400) + return RedirectResponse( + ( + "/ui/self-scoping/run-review" + f"?baseline={quote_plus(baseline_path)}" + f"&challenger={quote_plus(challenger_path)}" + f"&saved={quote_plus(record['outcome_id'])}" + ), + status_code=303, + ) + + +def _assessment_tree_diff(baseline: dict, challenger: dict) -> dict: + baseline_capabilities = set(_assessment_capability_names(baseline)) + challenger_capabilities = set(_assessment_capability_names(challenger)) + baseline_only = sorted(baseline_capabilities - challenger_capabilities) + challenger_only = sorted(challenger_capabilities - baseline_capabilities) + shared = sorted(baseline_capabilities & challenger_capabilities) + + baseline_feature_index = _assessment_feature_index(baseline) + challenger_feature_index = _assessment_feature_index(challenger) + moved_feature_names = sorted( + feature_name + for feature_name in set(baseline_feature_index) & set(challenger_feature_index) + if baseline_feature_index[feature_name] != challenger_feature_index[feature_name] + ) + baseline_moved_pairs = { + (capability_name, feature_name) + for feature_name in moved_feature_names + for capability_name in baseline_feature_index[feature_name] + } + challenger_moved_pairs = { + (capability_name, feature_name) + for feature_name in moved_feature_names + for capability_name in challenger_feature_index[feature_name] + } + status = ( + "candidate_improvement" + if not baseline_only and not challenger_only and not moved_feature_names + else "needs_review" + ) + return { + "status": status, + "summary": _assessment_tree_diff_summary( + baseline_only, + challenger_only, + moved_feature_names, + ), + "baseline_only_capabilities": baseline_only, + "challenger_only_capabilities": challenger_only, + "shared_capabilities": shared, + "moved_feature_names": moved_feature_names, + "baseline_moved_feature_pairs": baseline_moved_pairs, + "challenger_moved_feature_pairs": challenger_moved_pairs, + } + + +def _assessment_tree_diff_summary( + baseline_only: list[str], + challenger_only: list[str], + moved_feature_names: list[str], +) -> str: + if not baseline_only and not challenger_only and not moved_feature_names: + return "Assessment hierarchy names match between baseline and challenger." + return ( + "Assessment runs differ: " + f"{len(baseline_only)} baseline-only capability(s), " + f"{len(challenger_only)} challenger-only capability(s), and " + f"{len(moved_feature_names)} moved feature name(s)." + ) + + +def _assessment_capability_names(assessment: dict) -> list[str]: + names: list[str] = [] + for ability in assessment.get("generated_tree", {}).get("abilities", []): + for capability in ability.get("capabilities", []): + name = capability.get("name") + if name: + names.append(name) + return names + + +def _assessment_feature_index(assessment: dict) -> dict[str, set[str]]: + index: dict[str, set[str]] = {} + for ability in assessment.get("generated_tree", {}).get("abilities", []): + for capability in ability.get("capabilities", []): + capability_name = capability.get("name", "") + for feature in capability.get("features", []): + feature_name = feature.get("name") + if feature_name: + index.setdefault(feature_name, set()).add(capability_name) + return index + + +def _render_assessment_tree_for_run_diff( + assessment: dict, + comparison: dict, + *, + role: str, +) -> str: + if role == "baseline": + changed_capabilities = set(comparison["baseline_only_capabilities"]) + moved_pairs = comparison["baseline_moved_feature_pairs"] + changed_reason = "Baseline only" + else: + changed_capabilities = set(comparison["challenger_only_capabilities"]) + moved_pairs = comparison["challenger_moved_feature_pairs"] + changed_reason = "Challenger only" + shared = set(comparison["shared_capabilities"]) + ability_blocks = [] + for ability in assessment.get("generated_tree", {}).get("abilities", []): + capability_blocks = [] + for capability in ability.get("capabilities", []): + name = capability.get("name", "") + item_class = "warn" if name in changed_capabilities else "match" if name in shared else "" + reason = changed_reason if name in changed_capabilities else "Shared capability" + capability_blocks.append( + f""" +
+

{escape(name)}

+
+ {escape(reason)} + {escape(capability.get("primary_class", ""))} +
+ {_render_generated_features(name, capability.get("features", []), moved_pairs)} +
+ """ + ) + ability_blocks.append( + f""" +
+

{escape(ability.get("name", ""))}

+ {"".join(capability_blocks) or '

No capabilities generated.

'} +
+ """ + ) + return "\n".join(ability_blocks) or '

No generated abilities found.

' + + +def _review_artifact_options(artifacts) -> str: + if not artifacts: + return '' + return "\n".join( + f""" + + """ + for artifact in artifacts + ) + + +def _render_outcome_table(outcomes: list[dict]) -> str: + if not outcomes: + return '

No review outcomes have been recorded yet.

' + rows = "\n".join( + f""" + + {escape(record.get("created_at", ""))} + {escape(record.get("outcome", ""))} + {escape(record.get("comparison_status", ""))} + {escape(_outcome_record_subject(record))} + + """ + for record in outcomes[:8] + ) + return f""" + + + {rows} +
CreatedOutcomeStatusAssessment
+ """ + + +def _outcome_record_subject(record: dict) -> str: + if record.get("assessment_artifact_id"): + return str(record["assessment_artifact_id"]) + if record.get("challenger_assessment_artifact_id"): + return str(record["challenger_assessment_artifact_id"]) + if record.get("outcome_id"): + return str(record["outcome_id"]) + return "" + + +def _comparison_notice_class(comparison: dict) -> str: + if comparison["status"] == "regression": + return "error" + if comparison["status"] == "needs_review": + return "warn" + return "success" + + +def _render_golden_tree(golden_profile: dict, comparison: dict) -> str: + missing = set(comparison["missing_expected_capabilities"]) + matched = set(comparison["matched_expected_capabilities"]) + capabilities = golden_profile.get("ability", {}).get("expected_capabilities", []) + items = [] + for capability in capabilities: + name = capability.get("name", "") + item_class = "problem" if name in missing else "match" if name in matched else "" + features = _render_expected_features(capability.get("expected_features", [])) + state = "Missing" if name in missing else "Matched" if name in matched else "Expected" + items.append( + f""" +
+

{escape(name)}

+
+ {escape(state)} + {escape(capability.get("primary_class", ""))} +
+ {features} +
+ """ + ) + return "\n".join(items) or '

No expected capabilities found.

' + + +def _render_expected_features(features: list[dict]) -> str: + if not features: + return "" + rows = [] + for feature in features: + sources = ", ".join(feature.get("source_paths", [])[:3]) + rows.append( + f""" +
  • + {escape(feature.get("name", ""))} + {escape(feature.get("primary_class", ""))} +
    {escape(sources)}
    +
  • + """ + ) + return f'' + + +def _render_assessment_tree(assessment: dict, comparison: dict) -> str: + forbidden = set(comparison["forbidden_native_capabilities_present"]) + unexpected = set(comparison["unexpected_native_capabilities"]) + misplaced = { + (item.get("capability", ""), item.get("feature", "")) + for item in comparison["misplaced_features"] + } + abilities = assessment.get("generated_tree", {}).get("abilities", []) + ability_blocks = [] + for ability in abilities: + capabilities = ability.get("capabilities", []) + capability_blocks = [] + for capability in capabilities: + name = capability.get("name", "") + item_class = "problem" if name in forbidden else "warn" if name in unexpected else "" + reason = ( + "Forbidden native capability" + if name in forbidden + else "Unexpected native capability" + if name in unexpected + else "Generated capability" + ) + capability_blocks.append( + f""" +
    +

    {escape(name)}

    +
    + {escape(reason)} + {escape(capability.get("primary_class", ""))} +
    + {_render_generated_features(name, capability.get("features", []), misplaced)} +
    + """ + ) + ability_blocks.append( + f""" +
    +

    {escape(ability.get("name", ""))}

    + {"".join(capability_blocks) or '

    No capabilities generated.

    '} +
    + """ + ) + return "\n".join(ability_blocks) or '

    No generated abilities found.

    ' + + +def _render_generated_features( + capability_name: str, + features: list[dict], + misplaced: set[tuple[str, str]], +) -> str: + if not features: + return "" + rows = [] + for feature in features: + feature_name = feature.get("name", "") + feature_class = "warn" if (capability_name, feature_name) in misplaced else "" + rows.append( + f""" +
  • + {escape(feature_name)} + {escape(feature.get("type") or feature.get("primary_class") or "")} +
    {escape(feature.get("location", ""))}
    +
  • + """ + ) + return f'' + + +def _review_outcome_options() -> str: + return "\n".join( + f'' + for value in REVIEW_OUTCOME_LABELS + if value in ALLOWED_OUTCOMES + ) + + +def _pair_review_outcome_options() -> str: + return "\n".join( + f'' + for value in PAIR_REVIEW_OUTCOME_LABELS + if value in ALLOWED_OUTCOMES + ) + + @router.get("/ui/repos/{repository_id}/scope") def repository_scope_document( repository_id: int, diff --git a/tests/test_self_scoping_review_store.py b/tests/test_self_scoping_review_store.py new file mode 100644 index 0000000..1896544 --- /dev/null +++ b/tests/test_self_scoping_review_store.py @@ -0,0 +1,105 @@ +import json + +import pytest + +from repo_registry.self_scoping.review_store import ( + list_assessment_artifacts, + list_golden_profiles, + load_json_artifact, + record_assessment_outcome, + record_assessment_pair_outcome, +) + + +def write_json(path, payload): + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload), encoding="utf-8") + + +def test_review_store_lists_and_loads_artifacts(tmp_path): + root = tmp_path / "self-scoping" + write_json( + root / "golden" / "profile.json", + {"profile_id": "repo-scoping-golden-v1", "title": "Golden", "updated_at": "2026-05-15"}, + ) + write_json( + root / "assessments" / "run.json", + {"artifact_id": "known-bad-run", "created_at": "2026-05-15T10:00:00Z"}, + ) + + assert list_golden_profiles(root)[0].path == "golden/profile.json" + assert list_assessment_artifacts(root)[0].artifact_id == "known-bad-run" + assert load_json_artifact("assessments/run.json", root)["artifact_id"] == "known-bad-run" + + +def test_review_store_rejects_paths_outside_root(tmp_path): + root = tmp_path / "self-scoping" + write_json(root / "golden" / "profile.json", {"profile_id": "profile"}) + + with pytest.raises(ValueError): + load_json_artifact("../outside.json", root) + + +def test_record_assessment_outcome_is_append_only_json(tmp_path): + root = tmp_path / "self-scoping" + write_json(root / "golden" / "profile.json", {"profile_id": "profile-v1"}) + write_json( + root / "assessments" / "run.json", + { + "artifact_id": "run-1", + "engine_identity": {"engine_commit": "abc123", "engine_release": "v1"}, + }, + ) + + record = record_assessment_outcome( + golden_path="golden/profile.json", + assessment_path="assessments/run.json", + outcome="prefer_golden", + reviewer="codex", + notes="Known provider-routing regression remains present.", + comparison_status="regression", + root=root, + ) + + outcome_files = list((root / "outcomes").glob("*.json")) + assert len(outcome_files) == 1 + persisted = json.loads(outcome_files[0].read_text(encoding="utf-8")) + assert persisted == record + assert persisted["schema_version"] == "self-scoping-review-outcome/v1" + assert persisted["golden_profile_id"] == "profile-v1" + assert persisted["assessment_artifact_id"] == "run-1" + assert persisted["engine_identity"]["engine_commit"] == "abc123" + + +def test_record_assessment_pair_outcome_keeps_both_release_bindings(tmp_path): + root = tmp_path / "self-scoping" + write_json( + root / "assessments" / "baseline.json", + { + "artifact_id": "baseline-run", + "engine_identity": {"engine_commit": "old"}, + }, + ) + write_json( + root / "assessments" / "challenger.json", + { + "artifact_id": "challenger-run", + "engine_identity": {"engine_commit": "new"}, + }, + ) + + record = record_assessment_pair_outcome( + baseline_path="assessments/baseline.json", + challenger_path="assessments/challenger.json", + outcome="prefer_challenger", + reviewer="codex", + notes="Hierarchy is closer to native repo-scoping utility.", + comparison_status="needs_review", + root=root, + ) + + assert record["decision_scope"] == "assessment-pair-comparison" + assert record["baseline_assessment_artifact_id"] == "baseline-run" + assert record["challenger_assessment_artifact_id"] == "challenger-run" + assert record["baseline_engine_identity"]["engine_commit"] == "old" + assert record["challenger_engine_identity"]["engine_commit"] == "new" diff --git a/tests/test_self_scoping_web_ui.py b/tests/test_self_scoping_web_ui.py new file mode 100644 index 0000000..10e4115 --- /dev/null +++ b/tests/test_self_scoping_web_ui.py @@ -0,0 +1,158 @@ +import json + +from fastapi.testclient import TestClient + +from repo_registry.web_api.app import app + + +def write_json(path, payload): + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(payload), encoding="utf-8") + + +def seed_review_artifacts(root): + write_json( + root / "golden" / "profile.json", + { + "profile_id": "profile-v1", + "ability": { + "expected_capabilities": [ + { + "name": "Scan Repositories Into Observed Facts", + "primary_class": "analysis", + "expected_features": [], + } + ] + }, + "forbidden_native_capabilities": [ + {"name": "Route LLM Requests Across Providers"} + ], + }, + ) + write_json( + root / "assessments" / "baseline.json", + { + "artifact_id": "baseline-run", + "target_repository": {"repo_slug": "repo-scoping"}, + "engine_identity": {"engine_commit": "old"}, + "generated_tree": { + "abilities": [ + { + "name": "Map Repositories Into Reviewable Scope Profiles", + "capabilities": [ + { + "name": "Scan Repositories Into Observed Facts", + "primary_class": "analysis", + "features": [ + { + "name": "HTTP API surface", + "type": "API", + "location": "src/repo_registry/web_api/app.py", + } + ], + } + ], + } + ] + }, + "known_regression_patterns": [], + }, + ) + write_json( + root / "assessments" / "run.json", + { + "artifact_id": "run-1", + "target_repository": {"repo_slug": "repo-scoping"}, + "engine_identity": {"engine_commit": "abc123"}, + "generated_tree": { + "abilities": [ + { + "name": "Support Repo Registry", + "capabilities": [ + { + "name": "Route LLM Requests Across Providers", + "primary_class": "llm-integration", + "features": [ + { + "name": "HTTP API surface", + "type": "API", + "location": "src/repo_registry/web_api/app.py", + } + ], + } + ], + } + ] + }, + "known_regression_patterns": [], + }, + ) + + +def test_self_scoping_ui_compares_and_records_outcome(tmp_path, monkeypatch): + root = tmp_path / "self-scoping" + seed_review_artifacts(root) + monkeypatch.setenv("REPO_REGISTRY_SELF_SCOPING_ROOT", str(root)) + client = TestClient(app) + + index = client.get("/ui/self-scoping") + assert index.status_code == 200 + assert "Self-Scoping Review" in index.text + + review = client.get( + "/ui/self-scoping/review", + params={"golden": "golden/profile.json", "assessment": "assessments/run.json"}, + ) + assert review.status_code == 200 + assert "Route LLM Requests Across Providers" in review.text + assert "regression" in review.text + + saved = client.post( + "/ui/self-scoping/review", + data={ + "golden_path": "golden/profile.json", + "assessment_path": "assessments/run.json", + "outcome": "prefer_golden", + "reviewer": "codex", + "notes": "Provider routing is not native scope.", + "comparison_status": "regression", + }, + follow_redirects=False, + ) + assert saved.status_code == 303 + assert list((root / "outcomes").glob("*.json")) + + +def test_self_scoping_ui_compares_two_assessment_runs(tmp_path, monkeypatch): + root = tmp_path / "self-scoping" + seed_review_artifacts(root) + monkeypatch.setenv("REPO_REGISTRY_SELF_SCOPING_ROOT", str(root)) + client = TestClient(app) + + review = client.get( + "/ui/self-scoping/run-review", + params={ + "baseline": "assessments/baseline.json", + "challenger": "assessments/run.json", + }, + ) + assert review.status_code == 200 + assert "Assessment Run Comparison" in review.text + assert "Baseline only" in review.text + assert "Challenger only" in review.text + + saved = client.post( + "/ui/self-scoping/run-review", + data={ + "baseline_path": "assessments/baseline.json", + "challenger_path": "assessments/run.json", + "outcome": "prefer_baseline", + "reviewer": "codex", + "notes": "Baseline preserves native scanning capability.", + "comparison_status": "needs_review", + }, + follow_redirects=False, + ) + assert saved.status_code == 303 + records = [json.loads(path.read_text()) for path in (root / "outcomes").glob("*.json")] + assert any(record["decision_scope"] == "assessment-pair-comparison" for record in records) diff --git a/workplans/RREG-WP-0013-self-scoping-baseline-evaluation.md b/workplans/RREG-WP-0013-self-scoping-baseline-evaluation.md index 3635bad..4842f8d 100644 --- a/workplans/RREG-WP-0013-self-scoping-baseline-evaluation.md +++ b/workplans/RREG-WP-0013-self-scoping-baseline-evaluation.md @@ -4,7 +4,7 @@ type: workplan title: "Self-Scoping Baseline Evaluation" domain: capabilities repo: repo-scoping -status: active +status: done owner: codex topic_slug: foerster-capabilities created: "2026-05-15" @@ -208,7 +208,7 @@ capabilities. Reports can be emitted as JSON or Markdown. ```task id: RREG-WP-0013-T06 -status: todo +status: done priority: medium state_hub_task_id: "16a60b7c-7e2c-4bb0-b4ab-2381289dba0b" ``` @@ -223,6 +223,12 @@ Acceptance criteria: - Review decisions are persisted as assessment outcomes, not as changes to the underlying historical artifacts. +Implementation note 2026-05-15: added a file-backed `/ui/self-scoping` curator +surface that reads golden profiles and assessment artifacts from +`docs/self-scoping`, renders side-by-side hierarchy comparisons with regression +highlights, compares two assessment runs directly for old-vs-new judgement, and +records append-only review outcome JSON under `docs/self-scoping/outcomes/`. + ## T07: Add Self-Scoping Regression Command ```task