generated from coulomb/repo-seed
Add self-scoping regression command
This commit is contained in:
@@ -60,3 +60,18 @@ repo-scoping compare-assessment \
|
||||
|
||||
The first comparison report highlights missing expected capabilities, forbidden
|
||||
native capabilities, known regression patterns, and misplaced API/CLI features.
|
||||
|
||||
Run the full self-assessment loop:
|
||||
|
||||
```bash
|
||||
repo-scoping self-assess \
|
||||
--source-path . \
|
||||
--assessment-output docs/self-scoping/assessments/repo-scoping-challenger.json \
|
||||
--comparison-output docs/self-scoping/assessments/repo-scoping-challenger.md
|
||||
```
|
||||
|
||||
By default this path is deterministic-only and leaves generated candidates
|
||||
pending review. Add `--with-llm` only when a provider is configured and the run
|
||||
should include LLM-assisted candidate extraction. Add `--fail-on-regression` in
|
||||
CI when known regressions should fail the command; ordinary `needs_review`
|
||||
comparisons still exit successfully.
|
||||
|
||||
@@ -99,6 +99,58 @@ def build_parser() -> argparse.ArgumentParser:
|
||||
default="markdown",
|
||||
help="Comparison report format.",
|
||||
)
|
||||
self_assess = subparsers.add_parser(
|
||||
"self-assess",
|
||||
help="Run repo-scoping against a source tree and compare the result to a golden profile.",
|
||||
)
|
||||
self_assess.add_argument(
|
||||
"--repo",
|
||||
default="repo-scoping",
|
||||
help="Repository id or exact repository name to reuse; created by name when absent.",
|
||||
)
|
||||
self_assess.add_argument(
|
||||
"--source-path",
|
||||
default=".",
|
||||
help="Source tree to analyze; defaults to the current working directory.",
|
||||
)
|
||||
self_assess.add_argument(
|
||||
"--golden",
|
||||
default="docs/self-scoping/golden/repo-scoping-golden-profile.v1.json",
|
||||
help="Golden profile JSON path.",
|
||||
)
|
||||
self_assess.add_argument(
|
||||
"--assessment-output",
|
||||
help="Write challenger assessment artifact JSON to this path.",
|
||||
)
|
||||
self_assess.add_argument(
|
||||
"--comparison-output",
|
||||
help="Write comparison report to this path instead of stdout.",
|
||||
)
|
||||
self_assess.add_argument(
|
||||
"--format",
|
||||
choices=["json", "markdown"],
|
||||
default="markdown",
|
||||
help="Comparison report format.",
|
||||
)
|
||||
self_assess.add_argument(
|
||||
"--with-llm",
|
||||
action="store_false",
|
||||
dest="no_llm",
|
||||
help="Use configured LLM assistance during the self-assessment run.",
|
||||
)
|
||||
self_assess.add_argument(
|
||||
"--agentic-review",
|
||||
action="store_true",
|
||||
help="Reserved for a configured agentic reviewer; currently errors when requested.",
|
||||
)
|
||||
self_assess.add_argument(
|
||||
"--fail-on-regression",
|
||||
action="store_true",
|
||||
help="Return exit code 1 only when comparison status is regression.",
|
||||
)
|
||||
self_assess.add_argument("--database-path", help="Override REPO_REGISTRY_DATABASE_PATH.")
|
||||
self_assess.add_argument("--checkout-root", help="Override REPO_REGISTRY_CHECKOUT_ROOT.")
|
||||
self_assess.set_defaults(no_llm=True)
|
||||
return parser
|
||||
|
||||
|
||||
@@ -111,6 +163,8 @@ def main(argv: Sequence[str] | None = None) -> int:
|
||||
return export_assessment_command(args, parser)
|
||||
if args.command == "compare-assessment":
|
||||
return compare_assessment_command(args)
|
||||
if args.command == "self-assess":
|
||||
return self_assess_command(args, parser)
|
||||
parser.error(f"unknown command: {args.command}")
|
||||
return 2
|
||||
|
||||
@@ -158,12 +212,57 @@ def compare_assessment_command(args: argparse.Namespace) -> int:
|
||||
else comparison_markdown(comparison)
|
||||
)
|
||||
if args.output:
|
||||
Path(args.output).write_text(content, encoding="utf-8")
|
||||
write_text(args.output, content)
|
||||
else:
|
||||
print(content, end="" if content.endswith("\n") else "\n")
|
||||
return 0
|
||||
|
||||
|
||||
def self_assess_command(
|
||||
args: argparse.Namespace,
|
||||
parser: argparse.ArgumentParser,
|
||||
) -> int:
|
||||
if args.agentic_review:
|
||||
parser.error("agentic review is not configured yet")
|
||||
service = service_from_args(args)
|
||||
source_path = Path(args.source_path).expanduser().resolve()
|
||||
if not source_path.is_dir():
|
||||
parser.error(f"source path does not exist or is not a directory: {source_path}")
|
||||
repository = self_assessment_repository(service, args.repo, source_path)
|
||||
summary = service.analyze_repository(
|
||||
repository.id,
|
||||
source_path=str(source_path),
|
||||
use_llm_assistance=not args.no_llm,
|
||||
trusted_auto_approve=False,
|
||||
)
|
||||
if summary.analysis_run.status != "completed":
|
||||
parser.error(summary.analysis_run.error_message or "analysis failed")
|
||||
artifact = export_assessment_artifact(
|
||||
service,
|
||||
repository.id,
|
||||
summary.analysis_run.id,
|
||||
role="challenger",
|
||||
outcome="challenger",
|
||||
reviewer="self-assess",
|
||||
)
|
||||
comparison = compare_assessment_to_golden(load_json(args.golden), artifact)
|
||||
|
||||
if args.assessment_output:
|
||||
write_text(args.assessment_output, artifact_json(artifact))
|
||||
report = (
|
||||
comparison_json(comparison)
|
||||
if args.format == "json"
|
||||
else comparison_markdown(comparison)
|
||||
)
|
||||
if args.comparison_output:
|
||||
write_text(args.comparison_output, report)
|
||||
else:
|
||||
print(report, end="" if report.endswith("\n") else "\n")
|
||||
if args.fail_on_regression and comparison["status"] == "regression":
|
||||
return 1
|
||||
return 0
|
||||
|
||||
|
||||
def export_assessment_command(
|
||||
args: argparse.Namespace,
|
||||
parser: argparse.ArgumentParser,
|
||||
@@ -190,7 +289,7 @@ def export_assessment_command(
|
||||
|
||||
content = artifact_json(artifact)
|
||||
if args.output:
|
||||
Path(args.output).write_text(content, encoding="utf-8")
|
||||
write_text(args.output, content)
|
||||
else:
|
||||
print(content, end="")
|
||||
return 0
|
||||
@@ -231,6 +330,29 @@ def selected_repositories(
|
||||
return [repository for repository in repositories if repository.name == repo]
|
||||
|
||||
|
||||
def self_assessment_repository(
|
||||
service: RegistryService,
|
||||
repo: str,
|
||||
source_path: Path,
|
||||
) -> Repository:
|
||||
selected = selected_repositories(service, argparse.Namespace(repo=repo, all=False))
|
||||
if selected:
|
||||
return selected[0]
|
||||
if repo.isdigit():
|
||||
raise NotFoundError(f"repository {repo} was not found")
|
||||
return service.register_repository(
|
||||
name=repo,
|
||||
url=str(source_path),
|
||||
description="Self-scoping assessment target.",
|
||||
)
|
||||
|
||||
|
||||
def write_text(path: str | Path, content: str) -> None:
|
||||
target = Path(path)
|
||||
target.parent.mkdir(parents=True, exist_ok=True)
|
||||
target.write_text(content, encoding="utf-8")
|
||||
|
||||
|
||||
def rebuild_summary_line(
|
||||
service: RegistryService,
|
||||
result: CharacteristicRebuildResult,
|
||||
|
||||
@@ -154,3 +154,100 @@ def test_compare_assessment_cli_writes_markdown_report(tmp_path):
|
||||
assert exit_code == 0
|
||||
assert "Status: `regression`" in report
|
||||
assert "Route LLM Requests Across Providers" in report
|
||||
|
||||
|
||||
def test_self_assess_cli_exports_challenger_and_comparison(tmp_path):
|
||||
source = write_repo(tmp_path)
|
||||
golden_path = tmp_path / "golden.json"
|
||||
golden_path.write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"profile_id": "test-golden",
|
||||
"ability": {
|
||||
"expected_capabilities": [
|
||||
{"name": "Expose Repository Interface"}
|
||||
]
|
||||
},
|
||||
"forbidden_native_capabilities": [],
|
||||
}
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
assessment_path = tmp_path / "out" / "assessment.json"
|
||||
comparison_path = tmp_path / "out" / "comparison.json"
|
||||
|
||||
exit_code = main(
|
||||
[
|
||||
"self-assess",
|
||||
"--repo",
|
||||
"Self Assess Repo",
|
||||
"--source-path",
|
||||
str(source),
|
||||
"--golden",
|
||||
str(golden_path),
|
||||
"--assessment-output",
|
||||
str(assessment_path),
|
||||
"--comparison-output",
|
||||
str(comparison_path),
|
||||
"--format",
|
||||
"json",
|
||||
"--database-path",
|
||||
str(tmp_path / "registry.sqlite3"),
|
||||
"--checkout-root",
|
||||
str(tmp_path / "checkouts"),
|
||||
]
|
||||
)
|
||||
|
||||
assessment = json.loads(assessment_path.read_text(encoding="utf-8"))
|
||||
comparison = json.loads(comparison_path.read_text(encoding="utf-8"))
|
||||
assert exit_code == 0
|
||||
assert assessment["target_repository"]["repo_slug"] == "self-assess-repo"
|
||||
assert assessment["execution"]["mode"] == "deterministic-only"
|
||||
assert comparison["status"] == "candidate_improvement"
|
||||
assert comparison["matched_expected_capabilities"] == [
|
||||
"Expose Repository Interface"
|
||||
]
|
||||
|
||||
|
||||
def test_self_assess_cli_can_fail_on_regression(tmp_path):
|
||||
source = tmp_path / "provider-repo"
|
||||
source.mkdir()
|
||||
(source / "README.md").write_text("# Provider Repo\n", encoding="utf-8")
|
||||
(source / "providers.py").write_text(
|
||||
"provider_registry = {'openrouter': OpenRouterAdapter}\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
golden_path = tmp_path / "golden.json"
|
||||
golden_path.write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"profile_id": "test-golden",
|
||||
"ability": {"expected_capabilities": []},
|
||||
"forbidden_native_capabilities": [
|
||||
{"name": "Route LLM Requests Across Providers"}
|
||||
],
|
||||
}
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
exit_code = main(
|
||||
[
|
||||
"self-assess",
|
||||
"--repo",
|
||||
"Provider Repo",
|
||||
"--source-path",
|
||||
str(source),
|
||||
"--golden",
|
||||
str(golden_path),
|
||||
"--format",
|
||||
"json",
|
||||
"--fail-on-regression",
|
||||
"--database-path",
|
||||
str(tmp_path / "registry.sqlite3"),
|
||||
"--checkout-root",
|
||||
str(tmp_path / "checkouts"),
|
||||
]
|
||||
)
|
||||
|
||||
assert exit_code == 1
|
||||
|
||||
@@ -227,7 +227,7 @@ Acceptance criteria:
|
||||
|
||||
```task
|
||||
id: RREG-WP-0013-T07
|
||||
status: todo
|
||||
status: done
|
||||
priority: medium
|
||||
state_hub_task_id: "af1fcecd-686d-4592-b739-4698abc98c55"
|
||||
```
|
||||
@@ -242,6 +242,14 @@ Acceptance criteria:
|
||||
- The command emits a comparison report and exits non-zero only for explicit
|
||||
CI-blocking regressions, not for ordinary "needs review" assessment outcomes.
|
||||
|
||||
Implementation note 2026-05-15: added `repo-scoping self-assess`. The command
|
||||
analyzes a source tree, exports a challenger assessment artifact, compares it to
|
||||
the golden profile, emits JSON or Markdown, and returns non-zero only with
|
||||
`--fail-on-regression` when the comparison status is `regression`. The command
|
||||
defaults to deterministic-only; `--with-llm` opts into configured LLM assistance.
|
||||
`--agentic-review` is reserved for RREG-WP-0014 and currently errors when no
|
||||
agentic reviewer is configured.
|
||||
|
||||
## T08: Document Assessment Workflow
|
||||
|
||||
```task
|
||||
|
||||
Reference in New Issue
Block a user