From 2dfe5c6dd68d190c93f4f4649d9da21e649056f7 Mon Sep 17 00:00:00 2001 From: tegwick Date: Fri, 15 May 2026 13:37:19 +0200 Subject: [PATCH] Document self-scoping assessment workflow --- docs/self-scoping/README.md | 2 + docs/self-scoping/workflow.md | 121 ++++++++++++++++++ tests/test_self_scoping_artifacts.py | 10 ++ ...P-0013-self-scoping-baseline-evaluation.md | 7 +- 4 files changed, 139 insertions(+), 1 deletion(-) create mode 100644 docs/self-scoping/workflow.md diff --git a/docs/self-scoping/README.md b/docs/self-scoping/README.md index 09fd8f1..9c8d50d 100644 --- a/docs/self-scoping/README.md +++ b/docs/self-scoping/README.md @@ -11,6 +11,8 @@ instead of relying on memory or screenshots. - `assessments/repo-scoping-known-bad-2026-05-15-run-39.json` captures the known-bad self-analysis that promoted LLM-provider vocabulary into native repo-scoping capability truth. +- `workflow.md` explains how to run challenger assessments, interpret outcomes, + and decide whether to update the golden profile or fix the engine. - `../schemas/self-scoping-assessment.schema.json` defines the immutable assessment-run artifact shape. diff --git a/docs/self-scoping/workflow.md b/docs/self-scoping/workflow.md new file mode 100644 index 0000000..e7ad494 --- /dev/null +++ b/docs/self-scoping/workflow.md @@ -0,0 +1,121 @@ +# Self-Scoping Assessment Workflow + +Self-scoping is the feedback loop for improving repo-scoping with evidence. The +loop is simple: run the current engine against repo-scoping itself, compare the +result to a curated golden profile and known bad runs, then record whether the +new result is better. + +## Outcome Terms + +- `baseline`: a result accepted as a reference point for later comparisons. +- `challenger`: a fresh result from a new engine version or configuration. +- `preferred`: the reviewer chose this result over the prior baseline. +- `tied`: the reviewer judged old and new results roughly equivalent. +- `rejected`: the result is known bad and should not become baseline truth. +- `superseded`: the result used to be useful but was replaced by a newer + preferred assessment. +- `needs-human`: the result cannot be judged confidently without curator + review. + +The known 2026-05-15 run 39 artifact is a `rejected` negative regression seed, +not a baseline to imitate. + +## Release Binding + +Assessment output is only useful if it is bound to the engine that generated it. +Comparable challenger artifacts should record: + +- repo-scoping package version +- engine git commit +- engine release or tag when available +- engine dirty state +- scanner version +- candidate generator version +- quality criteria version +- prompt version when LLM or agentic review is used + +An artifact with `release_binding_status=complete` can be compared as a real +challenger. An artifact with `historical_incomplete` can still be useful as a +negative seed, but it should not become a preferred baseline. An `unbound` +artifact is diagnostic only. + +Dirty state does not automatically make an artifact useless, but it must be +visible. A dirty challenger should usually be rerun after the relevant changes +are committed. + +## Standard Loop + +1. Run the self-assessment command: + + ```bash + repo-scoping self-assess \ + --source-path . \ + --assessment-output docs/self-scoping/assessments/repo-scoping-challenger.json \ + --comparison-output docs/self-scoping/assessments/repo-scoping-challenger.md + ``` + +2. Read the comparison report. + +3. If the report says `regression`, inspect forbidden capabilities, misplaced + features, and known regression patterns first. + +4. If the report says `needs_review`, inspect missing expected capabilities and + source evidence before choosing old or new output. + +5. If the report says `candidate_improvement`, still confirm that the + hierarchy, source refs, and native-utility boundaries make sense. + +6. Record the decision as an assessment outcome before changing the active + baseline. + +## CI Use + +Use `--fail-on-regression` only when regressions should block the command: + +```bash +repo-scoping self-assess \ + --source-path . \ + --comparison-output /tmp/repo-scoping-self-assessment.md \ + --fail-on-regression +``` + +The command should not fail for ordinary `needs_review` results. Review-needed +output is signal, not a broken build. + +## Updating The Golden Profile + +Update `golden/repo-scoping-golden-profile.v1.json` when the repository's real +product utility has changed. Examples: + +- repo-scoping adds a genuinely new user-facing capability. +- a capability is renamed after curator agreement. +- a former out-of-scope behavior becomes product intent and has supporting + implementation evidence. + +Do not update the golden profile just because the engine failed to find an +expected capability. That is usually an engine issue. + +## Fixing The Engine + +Fix the engine when a challenger: + +- repeats a known regression pattern +- promotes dependency, fixture, schema, scanner-rule, or workplan vocabulary as + native capability truth +- places features under a capability they do not support +- loses source refs or cites evidence that does not support the abstraction +- relies on generated `SCOPE.md` as primary proof for rebuilding the same model + +The 2026-05-15 run 39 failure is the canonical example: provider vocabulary from +scanner code, tests, fixtures, and schema examples became the false native +capability `Route LLM Requests Across Providers`. The correct action is to fix +scanner/generator/acceptance behavior, not to teach the golden profile that +repo-scoping is an LLM router. + +## Relationship To Agentic Acceptance + +Deterministic assessment can reject, downgrade, or flag output with transparent +criteria. It should not approve candidate characteristics as registry truth. +When automation stands in for human review, the decision belongs to an agentic +reviewer that inspects evidence, applies versioned criteria, and records a +rationale. That acceptance redesign is tracked in `RREG-WP-0014`. diff --git a/tests/test_self_scoping_artifacts.py b/tests/test_self_scoping_artifacts.py index 60c64d3..70b379c 100644 --- a/tests/test_self_scoping_artifacts.py +++ b/tests/test_self_scoping_artifacts.py @@ -18,6 +18,7 @@ GOLDEN_PROFILE_PATH = ( / "golden" / "repo-scoping-golden-profile.v1.json" ) +WORKFLOW_PATH = ROOT / "docs" / "self-scoping" / "workflow.md" def load_json(path: Path) -> dict: @@ -117,3 +118,12 @@ def test_golden_profile_names_expected_native_capabilities_and_forbidden_false_p assert profile["comparison_rules"]["must_not_have_native_capability_names"] == [ "Route LLM Requests Across Providers" ] + + +def test_self_scoping_workflow_documents_decision_policy(): + content = WORKFLOW_PATH.read_text(encoding="utf-8") + + assert "release_binding_status=complete" in content + assert "Update `golden/repo-scoping-golden-profile.v1.json`" in content + assert "Fix the engine when a challenger" in content + assert "Deterministic assessment can reject, downgrade, or flag" in content diff --git a/workplans/RREG-WP-0013-self-scoping-baseline-evaluation.md b/workplans/RREG-WP-0013-self-scoping-baseline-evaluation.md index 0ae75b8..3635bad 100644 --- a/workplans/RREG-WP-0013-self-scoping-baseline-evaluation.md +++ b/workplans/RREG-WP-0013-self-scoping-baseline-evaluation.md @@ -254,7 +254,7 @@ agentic reviewer is configured. ```task id: RREG-WP-0013-T08 -status: todo +status: done priority: medium state_hub_task_id: "30d71946-3598-4dc7-9970-c7c18126cad7" ``` @@ -272,6 +272,11 @@ Acceptance criteria: - Documentation describes when to update the golden profile versus when to fix the engine. +Implementation note 2026-05-15: added `docs/self-scoping/workflow.md`. The +workflow documents assessment outcomes, release binding, the standard +self-assessment loop, CI use, when to update the golden profile, when to fix the +engine, and the relationship to RREG-WP-0014 agentic acceptance. + ## Completion Criteria - repo-scoping has an immutable, release-bound self-scoping assessment format.