generated from coulomb/repo-seed
Add self-scoping baseline workplans and artifacts
This commit is contained in:
442
docs/schemas/self-scoping-assessment.schema.json
Normal file
442
docs/schemas/self-scoping-assessment.schema.json
Normal file
@@ -0,0 +1,442 @@
|
||||
{
|
||||
"$schema": "https://json-schema.org/draft/2020-12/schema",
|
||||
"$id": "https://repo-scoping.local/schemas/self-scoping-assessment.schema.json",
|
||||
"title": "Self-Scoping Assessment Artifact",
|
||||
"description": "Immutable artifact used to compare repo-scoping self-analysis results across engine releases.",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"schema_version",
|
||||
"artifact_id",
|
||||
"artifact_type",
|
||||
"created_at",
|
||||
"target_repository",
|
||||
"engine_identity",
|
||||
"execution",
|
||||
"assessment",
|
||||
"fact_summary",
|
||||
"generated_tree",
|
||||
"known_regression_patterns"
|
||||
],
|
||||
"properties": {
|
||||
"schema_version": {
|
||||
"const": "self-scoping-assessment/v1"
|
||||
},
|
||||
"artifact_id": {
|
||||
"type": "string",
|
||||
"description": "Stable artifact identifier."
|
||||
},
|
||||
"artifact_type": {
|
||||
"enum": ["assessment_run"]
|
||||
},
|
||||
"created_at": {
|
||||
"type": "string",
|
||||
"format": "date-time"
|
||||
},
|
||||
"target_repository": {
|
||||
"$ref": "#/$defs/targetRepository"
|
||||
},
|
||||
"engine_identity": {
|
||||
"$ref": "#/$defs/engineIdentity"
|
||||
},
|
||||
"execution": {
|
||||
"$ref": "#/$defs/execution"
|
||||
},
|
||||
"assessment": {
|
||||
"$ref": "#/$defs/assessment"
|
||||
},
|
||||
"fact_summary": {
|
||||
"$ref": "#/$defs/factSummary"
|
||||
},
|
||||
"generated_tree": {
|
||||
"$ref": "#/$defs/generatedTree"
|
||||
},
|
||||
"known_regression_patterns": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/$defs/regressionPattern"
|
||||
}
|
||||
},
|
||||
"notes": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"$defs": {
|
||||
"targetRepository": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"repo_slug",
|
||||
"repository_id",
|
||||
"source",
|
||||
"target_commit",
|
||||
"target_branch",
|
||||
"dirty_state",
|
||||
"file_count"
|
||||
],
|
||||
"properties": {
|
||||
"repo_slug": {
|
||||
"type": "string"
|
||||
},
|
||||
"repository_id": {
|
||||
"type": ["integer", "null"]
|
||||
},
|
||||
"source": {
|
||||
"type": "string"
|
||||
},
|
||||
"target_commit": {
|
||||
"type": "string"
|
||||
},
|
||||
"target_branch": {
|
||||
"type": "string"
|
||||
},
|
||||
"dirty_state": {
|
||||
"enum": ["clean", "dirty", "unknown"]
|
||||
},
|
||||
"file_count": {
|
||||
"type": ["integer", "null"],
|
||||
"minimum": 0
|
||||
}
|
||||
}
|
||||
},
|
||||
"engineIdentity": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"repo_scoping_version",
|
||||
"engine_commit",
|
||||
"engine_release",
|
||||
"engine_dirty_state",
|
||||
"scanner_version",
|
||||
"candidate_generator_version",
|
||||
"quality_criteria_version",
|
||||
"prompt_version",
|
||||
"release_binding_status"
|
||||
],
|
||||
"properties": {
|
||||
"repo_scoping_version": {
|
||||
"type": "string"
|
||||
},
|
||||
"engine_commit": {
|
||||
"type": ["string", "null"]
|
||||
},
|
||||
"engine_release": {
|
||||
"type": ["string", "null"]
|
||||
},
|
||||
"engine_dirty_state": {
|
||||
"enum": ["clean", "dirty", "unknown"]
|
||||
},
|
||||
"scanner_version": {
|
||||
"type": "string"
|
||||
},
|
||||
"candidate_generator_version": {
|
||||
"type": "string"
|
||||
},
|
||||
"quality_criteria_version": {
|
||||
"type": "string"
|
||||
},
|
||||
"prompt_version": {
|
||||
"type": ["string", "null"]
|
||||
},
|
||||
"release_binding_status": {
|
||||
"enum": ["complete", "historical_incomplete", "unbound"]
|
||||
},
|
||||
"release_binding_note": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"execution": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"mode",
|
||||
"analysis_run_id",
|
||||
"candidate_source",
|
||||
"acceptance_mode"
|
||||
],
|
||||
"properties": {
|
||||
"mode": {
|
||||
"enum": [
|
||||
"deterministic-only",
|
||||
"llm-assisted",
|
||||
"agent-reviewed",
|
||||
"manual-review",
|
||||
"trusted-auto-review",
|
||||
"mixed"
|
||||
]
|
||||
},
|
||||
"analysis_run_id": {
|
||||
"type": ["integer", "null"]
|
||||
},
|
||||
"candidate_source": {
|
||||
"type": "string"
|
||||
},
|
||||
"acceptance_mode": {
|
||||
"type": "string"
|
||||
},
|
||||
"started_at": {
|
||||
"type": ["string", "null"],
|
||||
"format": "date-time"
|
||||
},
|
||||
"completed_at": {
|
||||
"type": ["string", "null"],
|
||||
"format": "date-time"
|
||||
}
|
||||
}
|
||||
},
|
||||
"assessment": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required": [
|
||||
"role",
|
||||
"outcome",
|
||||
"summary",
|
||||
"reviewer",
|
||||
"comparison_eligibility"
|
||||
],
|
||||
"properties": {
|
||||
"role": {
|
||||
"enum": ["baseline", "challenger", "negative_regression_seed"]
|
||||
},
|
||||
"outcome": {
|
||||
"enum": ["baseline", "challenger", "preferred", "tied", "rejected", "superseded", "needs-human"]
|
||||
},
|
||||
"summary": {
|
||||
"type": "string"
|
||||
},
|
||||
"reviewer": {
|
||||
"type": "string"
|
||||
},
|
||||
"comparison_eligibility": {
|
||||
"enum": ["eligible", "eligible_as_negative_seed", "not_comparable"]
|
||||
},
|
||||
"rationale": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"factSummary": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required": ["counts_by_kind"],
|
||||
"properties": {
|
||||
"counts_by_kind": {
|
||||
"type": "object",
|
||||
"additionalProperties": {
|
||||
"type": "integer",
|
||||
"minimum": 0
|
||||
}
|
||||
},
|
||||
"contamination_sources": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/$defs/contaminationSource"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"contaminationSource": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required": ["path", "reason"],
|
||||
"properties": {
|
||||
"path": {
|
||||
"type": "string"
|
||||
},
|
||||
"reason": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"generatedTree": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required": ["abilities"],
|
||||
"properties": {
|
||||
"abilities": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/$defs/ability"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"ability": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required": ["name", "status", "primary_class", "capabilities"],
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
"status": {
|
||||
"type": "string"
|
||||
},
|
||||
"primary_class": {
|
||||
"type": "string"
|
||||
},
|
||||
"capabilities": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/$defs/capability"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"capability": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required": ["name", "status", "primary_class", "features"],
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
"status": {
|
||||
"type": "string"
|
||||
},
|
||||
"primary_class": {
|
||||
"type": "string"
|
||||
},
|
||||
"features": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/$defs/feature"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"feature": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required": ["name", "type", "status", "primary_class", "location"],
|
||||
"properties": {
|
||||
"name": {
|
||||
"type": "string"
|
||||
},
|
||||
"type": {
|
||||
"type": "string"
|
||||
},
|
||||
"status": {
|
||||
"type": "string"
|
||||
},
|
||||
"primary_class": {
|
||||
"type": "string"
|
||||
},
|
||||
"location": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"regressionPattern": {
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"required": ["id", "title", "severity", "description", "detection_hint"],
|
||||
"properties": {
|
||||
"id": {
|
||||
"type": "string"
|
||||
},
|
||||
"title": {
|
||||
"type": "string"
|
||||
},
|
||||
"severity": {
|
||||
"enum": ["low", "medium", "high", "critical"]
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"detection_hint": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"examples": [
|
||||
{
|
||||
"schema_version": "self-scoping-assessment/v1",
|
||||
"artifact_id": "repo-scoping-known-bad-2026-05-15-run-39",
|
||||
"artifact_type": "assessment_run",
|
||||
"created_at": "2026-05-15T09:28:48Z",
|
||||
"target_repository": {
|
||||
"repo_slug": "repo-scoping",
|
||||
"repository_id": 16,
|
||||
"source": "/home/worsch/repo-scoping/var/checkouts/repo-scoping-8a9c4168485c",
|
||||
"target_commit": "00b57d509124789059639fedc724d9314edbb7b2",
|
||||
"target_branch": "main",
|
||||
"dirty_state": "unknown",
|
||||
"file_count": 96
|
||||
},
|
||||
"engine_identity": {
|
||||
"repo_scoping_version": "0.1.0",
|
||||
"engine_commit": null,
|
||||
"engine_release": null,
|
||||
"engine_dirty_state": "unknown",
|
||||
"scanner_version": "deterministic-v0.1",
|
||||
"candidate_generator_version": "unversioned",
|
||||
"quality_criteria_version": "none",
|
||||
"prompt_version": null,
|
||||
"release_binding_status": "historical_incomplete",
|
||||
"release_binding_note": "Historical database run did not record engine commit."
|
||||
},
|
||||
"execution": {
|
||||
"mode": "trusted-auto-review",
|
||||
"analysis_run_id": 39,
|
||||
"candidate_source": "deterministic",
|
||||
"acceptance_mode": "trusted_auto_approve_candidate_graph",
|
||||
"started_at": "2026-05-15T09:28:47Z",
|
||||
"completed_at": "2026-05-15T09:28:48Z"
|
||||
},
|
||||
"assessment": {
|
||||
"role": "negative_regression_seed",
|
||||
"outcome": "rejected",
|
||||
"summary": "Provider vocabulary was promoted into a false native LLM routing capability.",
|
||||
"reviewer": "codex",
|
||||
"comparison_eligibility": "eligible_as_negative_seed",
|
||||
"rationale": ["The generated tree misclassified scanner vocabulary as product behavior."]
|
||||
},
|
||||
"fact_summary": {
|
||||
"counts_by_kind": {
|
||||
"llm_provider": 41
|
||||
},
|
||||
"contamination_sources": [
|
||||
{
|
||||
"path": "src/repo_registry/repo_scanning/scanner.py",
|
||||
"reason": "Scanner rule vocabulary was treated as repo-owned capability evidence."
|
||||
}
|
||||
]
|
||||
},
|
||||
"generated_tree": {
|
||||
"abilities": [
|
||||
{
|
||||
"name": "Support Repo Registry",
|
||||
"status": "approved",
|
||||
"primary_class": "repository-intelligence",
|
||||
"capabilities": [
|
||||
{
|
||||
"name": "Route LLM Requests Across Providers",
|
||||
"status": "approved",
|
||||
"primary_class": "llm-integration",
|
||||
"features": []
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"known_regression_patterns": [
|
||||
{
|
||||
"id": "RREG-SELF-REG-001",
|
||||
"title": "LLM provider vocabulary promoted as native capability",
|
||||
"severity": "critical",
|
||||
"description": "Scanner or fixture vocabulary becomes a repo-scoping product capability.",
|
||||
"detection_hint": "Flag Route LLM Requests Across Providers when parented as a native repo-scoping capability."
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
35
docs/self-scoping/README.md
Normal file
35
docs/self-scoping/README.md
Normal file
@@ -0,0 +1,35 @@
|
||||
# Self-Scoping Assessment Artifacts
|
||||
|
||||
This directory contains repo-scoping's own baseline and assessment artifacts.
|
||||
These files are meant to make scoping-engine changes comparable across releases
|
||||
instead of relying on memory or screenshots.
|
||||
|
||||
## Artifact Types
|
||||
|
||||
- `golden/repo-scoping-golden-profile.v1.json` is the curated target profile for
|
||||
repo-scoping itself.
|
||||
- `assessments/repo-scoping-known-bad-2026-05-15-run-39.json` captures the
|
||||
known-bad self-analysis that promoted LLM-provider vocabulary into native
|
||||
repo-scoping capability truth.
|
||||
- `../schemas/self-scoping-assessment.schema.json` defines the immutable
|
||||
assessment-run artifact shape.
|
||||
|
||||
## Release Binding
|
||||
|
||||
Comparable assessment artifacts must bind generated results to the repo-scoping
|
||||
engine release that produced them. A complete binding records package version,
|
||||
engine git commit or release tag, dirty state, scanner version, candidate
|
||||
generator version, quality criteria version, and prompt version when applicable.
|
||||
|
||||
The current known-bad artifact is marked `historical_incomplete` because the
|
||||
original database run did not record the engine commit. It remains useful as a
|
||||
negative regression seed, but future challenger artifacts should be fully bound
|
||||
before they are accepted as comparable baselines.
|
||||
|
||||
## Review Use
|
||||
|
||||
When the engine changes, run repo-scoping against itself and export a challenger
|
||||
assessment. Compare the challenger to the golden profile and to the negative
|
||||
seed. Reviewers should be able to choose whether the old result, new result, or
|
||||
neither is better, then store that judgement as a new assessment outcome.
|
||||
|
||||
@@ -0,0 +1,209 @@
|
||||
{
|
||||
"schema_version": "self-scoping-assessment/v1",
|
||||
"artifact_id": "repo-scoping-known-bad-2026-05-15-run-39",
|
||||
"artifact_type": "assessment_run",
|
||||
"created_at": "2026-05-15T09:28:48Z",
|
||||
"target_repository": {
|
||||
"repo_slug": "repo-scoping",
|
||||
"repository_id": 16,
|
||||
"source": "/home/worsch/repo-scoping/var/checkouts/repo-scoping-8a9c4168485c",
|
||||
"target_commit": "00b57d509124789059639fedc724d9314edbb7b2",
|
||||
"target_branch": "main",
|
||||
"dirty_state": "unknown",
|
||||
"file_count": 96
|
||||
},
|
||||
"engine_identity": {
|
||||
"repo_scoping_version": "0.1.0",
|
||||
"engine_commit": null,
|
||||
"engine_release": null,
|
||||
"engine_dirty_state": "unknown",
|
||||
"scanner_version": "deterministic-v0.1",
|
||||
"candidate_generator_version": "unversioned-pre-self-scoping-baseline",
|
||||
"quality_criteria_version": "none",
|
||||
"prompt_version": null,
|
||||
"release_binding_status": "historical_incomplete",
|
||||
"release_binding_note": "This historical database run recorded scanner version and target commit, but not the repo-scoping engine commit or release tag that generated the candidate graph."
|
||||
},
|
||||
"execution": {
|
||||
"mode": "trusted-auto-review",
|
||||
"analysis_run_id": 39,
|
||||
"candidate_source": "deterministic",
|
||||
"acceptance_mode": "trusted_auto_approve_candidate_graph",
|
||||
"started_at": "2026-05-15T09:28:47Z",
|
||||
"completed_at": "2026-05-15T09:28:48Z"
|
||||
},
|
||||
"assessment": {
|
||||
"role": "negative_regression_seed",
|
||||
"outcome": "rejected",
|
||||
"summary": "The self-analysis promoted LLM-provider vocabulary into a false native repo-scoping capability and attached API/CLI features below it.",
|
||||
"reviewer": "codex",
|
||||
"comparison_eligibility": "eligible_as_negative_seed",
|
||||
"rationale": [
|
||||
"repo-scoping uses llm-connect as optional extraction infrastructure; it does not natively route LLM requests across providers.",
|
||||
"Provider names came from scanner rules, normalization tokens, schema examples, tests, fixtures, and workplan text rather than product-facing provider-routing behavior.",
|
||||
"The generated tree placed native API and CLI surfaces under the false LLM-provider capability, which makes the feature hierarchy misleading."
|
||||
]
|
||||
},
|
||||
"fact_summary": {
|
||||
"counts_by_kind": {
|
||||
"config": 1,
|
||||
"credential_config": 13,
|
||||
"documentation": 14,
|
||||
"fallback_policy": 10,
|
||||
"framework": 2,
|
||||
"intent": 1,
|
||||
"interface": 127,
|
||||
"language": 1,
|
||||
"llm_provider": 41,
|
||||
"manifest": 1,
|
||||
"provider_registry": 7,
|
||||
"scope": 1,
|
||||
"test": 19
|
||||
},
|
||||
"contamination_sources": [
|
||||
{
|
||||
"path": "src/repo_registry/repo_scanning/scanner.py",
|
||||
"reason": "Provider detector constants, credential hint constants, and fallback/provider-registry scanner logic were treated as repo-owned LLM routing evidence."
|
||||
},
|
||||
{
|
||||
"path": "src/repo_registry/candidate_graph/normalization.py",
|
||||
"reason": "Provider names used as distinctive candidate-normalization tokens were treated as implementation evidence for provider support."
|
||||
},
|
||||
{
|
||||
"path": "src/repo_registry/web_api/schemas.py",
|
||||
"reason": "An OpenRouter example in an expectation-gap schema was treated as provider evidence."
|
||||
},
|
||||
{
|
||||
"path": "tests/expectations/llm_connect_provider_expectations.json",
|
||||
"reason": "A fixture describing llm-connect expectations was treated as repo-scoping product behavior."
|
||||
},
|
||||
{
|
||||
"path": "tests/fixtures.py",
|
||||
"reason": "Regression fixture vocabulary was treated as native repo-scoping capability evidence."
|
||||
},
|
||||
{
|
||||
"path": "tests/test_candidate_graph.py",
|
||||
"reason": "Unit-test examples for LLM-provider detection were treated as product evidence."
|
||||
},
|
||||
{
|
||||
"path": "tests/test_repository_scanner.py",
|
||||
"reason": "Scanner tests for provider facts were treated as native provider-routing evidence."
|
||||
}
|
||||
]
|
||||
},
|
||||
"generated_tree": {
|
||||
"abilities": [
|
||||
{
|
||||
"name": "Support Repo Registry",
|
||||
"status": "approved",
|
||||
"primary_class": "repository-intelligence",
|
||||
"capabilities": [
|
||||
{
|
||||
"name": "Route LLM Requests Across Providers",
|
||||
"status": "approved",
|
||||
"primary_class": "llm-integration",
|
||||
"features": [
|
||||
{
|
||||
"name": "Use Anthropic Models",
|
||||
"type": "integration",
|
||||
"status": "approved",
|
||||
"primary_class": "integration",
|
||||
"location": "multiple files"
|
||||
},
|
||||
{
|
||||
"name": "Use Claude Models",
|
||||
"type": "integration",
|
||||
"status": "approved",
|
||||
"primary_class": "integration",
|
||||
"location": "multiple files"
|
||||
},
|
||||
{
|
||||
"name": "Use Gemini Models",
|
||||
"type": "integration",
|
||||
"status": "approved",
|
||||
"primary_class": "integration",
|
||||
"location": "multiple files"
|
||||
},
|
||||
{
|
||||
"name": "Use OpenAI Models",
|
||||
"type": "integration",
|
||||
"status": "approved",
|
||||
"primary_class": "integration",
|
||||
"location": "multiple files"
|
||||
},
|
||||
{
|
||||
"name": "Use OpenRouter Models",
|
||||
"type": "integration",
|
||||
"status": "approved",
|
||||
"primary_class": "integration",
|
||||
"location": "multiple files"
|
||||
},
|
||||
{
|
||||
"name": "Configure LLM Provider Credentials",
|
||||
"type": "configuration",
|
||||
"status": "approved",
|
||||
"primary_class": "configuration",
|
||||
"location": "multiple files"
|
||||
},
|
||||
{
|
||||
"name": "Maintain LLM Provider Registry",
|
||||
"type": "backend",
|
||||
"status": "approved",
|
||||
"primary_class": "backend",
|
||||
"location": "src/repo_registry/repo_scanning/scanner.py"
|
||||
},
|
||||
{
|
||||
"name": "Apply LLM Provider Fallback Policy",
|
||||
"type": "backend",
|
||||
"status": "approved",
|
||||
"primary_class": "backend",
|
||||
"location": "src/repo_registry/repo_scanning/scanner.py"
|
||||
},
|
||||
{
|
||||
"name": "HTTP API surface: possible API surface, GET /health, @app.post(, +43 more",
|
||||
"type": "API",
|
||||
"status": "approved",
|
||||
"primary_class": "API",
|
||||
"location": "multiple files"
|
||||
},
|
||||
{
|
||||
"name": "CLI command surface: CLI command build_parser, CLI command make_service",
|
||||
"type": "CLI",
|
||||
"status": "approved",
|
||||
"primary_class": "CLI",
|
||||
"location": "multiple files"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"known_regression_patterns": [
|
||||
{
|
||||
"id": "RREG-SELF-REG-001",
|
||||
"title": "LLM provider vocabulary promoted as native capability",
|
||||
"severity": "critical",
|
||||
"description": "Scanner, normalization, schema, fixture, test, or workplan vocabulary becomes the native repo-scoping capability Route LLM Requests Across Providers.",
|
||||
"detection_hint": "Flag any top-level/native repo-scoping capability named Route LLM Requests Across Providers unless product intent and public implementation explicitly show provider routing as a repo-scoping feature."
|
||||
},
|
||||
{
|
||||
"id": "RREG-SELF-REG-002",
|
||||
"title": "Native API and CLI surfaces attached under false capability",
|
||||
"severity": "high",
|
||||
"description": "General repo-scoping API/CLI interface features are nested below a capability they do not support.",
|
||||
"detection_hint": "Flag API or CLI surface features when their parent capability is llm-integration or provider-routing."
|
||||
},
|
||||
{
|
||||
"id": "RREG-SELF-REG-003",
|
||||
"title": "Deterministic trusted auto-approval accepted candidate truth",
|
||||
"severity": "high",
|
||||
"description": "A deterministic rule path approves candidate characteristics without human or agentic judgement.",
|
||||
"detection_hint": "Flag trusted_auto_approve_candidate_graph review decisions in self-scoping assessment artifacts."
|
||||
}
|
||||
],
|
||||
"notes": [
|
||||
"This artifact is a negative regression seed, not a desirable baseline.",
|
||||
"The historical run is useful for pattern detection but is not fully release-bound because the engine commit was not recorded in the original analysis metadata."
|
||||
]
|
||||
}
|
||||
311
docs/self-scoping/golden/repo-scoping-golden-profile.v1.json
Normal file
311
docs/self-scoping/golden/repo-scoping-golden-profile.v1.json
Normal file
@@ -0,0 +1,311 @@
|
||||
{
|
||||
"schema_version": "self-scoping-golden-profile/v1",
|
||||
"profile_id": "repo-scoping-golden-profile-v1",
|
||||
"repo_slug": "repo-scoping",
|
||||
"status": "active",
|
||||
"created_at": "2026-05-15",
|
||||
"updated_at": "2026-05-15",
|
||||
"curation": {
|
||||
"curator": "codex",
|
||||
"workplan_id": "RREG-WP-0013",
|
||||
"summary": "Curated target profile for evaluating repo-scoping self-analysis quality."
|
||||
},
|
||||
"ability": {
|
||||
"name": "Map Repositories Into Reviewable Scope Profiles",
|
||||
"primary_class": "repository-intelligence",
|
||||
"attributes": [
|
||||
"capability-mapping",
|
||||
"source-linked-review",
|
||||
"scope-generation"
|
||||
],
|
||||
"description": "repo-scoping turns repository source, documentation, and review decisions into source-linked maps of repository utility.",
|
||||
"expected_capabilities": [
|
||||
{
|
||||
"name": "Register And Track Repositories",
|
||||
"primary_class": "ingestion",
|
||||
"attributes": ["metadata", "git", "analysis-run"],
|
||||
"expected_features": [
|
||||
{
|
||||
"name": "Create and update repository records",
|
||||
"primary_class": "api",
|
||||
"source_paths": [
|
||||
"src/repo_registry/core/service.py",
|
||||
"src/repo_registry/web_api/app.py",
|
||||
"src/repo_registry/web_ui/views.py"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Resolve local or remote Git checkouts",
|
||||
"primary_class": "backend",
|
||||
"source_paths": [
|
||||
"src/repo_registry/repo_ingestion/git.py",
|
||||
"tests/test_git_ingestion.py"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Import repository metadata",
|
||||
"primary_class": "backend",
|
||||
"source_paths": [
|
||||
"src/repo_registry/repo_ingestion/metadata.py",
|
||||
"tests/test_repository_metadata.py"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Scan Repositories Into Observed Facts",
|
||||
"primary_class": "analysis",
|
||||
"attributes": ["deterministic", "facts", "provenance"],
|
||||
"expected_features": [
|
||||
{
|
||||
"name": "Detect source languages, manifests, docs, tests, config, and interfaces",
|
||||
"primary_class": "backend",
|
||||
"source_paths": [
|
||||
"src/repo_registry/repo_scanning/scanner.py",
|
||||
"tests/test_repository_scanner.py"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Classify source roles for facts",
|
||||
"primary_class": "backend",
|
||||
"source_paths": [
|
||||
"src/repo_registry/repo_scanning/scanner.py",
|
||||
"docs/characteristic-evidence-model.md"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Preserve analysis snapshots and fact records",
|
||||
"primary_class": "storage",
|
||||
"source_paths": [
|
||||
"src/repo_registry/storage/sqlite.py",
|
||||
"migrations/0001_initial.sql"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Index Source Content With Provenance",
|
||||
"primary_class": "analysis",
|
||||
"attributes": ["content-chunks", "source-role"],
|
||||
"expected_features": [
|
||||
{
|
||||
"name": "Create source-linked content chunks from observed facts",
|
||||
"primary_class": "backend",
|
||||
"source_paths": [
|
||||
"src/repo_registry/content_indexing/extractor.py",
|
||||
"tests/test_content_indexing.py"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Carry source-role metadata into downstream generation",
|
||||
"primary_class": "backend",
|
||||
"source_paths": [
|
||||
"src/repo_registry/content_indexing/extractor.py",
|
||||
"src/repo_registry/llm_extraction/extractor.py"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Generate Reviewable Candidate Characteristics",
|
||||
"primary_class": "analysis",
|
||||
"attributes": ["candidate-graph", "review-required"],
|
||||
"expected_features": [
|
||||
{
|
||||
"name": "Build candidate abilities, capabilities, features, and evidence",
|
||||
"primary_class": "backend",
|
||||
"source_paths": [
|
||||
"src/repo_registry/candidate_graph/generator.py",
|
||||
"src/repo_registry/candidate_graph/normalization.py",
|
||||
"tests/test_candidate_graph.py"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Optionally map structured LLM extraction into candidates",
|
||||
"primary_class": "integration",
|
||||
"source_paths": [
|
||||
"src/repo_registry/llm_extraction/extractor.py",
|
||||
"src/repo_registry/llm_extraction/mapper.py",
|
||||
"tests/test_llm_extraction.py"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Review And Approve Candidate Characteristics",
|
||||
"primary_class": "review",
|
||||
"attributes": ["curation", "approval", "audit"],
|
||||
"expected_features": [
|
||||
{
|
||||
"name": "Edit, reject, merge, and relink candidate graph entries",
|
||||
"primary_class": "api",
|
||||
"source_paths": [
|
||||
"src/repo_registry/core/service.py",
|
||||
"src/repo_registry/web_api/app.py",
|
||||
"src/repo_registry/web_ui/views.py",
|
||||
"tests/test_registry_service.py"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Publish approved characteristic maps after review",
|
||||
"primary_class": "storage",
|
||||
"source_paths": [
|
||||
"src/repo_registry/core/service.py",
|
||||
"src/repo_registry/storage/sqlite.py"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Record review decisions and expectation gaps",
|
||||
"primary_class": "audit",
|
||||
"source_paths": [
|
||||
"src/repo_registry/core/service.py",
|
||||
"src/repo_registry/web_api/schemas.py"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Search Compare And Export Approved Profiles",
|
||||
"primary_class": "discovery",
|
||||
"attributes": ["search", "comparison", "export"],
|
||||
"expected_features": [
|
||||
{
|
||||
"name": "Search approved abilities, capabilities, features, and evidence",
|
||||
"primary_class": "api",
|
||||
"source_paths": [
|
||||
"src/repo_registry/core/service.py",
|
||||
"src/repo_registry/semantic/embeddings.py",
|
||||
"tests/test_registry_service.py"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Compare repositories and identify capability gaps",
|
||||
"primary_class": "api",
|
||||
"source_paths": [
|
||||
"src/repo_registry/core/service.py",
|
||||
"src/repo_registry/web_api/app.py"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Export repository profiles",
|
||||
"primary_class": "api",
|
||||
"source_paths": [
|
||||
"src/repo_registry/web_api/app.py",
|
||||
"docs/api-contract.md"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Generate And Maintain SCOPE.md",
|
||||
"primary_class": "scope-generation",
|
||||
"attributes": ["scope-md", "diff", "validation"],
|
||||
"expected_features": [
|
||||
{
|
||||
"name": "Render SCOPE.md from approved characteristics",
|
||||
"primary_class": "backend",
|
||||
"source_paths": [
|
||||
"src/repo_registry/scope/generator.py",
|
||||
"tests/test_scope_generator.py",
|
||||
"docs/scope-md-spec.md"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Diff, validate, and write scope files",
|
||||
"primary_class": "api",
|
||||
"source_paths": [
|
||||
"src/repo_registry/scope/validator.py",
|
||||
"src/repo_registry/web_api/app.py",
|
||||
"src/repo_registry/web_ui/views.py"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Explore Dependency And Impact Graphs",
|
||||
"primary_class": "dependency-analysis",
|
||||
"attributes": ["graph", "impact", "visualization"],
|
||||
"expected_features": [
|
||||
{
|
||||
"name": "Model dependencies between facts, evidence, features, capabilities, abilities, and scope",
|
||||
"primary_class": "backend",
|
||||
"source_paths": [
|
||||
"src/repo_registry/core/service.py",
|
||||
"docs/dependency-aware-scope-propagation.md",
|
||||
"docs/dependency-visualization-exploration.md"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Render dependency graph views and profiles",
|
||||
"primary_class": "ui",
|
||||
"source_paths": [
|
||||
"src/repo_registry/web_ui/views.py",
|
||||
"tests/test_web_api.py"
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "Provide Scope Context To Downstream Agents",
|
||||
"primary_class": "coordination",
|
||||
"attributes": ["activity-core", "api-contract"],
|
||||
"expected_features": [
|
||||
{
|
||||
"name": "Return compact JSON scope context by repository slug",
|
||||
"primary_class": "api",
|
||||
"source_paths": [
|
||||
"src/repo_registry/web_api/app.py",
|
||||
"docs/schemas/repo-scope-context-response.json",
|
||||
"tests/test_scope_context_api.py"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
},
|
||||
"forbidden_native_capabilities": [
|
||||
{
|
||||
"name": "Route LLM Requests Across Providers",
|
||||
"reason": "repo-scoping may use llm-connect as optional extraction infrastructure, but provider routing is not a native repo-scoping product capability.",
|
||||
"allowed_only_if": "Future product intent and public implementation explicitly add provider routing as repo-scoping-owned behavior."
|
||||
}
|
||||
],
|
||||
"non_native_context": [
|
||||
{
|
||||
"name": "LLM provider names in scanner, normalization, schemas, tests, fixtures, docs, or workplans",
|
||||
"classification": "scanner-rule-or-fixture-context",
|
||||
"expected_handling": "May support scanner behavior facts or test coverage, but must not become native capability truth."
|
||||
},
|
||||
{
|
||||
"name": "llm-connect integration",
|
||||
"classification": "optional dependency / adapter consumer",
|
||||
"expected_handling": "May appear as optional extraction infrastructure, not as repo-scoping-owned provider routing."
|
||||
},
|
||||
{
|
||||
"name": "SCOPE.md content",
|
||||
"classification": "derived scope",
|
||||
"expected_handling": "Can be comparison or bootstrap context, not primary evidence for regenerating the same characteristic model."
|
||||
}
|
||||
],
|
||||
"comparison_rules": {
|
||||
"must_have_capability_names": [
|
||||
"Register And Track Repositories",
|
||||
"Scan Repositories Into Observed Facts",
|
||||
"Index Source Content With Provenance",
|
||||
"Generate Reviewable Candidate Characteristics",
|
||||
"Review And Approve Candidate Characteristics",
|
||||
"Search Compare And Export Approved Profiles",
|
||||
"Generate And Maintain SCOPE.md",
|
||||
"Explore Dependency And Impact Graphs",
|
||||
"Provide Scope Context To Downstream Agents"
|
||||
],
|
||||
"must_not_have_native_capability_names": [
|
||||
"Route LLM Requests Across Providers"
|
||||
],
|
||||
"known_regression_ids": [
|
||||
"RREG-SELF-REG-001",
|
||||
"RREG-SELF-REG-002",
|
||||
"RREG-SELF-REG-003"
|
||||
]
|
||||
}
|
||||
}
|
||||
112
tests/test_self_scoping_artifacts.py
Normal file
112
tests/test_self_scoping_artifacts.py
Normal file
@@ -0,0 +1,112 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
ROOT = Path(__file__).resolve().parents[1]
|
||||
SCHEMA_PATH = ROOT / "docs" / "schemas" / "self-scoping-assessment.schema.json"
|
||||
KNOWN_BAD_PATH = (
|
||||
ROOT
|
||||
/ "docs"
|
||||
/ "self-scoping"
|
||||
/ "assessments"
|
||||
/ "repo-scoping-known-bad-2026-05-15-run-39.json"
|
||||
)
|
||||
GOLDEN_PROFILE_PATH = (
|
||||
ROOT
|
||||
/ "docs"
|
||||
/ "self-scoping"
|
||||
/ "golden"
|
||||
/ "repo-scoping-golden-profile.v1.json"
|
||||
)
|
||||
|
||||
|
||||
def load_json(path: Path) -> dict:
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
|
||||
def test_self_scoping_assessment_schema_requires_release_binding_metadata():
|
||||
schema = load_json(SCHEMA_PATH)
|
||||
|
||||
required = set(schema["required"])
|
||||
engine_required = set(schema["$defs"]["engineIdentity"]["required"])
|
||||
|
||||
assert {
|
||||
"target_repository",
|
||||
"engine_identity",
|
||||
"execution",
|
||||
"assessment",
|
||||
"fact_summary",
|
||||
"generated_tree",
|
||||
"known_regression_patterns",
|
||||
} <= required
|
||||
assert {
|
||||
"repo_scoping_version",
|
||||
"engine_commit",
|
||||
"engine_release",
|
||||
"engine_dirty_state",
|
||||
"scanner_version",
|
||||
"candidate_generator_version",
|
||||
"quality_criteria_version",
|
||||
"prompt_version",
|
||||
"release_binding_status",
|
||||
} <= engine_required
|
||||
assert schema["$defs"]["engineIdentity"]["properties"]["release_binding_status"][
|
||||
"enum"
|
||||
] == ["complete", "historical_incomplete", "unbound"]
|
||||
|
||||
|
||||
def test_known_bad_self_scoping_artifact_captures_rejected_regression_seed():
|
||||
artifact = load_json(KNOWN_BAD_PATH)
|
||||
|
||||
assert artifact["schema_version"] == "self-scoping-assessment/v1"
|
||||
assert artifact["artifact_id"] == "repo-scoping-known-bad-2026-05-15-run-39"
|
||||
assert artifact["target_repository"]["repo_slug"] == "repo-scoping"
|
||||
assert artifact["execution"]["analysis_run_id"] == 39
|
||||
assert artifact["assessment"]["role"] == "negative_regression_seed"
|
||||
assert artifact["assessment"]["outcome"] == "rejected"
|
||||
assert (
|
||||
artifact["engine_identity"]["release_binding_status"]
|
||||
== "historical_incomplete"
|
||||
)
|
||||
|
||||
capability_names = {
|
||||
capability["name"]
|
||||
for ability in artifact["generated_tree"]["abilities"]
|
||||
for capability in ability["capabilities"]
|
||||
}
|
||||
regression_ids = {item["id"] for item in artifact["known_regression_patterns"]}
|
||||
|
||||
assert "Route LLM Requests Across Providers" in capability_names
|
||||
assert {"RREG-SELF-REG-001", "RREG-SELF-REG-002", "RREG-SELF-REG-003"} <= regression_ids
|
||||
assert artifact["fact_summary"]["counts_by_kind"]["llm_provider"] == 41
|
||||
|
||||
|
||||
def test_golden_profile_names_expected_native_capabilities_and_forbidden_false_positive():
|
||||
profile = load_json(GOLDEN_PROFILE_PATH)
|
||||
|
||||
expected_capability_names = {
|
||||
capability["name"]
|
||||
for capability in profile["ability"]["expected_capabilities"]
|
||||
}
|
||||
forbidden_names = {
|
||||
capability["name"] for capability in profile["forbidden_native_capabilities"]
|
||||
}
|
||||
|
||||
assert profile["schema_version"] == "self-scoping-golden-profile/v1"
|
||||
assert profile["repo_slug"] == "repo-scoping"
|
||||
assert {
|
||||
"Register And Track Repositories",
|
||||
"Scan Repositories Into Observed Facts",
|
||||
"Index Source Content With Provenance",
|
||||
"Generate Reviewable Candidate Characteristics",
|
||||
"Review And Approve Candidate Characteristics",
|
||||
"Search Compare And Export Approved Profiles",
|
||||
"Generate And Maintain SCOPE.md",
|
||||
"Explore Dependency And Impact Graphs",
|
||||
"Provide Scope Context To Downstream Agents",
|
||||
} <= expected_capability_names
|
||||
assert "Route LLM Requests Across Providers" in forbidden_names
|
||||
assert profile["comparison_rules"]["must_not_have_native_capability_names"] == [
|
||||
"Route LLM Requests Across Providers"
|
||||
]
|
||||
|
||||
258
workplans/RREG-WP-0013-self-scoping-baseline-evaluation.md
Normal file
258
workplans/RREG-WP-0013-self-scoping-baseline-evaluation.md
Normal file
@@ -0,0 +1,258 @@
|
||||
---
|
||||
id: RREG-WP-0013
|
||||
type: workplan
|
||||
title: "Self-Scoping Baseline Evaluation"
|
||||
domain: capabilities
|
||||
repo: repo-scoping
|
||||
status: active
|
||||
owner: codex
|
||||
topic_slug: foerster-capabilities
|
||||
created: "2026-05-15"
|
||||
updated: "2026-05-15"
|
||||
state_hub_workstream_id: "1c740db0-1999-478b-b3e3-c0fdfec1e9dd"
|
||||
---
|
||||
|
||||
# Self-Scoping Baseline Evaluation
|
||||
|
||||
repo-scoping should become a self-improving infrastructure: every meaningful
|
||||
change to the scoping engine should be testable against a known baseline for
|
||||
repo-scoping itself. The goal is not just to assert that output changed, but to
|
||||
make it easy for a human or trusted agent to decide whether an old or new
|
||||
result is better and preserve that assessment as signal for future engine
|
||||
iterations.
|
||||
|
||||
The motivating failure is the 2026-05-15 self-analysis where deterministic
|
||||
provider-vocabulary facts were promoted into an approved `Route LLM Requests
|
||||
Across Providers` capability and the repo's native API/CLI features were
|
||||
attached under that incorrect capability. Future reruns should make regressions
|
||||
like that obvious, reviewable, and attributable to the exact repo-scoping
|
||||
release that generated them.
|
||||
|
||||
## T01: Define Self-Scoping Assessment Model
|
||||
|
||||
```task
|
||||
id: RREG-WP-0013-T01
|
||||
status: done
|
||||
priority: high
|
||||
state_hub_task_id: "af633b76-3356-4480-8108-d996eeda5a31"
|
||||
```
|
||||
|
||||
Define the data model for immutable self-scoping assessment runs.
|
||||
|
||||
Each assessment must bind together:
|
||||
|
||||
- The target repository identity: repo slug, source URL/path, target commit,
|
||||
target branch, and dirty-state marker when applicable.
|
||||
- The engine identity: repo-scoping package version, git commit, git tag or
|
||||
release name when available, dirty-state marker, scanner version, candidate
|
||||
generator version, quality-gate/ruleset version, schema version, and prompt
|
||||
version/hash when LLM or agentic review is used.
|
||||
- The execution mode: deterministic-only, LLM-assisted, agent-reviewed,
|
||||
trusted-auto-review, manual-review, or mixed.
|
||||
- The generated artifacts: observed fact summary, candidate graph, approved map
|
||||
or proposed approval set, rejected/downgraded items, source refs, and review
|
||||
notes.
|
||||
- The assessment outcome: baseline, challenger, preferred, tied, rejected,
|
||||
superseded, or needs-human.
|
||||
|
||||
Acceptance criteria:
|
||||
- A documented schema exists for self-scoping assessment runs.
|
||||
- Assessment runs are append-only; reruns create new records instead of
|
||||
rewriting old judgements.
|
||||
- Engine release binding is required before an assessment can be compared.
|
||||
- Dirty working trees are visible in the assessment metadata.
|
||||
|
||||
## T02: Capture Current Bad Self-Run As A Regression Seed
|
||||
|
||||
```task
|
||||
id: RREG-WP-0013-T02
|
||||
status: done
|
||||
priority: high
|
||||
state_hub_task_id: "98258aea-65bb-4709-921f-711c6cc6ee48"
|
||||
```
|
||||
|
||||
Import or recreate the known-bad repo-scoping self-analysis as a named
|
||||
regression seed.
|
||||
|
||||
Known bad pattern:
|
||||
|
||||
- Candidate/approved capability: `Route LLM Requests Across Providers`.
|
||||
- Incorrect feature attachment: repo-scoping API/CLI surfaces nested under that
|
||||
LLM-provider capability.
|
||||
- Incorrect evidence: scanner vocabulary, schema examples, tests, and
|
||||
provider-name normalization code treated as repo-owned LLM routing behavior.
|
||||
|
||||
Acceptance criteria:
|
||||
- The bad run can be inspected as a historical assessment artifact.
|
||||
- It is clearly marked as a negative baseline, not a desired golden output.
|
||||
- The failure explanation is stored next to the captured graph.
|
||||
- Future comparison reports can flag when a challenger repeats the same pattern.
|
||||
|
||||
## T03: Create Desired Repo-Scoping Golden Profile
|
||||
|
||||
```task
|
||||
id: RREG-WP-0013-T03
|
||||
status: done
|
||||
priority: high
|
||||
state_hub_task_id: "f3ef1711-a115-4368-a97e-98abd1eda521"
|
||||
```
|
||||
|
||||
Author a curated golden profile for repo-scoping itself. This should be compact
|
||||
enough for comparison but expressive enough to catch hierarchy errors.
|
||||
|
||||
Expected native capabilities should cover at least:
|
||||
|
||||
- Repository registration and metadata import.
|
||||
- Deterministic repository scanning into observed facts.
|
||||
- Source-role and provenance-aware content indexing.
|
||||
- Candidate characteristic generation from facts and content.
|
||||
- Candidate review, edit, reject, merge, relink, and approval workflow.
|
||||
- Approved characteristic search, comparison, export, and capability-gap
|
||||
exploration.
|
||||
- SCOPE.md generation, diffing, validation, and write/update flows.
|
||||
- Dependency graph and characteristic impact exploration.
|
||||
- Scope context API support for downstream agents such as activity-core.
|
||||
|
||||
Forbidden top-level/native capabilities should include:
|
||||
|
||||
- `Route LLM Requests Across Providers`, unless repo-scoping later genuinely
|
||||
implements provider routing as a product feature rather than using
|
||||
`llm-connect` as optional extraction infrastructure.
|
||||
|
||||
Acceptance criteria:
|
||||
- The golden profile includes ability, capability, feature, and evidence
|
||||
expectations with source paths.
|
||||
- The profile distinguishes native utility from dependencies, fixtures, test
|
||||
vocabulary, schema examples, and optional LLM extraction infrastructure.
|
||||
- The profile is stored in a stable, reviewable fixture location.
|
||||
- The profile can evolve through explicit assessment decisions.
|
||||
|
||||
Implementation note 2026-05-15: added
|
||||
`docs/schemas/self-scoping-assessment.schema.json`,
|
||||
`docs/self-scoping/assessments/repo-scoping-known-bad-2026-05-15-run-39.json`,
|
||||
`docs/self-scoping/golden/repo-scoping-golden-profile.v1.json`, and
|
||||
`tests/test_self_scoping_artifacts.py`. The known-bad artifact is marked as a
|
||||
negative regression seed with `historical_incomplete` release binding because
|
||||
the original analysis run did not record the engine commit.
|
||||
|
||||
## T04: Export Assessment Artifacts From Analysis Runs
|
||||
|
||||
```task
|
||||
id: RREG-WP-0013-T04
|
||||
status: todo
|
||||
priority: high
|
||||
state_hub_task_id: "51e01d45-7574-4c97-994d-dabb2bcf9a00"
|
||||
```
|
||||
|
||||
Add a CLI and/or API workflow that exports a completed analysis run as a
|
||||
self-scoping assessment artifact.
|
||||
|
||||
Acceptance criteria:
|
||||
- Export includes repository metadata, analysis run metadata, engine identity,
|
||||
candidate graph, observed fact summary, content chunk summary, approved map
|
||||
if present, review decisions, and quality-gate outcomes when available.
|
||||
- Export format is deterministic JSON with a documented schema.
|
||||
- Export refuses to mark an artifact comparable when engine identity is
|
||||
incomplete.
|
||||
- Export can target repo-scoping itself without requiring network access.
|
||||
|
||||
## T05: Compare Baseline And Challenger Runs
|
||||
|
||||
```task
|
||||
id: RREG-WP-0013-T05
|
||||
status: todo
|
||||
priority: high
|
||||
state_hub_task_id: "2b71069b-6150-45f4-84a2-59f5ec1e04c0"
|
||||
```
|
||||
|
||||
Implement comparison between an existing baseline and a later challenger run.
|
||||
|
||||
Comparison should report:
|
||||
|
||||
- Added, removed, renamed, and moved abilities/capabilities/features.
|
||||
- Hierarchy quality changes, especially misplaced features under the wrong
|
||||
capability.
|
||||
- Native-utility precision: whether generated capabilities are repo-owned,
|
||||
facade/adapter, dependency, tooling, fixture, schema-example, or mention-only.
|
||||
- Coverage against the repo-scoping golden profile.
|
||||
- Regression flags for known-bad patterns.
|
||||
- Source-ref quality: whether claims cite product intent, docs, source, tests,
|
||||
fixtures, examples, or generated/derived scope.
|
||||
|
||||
Acceptance criteria:
|
||||
- Comparison output is useful in both machine-readable JSON and human-readable
|
||||
Markdown.
|
||||
- The report makes it easy to choose "old better", "new better", "tie", or
|
||||
"needs review".
|
||||
- It does not require candidates to have stable database IDs across runs.
|
||||
- It can compare deterministic-only and agent-reviewed runs without losing
|
||||
provenance.
|
||||
|
||||
## T06: Add Side-By-Side Review UI
|
||||
|
||||
```task
|
||||
id: RREG-WP-0013-T06
|
||||
status: todo
|
||||
priority: medium
|
||||
state_hub_task_id: "16a60b7c-7e2c-4bb0-b4ab-2381289dba0b"
|
||||
```
|
||||
|
||||
Expose baseline/challenger comparison in the curator UI.
|
||||
|
||||
Acceptance criteria:
|
||||
- Reviewers can select two assessment artifacts for repo-scoping.
|
||||
- The UI shows the two hierarchy trees side by side with moved/misplaced items
|
||||
highlighted.
|
||||
- Reviewers can record preference, tie, rejection, and notes.
|
||||
- Review decisions are persisted as assessment outcomes, not as changes to the
|
||||
underlying historical artifacts.
|
||||
|
||||
## T07: Add Self-Scoping Regression Command
|
||||
|
||||
```task
|
||||
id: RREG-WP-0013-T07
|
||||
status: todo
|
||||
priority: medium
|
||||
state_hub_task_id: "af1fcecd-686d-4592-b739-4698abc98c55"
|
||||
```
|
||||
|
||||
Add a repeatable command for running repo-scoping against itself and comparing
|
||||
the result to the active baseline.
|
||||
|
||||
Acceptance criteria:
|
||||
- The command captures engine identity before running analysis.
|
||||
- The command can run deterministic-only without LLM or agentic review.
|
||||
- The command can optionally invoke agentic review when configured.
|
||||
- The command emits a comparison report and exits non-zero only for explicit
|
||||
CI-blocking regressions, not for ordinary "needs review" assessment outcomes.
|
||||
|
||||
## T08: Document Assessment Workflow
|
||||
|
||||
```task
|
||||
id: RREG-WP-0013-T08
|
||||
status: todo
|
||||
priority: medium
|
||||
state_hub_task_id: "30d71946-3598-4dc7-9970-c7c18126cad7"
|
||||
```
|
||||
|
||||
Document how maintainers should use self-scoping assessment artifacts while
|
||||
evolving the engine.
|
||||
|
||||
Acceptance criteria:
|
||||
- Documentation explains baseline, challenger, preferred, tied, rejected, and
|
||||
superseded outcomes.
|
||||
- Documentation explains engine release binding and why unbound output is not
|
||||
comparable.
|
||||
- Documentation gives examples for the known-bad LLM-provider regression and a
|
||||
desired native repo-scoping profile.
|
||||
- Documentation describes when to update the golden profile versus when to fix
|
||||
the engine.
|
||||
|
||||
## Completion Criteria
|
||||
|
||||
- repo-scoping has an immutable, release-bound self-scoping assessment format.
|
||||
- The current known-bad output is captured as a negative regression seed.
|
||||
- A curated desired repo-scoping profile exists.
|
||||
- Maintainers can rerun repo-scoping on itself, compare old/new results, and
|
||||
record which output is better.
|
||||
- Comparison results are bound to the repo-scoping release that generated them.
|
||||
225
workplans/RREG-WP-0014-agentic-characteristic-acceptance.md
Normal file
225
workplans/RREG-WP-0014-agentic-characteristic-acceptance.md
Normal file
@@ -0,0 +1,225 @@
|
||||
---
|
||||
id: RREG-WP-0014
|
||||
type: workplan
|
||||
title: "Agentic Characteristic Acceptance"
|
||||
domain: capabilities
|
||||
repo: repo-scoping
|
||||
status: active
|
||||
owner: codex
|
||||
topic_slug: foerster-capabilities
|
||||
created: "2026-05-15"
|
||||
updated: "2026-05-15"
|
||||
state_hub_workstream_id: "7feaa5b5-32d8-4b8e-b377-cbb3ddacf64a"
|
||||
---
|
||||
|
||||
# Agentic Characteristic Acceptance
|
||||
|
||||
Deterministic rules should not automatically accept candidate
|
||||
characteristics. Determinism is strongest at fast, source-linked observation and
|
||||
at applying transparent rejection or downgrade criteria: facts, provenance,
|
||||
formal quality checks, schema validation, duplicate detection, and clear
|
||||
negative filters.
|
||||
|
||||
Acceptance is a judgement step. When automation stands in for human judgement,
|
||||
it should be agentic: inspect the evidence, apply the visible quality criteria,
|
||||
explain the decision, and leave a reviewable trace. Deterministic rules may
|
||||
invalidate, downgrade, or require review, but they should not silently promote a
|
||||
candidate into approved registry truth.
|
||||
|
||||
## T01: Define Acceptance Policy Boundary
|
||||
|
||||
```task
|
||||
id: RREG-WP-0014-T01
|
||||
status: todo
|
||||
priority: high
|
||||
state_hub_task_id: "4bc2e749-ec9e-45d4-8095-63181efb752b"
|
||||
```
|
||||
|
||||
Write the policy boundary between deterministic gates and acceptance
|
||||
judgement.
|
||||
|
||||
Policy principles:
|
||||
|
||||
- Deterministic scanners generate observed facts and source refs.
|
||||
- Deterministic quality gates can reject, downgrade, merge, flag, or require
|
||||
review when criteria are formally expressible.
|
||||
- Deterministic quality gates cannot approve candidate characteristics.
|
||||
- Human reviewers can approve.
|
||||
- Trusted agentic reviewers can approve only after producing an evidence-based
|
||||
rationale.
|
||||
- All automated review outcomes must be inspectable and reversible.
|
||||
|
||||
Acceptance criteria:
|
||||
- Documentation states that deterministic auto-approval is prohibited.
|
||||
- Existing "trusted auto-approve" terminology is marked for replacement or
|
||||
migration.
|
||||
- The allowed deterministic outcomes are explicitly listed.
|
||||
- The allowed agentic outcomes are explicitly listed.
|
||||
|
||||
## T02: Create Transparent Quality Criteria Registry
|
||||
|
||||
```task
|
||||
id: RREG-WP-0014-T02
|
||||
status: todo
|
||||
priority: high
|
||||
state_hub_task_id: "101998a4-8cf8-4df0-8d05-c4e2041c0cac"
|
||||
```
|
||||
|
||||
Create a reviewable quality criteria registry for candidate characteristics.
|
||||
|
||||
Initial criteria should cover:
|
||||
|
||||
- Source-role quality: intent/docs/source/tests are stronger than fixtures,
|
||||
schema examples, agent guidance, CI/tooling, dependency declarations, or
|
||||
derived scope.
|
||||
- Native utility: owned/facade/adapter claims require explicit product evidence;
|
||||
dependency, tooling, configuration, fixture, schema-example, and mention-only
|
||||
claims are not native capabilities.
|
||||
- Hierarchy fit: features should support their parent capability; misplaced
|
||||
API/CLI surfaces should be flagged.
|
||||
- Evidence sufficiency: candidate claims need source refs that support the
|
||||
actual abstraction, not just matching vocabulary.
|
||||
- Circularity: generated `SCOPE.md` text cannot be primary proof for rebuilding
|
||||
the same characteristic model.
|
||||
- Fixture contamination: tests and expectation files can prove scanner behavior
|
||||
but should not become repo-native product capability claims.
|
||||
|
||||
Acceptance criteria:
|
||||
- Criteria are stored in a versioned, human-readable format.
|
||||
- Each criterion has an identifier, description, severity, deterministic action
|
||||
if applicable, and reviewer guidance.
|
||||
- Criteria can be listed through CLI and/or API.
|
||||
- Assessment and review records include the criteria version used.
|
||||
|
||||
## T03: Implement Deterministic Quality Gate Outcomes
|
||||
|
||||
```task
|
||||
id: RREG-WP-0014-T03
|
||||
status: todo
|
||||
priority: high
|
||||
state_hub_task_id: "d599c084-a207-4910-9d0b-578d0c50f282"
|
||||
```
|
||||
|
||||
Apply quality criteria before any human or agentic acceptance step.
|
||||
|
||||
Acceptance criteria:
|
||||
- Candidate abilities, capabilities, features, and evidence can carry gate
|
||||
outcomes such as `pass`, `downgraded`, `rejected`, `requires_review`, and
|
||||
`invalidated`.
|
||||
- Rejected or invalidated candidates remain auditable with reason codes.
|
||||
- Downgraded candidates remain visible but cannot be accepted without explicit
|
||||
reviewer override.
|
||||
- Deterministic gates never mark a candidate as approved.
|
||||
- The known repo-scoping LLM-provider self-scan failure is flagged before
|
||||
acceptance.
|
||||
|
||||
## T04: Replace Trusted Auto-Approval With Agentic Review
|
||||
|
||||
```task
|
||||
id: RREG-WP-0014-T04
|
||||
status: todo
|
||||
priority: high
|
||||
state_hub_task_id: "b0d29756-7460-4ffa-8d56-d94cfb34e94f"
|
||||
```
|
||||
|
||||
Replace `trusted_auto_approve_candidate_graph` behavior with an agentic review
|
||||
workflow.
|
||||
|
||||
Acceptance criteria:
|
||||
- Existing API/CLI/UI affordances no longer present deterministic
|
||||
auto-approval as a safe path.
|
||||
- A configured agentic reviewer receives the candidate graph, source refs,
|
||||
quality-gate outcomes, criteria version, and repository context.
|
||||
- The reviewer can approve, reject, downgrade, request human review, relink,
|
||||
or propose edits.
|
||||
- Each agentic approval includes a rationale tied to evidence and criteria.
|
||||
- If no agentic reviewer is configured, candidates remain pending review.
|
||||
|
||||
## T05: Add Review Decision Audit Trail
|
||||
|
||||
```task
|
||||
id: RREG-WP-0014-T05
|
||||
status: todo
|
||||
priority: high
|
||||
state_hub_task_id: "0d12559a-831e-40ff-bf82-85f45b763f07"
|
||||
```
|
||||
|
||||
Extend review decisions so acceptance history is useful for later audits and
|
||||
self-scoping assessments.
|
||||
|
||||
Acceptance criteria:
|
||||
- Review decisions record reviewer type: human, agent, deterministic-gate, or
|
||||
migration.
|
||||
- Agentic decisions record reviewer identity/configuration, criteria version,
|
||||
prompt or policy version, evidence inspected, and rationale.
|
||||
- Deterministic gate decisions record rule IDs and outcomes, not approval.
|
||||
- Review records distinguish "candidate accepted as-is" from "accepted after
|
||||
edits/relinks".
|
||||
- Existing decisions remain readable through a migration or compatibility view.
|
||||
|
||||
## T06: Add Human Override And Criteria Refinement Flow
|
||||
|
||||
```task
|
||||
id: RREG-WP-0014-T06
|
||||
status: todo
|
||||
priority: medium
|
||||
state_hub_task_id: "bcba3237-fb87-4a38-8e96-12b872d5e6a9"
|
||||
```
|
||||
|
||||
Make quality criteria reviewable and refineable instead of hidden in code.
|
||||
|
||||
Acceptance criteria:
|
||||
- Reviewers can inspect which criteria fired for a candidate.
|
||||
- Reviewers can override a gate with a reason.
|
||||
- Overrides are searchable so repeated overrides can drive criteria changes.
|
||||
- Criteria changes are versioned and linked to workplans or decisions.
|
||||
- The UI makes it clear when a candidate is blocked by formal criteria versus
|
||||
merely awaiting judgement.
|
||||
|
||||
## T07: Regression Coverage For Acceptance Boundary
|
||||
|
||||
```task
|
||||
id: RREG-WP-0014-T07
|
||||
status: todo
|
||||
priority: high
|
||||
state_hub_task_id: "37a22c89-ded5-42dd-aaa9-ece79477fcff"
|
||||
```
|
||||
|
||||
Add tests that lock in the new acceptance boundary.
|
||||
|
||||
Acceptance criteria:
|
||||
- Deterministic analysis can generate facts and candidates but cannot approve
|
||||
them.
|
||||
- Deterministic gates can reject/downgrade/require review with reason codes.
|
||||
- Agentic review can approve only with a rationale and criteria version.
|
||||
- The repo-scoping self-scan LLM-provider failure is not accepted by
|
||||
deterministic rules.
|
||||
- Existing manual review and approval paths keep working.
|
||||
|
||||
## T08: Migration And Compatibility Plan
|
||||
|
||||
```task
|
||||
id: RREG-WP-0014-T08
|
||||
status: todo
|
||||
priority: medium
|
||||
state_hub_task_id: "3d5475f6-71a7-4ca7-aa69-573e91d1fe1e"
|
||||
```
|
||||
|
||||
Plan the migration away from trusted deterministic auto-approval.
|
||||
|
||||
Acceptance criteria:
|
||||
- Existing approved maps created by trusted auto-approval can be identified.
|
||||
- Users can rebuild or re-review those maps without losing audit history.
|
||||
- API and CLI changes are documented with compatibility notes.
|
||||
- The old behavior is either removed or guarded behind an explicit deprecated
|
||||
migration mode that cannot run by default.
|
||||
|
||||
## Completion Criteria
|
||||
|
||||
- Deterministic rules no longer approve candidate characteristics.
|
||||
- Transparent, versioned quality criteria can reject, downgrade, invalidate, or
|
||||
require review.
|
||||
- Agentic review is the only automated path that can stand in for human
|
||||
acceptance.
|
||||
- Acceptance decisions are auditable, evidence-bound, and useful as training
|
||||
signal for future self-scoping assessment.
|
||||
Reference in New Issue
Block a user