From 90bae27237f104e9813ec6cd24f6d966ed31c50e Mon Sep 17 00:00:00 2001 From: tegwick Date: Fri, 15 May 2026 12:26:36 +0200 Subject: [PATCH] Add self-scoping baseline workplans and artifacts --- .../self-scoping-assessment.schema.json | 442 ++++++++++++++++++ docs/self-scoping/README.md | 35 ++ ...o-scoping-known-bad-2026-05-15-run-39.json | 209 +++++++++ .../repo-scoping-golden-profile.v1.json | 311 ++++++++++++ tests/test_self_scoping_artifacts.py | 112 +++++ ...P-0013-self-scoping-baseline-evaluation.md | 258 ++++++++++ ...-0014-agentic-characteristic-acceptance.md | 225 +++++++++ 7 files changed, 1592 insertions(+) create mode 100644 docs/schemas/self-scoping-assessment.schema.json create mode 100644 docs/self-scoping/README.md create mode 100644 docs/self-scoping/assessments/repo-scoping-known-bad-2026-05-15-run-39.json create mode 100644 docs/self-scoping/golden/repo-scoping-golden-profile.v1.json create mode 100644 tests/test_self_scoping_artifacts.py create mode 100644 workplans/RREG-WP-0013-self-scoping-baseline-evaluation.md create mode 100644 workplans/RREG-WP-0014-agentic-characteristic-acceptance.md diff --git a/docs/schemas/self-scoping-assessment.schema.json b/docs/schemas/self-scoping-assessment.schema.json new file mode 100644 index 0000000..aedf8bc --- /dev/null +++ b/docs/schemas/self-scoping-assessment.schema.json @@ -0,0 +1,442 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://repo-scoping.local/schemas/self-scoping-assessment.schema.json", + "title": "Self-Scoping Assessment Artifact", + "description": "Immutable artifact used to compare repo-scoping self-analysis results across engine releases.", + "type": "object", + "additionalProperties": false, + "required": [ + "schema_version", + "artifact_id", + "artifact_type", + "created_at", + "target_repository", + "engine_identity", + "execution", + "assessment", + "fact_summary", + "generated_tree", + "known_regression_patterns" + ], + "properties": { + "schema_version": { + "const": "self-scoping-assessment/v1" + }, + "artifact_id": { + "type": "string", + "description": "Stable artifact identifier." + }, + "artifact_type": { + "enum": ["assessment_run"] + }, + "created_at": { + "type": "string", + "format": "date-time" + }, + "target_repository": { + "$ref": "#/$defs/targetRepository" + }, + "engine_identity": { + "$ref": "#/$defs/engineIdentity" + }, + "execution": { + "$ref": "#/$defs/execution" + }, + "assessment": { + "$ref": "#/$defs/assessment" + }, + "fact_summary": { + "$ref": "#/$defs/factSummary" + }, + "generated_tree": { + "$ref": "#/$defs/generatedTree" + }, + "known_regression_patterns": { + "type": "array", + "items": { + "$ref": "#/$defs/regressionPattern" + } + }, + "notes": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "$defs": { + "targetRepository": { + "type": "object", + "additionalProperties": false, + "required": [ + "repo_slug", + "repository_id", + "source", + "target_commit", + "target_branch", + "dirty_state", + "file_count" + ], + "properties": { + "repo_slug": { + "type": "string" + }, + "repository_id": { + "type": ["integer", "null"] + }, + "source": { + "type": "string" + }, + "target_commit": { + "type": "string" + }, + "target_branch": { + "type": "string" + }, + "dirty_state": { + "enum": ["clean", "dirty", "unknown"] + }, + "file_count": { + "type": ["integer", "null"], + "minimum": 0 + } + } + }, + "engineIdentity": { + "type": "object", + "additionalProperties": false, + "required": [ + "repo_scoping_version", + "engine_commit", + "engine_release", + "engine_dirty_state", + "scanner_version", + "candidate_generator_version", + "quality_criteria_version", + "prompt_version", + "release_binding_status" + ], + "properties": { + "repo_scoping_version": { + "type": "string" + }, + "engine_commit": { + "type": ["string", "null"] + }, + "engine_release": { + "type": ["string", "null"] + }, + "engine_dirty_state": { + "enum": ["clean", "dirty", "unknown"] + }, + "scanner_version": { + "type": "string" + }, + "candidate_generator_version": { + "type": "string" + }, + "quality_criteria_version": { + "type": "string" + }, + "prompt_version": { + "type": ["string", "null"] + }, + "release_binding_status": { + "enum": ["complete", "historical_incomplete", "unbound"] + }, + "release_binding_note": { + "type": "string" + } + } + }, + "execution": { + "type": "object", + "additionalProperties": false, + "required": [ + "mode", + "analysis_run_id", + "candidate_source", + "acceptance_mode" + ], + "properties": { + "mode": { + "enum": [ + "deterministic-only", + "llm-assisted", + "agent-reviewed", + "manual-review", + "trusted-auto-review", + "mixed" + ] + }, + "analysis_run_id": { + "type": ["integer", "null"] + }, + "candidate_source": { + "type": "string" + }, + "acceptance_mode": { + "type": "string" + }, + "started_at": { + "type": ["string", "null"], + "format": "date-time" + }, + "completed_at": { + "type": ["string", "null"], + "format": "date-time" + } + } + }, + "assessment": { + "type": "object", + "additionalProperties": false, + "required": [ + "role", + "outcome", + "summary", + "reviewer", + "comparison_eligibility" + ], + "properties": { + "role": { + "enum": ["baseline", "challenger", "negative_regression_seed"] + }, + "outcome": { + "enum": ["baseline", "challenger", "preferred", "tied", "rejected", "superseded", "needs-human"] + }, + "summary": { + "type": "string" + }, + "reviewer": { + "type": "string" + }, + "comparison_eligibility": { + "enum": ["eligible", "eligible_as_negative_seed", "not_comparable"] + }, + "rationale": { + "type": "array", + "items": { + "type": "string" + } + } + } + }, + "factSummary": { + "type": "object", + "additionalProperties": false, + "required": ["counts_by_kind"], + "properties": { + "counts_by_kind": { + "type": "object", + "additionalProperties": { + "type": "integer", + "minimum": 0 + } + }, + "contamination_sources": { + "type": "array", + "items": { + "$ref": "#/$defs/contaminationSource" + } + } + } + }, + "contaminationSource": { + "type": "object", + "additionalProperties": false, + "required": ["path", "reason"], + "properties": { + "path": { + "type": "string" + }, + "reason": { + "type": "string" + } + } + }, + "generatedTree": { + "type": "object", + "additionalProperties": false, + "required": ["abilities"], + "properties": { + "abilities": { + "type": "array", + "items": { + "$ref": "#/$defs/ability" + } + } + } + }, + "ability": { + "type": "object", + "additionalProperties": false, + "required": ["name", "status", "primary_class", "capabilities"], + "properties": { + "name": { + "type": "string" + }, + "status": { + "type": "string" + }, + "primary_class": { + "type": "string" + }, + "capabilities": { + "type": "array", + "items": { + "$ref": "#/$defs/capability" + } + } + } + }, + "capability": { + "type": "object", + "additionalProperties": false, + "required": ["name", "status", "primary_class", "features"], + "properties": { + "name": { + "type": "string" + }, + "status": { + "type": "string" + }, + "primary_class": { + "type": "string" + }, + "features": { + "type": "array", + "items": { + "$ref": "#/$defs/feature" + } + } + } + }, + "feature": { + "type": "object", + "additionalProperties": false, + "required": ["name", "type", "status", "primary_class", "location"], + "properties": { + "name": { + "type": "string" + }, + "type": { + "type": "string" + }, + "status": { + "type": "string" + }, + "primary_class": { + "type": "string" + }, + "location": { + "type": "string" + } + } + }, + "regressionPattern": { + "type": "object", + "additionalProperties": false, + "required": ["id", "title", "severity", "description", "detection_hint"], + "properties": { + "id": { + "type": "string" + }, + "title": { + "type": "string" + }, + "severity": { + "enum": ["low", "medium", "high", "critical"] + }, + "description": { + "type": "string" + }, + "detection_hint": { + "type": "string" + } + } + } + }, + "examples": [ + { + "schema_version": "self-scoping-assessment/v1", + "artifact_id": "repo-scoping-known-bad-2026-05-15-run-39", + "artifact_type": "assessment_run", + "created_at": "2026-05-15T09:28:48Z", + "target_repository": { + "repo_slug": "repo-scoping", + "repository_id": 16, + "source": "/home/worsch/repo-scoping/var/checkouts/repo-scoping-8a9c4168485c", + "target_commit": "00b57d509124789059639fedc724d9314edbb7b2", + "target_branch": "main", + "dirty_state": "unknown", + "file_count": 96 + }, + "engine_identity": { + "repo_scoping_version": "0.1.0", + "engine_commit": null, + "engine_release": null, + "engine_dirty_state": "unknown", + "scanner_version": "deterministic-v0.1", + "candidate_generator_version": "unversioned", + "quality_criteria_version": "none", + "prompt_version": null, + "release_binding_status": "historical_incomplete", + "release_binding_note": "Historical database run did not record engine commit." + }, + "execution": { + "mode": "trusted-auto-review", + "analysis_run_id": 39, + "candidate_source": "deterministic", + "acceptance_mode": "trusted_auto_approve_candidate_graph", + "started_at": "2026-05-15T09:28:47Z", + "completed_at": "2026-05-15T09:28:48Z" + }, + "assessment": { + "role": "negative_regression_seed", + "outcome": "rejected", + "summary": "Provider vocabulary was promoted into a false native LLM routing capability.", + "reviewer": "codex", + "comparison_eligibility": "eligible_as_negative_seed", + "rationale": ["The generated tree misclassified scanner vocabulary as product behavior."] + }, + "fact_summary": { + "counts_by_kind": { + "llm_provider": 41 + }, + "contamination_sources": [ + { + "path": "src/repo_registry/repo_scanning/scanner.py", + "reason": "Scanner rule vocabulary was treated as repo-owned capability evidence." + } + ] + }, + "generated_tree": { + "abilities": [ + { + "name": "Support Repo Registry", + "status": "approved", + "primary_class": "repository-intelligence", + "capabilities": [ + { + "name": "Route LLM Requests Across Providers", + "status": "approved", + "primary_class": "llm-integration", + "features": [] + } + ] + } + ] + }, + "known_regression_patterns": [ + { + "id": "RREG-SELF-REG-001", + "title": "LLM provider vocabulary promoted as native capability", + "severity": "critical", + "description": "Scanner or fixture vocabulary becomes a repo-scoping product capability.", + "detection_hint": "Flag Route LLM Requests Across Providers when parented as a native repo-scoping capability." + } + ] + } + ] +} diff --git a/docs/self-scoping/README.md b/docs/self-scoping/README.md new file mode 100644 index 0000000..9b688c4 --- /dev/null +++ b/docs/self-scoping/README.md @@ -0,0 +1,35 @@ +# Self-Scoping Assessment Artifacts + +This directory contains repo-scoping's own baseline and assessment artifacts. +These files are meant to make scoping-engine changes comparable across releases +instead of relying on memory or screenshots. + +## Artifact Types + +- `golden/repo-scoping-golden-profile.v1.json` is the curated target profile for + repo-scoping itself. +- `assessments/repo-scoping-known-bad-2026-05-15-run-39.json` captures the + known-bad self-analysis that promoted LLM-provider vocabulary into native + repo-scoping capability truth. +- `../schemas/self-scoping-assessment.schema.json` defines the immutable + assessment-run artifact shape. + +## Release Binding + +Comparable assessment artifacts must bind generated results to the repo-scoping +engine release that produced them. A complete binding records package version, +engine git commit or release tag, dirty state, scanner version, candidate +generator version, quality criteria version, and prompt version when applicable. + +The current known-bad artifact is marked `historical_incomplete` because the +original database run did not record the engine commit. It remains useful as a +negative regression seed, but future challenger artifacts should be fully bound +before they are accepted as comparable baselines. + +## Review Use + +When the engine changes, run repo-scoping against itself and export a challenger +assessment. Compare the challenger to the golden profile and to the negative +seed. Reviewers should be able to choose whether the old result, new result, or +neither is better, then store that judgement as a new assessment outcome. + diff --git a/docs/self-scoping/assessments/repo-scoping-known-bad-2026-05-15-run-39.json b/docs/self-scoping/assessments/repo-scoping-known-bad-2026-05-15-run-39.json new file mode 100644 index 0000000..f800ecd --- /dev/null +++ b/docs/self-scoping/assessments/repo-scoping-known-bad-2026-05-15-run-39.json @@ -0,0 +1,209 @@ +{ + "schema_version": "self-scoping-assessment/v1", + "artifact_id": "repo-scoping-known-bad-2026-05-15-run-39", + "artifact_type": "assessment_run", + "created_at": "2026-05-15T09:28:48Z", + "target_repository": { + "repo_slug": "repo-scoping", + "repository_id": 16, + "source": "/home/worsch/repo-scoping/var/checkouts/repo-scoping-8a9c4168485c", + "target_commit": "00b57d509124789059639fedc724d9314edbb7b2", + "target_branch": "main", + "dirty_state": "unknown", + "file_count": 96 + }, + "engine_identity": { + "repo_scoping_version": "0.1.0", + "engine_commit": null, + "engine_release": null, + "engine_dirty_state": "unknown", + "scanner_version": "deterministic-v0.1", + "candidate_generator_version": "unversioned-pre-self-scoping-baseline", + "quality_criteria_version": "none", + "prompt_version": null, + "release_binding_status": "historical_incomplete", + "release_binding_note": "This historical database run recorded scanner version and target commit, but not the repo-scoping engine commit or release tag that generated the candidate graph." + }, + "execution": { + "mode": "trusted-auto-review", + "analysis_run_id": 39, + "candidate_source": "deterministic", + "acceptance_mode": "trusted_auto_approve_candidate_graph", + "started_at": "2026-05-15T09:28:47Z", + "completed_at": "2026-05-15T09:28:48Z" + }, + "assessment": { + "role": "negative_regression_seed", + "outcome": "rejected", + "summary": "The self-analysis promoted LLM-provider vocabulary into a false native repo-scoping capability and attached API/CLI features below it.", + "reviewer": "codex", + "comparison_eligibility": "eligible_as_negative_seed", + "rationale": [ + "repo-scoping uses llm-connect as optional extraction infrastructure; it does not natively route LLM requests across providers.", + "Provider names came from scanner rules, normalization tokens, schema examples, tests, fixtures, and workplan text rather than product-facing provider-routing behavior.", + "The generated tree placed native API and CLI surfaces under the false LLM-provider capability, which makes the feature hierarchy misleading." + ] + }, + "fact_summary": { + "counts_by_kind": { + "config": 1, + "credential_config": 13, + "documentation": 14, + "fallback_policy": 10, + "framework": 2, + "intent": 1, + "interface": 127, + "language": 1, + "llm_provider": 41, + "manifest": 1, + "provider_registry": 7, + "scope": 1, + "test": 19 + }, + "contamination_sources": [ + { + "path": "src/repo_registry/repo_scanning/scanner.py", + "reason": "Provider detector constants, credential hint constants, and fallback/provider-registry scanner logic were treated as repo-owned LLM routing evidence." + }, + { + "path": "src/repo_registry/candidate_graph/normalization.py", + "reason": "Provider names used as distinctive candidate-normalization tokens were treated as implementation evidence for provider support." + }, + { + "path": "src/repo_registry/web_api/schemas.py", + "reason": "An OpenRouter example in an expectation-gap schema was treated as provider evidence." + }, + { + "path": "tests/expectations/llm_connect_provider_expectations.json", + "reason": "A fixture describing llm-connect expectations was treated as repo-scoping product behavior." + }, + { + "path": "tests/fixtures.py", + "reason": "Regression fixture vocabulary was treated as native repo-scoping capability evidence." + }, + { + "path": "tests/test_candidate_graph.py", + "reason": "Unit-test examples for LLM-provider detection were treated as product evidence." + }, + { + "path": "tests/test_repository_scanner.py", + "reason": "Scanner tests for provider facts were treated as native provider-routing evidence." + } + ] + }, + "generated_tree": { + "abilities": [ + { + "name": "Support Repo Registry", + "status": "approved", + "primary_class": "repository-intelligence", + "capabilities": [ + { + "name": "Route LLM Requests Across Providers", + "status": "approved", + "primary_class": "llm-integration", + "features": [ + { + "name": "Use Anthropic Models", + "type": "integration", + "status": "approved", + "primary_class": "integration", + "location": "multiple files" + }, + { + "name": "Use Claude Models", + "type": "integration", + "status": "approved", + "primary_class": "integration", + "location": "multiple files" + }, + { + "name": "Use Gemini Models", + "type": "integration", + "status": "approved", + "primary_class": "integration", + "location": "multiple files" + }, + { + "name": "Use OpenAI Models", + "type": "integration", + "status": "approved", + "primary_class": "integration", + "location": "multiple files" + }, + { + "name": "Use OpenRouter Models", + "type": "integration", + "status": "approved", + "primary_class": "integration", + "location": "multiple files" + }, + { + "name": "Configure LLM Provider Credentials", + "type": "configuration", + "status": "approved", + "primary_class": "configuration", + "location": "multiple files" + }, + { + "name": "Maintain LLM Provider Registry", + "type": "backend", + "status": "approved", + "primary_class": "backend", + "location": "src/repo_registry/repo_scanning/scanner.py" + }, + { + "name": "Apply LLM Provider Fallback Policy", + "type": "backend", + "status": "approved", + "primary_class": "backend", + "location": "src/repo_registry/repo_scanning/scanner.py" + }, + { + "name": "HTTP API surface: possible API surface, GET /health, @app.post(, +43 more", + "type": "API", + "status": "approved", + "primary_class": "API", + "location": "multiple files" + }, + { + "name": "CLI command surface: CLI command build_parser, CLI command make_service", + "type": "CLI", + "status": "approved", + "primary_class": "CLI", + "location": "multiple files" + } + ] + } + ] + } + ] + }, + "known_regression_patterns": [ + { + "id": "RREG-SELF-REG-001", + "title": "LLM provider vocabulary promoted as native capability", + "severity": "critical", + "description": "Scanner, normalization, schema, fixture, test, or workplan vocabulary becomes the native repo-scoping capability Route LLM Requests Across Providers.", + "detection_hint": "Flag any top-level/native repo-scoping capability named Route LLM Requests Across Providers unless product intent and public implementation explicitly show provider routing as a repo-scoping feature." + }, + { + "id": "RREG-SELF-REG-002", + "title": "Native API and CLI surfaces attached under false capability", + "severity": "high", + "description": "General repo-scoping API/CLI interface features are nested below a capability they do not support.", + "detection_hint": "Flag API or CLI surface features when their parent capability is llm-integration or provider-routing." + }, + { + "id": "RREG-SELF-REG-003", + "title": "Deterministic trusted auto-approval accepted candidate truth", + "severity": "high", + "description": "A deterministic rule path approves candidate characteristics without human or agentic judgement.", + "detection_hint": "Flag trusted_auto_approve_candidate_graph review decisions in self-scoping assessment artifacts." + } + ], + "notes": [ + "This artifact is a negative regression seed, not a desirable baseline.", + "The historical run is useful for pattern detection but is not fully release-bound because the engine commit was not recorded in the original analysis metadata." + ] +} diff --git a/docs/self-scoping/golden/repo-scoping-golden-profile.v1.json b/docs/self-scoping/golden/repo-scoping-golden-profile.v1.json new file mode 100644 index 0000000..58bb3a3 --- /dev/null +++ b/docs/self-scoping/golden/repo-scoping-golden-profile.v1.json @@ -0,0 +1,311 @@ +{ + "schema_version": "self-scoping-golden-profile/v1", + "profile_id": "repo-scoping-golden-profile-v1", + "repo_slug": "repo-scoping", + "status": "active", + "created_at": "2026-05-15", + "updated_at": "2026-05-15", + "curation": { + "curator": "codex", + "workplan_id": "RREG-WP-0013", + "summary": "Curated target profile for evaluating repo-scoping self-analysis quality." + }, + "ability": { + "name": "Map Repositories Into Reviewable Scope Profiles", + "primary_class": "repository-intelligence", + "attributes": [ + "capability-mapping", + "source-linked-review", + "scope-generation" + ], + "description": "repo-scoping turns repository source, documentation, and review decisions into source-linked maps of repository utility.", + "expected_capabilities": [ + { + "name": "Register And Track Repositories", + "primary_class": "ingestion", + "attributes": ["metadata", "git", "analysis-run"], + "expected_features": [ + { + "name": "Create and update repository records", + "primary_class": "api", + "source_paths": [ + "src/repo_registry/core/service.py", + "src/repo_registry/web_api/app.py", + "src/repo_registry/web_ui/views.py" + ] + }, + { + "name": "Resolve local or remote Git checkouts", + "primary_class": "backend", + "source_paths": [ + "src/repo_registry/repo_ingestion/git.py", + "tests/test_git_ingestion.py" + ] + }, + { + "name": "Import repository metadata", + "primary_class": "backend", + "source_paths": [ + "src/repo_registry/repo_ingestion/metadata.py", + "tests/test_repository_metadata.py" + ] + } + ] + }, + { + "name": "Scan Repositories Into Observed Facts", + "primary_class": "analysis", + "attributes": ["deterministic", "facts", "provenance"], + "expected_features": [ + { + "name": "Detect source languages, manifests, docs, tests, config, and interfaces", + "primary_class": "backend", + "source_paths": [ + "src/repo_registry/repo_scanning/scanner.py", + "tests/test_repository_scanner.py" + ] + }, + { + "name": "Classify source roles for facts", + "primary_class": "backend", + "source_paths": [ + "src/repo_registry/repo_scanning/scanner.py", + "docs/characteristic-evidence-model.md" + ] + }, + { + "name": "Preserve analysis snapshots and fact records", + "primary_class": "storage", + "source_paths": [ + "src/repo_registry/storage/sqlite.py", + "migrations/0001_initial.sql" + ] + } + ] + }, + { + "name": "Index Source Content With Provenance", + "primary_class": "analysis", + "attributes": ["content-chunks", "source-role"], + "expected_features": [ + { + "name": "Create source-linked content chunks from observed facts", + "primary_class": "backend", + "source_paths": [ + "src/repo_registry/content_indexing/extractor.py", + "tests/test_content_indexing.py" + ] + }, + { + "name": "Carry source-role metadata into downstream generation", + "primary_class": "backend", + "source_paths": [ + "src/repo_registry/content_indexing/extractor.py", + "src/repo_registry/llm_extraction/extractor.py" + ] + } + ] + }, + { + "name": "Generate Reviewable Candidate Characteristics", + "primary_class": "analysis", + "attributes": ["candidate-graph", "review-required"], + "expected_features": [ + { + "name": "Build candidate abilities, capabilities, features, and evidence", + "primary_class": "backend", + "source_paths": [ + "src/repo_registry/candidate_graph/generator.py", + "src/repo_registry/candidate_graph/normalization.py", + "tests/test_candidate_graph.py" + ] + }, + { + "name": "Optionally map structured LLM extraction into candidates", + "primary_class": "integration", + "source_paths": [ + "src/repo_registry/llm_extraction/extractor.py", + "src/repo_registry/llm_extraction/mapper.py", + "tests/test_llm_extraction.py" + ] + } + ] + }, + { + "name": "Review And Approve Candidate Characteristics", + "primary_class": "review", + "attributes": ["curation", "approval", "audit"], + "expected_features": [ + { + "name": "Edit, reject, merge, and relink candidate graph entries", + "primary_class": "api", + "source_paths": [ + "src/repo_registry/core/service.py", + "src/repo_registry/web_api/app.py", + "src/repo_registry/web_ui/views.py", + "tests/test_registry_service.py" + ] + }, + { + "name": "Publish approved characteristic maps after review", + "primary_class": "storage", + "source_paths": [ + "src/repo_registry/core/service.py", + "src/repo_registry/storage/sqlite.py" + ] + }, + { + "name": "Record review decisions and expectation gaps", + "primary_class": "audit", + "source_paths": [ + "src/repo_registry/core/service.py", + "src/repo_registry/web_api/schemas.py" + ] + } + ] + }, + { + "name": "Search Compare And Export Approved Profiles", + "primary_class": "discovery", + "attributes": ["search", "comparison", "export"], + "expected_features": [ + { + "name": "Search approved abilities, capabilities, features, and evidence", + "primary_class": "api", + "source_paths": [ + "src/repo_registry/core/service.py", + "src/repo_registry/semantic/embeddings.py", + "tests/test_registry_service.py" + ] + }, + { + "name": "Compare repositories and identify capability gaps", + "primary_class": "api", + "source_paths": [ + "src/repo_registry/core/service.py", + "src/repo_registry/web_api/app.py" + ] + }, + { + "name": "Export repository profiles", + "primary_class": "api", + "source_paths": [ + "src/repo_registry/web_api/app.py", + "docs/api-contract.md" + ] + } + ] + }, + { + "name": "Generate And Maintain SCOPE.md", + "primary_class": "scope-generation", + "attributes": ["scope-md", "diff", "validation"], + "expected_features": [ + { + "name": "Render SCOPE.md from approved characteristics", + "primary_class": "backend", + "source_paths": [ + "src/repo_registry/scope/generator.py", + "tests/test_scope_generator.py", + "docs/scope-md-spec.md" + ] + }, + { + "name": "Diff, validate, and write scope files", + "primary_class": "api", + "source_paths": [ + "src/repo_registry/scope/validator.py", + "src/repo_registry/web_api/app.py", + "src/repo_registry/web_ui/views.py" + ] + } + ] + }, + { + "name": "Explore Dependency And Impact Graphs", + "primary_class": "dependency-analysis", + "attributes": ["graph", "impact", "visualization"], + "expected_features": [ + { + "name": "Model dependencies between facts, evidence, features, capabilities, abilities, and scope", + "primary_class": "backend", + "source_paths": [ + "src/repo_registry/core/service.py", + "docs/dependency-aware-scope-propagation.md", + "docs/dependency-visualization-exploration.md" + ] + }, + { + "name": "Render dependency graph views and profiles", + "primary_class": "ui", + "source_paths": [ + "src/repo_registry/web_ui/views.py", + "tests/test_web_api.py" + ] + } + ] + }, + { + "name": "Provide Scope Context To Downstream Agents", + "primary_class": "coordination", + "attributes": ["activity-core", "api-contract"], + "expected_features": [ + { + "name": "Return compact JSON scope context by repository slug", + "primary_class": "api", + "source_paths": [ + "src/repo_registry/web_api/app.py", + "docs/schemas/repo-scope-context-response.json", + "tests/test_scope_context_api.py" + ] + } + ] + } + ] + }, + "forbidden_native_capabilities": [ + { + "name": "Route LLM Requests Across Providers", + "reason": "repo-scoping may use llm-connect as optional extraction infrastructure, but provider routing is not a native repo-scoping product capability.", + "allowed_only_if": "Future product intent and public implementation explicitly add provider routing as repo-scoping-owned behavior." + } + ], + "non_native_context": [ + { + "name": "LLM provider names in scanner, normalization, schemas, tests, fixtures, docs, or workplans", + "classification": "scanner-rule-or-fixture-context", + "expected_handling": "May support scanner behavior facts or test coverage, but must not become native capability truth." + }, + { + "name": "llm-connect integration", + "classification": "optional dependency / adapter consumer", + "expected_handling": "May appear as optional extraction infrastructure, not as repo-scoping-owned provider routing." + }, + { + "name": "SCOPE.md content", + "classification": "derived scope", + "expected_handling": "Can be comparison or bootstrap context, not primary evidence for regenerating the same characteristic model." + } + ], + "comparison_rules": { + "must_have_capability_names": [ + "Register And Track Repositories", + "Scan Repositories Into Observed Facts", + "Index Source Content With Provenance", + "Generate Reviewable Candidate Characteristics", + "Review And Approve Candidate Characteristics", + "Search Compare And Export Approved Profiles", + "Generate And Maintain SCOPE.md", + "Explore Dependency And Impact Graphs", + "Provide Scope Context To Downstream Agents" + ], + "must_not_have_native_capability_names": [ + "Route LLM Requests Across Providers" + ], + "known_regression_ids": [ + "RREG-SELF-REG-001", + "RREG-SELF-REG-002", + "RREG-SELF-REG-003" + ] + } +} diff --git a/tests/test_self_scoping_artifacts.py b/tests/test_self_scoping_artifacts.py new file mode 100644 index 0000000..e4dad18 --- /dev/null +++ b/tests/test_self_scoping_artifacts.py @@ -0,0 +1,112 @@ +import json +from pathlib import Path + + +ROOT = Path(__file__).resolve().parents[1] +SCHEMA_PATH = ROOT / "docs" / "schemas" / "self-scoping-assessment.schema.json" +KNOWN_BAD_PATH = ( + ROOT + / "docs" + / "self-scoping" + / "assessments" + / "repo-scoping-known-bad-2026-05-15-run-39.json" +) +GOLDEN_PROFILE_PATH = ( + ROOT + / "docs" + / "self-scoping" + / "golden" + / "repo-scoping-golden-profile.v1.json" +) + + +def load_json(path: Path) -> dict: + return json.loads(path.read_text(encoding="utf-8")) + + +def test_self_scoping_assessment_schema_requires_release_binding_metadata(): + schema = load_json(SCHEMA_PATH) + + required = set(schema["required"]) + engine_required = set(schema["$defs"]["engineIdentity"]["required"]) + + assert { + "target_repository", + "engine_identity", + "execution", + "assessment", + "fact_summary", + "generated_tree", + "known_regression_patterns", + } <= required + assert { + "repo_scoping_version", + "engine_commit", + "engine_release", + "engine_dirty_state", + "scanner_version", + "candidate_generator_version", + "quality_criteria_version", + "prompt_version", + "release_binding_status", + } <= engine_required + assert schema["$defs"]["engineIdentity"]["properties"]["release_binding_status"][ + "enum" + ] == ["complete", "historical_incomplete", "unbound"] + + +def test_known_bad_self_scoping_artifact_captures_rejected_regression_seed(): + artifact = load_json(KNOWN_BAD_PATH) + + assert artifact["schema_version"] == "self-scoping-assessment/v1" + assert artifact["artifact_id"] == "repo-scoping-known-bad-2026-05-15-run-39" + assert artifact["target_repository"]["repo_slug"] == "repo-scoping" + assert artifact["execution"]["analysis_run_id"] == 39 + assert artifact["assessment"]["role"] == "negative_regression_seed" + assert artifact["assessment"]["outcome"] == "rejected" + assert ( + artifact["engine_identity"]["release_binding_status"] + == "historical_incomplete" + ) + + capability_names = { + capability["name"] + for ability in artifact["generated_tree"]["abilities"] + for capability in ability["capabilities"] + } + regression_ids = {item["id"] for item in artifact["known_regression_patterns"]} + + assert "Route LLM Requests Across Providers" in capability_names + assert {"RREG-SELF-REG-001", "RREG-SELF-REG-002", "RREG-SELF-REG-003"} <= regression_ids + assert artifact["fact_summary"]["counts_by_kind"]["llm_provider"] == 41 + + +def test_golden_profile_names_expected_native_capabilities_and_forbidden_false_positive(): + profile = load_json(GOLDEN_PROFILE_PATH) + + expected_capability_names = { + capability["name"] + for capability in profile["ability"]["expected_capabilities"] + } + forbidden_names = { + capability["name"] for capability in profile["forbidden_native_capabilities"] + } + + assert profile["schema_version"] == "self-scoping-golden-profile/v1" + assert profile["repo_slug"] == "repo-scoping" + assert { + "Register And Track Repositories", + "Scan Repositories Into Observed Facts", + "Index Source Content With Provenance", + "Generate Reviewable Candidate Characteristics", + "Review And Approve Candidate Characteristics", + "Search Compare And Export Approved Profiles", + "Generate And Maintain SCOPE.md", + "Explore Dependency And Impact Graphs", + "Provide Scope Context To Downstream Agents", + } <= expected_capability_names + assert "Route LLM Requests Across Providers" in forbidden_names + assert profile["comparison_rules"]["must_not_have_native_capability_names"] == [ + "Route LLM Requests Across Providers" + ] + diff --git a/workplans/RREG-WP-0013-self-scoping-baseline-evaluation.md b/workplans/RREG-WP-0013-self-scoping-baseline-evaluation.md new file mode 100644 index 0000000..676245b --- /dev/null +++ b/workplans/RREG-WP-0013-self-scoping-baseline-evaluation.md @@ -0,0 +1,258 @@ +--- +id: RREG-WP-0013 +type: workplan +title: "Self-Scoping Baseline Evaluation" +domain: capabilities +repo: repo-scoping +status: active +owner: codex +topic_slug: foerster-capabilities +created: "2026-05-15" +updated: "2026-05-15" +state_hub_workstream_id: "1c740db0-1999-478b-b3e3-c0fdfec1e9dd" +--- + +# Self-Scoping Baseline Evaluation + +repo-scoping should become a self-improving infrastructure: every meaningful +change to the scoping engine should be testable against a known baseline for +repo-scoping itself. The goal is not just to assert that output changed, but to +make it easy for a human or trusted agent to decide whether an old or new +result is better and preserve that assessment as signal for future engine +iterations. + +The motivating failure is the 2026-05-15 self-analysis where deterministic +provider-vocabulary facts were promoted into an approved `Route LLM Requests +Across Providers` capability and the repo's native API/CLI features were +attached under that incorrect capability. Future reruns should make regressions +like that obvious, reviewable, and attributable to the exact repo-scoping +release that generated them. + +## T01: Define Self-Scoping Assessment Model + +```task +id: RREG-WP-0013-T01 +status: done +priority: high +state_hub_task_id: "af633b76-3356-4480-8108-d996eeda5a31" +``` + +Define the data model for immutable self-scoping assessment runs. + +Each assessment must bind together: + +- The target repository identity: repo slug, source URL/path, target commit, + target branch, and dirty-state marker when applicable. +- The engine identity: repo-scoping package version, git commit, git tag or + release name when available, dirty-state marker, scanner version, candidate + generator version, quality-gate/ruleset version, schema version, and prompt + version/hash when LLM or agentic review is used. +- The execution mode: deterministic-only, LLM-assisted, agent-reviewed, + trusted-auto-review, manual-review, or mixed. +- The generated artifacts: observed fact summary, candidate graph, approved map + or proposed approval set, rejected/downgraded items, source refs, and review + notes. +- The assessment outcome: baseline, challenger, preferred, tied, rejected, + superseded, or needs-human. + +Acceptance criteria: +- A documented schema exists for self-scoping assessment runs. +- Assessment runs are append-only; reruns create new records instead of + rewriting old judgements. +- Engine release binding is required before an assessment can be compared. +- Dirty working trees are visible in the assessment metadata. + +## T02: Capture Current Bad Self-Run As A Regression Seed + +```task +id: RREG-WP-0013-T02 +status: done +priority: high +state_hub_task_id: "98258aea-65bb-4709-921f-711c6cc6ee48" +``` + +Import or recreate the known-bad repo-scoping self-analysis as a named +regression seed. + +Known bad pattern: + +- Candidate/approved capability: `Route LLM Requests Across Providers`. +- Incorrect feature attachment: repo-scoping API/CLI surfaces nested under that + LLM-provider capability. +- Incorrect evidence: scanner vocabulary, schema examples, tests, and + provider-name normalization code treated as repo-owned LLM routing behavior. + +Acceptance criteria: +- The bad run can be inspected as a historical assessment artifact. +- It is clearly marked as a negative baseline, not a desired golden output. +- The failure explanation is stored next to the captured graph. +- Future comparison reports can flag when a challenger repeats the same pattern. + +## T03: Create Desired Repo-Scoping Golden Profile + +```task +id: RREG-WP-0013-T03 +status: done +priority: high +state_hub_task_id: "f3ef1711-a115-4368-a97e-98abd1eda521" +``` + +Author a curated golden profile for repo-scoping itself. This should be compact +enough for comparison but expressive enough to catch hierarchy errors. + +Expected native capabilities should cover at least: + +- Repository registration and metadata import. +- Deterministic repository scanning into observed facts. +- Source-role and provenance-aware content indexing. +- Candidate characteristic generation from facts and content. +- Candidate review, edit, reject, merge, relink, and approval workflow. +- Approved characteristic search, comparison, export, and capability-gap + exploration. +- SCOPE.md generation, diffing, validation, and write/update flows. +- Dependency graph and characteristic impact exploration. +- Scope context API support for downstream agents such as activity-core. + +Forbidden top-level/native capabilities should include: + +- `Route LLM Requests Across Providers`, unless repo-scoping later genuinely + implements provider routing as a product feature rather than using + `llm-connect` as optional extraction infrastructure. + +Acceptance criteria: +- The golden profile includes ability, capability, feature, and evidence + expectations with source paths. +- The profile distinguishes native utility from dependencies, fixtures, test + vocabulary, schema examples, and optional LLM extraction infrastructure. +- The profile is stored in a stable, reviewable fixture location. +- The profile can evolve through explicit assessment decisions. + +Implementation note 2026-05-15: added +`docs/schemas/self-scoping-assessment.schema.json`, +`docs/self-scoping/assessments/repo-scoping-known-bad-2026-05-15-run-39.json`, +`docs/self-scoping/golden/repo-scoping-golden-profile.v1.json`, and +`tests/test_self_scoping_artifacts.py`. The known-bad artifact is marked as a +negative regression seed with `historical_incomplete` release binding because +the original analysis run did not record the engine commit. + +## T04: Export Assessment Artifacts From Analysis Runs + +```task +id: RREG-WP-0013-T04 +status: todo +priority: high +state_hub_task_id: "51e01d45-7574-4c97-994d-dabb2bcf9a00" +``` + +Add a CLI and/or API workflow that exports a completed analysis run as a +self-scoping assessment artifact. + +Acceptance criteria: +- Export includes repository metadata, analysis run metadata, engine identity, + candidate graph, observed fact summary, content chunk summary, approved map + if present, review decisions, and quality-gate outcomes when available. +- Export format is deterministic JSON with a documented schema. +- Export refuses to mark an artifact comparable when engine identity is + incomplete. +- Export can target repo-scoping itself without requiring network access. + +## T05: Compare Baseline And Challenger Runs + +```task +id: RREG-WP-0013-T05 +status: todo +priority: high +state_hub_task_id: "2b71069b-6150-45f4-84a2-59f5ec1e04c0" +``` + +Implement comparison between an existing baseline and a later challenger run. + +Comparison should report: + +- Added, removed, renamed, and moved abilities/capabilities/features. +- Hierarchy quality changes, especially misplaced features under the wrong + capability. +- Native-utility precision: whether generated capabilities are repo-owned, + facade/adapter, dependency, tooling, fixture, schema-example, or mention-only. +- Coverage against the repo-scoping golden profile. +- Regression flags for known-bad patterns. +- Source-ref quality: whether claims cite product intent, docs, source, tests, + fixtures, examples, or generated/derived scope. + +Acceptance criteria: +- Comparison output is useful in both machine-readable JSON and human-readable + Markdown. +- The report makes it easy to choose "old better", "new better", "tie", or + "needs review". +- It does not require candidates to have stable database IDs across runs. +- It can compare deterministic-only and agent-reviewed runs without losing + provenance. + +## T06: Add Side-By-Side Review UI + +```task +id: RREG-WP-0013-T06 +status: todo +priority: medium +state_hub_task_id: "16a60b7c-7e2c-4bb0-b4ab-2381289dba0b" +``` + +Expose baseline/challenger comparison in the curator UI. + +Acceptance criteria: +- Reviewers can select two assessment artifacts for repo-scoping. +- The UI shows the two hierarchy trees side by side with moved/misplaced items + highlighted. +- Reviewers can record preference, tie, rejection, and notes. +- Review decisions are persisted as assessment outcomes, not as changes to the + underlying historical artifacts. + +## T07: Add Self-Scoping Regression Command + +```task +id: RREG-WP-0013-T07 +status: todo +priority: medium +state_hub_task_id: "af1fcecd-686d-4592-b739-4698abc98c55" +``` + +Add a repeatable command for running repo-scoping against itself and comparing +the result to the active baseline. + +Acceptance criteria: +- The command captures engine identity before running analysis. +- The command can run deterministic-only without LLM or agentic review. +- The command can optionally invoke agentic review when configured. +- The command emits a comparison report and exits non-zero only for explicit + CI-blocking regressions, not for ordinary "needs review" assessment outcomes. + +## T08: Document Assessment Workflow + +```task +id: RREG-WP-0013-T08 +status: todo +priority: medium +state_hub_task_id: "30d71946-3598-4dc7-9970-c7c18126cad7" +``` + +Document how maintainers should use self-scoping assessment artifacts while +evolving the engine. + +Acceptance criteria: +- Documentation explains baseline, challenger, preferred, tied, rejected, and + superseded outcomes. +- Documentation explains engine release binding and why unbound output is not + comparable. +- Documentation gives examples for the known-bad LLM-provider regression and a + desired native repo-scoping profile. +- Documentation describes when to update the golden profile versus when to fix + the engine. + +## Completion Criteria + +- repo-scoping has an immutable, release-bound self-scoping assessment format. +- The current known-bad output is captured as a negative regression seed. +- A curated desired repo-scoping profile exists. +- Maintainers can rerun repo-scoping on itself, compare old/new results, and + record which output is better. +- Comparison results are bound to the repo-scoping release that generated them. diff --git a/workplans/RREG-WP-0014-agentic-characteristic-acceptance.md b/workplans/RREG-WP-0014-agentic-characteristic-acceptance.md new file mode 100644 index 0000000..111b715 --- /dev/null +++ b/workplans/RREG-WP-0014-agentic-characteristic-acceptance.md @@ -0,0 +1,225 @@ +--- +id: RREG-WP-0014 +type: workplan +title: "Agentic Characteristic Acceptance" +domain: capabilities +repo: repo-scoping +status: active +owner: codex +topic_slug: foerster-capabilities +created: "2026-05-15" +updated: "2026-05-15" +state_hub_workstream_id: "7feaa5b5-32d8-4b8e-b377-cbb3ddacf64a" +--- + +# Agentic Characteristic Acceptance + +Deterministic rules should not automatically accept candidate +characteristics. Determinism is strongest at fast, source-linked observation and +at applying transparent rejection or downgrade criteria: facts, provenance, +formal quality checks, schema validation, duplicate detection, and clear +negative filters. + +Acceptance is a judgement step. When automation stands in for human judgement, +it should be agentic: inspect the evidence, apply the visible quality criteria, +explain the decision, and leave a reviewable trace. Deterministic rules may +invalidate, downgrade, or require review, but they should not silently promote a +candidate into approved registry truth. + +## T01: Define Acceptance Policy Boundary + +```task +id: RREG-WP-0014-T01 +status: todo +priority: high +state_hub_task_id: "4bc2e749-ec9e-45d4-8095-63181efb752b" +``` + +Write the policy boundary between deterministic gates and acceptance +judgement. + +Policy principles: + +- Deterministic scanners generate observed facts and source refs. +- Deterministic quality gates can reject, downgrade, merge, flag, or require + review when criteria are formally expressible. +- Deterministic quality gates cannot approve candidate characteristics. +- Human reviewers can approve. +- Trusted agentic reviewers can approve only after producing an evidence-based + rationale. +- All automated review outcomes must be inspectable and reversible. + +Acceptance criteria: +- Documentation states that deterministic auto-approval is prohibited. +- Existing "trusted auto-approve" terminology is marked for replacement or + migration. +- The allowed deterministic outcomes are explicitly listed. +- The allowed agentic outcomes are explicitly listed. + +## T02: Create Transparent Quality Criteria Registry + +```task +id: RREG-WP-0014-T02 +status: todo +priority: high +state_hub_task_id: "101998a4-8cf8-4df0-8d05-c4e2041c0cac" +``` + +Create a reviewable quality criteria registry for candidate characteristics. + +Initial criteria should cover: + +- Source-role quality: intent/docs/source/tests are stronger than fixtures, + schema examples, agent guidance, CI/tooling, dependency declarations, or + derived scope. +- Native utility: owned/facade/adapter claims require explicit product evidence; + dependency, tooling, configuration, fixture, schema-example, and mention-only + claims are not native capabilities. +- Hierarchy fit: features should support their parent capability; misplaced + API/CLI surfaces should be flagged. +- Evidence sufficiency: candidate claims need source refs that support the + actual abstraction, not just matching vocabulary. +- Circularity: generated `SCOPE.md` text cannot be primary proof for rebuilding + the same characteristic model. +- Fixture contamination: tests and expectation files can prove scanner behavior + but should not become repo-native product capability claims. + +Acceptance criteria: +- Criteria are stored in a versioned, human-readable format. +- Each criterion has an identifier, description, severity, deterministic action + if applicable, and reviewer guidance. +- Criteria can be listed through CLI and/or API. +- Assessment and review records include the criteria version used. + +## T03: Implement Deterministic Quality Gate Outcomes + +```task +id: RREG-WP-0014-T03 +status: todo +priority: high +state_hub_task_id: "d599c084-a207-4910-9d0b-578d0c50f282" +``` + +Apply quality criteria before any human or agentic acceptance step. + +Acceptance criteria: +- Candidate abilities, capabilities, features, and evidence can carry gate + outcomes such as `pass`, `downgraded`, `rejected`, `requires_review`, and + `invalidated`. +- Rejected or invalidated candidates remain auditable with reason codes. +- Downgraded candidates remain visible but cannot be accepted without explicit + reviewer override. +- Deterministic gates never mark a candidate as approved. +- The known repo-scoping LLM-provider self-scan failure is flagged before + acceptance. + +## T04: Replace Trusted Auto-Approval With Agentic Review + +```task +id: RREG-WP-0014-T04 +status: todo +priority: high +state_hub_task_id: "b0d29756-7460-4ffa-8d56-d94cfb34e94f" +``` + +Replace `trusted_auto_approve_candidate_graph` behavior with an agentic review +workflow. + +Acceptance criteria: +- Existing API/CLI/UI affordances no longer present deterministic + auto-approval as a safe path. +- A configured agentic reviewer receives the candidate graph, source refs, + quality-gate outcomes, criteria version, and repository context. +- The reviewer can approve, reject, downgrade, request human review, relink, + or propose edits. +- Each agentic approval includes a rationale tied to evidence and criteria. +- If no agentic reviewer is configured, candidates remain pending review. + +## T05: Add Review Decision Audit Trail + +```task +id: RREG-WP-0014-T05 +status: todo +priority: high +state_hub_task_id: "0d12559a-831e-40ff-bf82-85f45b763f07" +``` + +Extend review decisions so acceptance history is useful for later audits and +self-scoping assessments. + +Acceptance criteria: +- Review decisions record reviewer type: human, agent, deterministic-gate, or + migration. +- Agentic decisions record reviewer identity/configuration, criteria version, + prompt or policy version, evidence inspected, and rationale. +- Deterministic gate decisions record rule IDs and outcomes, not approval. +- Review records distinguish "candidate accepted as-is" from "accepted after + edits/relinks". +- Existing decisions remain readable through a migration or compatibility view. + +## T06: Add Human Override And Criteria Refinement Flow + +```task +id: RREG-WP-0014-T06 +status: todo +priority: medium +state_hub_task_id: "bcba3237-fb87-4a38-8e96-12b872d5e6a9" +``` + +Make quality criteria reviewable and refineable instead of hidden in code. + +Acceptance criteria: +- Reviewers can inspect which criteria fired for a candidate. +- Reviewers can override a gate with a reason. +- Overrides are searchable so repeated overrides can drive criteria changes. +- Criteria changes are versioned and linked to workplans or decisions. +- The UI makes it clear when a candidate is blocked by formal criteria versus + merely awaiting judgement. + +## T07: Regression Coverage For Acceptance Boundary + +```task +id: RREG-WP-0014-T07 +status: todo +priority: high +state_hub_task_id: "37a22c89-ded5-42dd-aaa9-ece79477fcff" +``` + +Add tests that lock in the new acceptance boundary. + +Acceptance criteria: +- Deterministic analysis can generate facts and candidates but cannot approve + them. +- Deterministic gates can reject/downgrade/require review with reason codes. +- Agentic review can approve only with a rationale and criteria version. +- The repo-scoping self-scan LLM-provider failure is not accepted by + deterministic rules. +- Existing manual review and approval paths keep working. + +## T08: Migration And Compatibility Plan + +```task +id: RREG-WP-0014-T08 +status: todo +priority: medium +state_hub_task_id: "3d5475f6-71a7-4ca7-aa69-573e91d1fe1e" +``` + +Plan the migration away from trusted deterministic auto-approval. + +Acceptance criteria: +- Existing approved maps created by trusted auto-approval can be identified. +- Users can rebuild or re-review those maps without losing audit history. +- API and CLI changes are documented with compatibility notes. +- The old behavior is either removed or guarded behind an explicit deprecated + migration mode that cannot run by default. + +## Completion Criteria + +- Deterministic rules no longer approve candidate characteristics. +- Transparent, versioned quality criteria can reject, downgrade, invalidate, or + require review. +- Agentic review is the only automated path that can stand in for human + acceptance. +- Acceptance decisions are auditable, evidence-bound, and useful as training + signal for future self-scoping assessment.