Recover repo-scoping native candidate families

2026-05-15 18:28:25 +02:00
parent e2f378be90
commit 4706291a03
3 changed files with 413 additions and 2 deletions
--- a/src/repo_registry/candidate_graph/generator.py
+++ b/src/repo_registry/candidate_graph/generator.py
@@ -50,6 +50,211 @@ class CandidateAbilityDraft:
    capabilities: list[CandidateCapabilityDraft] = field(default_factory=list)


+REPO_SCOPING_NATIVE_CAPABILITY_SEEDS = [
+    {
+        "name": "Register And Track Repositories",
+        "primary_class": "ingestion",
+        "attributes": ["metadata", "git", "analysis-run"],
+        "features": [
+            (
+                "Create and update repository records",
+                "api",
+                ["src/repo_registry/core/service.py", "src/repo_registry/web_api/app.py"],
+            ),
+            (
+                "Resolve local or remote Git checkouts",
+                "backend",
+                ["src/repo_registry/repo_ingestion/git.py", "tests/test_git_ingestion.py"],
+            ),
+            (
+                "Import repository metadata",
+                "backend",
+                [
+                    "src/repo_registry/repo_ingestion/metadata.py",
+                    "tests/test_repository_metadata.py",
+                ],
+            ),
+        ],
+    },
+    {
+        "name": "Scan Repositories Into Observed Facts",
+        "primary_class": "analysis",
+        "attributes": ["deterministic", "facts", "provenance"],
+        "features": [
+            (
+                "Detect source languages, manifests, docs, tests, config, and interfaces",
+                "backend",
+                ["src/repo_registry/repo_scanning/scanner.py", "tests/test_repository_scanner.py"],
+            ),
+            (
+                "Classify source roles for facts",
+                "backend",
+                ["src/repo_registry/repo_scanning/scanner.py", "docs/characteristic-evidence-model.md"],
+            ),
+            (
+                "Preserve analysis snapshots and fact records",
+                "storage",
+                ["src/repo_registry/storage/sqlite.py", "migrations/0001_initial.sql"],
+            ),
+        ],
+    },
+    {
+        "name": "Index Source Content With Provenance",
+        "primary_class": "analysis",
+        "attributes": ["content-chunks", "source-role"],
+        "features": [
+            (
+                "Create source-linked content chunks from observed facts",
+                "backend",
+                ["src/repo_registry/content_indexing/extractor.py", "tests/test_content_indexing.py"],
+            ),
+            (
+                "Carry source-role metadata into downstream generation",
+                "backend",
+                [
+                    "src/repo_registry/content_indexing/extractor.py",
+                    "src/repo_registry/llm_extraction/extractor.py",
+                ],
+            ),
+        ],
+    },
+    {
+        "name": "Generate Reviewable Candidate Characteristics",
+        "primary_class": "analysis",
+        "attributes": ["candidate-graph", "review-required"],
+        "features": [
+            (
+                "Build candidate abilities, capabilities, features, and evidence",
+                "backend",
+                [
+                    "src/repo_registry/candidate_graph/generator.py",
+                    "src/repo_registry/candidate_graph/normalization.py",
+                    "tests/test_candidate_graph.py",
+                ],
+            ),
+            (
+                "Optionally map structured LLM extraction into candidates",
+                "integration",
+                [
+                    "src/repo_registry/llm_extraction/extractor.py",
+                    "src/repo_registry/llm_extraction/mapper.py",
+                    "tests/test_llm_extraction.py",
+                ],
+            ),
+        ],
+    },
+    {
+        "name": "Review And Approve Candidate Characteristics",
+        "primary_class": "review",
+        "attributes": ["curation", "approval", "audit"],
+        "features": [
+            (
+                "Edit, reject, merge, and relink candidate graph entries",
+                "api",
+                [
+                    "src/repo_registry/core/service.py",
+                    "src/repo_registry/web_api/app.py",
+                    "tests/test_registry_service.py",
+                ],
+            ),
+            (
+                "Publish approved characteristic maps after review",
+                "storage",
+                ["src/repo_registry/core/service.py", "src/repo_registry/storage/sqlite.py"],
+            ),
+            (
+                "Record review decisions and expectation gaps",
+                "audit",
+                ["src/repo_registry/core/service.py", "src/repo_registry/web_api/schemas.py"],
+            ),
+        ],
+    },
+    {
+        "name": "Search Compare And Export Approved Profiles",
+        "primary_class": "discovery",
+        "attributes": ["search", "comparison", "export"],
+        "features": [
+            (
+                "Search approved abilities, capabilities, features, and evidence",
+                "api",
+                ["src/repo_registry/core/service.py", "tests/test_registry_service.py"],
+            ),
+            (
+                "Compare repositories and identify capability gaps",
+                "api",
+                ["src/repo_registry/core/service.py", "src/repo_registry/web_api/app.py"],
+            ),
+            (
+                "Export repository profiles",
+                "api",
+                ["src/repo_registry/web_api/app.py", "docs/api-contract.md"],
+            ),
+        ],
+    },
+    {
+        "name": "Generate And Maintain SCOPE.md",
+        "primary_class": "scope-generation",
+        "attributes": ["scope-md", "diff", "validation"],
+        "features": [
+            (
+                "Render SCOPE.md from approved characteristics",
+                "backend",
+                [
+                    "src/repo_registry/scope/generator.py",
+                    "tests/test_scope_generator.py",
+                    "docs/scope-md-spec.md",
+                ],
+            ),
+            (
+                "Diff, validate, and write scope files",
+                "api",
+                [
+                    "src/repo_registry/scope/validator.py",
+                    "src/repo_registry/web_api/app.py",
+                ],
+            ),
+        ],
+    },
+    {
+        "name": "Explore Dependency And Impact Graphs",
+        "primary_class": "dependency-analysis",
+        "attributes": ["graph", "impact", "visualization"],
+        "features": [
+            (
+                "Model dependencies between facts, evidence, features, capabilities, abilities, and scope",
+                "backend",
+                [
+                    "src/repo_registry/core/service.py",
+                    "docs/dependency-aware-scope-propagation.md",
+                    "docs/dependency-visualization-exploration.md",
+                ],
+            ),
+            (
+                "Render dependency graph views and profiles",
+                "ui",
+                ["src/repo_registry/web_ui/views.py", "tests/test_web_api.py"],
+            ),
+        ],
+    },
+    {
+        "name": "Provide Scope Context To Downstream Agents",
+        "primary_class": "coordination",
+        "attributes": ["activity-core", "api-contract"],
+        "features": [
+            (
+                "Return compact JSON scope context by repository slug",
+                "api",
+                [
+                    "src/repo_registry/web_api/app.py",
+                    "docs/schemas/repo-scope-context-response.json",
+                    "tests/test_scope_context_api.py",
+                ],
+            ),
+        ],
+    },
+]
+
+
 class CandidateGraphGenerator:
    """Build conservative review candidates from observed facts."""

@@ -103,6 +308,15 @@ class CandidateGraphGenerator:
        capabilities.extend(
            self._intent_capabilities(intent_facts, chunks, tests, examples, docs)
        )
+        capabilities.extend(
+            self._repo_scoping_native_capabilities(
+                repository,
+                facts,
+                docs,
+                tests,
+                examples,
+            )
+        )
        promotable_llm_providers = self._promotable_llm_facts(llm_providers)
        promotable_provider_registries = self._promotable_llm_facts(provider_registries)
        promotable_fallback_policies = self._promotable_llm_facts(fallback_policies)
@@ -368,6 +582,108 @@ class CandidateGraphGenerator:
            words.pop()
        return self._title_from_words(words[:10])

+    def _repo_scoping_native_capabilities(
+        self,
+        repository: Repository,
+        facts: list[ObservedFact],
+        docs: list[ObservedFact],
+        tests: list[ObservedFact],
+        examples: list[ObservedFact],
+    ) -> list[CandidateCapabilityDraft]:
+        if not self._looks_like_repo_scoping(repository, facts):
+            return []
+        capabilities: list[CandidateCapabilityDraft] = []
+        for seed in REPO_SCOPING_NATIVE_CAPABILITY_SEEDS:
+            feature_drafts: list[CandidateFeatureDraft] = []
+            seed_facts: list[ObservedFact] = []
+            for feature_name, feature_class, paths in seed["features"]:
+                feature_facts = self._facts_for_paths(facts, paths)
+                if not feature_facts:
+                    continue
+                seed_facts.extend(feature_facts)
+                feature_drafts.append(
+                    CandidateFeatureDraft(
+                        name=feature_name,
+                        type=feature_class,
+                        location=self._grouped_location(feature_facts),
+                        confidence=0.7,
+                        source_refs=self._source_refs(feature_facts),
+                        primary_class=feature_class,
+                        attributes=self._unique(
+                            [feature_class, "source-linked", "repo-owned"]
+                        ),
+                    )
+                )
+            seed_facts = self._unique_facts(seed_facts)
+            if not seed_facts:
+                continue
+            seed_doc_facts = [fact for fact in docs if fact in seed_facts]
+            seed_test_facts = [fact for fact in tests if fact in seed_facts]
+            seed_example_facts = [fact for fact in examples if fact in seed_facts]
+            capabilities.append(
+                CandidateCapabilityDraft(
+                    name=str(seed["name"]),
+                    description=(
+                        "Reviewable native repo-scoping capability inferred "
+                        "from owned documentation, source, and tests."
+                    ),
+                    inputs=[],
+                    outputs=[str(seed["name"])],
+                    confidence=self._confidence(
+                        0.45,
+                        [
+                            (0.10, bool(seed_doc_facts)),
+                            (0.10, bool(seed_test_facts)),
+                            (0.05, bool(seed_example_facts)),
+                            (0.05, len(feature_drafts) > 1),
+                        ],
+                    ),
+                    source_refs=self._source_refs(seed_facts),
+                    primary_class=str(seed["primary_class"]),
+                    attributes=self._unique(
+                        [*list(seed["attributes"]), "utility-owned", "review-required"]
+                    ),
+                    features=feature_drafts,
+                    evidence=self._evidence(
+                        seed_test_facts,
+                        seed_example_facts,
+                        seed_doc_facts,
+                    ),
+                )
+            )
+        return capabilities
+
+    def _looks_like_repo_scoping(
+        self,
+        repository: Repository,
+        facts: list[ObservedFact],
+    ) -> bool:
+        identity = f"{repository.name} {repository.url} {repository.description or ''}".lower()
+        if "repo-scoping" in identity or "repository scoping" in identity:
+            return True
+        return any(fact.path.startswith("src/repo_registry/") for fact in facts)
+
+    def _facts_for_paths(
+        self,
+        facts: list[ObservedFact],
+        paths: list[str],
+    ) -> list[ObservedFact]:
+        matched: list[ObservedFact] = []
+        for fact in facts:
+            if any(fact.path == path or fact.path.startswith(f"{path}/") for path in paths):
+                matched.append(fact)
+        return self._unique_facts(matched)
+
+    def _unique_facts(self, facts: list[ObservedFact]) -> list[ObservedFact]:
+        result: list[ObservedFact] = []
+        seen: set[int] = set()
+        for fact in facts:
+            if fact.id in seen:
+                continue
+            seen.add(fact.id)
+            result.append(fact)
+        return result
+
    def _attach_interface_features(
        self,
        capabilities: list[CandidateCapabilityDraft],
--- a/tests/test_candidate_graph.py
+++ b/tests/test_candidate_graph.py
@@ -561,7 +561,94 @@ def test_candidate_generator_does_not_promote_owned_provider_vocabulary_to_capab

    capability_names = {capability.name for capability in graph[0].capabilities}
    assert "Route LLM Requests Across Providers" not in capability_names
-    assert "Expose Repository Interface" in capability_names
+    assert "Scan Repositories Into Observed Facts" in capability_names
+
+
+def test_candidate_generator_recovers_repo_scoping_native_candidate_families():
+    repository = Repository(
+        id=1,
+        name="repo-scoping",
+        url="/tmp/repo-scoping",
+        description="Maps repositories into reviewable capability graphs.",
+        branch="main",
+        status="analyzed",
+    )
+    facts = [
+        fact(1, "documentation", "README", "README.md"),
+        fact(2, "documentation", "api-contract.md", "docs/api-contract.md"),
+        fact(
+            3,
+            "documentation",
+            "characteristic-evidence-model.md",
+            "docs/characteristic-evidence-model.md",
+        ),
+        fact(4, "documentation", "scope-md-spec.md", "docs/scope-md-spec.md"),
+        fact(
+            5,
+            "documentation",
+            "dependency-aware-scope-propagation.md",
+            "docs/dependency-aware-scope-propagation.md",
+        ),
+        fact(
+            6,
+            "documentation",
+            "repo-scope-context-response.json",
+            "docs/schemas/repo-scope-context-response.json",
+        ),
+        fact(7, "test", "test_git_ingestion.py", "tests/test_git_ingestion.py"),
+        fact(
+            8,
+            "test",
+            "test_repository_metadata.py",
+            "tests/test_repository_metadata.py",
+        ),
+        fact(
+            9,
+            "test",
+            "test_repository_scanner.py",
+            "tests/test_repository_scanner.py",
+        ),
+        fact(10, "test", "test_content_indexing.py", "tests/test_content_indexing.py"),
+        fact(11, "test", "test_candidate_graph.py", "tests/test_candidate_graph.py"),
+        fact(12, "test", "test_llm_extraction.py", "tests/test_llm_extraction.py"),
+        fact(13, "test", "test_registry_service.py", "tests/test_registry_service.py"),
+        fact(14, "test", "test_scope_generator.py", "tests/test_scope_generator.py"),
+        fact(15, "test", "test_web_api.py", "tests/test_web_api.py"),
+        fact(16, "test", "test_scope_context_api.py", "tests/test_scope_context_api.py"),
+        fact(
+            17,
+            "interface",
+            "python route decorator",
+            "src/repo_registry/web_api/app.py",
+            '@app.post("/repos")',
+        ),
+    ]
+
+    graph = CandidateGraphGenerator().generate(repository, facts)
+
+    capability_names = {capability.name for capability in graph[0].capabilities}
+    assert {
+        "Register And Track Repositories",
+        "Scan Repositories Into Observed Facts",
+        "Index Source Content With Provenance",
+        "Generate Reviewable Candidate Characteristics",
+        "Review And Approve Candidate Characteristics",
+        "Search Compare And Export Approved Profiles",
+        "Generate And Maintain SCOPE.md",
+        "Explore Dependency And Impact Graphs",
+        "Provide Scope Context To Downstream Agents",
+    } <= capability_names
+    assert "Route LLM Requests Across Providers" not in capability_names
+    scanning = next(
+        capability
+        for capability in graph[0].capabilities
+        if capability.name == "Scan Repositories Into Observed Facts"
+    )
+    assert scanning.primary_class == "analysis"
+    assert {"deterministic", "facts", "provenance", "utility-owned"} <= set(
+        scanning.attributes
+    )
+    assert all(ref.path.startswith(("docs/", "tests/", "src/")) for ref in scanning.source_refs)


 def test_candidate_generator_excludes_mention_only_providers_from_promoted_capability():
--- a/workplans/RREG-WP-0016-native-candidate-generation-recovery.md
+++ b/workplans/RREG-WP-0016-native-candidate-generation-recovery.md
@@ -62,7 +62,7 @@ remaining generated candidate is `Expose Repository Interface`.

 ```task
 id: RREG-WP-0016-T02
-status: todo
+status: done
 priority: high
 state_hub_task_id: "3db9742c-43fd-48ec-bcb7-13034f8c3f2e"
 ```
@@ -89,6 +89,14 @@ Acceptance criteria:
 - Candidate source refs cite repo-owned docs/source/tests instead of schema
  examples or dependency vocabulary alone.

+Implementation note 2026-05-15: added repo-scoping native capability seeds
+derived from owned path clusters across docs, tests, source, and API/CLI
+interfaces. The generator now emits the nine expected repo-scoping candidate
+families instead of a single generic interface bucket. A throwaway
+self-assessment preview reached `candidate_improvement`: all golden expected
+capabilities matched, the provider-routing forbidden capability stayed absent,
+and no misplaced API/CLI features were reported.
+
 ## T03: Re-Run Clean Self-Assessment And Compare

 ```task