From 07c837d6bbc2d9f42256ec6a44a5bb06a3dd5cff Mon Sep 17 00:00:00 2001
From: tegwick <bernd.worsch@gmail.com>
Date: Sun, 26 Apr 2026 02:49:58 +0200
Subject: [PATCH] candidate generation use content chunks

---
 .../candidate_graph/generator.py              | 70 ++++++++++++++++---
 src/repo_registry/core/service.py             |  7 +-
 tests/test_candidate_graph.py                 | 53 +++++++++++++-
 tests/test_registry_service.py                |  2 +
 4 files changed, 119 insertions(+), 13 deletions(-)

diff --git a/src/repo_registry/candidate_graph/generator.py b/src/repo_registry/candidate_graph/generator.py
index a257592..c1b567e 100644
--- a/src/repo_registry/candidate_graph/generator.py
+++ b/src/repo_registry/candidate_graph/generator.py
@@ -2,7 +2,7 @@ from __future__ import annotations
 
 from dataclasses import dataclass, field
 
-from repo_registry.core.models import ObservedFact, Repository, SourceReference
+from repo_registry.core.models import ContentChunk, ObservedFact, Repository, SourceReference
 
 
 @dataclass(frozen=True)
@@ -50,9 +50,11 @@ class CandidateGraphGenerator:
         self,
         repository: Repository,
         facts: list[ObservedFact],
+        chunks: list[ContentChunk] | None = None,
     ) -> list[CandidateAbilityDraft]:
         if not facts:
             return []
+        chunks = chunks or []
 
         docs = self._facts(facts, "documentation")
         tests = self._facts(facts, "test")
@@ -65,11 +67,7 @@ class CandidateGraphGenerator:
         ability_sources = docs or manifests or languages
         ability = CandidateAbilityDraft(
             name=f"Review {repository.name} Repository Usefulness",
-            description=(
-                "Candidate usefulness summary generated from observed repository "
-                "documentation, manifests, languages, and interfaces. This is a "
-                "review seed, not an asserted domain ability."
-            ),
+            description=self._ability_description(chunks),
             confidence=0.55 if docs else 0.35,
             source_refs=self._source_refs(ability_sources),
             capabilities=[],
@@ -77,7 +75,9 @@ class CandidateGraphGenerator:
 
         capabilities: list[CandidateCapabilityDraft] = []
         if interfaces:
-            capabilities.append(self._interface_capability(interfaces, tests, examples, docs))
+            capabilities.append(
+                self._interface_capability(interfaces, tests, examples, docs, chunks)
+            )
         if manifests or frameworks or languages:
             capabilities.append(
                 CandidateCapabilityDraft(
@@ -110,6 +110,7 @@ class CandidateGraphGenerator:
         tests: list[ObservedFact],
         examples: list[ObservedFact],
         docs: list[ObservedFact],
+        chunks: list[ContentChunk],
     ) -> CandidateCapabilityDraft:
         features = [
             CandidateFeatureDraft(
@@ -123,10 +124,7 @@ class CandidateGraphGenerator:
         ]
         return CandidateCapabilityDraft(
             name="Expose Repository Interface",
-            description=(
-                "Expose one or more likely user-facing API or CLI entry points. "
-                "Review is required to name the concrete domain behavior."
-            ),
+            description=self._interface_description(chunks),
             inputs=[],
             outputs=["callable interface"],
             confidence=0.65,
@@ -179,6 +177,56 @@ class CandidateGraphGenerator:
             return "API"
         return "interface"
 
+    def _ability_description(self, chunks: list[ContentChunk]) -> str:
+        doc_summary = self._document_summary(chunks)
+        if doc_summary:
+            return (
+                "Candidate usefulness summary seeded from repository content: "
+                f"{doc_summary} This is a review seed, not an asserted domain ability."
+            )
+        return (
+            "Candidate usefulness summary generated from observed repository "
+            "documentation, manifests, languages, and interfaces. This is a "
+            "review seed, not an asserted domain ability."
+        )
+
+    def _interface_description(self, chunks: list[ContentChunk]) -> str:
+        interface_summary = self._interface_summary(chunks)
+        if interface_summary:
+            return (
+                "Expose one or more likely user-facing API or CLI entry points. "
+                f"Source context: {interface_summary} Review is required to name "
+                "the concrete domain behavior."
+            )
+        return (
+            "Expose one or more likely user-facing API or CLI entry points. "
+            "Review is required to name the concrete domain behavior."
+        )
+
+    def _document_summary(self, chunks: list[ContentChunk]) -> str:
+        for chunk in chunks:
+            if chunk.kind != "documentation":
+                continue
+            lines = [line.strip() for line in chunk.text.splitlines() if line.strip()]
+            if not lines:
+                continue
+            heading = next((line.lstrip("#").strip() for line in lines if line.startswith("#")), "")
+            paragraph = next((line for line in lines if not line.startswith("#")), "")
+            if heading and paragraph:
+                return f"{heading}. {paragraph}"
+            return heading or paragraph
+        return ""
+
+    def _interface_summary(self, chunks: list[ContentChunk]) -> str:
+        for chunk in chunks:
+            if chunk.kind != "interface":
+                continue
+            lines = [line.strip() for line in chunk.text.splitlines() if line.strip()]
+            if not lines:
+                continue
+            return " ".join(lines[:3])
+        return ""
+
     def _facts(self, facts: list[ObservedFact], kind: str) -> list[ObservedFact]:
         return [fact for fact in facts if fact.kind == kind]
 
diff --git a/src/repo_registry/core/service.py b/src/repo_registry/core/service.py
index 79bfe0f..586f40c 100644
--- a/src/repo_registry/core/service.py
+++ b/src/repo_registry/core/service.py
@@ -121,7 +121,12 @@ class RegistryService:
             completed_run.snapshot_id,
             chunks,
         )
-        candidates = self.candidate_generator.generate(repository, facts)
+        stored_chunks = self.store.list_content_chunks(repository_id, completed_run.id)
+        candidates = self.candidate_generator.generate(
+            repository,
+            facts,
+            stored_chunks,
+        )
         self.store.replace_candidate_graph(repository_id, completed_run.id, candidates)
         return ScanSummary(
             analysis_run=completed_run,
diff --git a/tests/test_candidate_graph.py b/tests/test_candidate_graph.py
index 7714931..d3ea051 100644
--- a/tests/test_candidate_graph.py
+++ b/tests/test_candidate_graph.py
@@ -1,5 +1,5 @@
 from repo_registry.candidate_graph.generator import CandidateGraphGenerator
-from repo_registry.core.models import ObservedFact, Repository
+from repo_registry.core.models import ContentChunk, ObservedFact, Repository
 
 
 def fact(id, kind, name, path="", value=""):
@@ -16,6 +16,20 @@ def fact(id, kind, name, path="", value=""):
     )
 
 
+def chunk(id, kind, path, text, start_line=1, end_line=1):
+    return ContentChunk(
+        id=id,
+        repository_id=1,
+        analysis_run_id=1,
+        snapshot_id=1,
+        path=path,
+        kind=kind,
+        start_line=start_line,
+        end_line=end_line,
+        text=text,
+    )
+
+
 def test_candidate_generator_builds_review_seed_from_observed_facts():
     repository = Repository(
         id=1,
@@ -43,3 +57,40 @@ def test_candidate_generator_builds_review_seed_from_observed_facts():
     assert interface_capability.features[0].type == "API"
     assert interface_capability.features[0].location == "app.py"
     assert interface_capability.evidence[0].strength == "strong"
+
+
+def test_candidate_generator_enriches_descriptions_from_content_chunks():
+    repository = Repository(
+        id=1,
+        name="MailRouter",
+        url="/tmp/mail-router",
+        description=None,
+        branch="main",
+        status="analyzed",
+    )
+    facts = [
+        fact(1, "documentation", "README", "README.md"),
+        fact(2, "interface", "python route decorator", "app.py", '@app.post("/classify")'),
+    ]
+    chunks = [
+        chunk(
+            1,
+            "documentation",
+            "README.md",
+            "# MailRouter\nRoutes incoming customer email to the right team.",
+            end_line=2,
+        ),
+        chunk(
+            2,
+            "interface",
+            "app.py",
+            '@app.post("/classify")\ndef classify_email():\n    return {}',
+            start_line=5,
+            end_line=7,
+        ),
+    ]
+
+    graph = CandidateGraphGenerator().generate(repository, facts, chunks)
+
+    assert "MailRouter. Routes incoming customer email" in graph[0].description
+    assert '@app.post("/classify")' in graph[0].capabilities[0].description
diff --git a/tests/test_registry_service.py b/tests/test_registry_service.py
index f25f4a1..b041087 100644
--- a/tests/test_registry_service.py
+++ b/tests/test_registry_service.py
@@ -352,6 +352,8 @@ def test_analyze_repository_records_snapshot_and_observed_facts(tmp_path):
     candidate_graph = service.candidate_graph(repository.id, summary.analysis_run.id)
     assert candidate_graph.repository.name == "Example"
     assert candidate_graph.abilities
+    assert "Example" in candidate_graph.abilities[0].description
+    assert "@app.get" in candidate_graph.abilities[0].capabilities[0].description
     capability_names = {
         capability.name
         for ability in candidate_graph.abilities