From 07c837d6bbc2d9f42256ec6a44a5bb06a3dd5cff Mon Sep 17 00:00:00 2001 From: tegwick Date: Sun, 26 Apr 2026 02:49:58 +0200 Subject: [PATCH] candidate generation use content chunks --- .../candidate_graph/generator.py | 70 ++++++++++++++++--- src/repo_registry/core/service.py | 7 +- tests/test_candidate_graph.py | 53 +++++++++++++- tests/test_registry_service.py | 2 + 4 files changed, 119 insertions(+), 13 deletions(-) diff --git a/src/repo_registry/candidate_graph/generator.py b/src/repo_registry/candidate_graph/generator.py index a257592..c1b567e 100644 --- a/src/repo_registry/candidate_graph/generator.py +++ b/src/repo_registry/candidate_graph/generator.py @@ -2,7 +2,7 @@ from __future__ import annotations from dataclasses import dataclass, field -from repo_registry.core.models import ObservedFact, Repository, SourceReference +from repo_registry.core.models import ContentChunk, ObservedFact, Repository, SourceReference @dataclass(frozen=True) @@ -50,9 +50,11 @@ class CandidateGraphGenerator: self, repository: Repository, facts: list[ObservedFact], + chunks: list[ContentChunk] | None = None, ) -> list[CandidateAbilityDraft]: if not facts: return [] + chunks = chunks or [] docs = self._facts(facts, "documentation") tests = self._facts(facts, "test") @@ -65,11 +67,7 @@ class CandidateGraphGenerator: ability_sources = docs or manifests or languages ability = CandidateAbilityDraft( name=f"Review {repository.name} Repository Usefulness", - description=( - "Candidate usefulness summary generated from observed repository " - "documentation, manifests, languages, and interfaces. This is a " - "review seed, not an asserted domain ability." - ), + description=self._ability_description(chunks), confidence=0.55 if docs else 0.35, source_refs=self._source_refs(ability_sources), capabilities=[], @@ -77,7 +75,9 @@ class CandidateGraphGenerator: capabilities: list[CandidateCapabilityDraft] = [] if interfaces: - capabilities.append(self._interface_capability(interfaces, tests, examples, docs)) + capabilities.append( + self._interface_capability(interfaces, tests, examples, docs, chunks) + ) if manifests or frameworks or languages: capabilities.append( CandidateCapabilityDraft( @@ -110,6 +110,7 @@ class CandidateGraphGenerator: tests: list[ObservedFact], examples: list[ObservedFact], docs: list[ObservedFact], + chunks: list[ContentChunk], ) -> CandidateCapabilityDraft: features = [ CandidateFeatureDraft( @@ -123,10 +124,7 @@ class CandidateGraphGenerator: ] return CandidateCapabilityDraft( name="Expose Repository Interface", - description=( - "Expose one or more likely user-facing API or CLI entry points. " - "Review is required to name the concrete domain behavior." - ), + description=self._interface_description(chunks), inputs=[], outputs=["callable interface"], confidence=0.65, @@ -179,6 +177,56 @@ class CandidateGraphGenerator: return "API" return "interface" + def _ability_description(self, chunks: list[ContentChunk]) -> str: + doc_summary = self._document_summary(chunks) + if doc_summary: + return ( + "Candidate usefulness summary seeded from repository content: " + f"{doc_summary} This is a review seed, not an asserted domain ability." + ) + return ( + "Candidate usefulness summary generated from observed repository " + "documentation, manifests, languages, and interfaces. This is a " + "review seed, not an asserted domain ability." + ) + + def _interface_description(self, chunks: list[ContentChunk]) -> str: + interface_summary = self._interface_summary(chunks) + if interface_summary: + return ( + "Expose one or more likely user-facing API or CLI entry points. " + f"Source context: {interface_summary} Review is required to name " + "the concrete domain behavior." + ) + return ( + "Expose one or more likely user-facing API or CLI entry points. " + "Review is required to name the concrete domain behavior." + ) + + def _document_summary(self, chunks: list[ContentChunk]) -> str: + for chunk in chunks: + if chunk.kind != "documentation": + continue + lines = [line.strip() for line in chunk.text.splitlines() if line.strip()] + if not lines: + continue + heading = next((line.lstrip("#").strip() for line in lines if line.startswith("#")), "") + paragraph = next((line for line in lines if not line.startswith("#")), "") + if heading and paragraph: + return f"{heading}. {paragraph}" + return heading or paragraph + return "" + + def _interface_summary(self, chunks: list[ContentChunk]) -> str: + for chunk in chunks: + if chunk.kind != "interface": + continue + lines = [line.strip() for line in chunk.text.splitlines() if line.strip()] + if not lines: + continue + return " ".join(lines[:3]) + return "" + def _facts(self, facts: list[ObservedFact], kind: str) -> list[ObservedFact]: return [fact for fact in facts if fact.kind == kind] diff --git a/src/repo_registry/core/service.py b/src/repo_registry/core/service.py index 79bfe0f..586f40c 100644 --- a/src/repo_registry/core/service.py +++ b/src/repo_registry/core/service.py @@ -121,7 +121,12 @@ class RegistryService: completed_run.snapshot_id, chunks, ) - candidates = self.candidate_generator.generate(repository, facts) + stored_chunks = self.store.list_content_chunks(repository_id, completed_run.id) + candidates = self.candidate_generator.generate( + repository, + facts, + stored_chunks, + ) self.store.replace_candidate_graph(repository_id, completed_run.id, candidates) return ScanSummary( analysis_run=completed_run, diff --git a/tests/test_candidate_graph.py b/tests/test_candidate_graph.py index 7714931..d3ea051 100644 --- a/tests/test_candidate_graph.py +++ b/tests/test_candidate_graph.py @@ -1,5 +1,5 @@ from repo_registry.candidate_graph.generator import CandidateGraphGenerator -from repo_registry.core.models import ObservedFact, Repository +from repo_registry.core.models import ContentChunk, ObservedFact, Repository def fact(id, kind, name, path="", value=""): @@ -16,6 +16,20 @@ def fact(id, kind, name, path="", value=""): ) +def chunk(id, kind, path, text, start_line=1, end_line=1): + return ContentChunk( + id=id, + repository_id=1, + analysis_run_id=1, + snapshot_id=1, + path=path, + kind=kind, + start_line=start_line, + end_line=end_line, + text=text, + ) + + def test_candidate_generator_builds_review_seed_from_observed_facts(): repository = Repository( id=1, @@ -43,3 +57,40 @@ def test_candidate_generator_builds_review_seed_from_observed_facts(): assert interface_capability.features[0].type == "API" assert interface_capability.features[0].location == "app.py" assert interface_capability.evidence[0].strength == "strong" + + +def test_candidate_generator_enriches_descriptions_from_content_chunks(): + repository = Repository( + id=1, + name="MailRouter", + url="/tmp/mail-router", + description=None, + branch="main", + status="analyzed", + ) + facts = [ + fact(1, "documentation", "README", "README.md"), + fact(2, "interface", "python route decorator", "app.py", '@app.post("/classify")'), + ] + chunks = [ + chunk( + 1, + "documentation", + "README.md", + "# MailRouter\nRoutes incoming customer email to the right team.", + end_line=2, + ), + chunk( + 2, + "interface", + "app.py", + '@app.post("/classify")\ndef classify_email():\n return {}', + start_line=5, + end_line=7, + ), + ] + + graph = CandidateGraphGenerator().generate(repository, facts, chunks) + + assert "MailRouter. Routes incoming customer email" in graph[0].description + assert '@app.post("/classify")' in graph[0].capabilities[0].description diff --git a/tests/test_registry_service.py b/tests/test_registry_service.py index f25f4a1..b041087 100644 --- a/tests/test_registry_service.py +++ b/tests/test_registry_service.py @@ -352,6 +352,8 @@ def test_analyze_repository_records_snapshot_and_observed_facts(tmp_path): candidate_graph = service.candidate_graph(repository.id, summary.analysis_run.id) assert candidate_graph.repository.name == "Example" assert candidate_graph.abilities + assert "Example" in candidate_graph.abilities[0].description + assert "@app.get" in candidate_graph.abilities[0].capabilities[0].description capability_names = { capability.name for ability in candidate_graph.abilities