diff --git a/README.md b/README.md index 798a6f2..14eee4e 100644 --- a/README.md +++ b/README.md @@ -122,6 +122,11 @@ entries while preserving source paths where they match observed facts or content chunks. Tests use fake adapters, so the default test suite does not call external providers. +Application code can inject an `LLMCandidateExtractor` into `RegistryService`. +When an extractor is present and returns candidates, analysis stores those +reviewable candidates; when it returns no candidates, the deterministic +heuristic generator remains the fallback. + ## Agent-Facing Endpoints The v0.1 API covers the main registration, analysis, review, search, and inspection loop: diff --git a/src/repo_registry/core/service.py b/src/repo_registry/core/service.py index 586f40c..e6adf95 100644 --- a/src/repo_registry/core/service.py +++ b/src/repo_registry/core/service.py @@ -17,6 +17,8 @@ from repo_registry.core.models import ( ) from repo_registry.candidate_graph.generator import CandidateGraphGenerator from repo_registry.content_indexing.extractor import ContentExtractor +from repo_registry.llm_extraction.extractor import LLMCandidateExtractor +from repo_registry.llm_extraction.mapper import LLMExtractionMapper from repo_registry.repo_ingestion.git import GitIngestionService from repo_registry.repo_ingestion.metadata import RepositoryMetadataExtractor from repo_registry.repo_scanning.scanner import DeterministicScanner @@ -30,6 +32,7 @@ class RegistryService: self, store: RegistryStore, ingestion: GitIngestionService | None = None, + llm_extractor: LLMCandidateExtractor | None = None, ) -> None: self.store = store self.scanner = DeterministicScanner() @@ -37,6 +40,8 @@ class RegistryService: self.metadata_extractor = RepositoryMetadataExtractor() self.candidate_generator = CandidateGraphGenerator() self.content_extractor = ContentExtractor() + self.llm_extractor = llm_extractor + self.llm_mapper = LLMExtractionMapper() def register_repository( self, @@ -122,11 +127,7 @@ class RegistryService: chunks, ) stored_chunks = self.store.list_content_chunks(repository_id, completed_run.id) - candidates = self.candidate_generator.generate( - repository, - facts, - stored_chunks, - ) + candidates = self._generate_candidates(repository, facts, stored_chunks) self.store.replace_candidate_graph(repository_id, completed_run.id, candidates) return ScanSummary( analysis_run=completed_run, @@ -134,6 +135,18 @@ class RegistryService: facts=facts, ) + def _generate_candidates( + self, + repository: Repository, + facts: list[ObservedFact], + chunks: list[ContentChunk], + ): + if self.llm_extractor is not None: + extracted = self.llm_extractor.extract(repository, chunks) + if extracted: + return self.llm_mapper.map(extracted, facts, chunks) + return self.candidate_generator.generate(repository, facts, chunks) + def list_analysis_runs(self, repository_id: int) -> list[AnalysisRun]: return self.store.list_analysis_runs(repository_id) diff --git a/tests/test_registry_service.py b/tests/test_registry_service.py index ca49358..f19676a 100644 --- a/tests/test_registry_service.py +++ b/tests/test_registry_service.py @@ -1,6 +1,7 @@ import subprocess from repo_registry.core.service import RegistryService +from repo_registry.llm_extraction import ExtractedAbility, ExtractedCapability from repo_registry.repo_ingestion.git import GitIngestionService from repo_registry.storage.sqlite import NotFoundError, RegistryStore @@ -11,6 +12,16 @@ def make_service(tmp_path): return RegistryService(store, ingestion=GitIngestionService(tmp_path / "checkouts")) +class FakeLLMExtractor: + def __init__(self, abilities): + self.abilities = abilities + self.calls = [] + + def extract(self, repository, chunks): + self.calls.append((repository, chunks)) + return self.abilities + + def test_manual_registry_builds_ability_map(tmp_path): service = make_service(tmp_path) @@ -363,6 +374,67 @@ def test_analyze_repository_records_snapshot_and_observed_facts(tmp_path): assert "Expose Repository Interface" in capability_names +def test_analyze_repository_can_use_optional_llm_extractor(tmp_path): + source = tmp_path / "repo" + source.mkdir() + (source / "README.md").write_text( + "# Email Router\nRoutes incoming customer email.\n", + encoding="utf-8", + ) + store = RegistryStore(tmp_path / "registry.sqlite3") + store.initialize() + extractor = FakeLLMExtractor( + [ + ExtractedAbility( + name="Business Email Routing", + description="Route incoming messages.", + source_paths=["README.md"], + capabilities=[ + ExtractedCapability( + name="Classify Incoming Email", + description="Classify messages by intent.", + source_paths=["README.md"], + ) + ], + ) + ] + ) + service = RegistryService( + store, + ingestion=GitIngestionService(tmp_path / "checkouts"), + llm_extractor=extractor, + ) + repository = service.register_repository(name="Email Router", url=str(source)) + + summary = service.analyze_repository(repository.id) + graph = service.candidate_graph(repository.id, summary.analysis_run.id) + + assert extractor.calls + assert extractor.calls[0][1] + assert graph.abilities[0].name == "Business Email Routing" + assert graph.abilities[0].capabilities[0].name == "Classify Incoming Email" + assert graph.abilities[0].source_refs[0].path == "README.md" + + +def test_analyze_repository_falls_back_when_optional_llm_extractor_returns_no_candidates(tmp_path): + source = tmp_path / "repo" + source.mkdir() + (source / "README.md").write_text("# Fallback\n", encoding="utf-8") + store = RegistryStore(tmp_path / "registry.sqlite3") + store.initialize() + service = RegistryService( + store, + ingestion=GitIngestionService(tmp_path / "checkouts"), + llm_extractor=FakeLLMExtractor([]), + ) + repository = service.register_repository(name="Fallback", url=str(source)) + + summary = service.analyze_repository(repository.id) + graph = service.candidate_graph(repository.id, summary.analysis_run.id) + + assert graph.abilities[0].name == "Review Fallback Repository Usefulness" + + def test_approve_candidate_graph_publishes_ability_map_once(tmp_path): source = tmp_path / "repo" source.mkdir()