Wired optional LLM extraction into the analysis path

This commit is contained in:
2026-04-26 03:11:45 +02:00
parent 3aa0c08ab9
commit 0f10ca6065
3 changed files with 95 additions and 5 deletions

View File

@@ -122,6 +122,11 @@ entries while preserving source paths where they match observed facts or
content chunks. Tests use fake adapters, so the default test suite does not call
external providers.
Application code can inject an `LLMCandidateExtractor` into `RegistryService`.
When an extractor is present and returns candidates, analysis stores those
reviewable candidates; when it returns no candidates, the deterministic
heuristic generator remains the fallback.
## Agent-Facing Endpoints
The v0.1 API covers the main registration, analysis, review, search, and inspection loop:

View File

@@ -17,6 +17,8 @@ from repo_registry.core.models import (
)
from repo_registry.candidate_graph.generator import CandidateGraphGenerator
from repo_registry.content_indexing.extractor import ContentExtractor
from repo_registry.llm_extraction.extractor import LLMCandidateExtractor
from repo_registry.llm_extraction.mapper import LLMExtractionMapper
from repo_registry.repo_ingestion.git import GitIngestionService
from repo_registry.repo_ingestion.metadata import RepositoryMetadataExtractor
from repo_registry.repo_scanning.scanner import DeterministicScanner
@@ -30,6 +32,7 @@ class RegistryService:
self,
store: RegistryStore,
ingestion: GitIngestionService | None = None,
llm_extractor: LLMCandidateExtractor | None = None,
) -> None:
self.store = store
self.scanner = DeterministicScanner()
@@ -37,6 +40,8 @@ class RegistryService:
self.metadata_extractor = RepositoryMetadataExtractor()
self.candidate_generator = CandidateGraphGenerator()
self.content_extractor = ContentExtractor()
self.llm_extractor = llm_extractor
self.llm_mapper = LLMExtractionMapper()
def register_repository(
self,
@@ -122,11 +127,7 @@ class RegistryService:
chunks,
)
stored_chunks = self.store.list_content_chunks(repository_id, completed_run.id)
candidates = self.candidate_generator.generate(
repository,
facts,
stored_chunks,
)
candidates = self._generate_candidates(repository, facts, stored_chunks)
self.store.replace_candidate_graph(repository_id, completed_run.id, candidates)
return ScanSummary(
analysis_run=completed_run,
@@ -134,6 +135,18 @@ class RegistryService:
facts=facts,
)
def _generate_candidates(
self,
repository: Repository,
facts: list[ObservedFact],
chunks: list[ContentChunk],
):
if self.llm_extractor is not None:
extracted = self.llm_extractor.extract(repository, chunks)
if extracted:
return self.llm_mapper.map(extracted, facts, chunks)
return self.candidate_generator.generate(repository, facts, chunks)
def list_analysis_runs(self, repository_id: int) -> list[AnalysisRun]:
return self.store.list_analysis_runs(repository_id)

View File

@@ -1,6 +1,7 @@
import subprocess
from repo_registry.core.service import RegistryService
from repo_registry.llm_extraction import ExtractedAbility, ExtractedCapability
from repo_registry.repo_ingestion.git import GitIngestionService
from repo_registry.storage.sqlite import NotFoundError, RegistryStore
@@ -11,6 +12,16 @@ def make_service(tmp_path):
return RegistryService(store, ingestion=GitIngestionService(tmp_path / "checkouts"))
class FakeLLMExtractor:
def __init__(self, abilities):
self.abilities = abilities
self.calls = []
def extract(self, repository, chunks):
self.calls.append((repository, chunks))
return self.abilities
def test_manual_registry_builds_ability_map(tmp_path):
service = make_service(tmp_path)
@@ -363,6 +374,67 @@ def test_analyze_repository_records_snapshot_and_observed_facts(tmp_path):
assert "Expose Repository Interface" in capability_names
def test_analyze_repository_can_use_optional_llm_extractor(tmp_path):
source = tmp_path / "repo"
source.mkdir()
(source / "README.md").write_text(
"# Email Router\nRoutes incoming customer email.\n",
encoding="utf-8",
)
store = RegistryStore(tmp_path / "registry.sqlite3")
store.initialize()
extractor = FakeLLMExtractor(
[
ExtractedAbility(
name="Business Email Routing",
description="Route incoming messages.",
source_paths=["README.md"],
capabilities=[
ExtractedCapability(
name="Classify Incoming Email",
description="Classify messages by intent.",
source_paths=["README.md"],
)
],
)
]
)
service = RegistryService(
store,
ingestion=GitIngestionService(tmp_path / "checkouts"),
llm_extractor=extractor,
)
repository = service.register_repository(name="Email Router", url=str(source))
summary = service.analyze_repository(repository.id)
graph = service.candidate_graph(repository.id, summary.analysis_run.id)
assert extractor.calls
assert extractor.calls[0][1]
assert graph.abilities[0].name == "Business Email Routing"
assert graph.abilities[0].capabilities[0].name == "Classify Incoming Email"
assert graph.abilities[0].source_refs[0].path == "README.md"
def test_analyze_repository_falls_back_when_optional_llm_extractor_returns_no_candidates(tmp_path):
source = tmp_path / "repo"
source.mkdir()
(source / "README.md").write_text("# Fallback\n", encoding="utf-8")
store = RegistryStore(tmp_path / "registry.sqlite3")
store.initialize()
service = RegistryService(
store,
ingestion=GitIngestionService(tmp_path / "checkouts"),
llm_extractor=FakeLLMExtractor([]),
)
repository = service.register_repository(name="Fallback", url=str(source))
summary = service.analyze_repository(repository.id)
graph = service.candidate_graph(repository.id, summary.analysis_run.id)
assert graph.abilities[0].name == "Review Fallback Repository Usefulness"
def test_approve_candidate_graph_publishes_ability_map_once(tmp_path):
source = tmp_path / "repo"
source.mkdir()