From 7e66c5735091e13831b93bcb8a8e09ad61b0e045 Mon Sep 17 00:00:00 2001 From: tegwick Date: Sun, 26 Apr 2026 03:05:48 +0200 Subject: [PATCH] llm_extraction boundary --- README.md | 15 ++ src/repo_registry/llm_extraction/__init__.py | 19 ++ src/repo_registry/llm_extraction/extractor.py | 214 ++++++++++++++++++ tests/test_llm_extraction.py | 124 ++++++++++ 4 files changed, 372 insertions(+) create mode 100644 src/repo_registry/llm_extraction/__init__.py create mode 100644 src/repo_registry/llm_extraction/extractor.py create mode 100644 tests/test_llm_extraction.py diff --git a/README.md b/README.md index f89482c..a0d06e7 100644 --- a/README.md +++ b/README.md @@ -105,6 +105,21 @@ Candidate graphs are meant to be corrected before publication. The API supports: Examples are available in the generated OpenAPI docs at `/docs`. +## Optional LLM Extraction + +The `llm_extraction` module is designed to work with the sibling `llm-connect` +project without making it a hard dependency. To enable provider-backed +extraction locally: + +```bash +python -m pip install -e ../llm-connect +``` + +The integration accepts any `llm-connect` style adapter with +`execute_prompt(prompt, config)` and parses strict JSON candidate drafts from +model responses. Tests use a fake adapter, so the default test suite does not +call external providers. + ## Agent-Facing Endpoints The v0.1 API covers the main registration, analysis, review, search, and inspection loop: diff --git a/src/repo_registry/llm_extraction/__init__.py b/src/repo_registry/llm_extraction/__init__.py new file mode 100644 index 0000000..2fb2579 --- /dev/null +++ b/src/repo_registry/llm_extraction/__init__.py @@ -0,0 +1,19 @@ +from repo_registry.llm_extraction.extractor import ( + ExtractedAbility, + ExtractedCapability, + ExtractedEvidence, + ExtractedFeature, + LLMCandidateExtractor, + LLMExtractionError, + create_llm_connect_adapter, +) + +__all__ = [ + "ExtractedAbility", + "ExtractedCapability", + "ExtractedEvidence", + "ExtractedFeature", + "LLMCandidateExtractor", + "LLMExtractionError", + "create_llm_connect_adapter", +] diff --git a/src/repo_registry/llm_extraction/extractor.py b/src/repo_registry/llm_extraction/extractor.py new file mode 100644 index 0000000..260724c --- /dev/null +++ b/src/repo_registry/llm_extraction/extractor.py @@ -0,0 +1,214 @@ +from __future__ import annotations + +import json +from dataclasses import dataclass, field +from typing import Any, Protocol + +from repo_registry.core.models import ContentChunk, Repository + + +class LLMExtractionError(ValueError): + pass + + +class LLMResponseLike(Protocol): + content: str + + +class LLMAdapterLike(Protocol): + def execute_prompt(self, prompt: str, config: Any) -> LLMResponseLike: + pass + + +@dataclass(frozen=True) +class ExtractedEvidence: + type: str + reference: str + strength: str = "medium" + source_paths: list[str] = field(default_factory=list) + + +@dataclass(frozen=True) +class ExtractedFeature: + name: str + type: str + location: str = "" + source_paths: list[str] = field(default_factory=list) + + +@dataclass(frozen=True) +class ExtractedCapability: + name: str + description: str = "" + inputs: list[str] = field(default_factory=list) + outputs: list[str] = field(default_factory=list) + features: list[ExtractedFeature] = field(default_factory=list) + evidence: list[ExtractedEvidence] = field(default_factory=list) + source_paths: list[str] = field(default_factory=list) + + +@dataclass(frozen=True) +class ExtractedAbility: + name: str + description: str = "" + capabilities: list[ExtractedCapability] = field(default_factory=list) + source_paths: list[str] = field(default_factory=list) + + +class LLMCandidateExtractor: + """Structured candidate extraction over llm-connect-style adapters.""" + + def __init__(self, adapter: LLMAdapterLike, run_config: Any | None = None) -> None: + self.adapter = adapter + self.run_config = run_config or self._default_run_config() + + def extract( + self, + repository: Repository, + chunks: list[ContentChunk], + ) -> list[ExtractedAbility]: + prompt = self.build_prompt(repository, chunks) + response = self.adapter.execute_prompt(prompt, self.run_config) + return self.parse_response(response.content) + + def build_prompt(self, repository: Repository, chunks: list[ContentChunk]) -> str: + chunk_text = "\n\n".join( + ( + f"Source: {chunk.path}:{chunk.start_line}-{chunk.end_line} " + f"({chunk.kind})\n{chunk.text}" + ) + for chunk in chunks[:12] + ) + return ( + "Extract a conservative, source-linked repository ability map.\n" + "Return strict JSON only with this shape:\n" + "{\n" + ' "abilities": [\n' + " {\n" + ' "name": "...",\n' + ' "description": "...",\n' + ' "source_paths": ["README.md"],\n' + ' "capabilities": [\n' + " {\n" + ' "name": "...",\n' + ' "description": "...",\n' + ' "inputs": ["..."],\n' + ' "outputs": ["..."],\n' + ' "source_paths": ["..."],\n' + ' "features": [{"name": "...", "type": "...", "location": "...", "source_paths": ["..."]}],\n' + ' "evidence": [{"type": "documentation", "reference": "...", "strength": "medium", "source_paths": ["..."]}]\n' + " }\n" + " ]\n" + " }\n" + " ]\n" + "}\n" + "Do not invent unsupported claims. If sources are weak, keep names generic.\n\n" + f"Repository: {repository.name}\n" + f"Description: {repository.description or ''}\n\n" + f"{chunk_text}\n" + ) + + def parse_response(self, content: str) -> list[ExtractedAbility]: + try: + payload = json.loads(self._json_text(content)) + except json.JSONDecodeError as exc: + raise LLMExtractionError(f"LLM response was not valid JSON: {exc}") from exc + abilities = payload.get("abilities") + if not isinstance(abilities, list): + raise LLMExtractionError("LLM response must contain an abilities list") + return [self._ability(item) for item in abilities] + + def _ability(self, item: dict[str, Any]) -> ExtractedAbility: + return ExtractedAbility( + name=self._required_str(item, "name"), + description=self._optional_str(item, "description"), + source_paths=self._str_list(item.get("source_paths")), + capabilities=[ + self._capability(capability) + for capability in item.get("capabilities", []) + if isinstance(capability, dict) + ], + ) + + def _capability(self, item: dict[str, Any]) -> ExtractedCapability: + return ExtractedCapability( + name=self._required_str(item, "name"), + description=self._optional_str(item, "description"), + inputs=self._str_list(item.get("inputs")), + outputs=self._str_list(item.get("outputs")), + source_paths=self._str_list(item.get("source_paths")), + features=[ + self._feature(feature) + for feature in item.get("features", []) + if isinstance(feature, dict) + ], + evidence=[ + self._evidence(evidence) + for evidence in item.get("evidence", []) + if isinstance(evidence, dict) + ], + ) + + def _feature(self, item: dict[str, Any]) -> ExtractedFeature: + return ExtractedFeature( + name=self._required_str(item, "name"), + type=self._required_str(item, "type"), + location=self._optional_str(item, "location"), + source_paths=self._str_list(item.get("source_paths")), + ) + + def _evidence(self, item: dict[str, Any]) -> ExtractedEvidence: + return ExtractedEvidence( + type=self._required_str(item, "type"), + reference=self._required_str(item, "reference"), + strength=self._optional_str(item, "strength") or "medium", + source_paths=self._str_list(item.get("source_paths")), + ) + + def _json_text(self, content: str) -> str: + stripped = content.strip() + if stripped.startswith("```"): + lines = stripped.splitlines() + if lines and lines[0].startswith("```"): + lines = lines[1:] + if lines and lines[-1].startswith("```"): + lines = lines[:-1] + return "\n".join(lines).strip() + return stripped + + def _required_str(self, item: dict[str, Any], key: str) -> str: + value = item.get(key) + if not isinstance(value, str) or not value.strip(): + raise LLMExtractionError(f"Missing required string field: {key}") + return value.strip() + + def _optional_str(self, item: dict[str, Any], key: str) -> str: + value = item.get(key, "") + return value.strip() if isinstance(value, str) else "" + + def _str_list(self, value: Any) -> list[str]: + if not isinstance(value, list): + return [] + return [item.strip() for item in value if isinstance(item, str) and item.strip()] + + def _default_run_config(self) -> Any: + try: + from llm_connect import RunConfig + except ModuleNotFoundError: + return None + return RunConfig(temperature=0.1, max_tokens=2000) + + +def create_llm_connect_adapter( + provider: str, + model: str | None = None, + **kwargs: Any, +) -> LLMAdapterLike: + try: + from llm_connect import create_adapter + except ModuleNotFoundError as exc: + raise LLMExtractionError( + "llm-connect is not installed. Install the sibling project with " + "`python -m pip install -e ../llm-connect`." + ) from exc + return create_adapter(provider, model=model, **kwargs) diff --git a/tests/test_llm_extraction.py b/tests/test_llm_extraction.py new file mode 100644 index 0000000..f0ca36d --- /dev/null +++ b/tests/test_llm_extraction.py @@ -0,0 +1,124 @@ +import pytest + +from repo_registry.core.models import ContentChunk, Repository +from repo_registry.llm_extraction import ( + LLMCandidateExtractor, + LLMExtractionError, + create_llm_connect_adapter, +) + + +class Response: + def __init__(self, content): + self.content = content + + +class FakeAdapter: + def __init__(self, content): + self.content = content + self.last_prompt = "" + self.last_config = object() + + def execute_prompt(self, prompt, config): + self.last_prompt = prompt + self.last_config = config + return Response(self.content) + + +def repository(): + return Repository( + id=1, + name="MailRouter", + url="/tmp/mail-router", + description="Routes inbound email.", + branch="main", + status="analyzed", + ) + + +def chunk(): + return ContentChunk( + id=1, + repository_id=1, + analysis_run_id=1, + snapshot_id=1, + path="README.md", + kind="documentation", + start_line=1, + end_line=2, + text="# MailRouter\nRoutes incoming customer email.", + ) + + +def test_llm_candidate_extractor_parses_structured_response(): + adapter = FakeAdapter( + """ + { + "abilities": [ + { + "name": "Business Email Routing", + "description": "Routes inbound customer email.", + "source_paths": ["README.md"], + "capabilities": [ + { + "name": "Classify Incoming Email", + "description": "Classify messages.", + "inputs": ["email body"], + "outputs": ["intent"], + "source_paths": ["README.md"], + "features": [ + { + "name": "POST /classify", + "type": "REST endpoint", + "location": "app.py", + "source_paths": ["app.py"] + } + ], + "evidence": [ + { + "type": "documentation", + "reference": "README.md", + "strength": "medium", + "source_paths": ["README.md"] + } + ] + } + ] + } + ] + } + """ + ) + extractor = LLMCandidateExtractor(adapter) + + abilities = extractor.extract(repository(), [chunk()]) + + assert "Return strict JSON only" in adapter.last_prompt + assert "README.md:1-2" in adapter.last_prompt + assert abilities[0].name == "Business Email Routing" + assert abilities[0].capabilities[0].features[0].name == "POST /classify" + assert abilities[0].capabilities[0].evidence[0].reference == "README.md" + + +def test_llm_candidate_extractor_accepts_fenced_json(): + adapter = FakeAdapter( + '```json\n{"abilities": [{"name": "A", "capabilities": []}]}\n```' + ) + + abilities = LLMCandidateExtractor(adapter).extract(repository(), []) + + assert abilities[0].name == "A" + + +def test_llm_candidate_extractor_rejects_invalid_json(): + adapter = FakeAdapter("not json") + + with pytest.raises(LLMExtractionError): + LLMCandidateExtractor(adapter).extract(repository(), []) + + +def test_llm_connect_factory_reports_missing_dependency(): + with pytest.raises(LLMExtractionError) as exc: + create_llm_connect_adapter("mock") + + assert "llm-connect is not installed" in str(exc.value)