llm_extraction boundary

2026-04-26 03:05:48 +02:00
parent c6d1ee55e6
commit 7e66c57350
4 changed files with 372 additions and 0 deletions
--- a/README.md
+++ b/README.md
@@ -105,6 +105,21 @@ Candidate graphs are meant to be corrected before publication. The API supports:

 Examples are available in the generated OpenAPI docs at `/docs`.

+## Optional LLM Extraction
+
+The `llm_extraction` module is designed to work with the sibling `llm-connect`
+project without making it a hard dependency. To enable provider-backed
+extraction locally:
+
+```bash
+python -m pip install -e ../llm-connect
+```
+
+The integration accepts any `llm-connect` style adapter with
+`execute_prompt(prompt, config)` and parses strict JSON candidate drafts from
+model responses. Tests use a fake adapter, so the default test suite does not
+call external providers.
+
 ## Agent-Facing Endpoints

 The v0.1 API covers the main registration, analysis, review, search, and inspection loop:
--- a/src/repo_registry/llm_extraction/init.py
+++ b/src/repo_registry/llm_extraction/init.py
@@ -0,0 +1,19 @@
+from repo_registry.llm_extraction.extractor import (
+    ExtractedAbility,
+    ExtractedCapability,
+    ExtractedEvidence,
+    ExtractedFeature,
+    LLMCandidateExtractor,
+    LLMExtractionError,
+    create_llm_connect_adapter,
+)
+
+__all__ = [
+    "ExtractedAbility",
+    "ExtractedCapability",
+    "ExtractedEvidence",
+    "ExtractedFeature",
+    "LLMCandidateExtractor",
+    "LLMExtractionError",
+    "create_llm_connect_adapter",
+]
--- a/src/repo_registry/llm_extraction/extractor.py
+++ b/src/repo_registry/llm_extraction/extractor.py
@@ -0,0 +1,214 @@
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass, field
+from typing import Any, Protocol
+
+from repo_registry.core.models import ContentChunk, Repository
+
+
+class LLMExtractionError(ValueError):
+    pass
+
+
+class LLMResponseLike(Protocol):
+    content: str
+
+
+class LLMAdapterLike(Protocol):
+    def execute_prompt(self, prompt: str, config: Any) -> LLMResponseLike:
+        pass
+
+
+@dataclass(frozen=True)
+class ExtractedEvidence:
+    type: str
+    reference: str
+    strength: str = "medium"
+    source_paths: list[str] = field(default_factory=list)
+
+
+@dataclass(frozen=True)
+class ExtractedFeature:
+    name: str
+    type: str
+    location: str = ""
+    source_paths: list[str] = field(default_factory=list)
+
+
+@dataclass(frozen=True)
+class ExtractedCapability:
+    name: str
+    description: str = ""
+    inputs: list[str] = field(default_factory=list)
+    outputs: list[str] = field(default_factory=list)
+    features: list[ExtractedFeature] = field(default_factory=list)
+    evidence: list[ExtractedEvidence] = field(default_factory=list)
+    source_paths: list[str] = field(default_factory=list)
+
+
+@dataclass(frozen=True)
+class ExtractedAbility:
+    name: str
+    description: str = ""
+    capabilities: list[ExtractedCapability] = field(default_factory=list)
+    source_paths: list[str] = field(default_factory=list)
+
+
+class LLMCandidateExtractor:
+    """Structured candidate extraction over llm-connect-style adapters."""
+
+    def __init__(self, adapter: LLMAdapterLike, run_config: Any | None = None) -> None:
+        self.adapter = adapter
+        self.run_config = run_config or self._default_run_config()
+
+    def extract(
+        self,
+        repository: Repository,
+        chunks: list[ContentChunk],
+    ) -> list[ExtractedAbility]:
+        prompt = self.build_prompt(repository, chunks)
+        response = self.adapter.execute_prompt(prompt, self.run_config)
+        return self.parse_response(response.content)
+
+    def build_prompt(self, repository: Repository, chunks: list[ContentChunk]) -> str:
+        chunk_text = "\n\n".join(
+            (
+                f"Source: {chunk.path}:{chunk.start_line}-{chunk.end_line} "
+                f"({chunk.kind})\n{chunk.text}"
+            )
+            for chunk in chunks[:12]
+        )
+        return (
+            "Extract a conservative, source-linked repository ability map.\n"
+            "Return strict JSON only with this shape:\n"
+            "{\n"
+            '  "abilities": [\n'
+            "    {\n"
+            '      "name": "...",\n'
+            '      "description": "...",\n'
+            '      "source_paths": ["README.md"],\n'
+            '      "capabilities": [\n'
+            "        {\n"
+            '          "name": "...",\n'
+            '          "description": "...",\n'
+            '          "inputs": ["..."],\n'
+            '          "outputs": ["..."],\n'
+            '          "source_paths": ["..."],\n'
+            '          "features": [{"name": "...", "type": "...", "location": "...", "source_paths": ["..."]}],\n'
+            '          "evidence": [{"type": "documentation", "reference": "...", "strength": "medium", "source_paths": ["..."]}]\n'
+            "        }\n"
+            "      ]\n"
+            "    }\n"
+            "  ]\n"
+            "}\n"
+            "Do not invent unsupported claims. If sources are weak, keep names generic.\n\n"
+            f"Repository: {repository.name}\n"
+            f"Description: {repository.description or ''}\n\n"
+            f"{chunk_text}\n"
+        )
+
+    def parse_response(self, content: str) -> list[ExtractedAbility]:
+        try:
+            payload = json.loads(self._json_text(content))
+        except json.JSONDecodeError as exc:
+            raise LLMExtractionError(f"LLM response was not valid JSON: {exc}") from exc
+        abilities = payload.get("abilities")
+        if not isinstance(abilities, list):
+            raise LLMExtractionError("LLM response must contain an abilities list")
+        return [self._ability(item) for item in abilities]
+
+    def _ability(self, item: dict[str, Any]) -> ExtractedAbility:
+        return ExtractedAbility(
+            name=self._required_str(item, "name"),
+            description=self._optional_str(item, "description"),
+            source_paths=self._str_list(item.get("source_paths")),
+            capabilities=[
+                self._capability(capability)
+                for capability in item.get("capabilities", [])
+                if isinstance(capability, dict)
+            ],
+        )
+
+    def _capability(self, item: dict[str, Any]) -> ExtractedCapability:
+        return ExtractedCapability(
+            name=self._required_str(item, "name"),
+            description=self._optional_str(item, "description"),
+            inputs=self._str_list(item.get("inputs")),
+            outputs=self._str_list(item.get("outputs")),
+            source_paths=self._str_list(item.get("source_paths")),
+            features=[
+                self._feature(feature)
+                for feature in item.get("features", [])
+                if isinstance(feature, dict)
+            ],
+            evidence=[
+                self._evidence(evidence)
+                for evidence in item.get("evidence", [])
+                if isinstance(evidence, dict)
+            ],
+        )
+
+    def _feature(self, item: dict[str, Any]) -> ExtractedFeature:
+        return ExtractedFeature(
+            name=self._required_str(item, "name"),
+            type=self._required_str(item, "type"),
+            location=self._optional_str(item, "location"),
+            source_paths=self._str_list(item.get("source_paths")),
+        )
+
+    def _evidence(self, item: dict[str, Any]) -> ExtractedEvidence:
+        return ExtractedEvidence(
+            type=self._required_str(item, "type"),
+            reference=self._required_str(item, "reference"),
+            strength=self._optional_str(item, "strength") or "medium",
+            source_paths=self._str_list(item.get("source_paths")),
+        )
+
+    def _json_text(self, content: str) -> str:
+        stripped = content.strip()
+        if stripped.startswith("```"):
+            lines = stripped.splitlines()
+            if lines and lines[0].startswith("```"):
+                lines = lines[1:]
+            if lines and lines[-1].startswith("```"):
+                lines = lines[:-1]
+            return "\n".join(lines).strip()
+        return stripped
+
+    def _required_str(self, item: dict[str, Any], key: str) -> str:
+        value = item.get(key)
+        if not isinstance(value, str) or not value.strip():
+            raise LLMExtractionError(f"Missing required string field: {key}")
+        return value.strip()
+
+    def _optional_str(self, item: dict[str, Any], key: str) -> str:
+        value = item.get(key, "")
+        return value.strip() if isinstance(value, str) else ""
+
+    def _str_list(self, value: Any) -> list[str]:
+        if not isinstance(value, list):
+            return []
+        return [item.strip() for item in value if isinstance(item, str) and item.strip()]
+
+    def _default_run_config(self) -> Any:
+        try:
+            from llm_connect import RunConfig
+        except ModuleNotFoundError:
+            return None
+        return RunConfig(temperature=0.1, max_tokens=2000)
+
+
+def create_llm_connect_adapter(
+    provider: str,
+    model: str | None = None,
+    **kwargs: Any,
+) -> LLMAdapterLike:
+    try:
+        from llm_connect import create_adapter
+    except ModuleNotFoundError as exc:
+        raise LLMExtractionError(
+            "llm-connect is not installed. Install the sibling project with "
+            "`python -m pip install -e ../llm-connect`."
+        ) from exc
+    return create_adapter(provider, model=model, **kwargs)
--- a/tests/test_llm_extraction.py
+++ b/tests/test_llm_extraction.py
@@ -0,0 +1,124 @@
+import pytest
+
+from repo_registry.core.models import ContentChunk, Repository
+from repo_registry.llm_extraction import (
+    LLMCandidateExtractor,
+    LLMExtractionError,
+    create_llm_connect_adapter,
+)
+
+
+class Response:
+    def __init__(self, content):
+        self.content = content
+
+
+class FakeAdapter:
+    def __init__(self, content):
+        self.content = content
+        self.last_prompt = ""
+        self.last_config = object()
+
+    def execute_prompt(self, prompt, config):
+        self.last_prompt = prompt
+        self.last_config = config
+        return Response(self.content)
+
+
+def repository():
+    return Repository(
+        id=1,
+        name="MailRouter",
+        url="/tmp/mail-router",
+        description="Routes inbound email.",
+        branch="main",
+        status="analyzed",
+    )
+
+
+def chunk():
+    return ContentChunk(
+        id=1,
+        repository_id=1,
+        analysis_run_id=1,
+        snapshot_id=1,
+        path="README.md",
+        kind="documentation",
+        start_line=1,
+        end_line=2,
+        text="# MailRouter\nRoutes incoming customer email.",
+    )
+
+
+def test_llm_candidate_extractor_parses_structured_response():
+    adapter = FakeAdapter(
+        """
+        {
+          "abilities": [
+            {
+              "name": "Business Email Routing",
+              "description": "Routes inbound customer email.",
+              "source_paths": ["README.md"],
+              "capabilities": [
+                {
+                  "name": "Classify Incoming Email",
+                  "description": "Classify messages.",
+                  "inputs": ["email body"],
+                  "outputs": ["intent"],
+                  "source_paths": ["README.md"],
+                  "features": [
+                    {
+                      "name": "POST /classify",
+                      "type": "REST endpoint",
+                      "location": "app.py",
+                      "source_paths": ["app.py"]
+                    }
+                  ],
+                  "evidence": [
+                    {
+                      "type": "documentation",
+                      "reference": "README.md",
+                      "strength": "medium",
+                      "source_paths": ["README.md"]
+                    }
+                  ]
+                }
+              ]
+            }
+          ]
+        }
+        """
+    )
+    extractor = LLMCandidateExtractor(adapter)
+
+    abilities = extractor.extract(repository(), [chunk()])
+
+    assert "Return strict JSON only" in adapter.last_prompt
+    assert "README.md:1-2" in adapter.last_prompt
+    assert abilities[0].name == "Business Email Routing"
+    assert abilities[0].capabilities[0].features[0].name == "POST /classify"
+    assert abilities[0].capabilities[0].evidence[0].reference == "README.md"
+
+
+def test_llm_candidate_extractor_accepts_fenced_json():
+    adapter = FakeAdapter(
+        '```json\n{"abilities": [{"name": "A", "capabilities": []}]}\n```'
+    )
+
+    abilities = LLMCandidateExtractor(adapter).extract(repository(), [])
+
+    assert abilities[0].name == "A"
+
+
+def test_llm_candidate_extractor_rejects_invalid_json():
+    adapter = FakeAdapter("not json")
+
+    with pytest.raises(LLMExtractionError):
+        LLMCandidateExtractor(adapter).extract(repository(), [])
+
+
+def test_llm_connect_factory_reports_missing_dependency():
+    with pytest.raises(LLMExtractionError) as exc:
+        create_llm_connect_adapter("mock")
+
+    assert "llm-connect is not installed" in str(exc.value)