llm_extraction boundary

2026-04-26 03:05:48 +02:00
parent c6d1ee55e6
commit 7e66c57350
4 changed files with 372 additions and 0 deletions
--- a/src/repo_registry/llm_extraction/init.py
+++ b/src/repo_registry/llm_extraction/init.py
@@ -0,0 +1,19 @@
+from repo_registry.llm_extraction.extractor import (
+    ExtractedAbility,
+    ExtractedCapability,
+    ExtractedEvidence,
+    ExtractedFeature,
+    LLMCandidateExtractor,
+    LLMExtractionError,
+    create_llm_connect_adapter,
+)
+
+__all__ = [
+    "ExtractedAbility",
+    "ExtractedCapability",
+    "ExtractedEvidence",
+    "ExtractedFeature",
+    "LLMCandidateExtractor",
+    "LLMExtractionError",
+    "create_llm_connect_adapter",
+]
--- a/src/repo_registry/llm_extraction/extractor.py
+++ b/src/repo_registry/llm_extraction/extractor.py
@@ -0,0 +1,214 @@
+from __future__ import annotations
+
+import json
+from dataclasses import dataclass, field
+from typing import Any, Protocol
+
+from repo_registry.core.models import ContentChunk, Repository
+
+
+class LLMExtractionError(ValueError):
+    pass
+
+
+class LLMResponseLike(Protocol):
+    content: str
+
+
+class LLMAdapterLike(Protocol):
+    def execute_prompt(self, prompt: str, config: Any) -> LLMResponseLike:
+        pass
+
+
+@dataclass(frozen=True)
+class ExtractedEvidence:
+    type: str
+    reference: str
+    strength: str = "medium"
+    source_paths: list[str] = field(default_factory=list)
+
+
+@dataclass(frozen=True)
+class ExtractedFeature:
+    name: str
+    type: str
+    location: str = ""
+    source_paths: list[str] = field(default_factory=list)
+
+
+@dataclass(frozen=True)
+class ExtractedCapability:
+    name: str
+    description: str = ""
+    inputs: list[str] = field(default_factory=list)
+    outputs: list[str] = field(default_factory=list)
+    features: list[ExtractedFeature] = field(default_factory=list)
+    evidence: list[ExtractedEvidence] = field(default_factory=list)
+    source_paths: list[str] = field(default_factory=list)
+
+
+@dataclass(frozen=True)
+class ExtractedAbility:
+    name: str
+    description: str = ""
+    capabilities: list[ExtractedCapability] = field(default_factory=list)
+    source_paths: list[str] = field(default_factory=list)
+
+
+class LLMCandidateExtractor:
+    """Structured candidate extraction over llm-connect-style adapters."""
+
+    def __init__(self, adapter: LLMAdapterLike, run_config: Any | None = None) -> None:
+        self.adapter = adapter
+        self.run_config = run_config or self._default_run_config()
+
+    def extract(
+        self,
+        repository: Repository,
+        chunks: list[ContentChunk],
+    ) -> list[ExtractedAbility]:
+        prompt = self.build_prompt(repository, chunks)
+        response = self.adapter.execute_prompt(prompt, self.run_config)
+        return self.parse_response(response.content)
+
+    def build_prompt(self, repository: Repository, chunks: list[ContentChunk]) -> str:
+        chunk_text = "\n\n".join(
+            (
+                f"Source: {chunk.path}:{chunk.start_line}-{chunk.end_line} "
+                f"({chunk.kind})\n{chunk.text}"
+            )
+            for chunk in chunks[:12]
+        )
+        return (
+            "Extract a conservative, source-linked repository ability map.\n"
+            "Return strict JSON only with this shape:\n"
+            "{\n"
+            '  "abilities": [\n'
+            "    {\n"
+            '      "name": "...",\n'
+            '      "description": "...",\n'
+            '      "source_paths": ["README.md"],\n'
+            '      "capabilities": [\n'
+            "        {\n"
+            '          "name": "...",\n'
+            '          "description": "...",\n'
+            '          "inputs": ["..."],\n'
+            '          "outputs": ["..."],\n'
+            '          "source_paths": ["..."],\n'
+            '          "features": [{"name": "...", "type": "...", "location": "...", "source_paths": ["..."]}],\n'
+            '          "evidence": [{"type": "documentation", "reference": "...", "strength": "medium", "source_paths": ["..."]}]\n'
+            "        }\n"
+            "      ]\n"
+            "    }\n"
+            "  ]\n"
+            "}\n"
+            "Do not invent unsupported claims. If sources are weak, keep names generic.\n\n"
+            f"Repository: {repository.name}\n"
+            f"Description: {repository.description or ''}\n\n"
+            f"{chunk_text}\n"
+        )
+
+    def parse_response(self, content: str) -> list[ExtractedAbility]:
+        try:
+            payload = json.loads(self._json_text(content))
+        except json.JSONDecodeError as exc:
+            raise LLMExtractionError(f"LLM response was not valid JSON: {exc}") from exc
+        abilities = payload.get("abilities")
+        if not isinstance(abilities, list):
+            raise LLMExtractionError("LLM response must contain an abilities list")
+        return [self._ability(item) for item in abilities]
+
+    def _ability(self, item: dict[str, Any]) -> ExtractedAbility:
+        return ExtractedAbility(
+            name=self._required_str(item, "name"),
+            description=self._optional_str(item, "description"),
+            source_paths=self._str_list(item.get("source_paths")),
+            capabilities=[
+                self._capability(capability)
+                for capability in item.get("capabilities", [])
+                if isinstance(capability, dict)
+            ],
+        )
+
+    def _capability(self, item: dict[str, Any]) -> ExtractedCapability:
+        return ExtractedCapability(
+            name=self._required_str(item, "name"),
+            description=self._optional_str(item, "description"),
+            inputs=self._str_list(item.get("inputs")),
+            outputs=self._str_list(item.get("outputs")),
+            source_paths=self._str_list(item.get("source_paths")),
+            features=[
+                self._feature(feature)
+                for feature in item.get("features", [])
+                if isinstance(feature, dict)
+            ],
+            evidence=[
+                self._evidence(evidence)
+                for evidence in item.get("evidence", [])
+                if isinstance(evidence, dict)
+            ],
+        )
+
+    def _feature(self, item: dict[str, Any]) -> ExtractedFeature:
+        return ExtractedFeature(
+            name=self._required_str(item, "name"),
+            type=self._required_str(item, "type"),
+            location=self._optional_str(item, "location"),
+            source_paths=self._str_list(item.get("source_paths")),
+        )
+
+    def _evidence(self, item: dict[str, Any]) -> ExtractedEvidence:
+        return ExtractedEvidence(
+            type=self._required_str(item, "type"),
+            reference=self._required_str(item, "reference"),
+            strength=self._optional_str(item, "strength") or "medium",
+            source_paths=self._str_list(item.get("source_paths")),
+        )
+
+    def _json_text(self, content: str) -> str:
+        stripped = content.strip()
+        if stripped.startswith("```"):
+            lines = stripped.splitlines()
+            if lines and lines[0].startswith("```"):
+                lines = lines[1:]
+            if lines and lines[-1].startswith("```"):
+                lines = lines[:-1]
+            return "\n".join(lines).strip()
+        return stripped
+
+    def _required_str(self, item: dict[str, Any], key: str) -> str:
+        value = item.get(key)
+        if not isinstance(value, str) or not value.strip():
+            raise LLMExtractionError(f"Missing required string field: {key}")
+        return value.strip()
+
+    def _optional_str(self, item: dict[str, Any], key: str) -> str:
+        value = item.get(key, "")
+        return value.strip() if isinstance(value, str) else ""
+
+    def _str_list(self, value: Any) -> list[str]:
+        if not isinstance(value, list):
+            return []
+        return [item.strip() for item in value if isinstance(item, str) and item.strip()]
+
+    def _default_run_config(self) -> Any:
+        try:
+            from llm_connect import RunConfig
+        except ModuleNotFoundError:
+            return None
+        return RunConfig(temperature=0.1, max_tokens=2000)
+
+
+def create_llm_connect_adapter(
+    provider: str,
+    model: str | None = None,
+    **kwargs: Any,
+) -> LLMAdapterLike:
+    try:
+        from llm_connect import create_adapter
+    except ModuleNotFoundError as exc:
+        raise LLMExtractionError(
+            "llm-connect is not installed. Install the sibling project with "
+            "`python -m pip install -e ../llm-connect`."
+        ) from exc
+    return create_adapter(provider, model=model, **kwargs)