generated from coulomb/repo-seed
llm_extraction boundary
This commit is contained in:
15
README.md
15
README.md
@@ -105,6 +105,21 @@ Candidate graphs are meant to be corrected before publication. The API supports:
|
||||
|
||||
Examples are available in the generated OpenAPI docs at `/docs`.
|
||||
|
||||
## Optional LLM Extraction
|
||||
|
||||
The `llm_extraction` module is designed to work with the sibling `llm-connect`
|
||||
project without making it a hard dependency. To enable provider-backed
|
||||
extraction locally:
|
||||
|
||||
```bash
|
||||
python -m pip install -e ../llm-connect
|
||||
```
|
||||
|
||||
The integration accepts any `llm-connect` style adapter with
|
||||
`execute_prompt(prompt, config)` and parses strict JSON candidate drafts from
|
||||
model responses. Tests use a fake adapter, so the default test suite does not
|
||||
call external providers.
|
||||
|
||||
## Agent-Facing Endpoints
|
||||
|
||||
The v0.1 API covers the main registration, analysis, review, search, and inspection loop:
|
||||
|
||||
19
src/repo_registry/llm_extraction/__init__.py
Normal file
19
src/repo_registry/llm_extraction/__init__.py
Normal file
@@ -0,0 +1,19 @@
|
||||
from repo_registry.llm_extraction.extractor import (
|
||||
ExtractedAbility,
|
||||
ExtractedCapability,
|
||||
ExtractedEvidence,
|
||||
ExtractedFeature,
|
||||
LLMCandidateExtractor,
|
||||
LLMExtractionError,
|
||||
create_llm_connect_adapter,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"ExtractedAbility",
|
||||
"ExtractedCapability",
|
||||
"ExtractedEvidence",
|
||||
"ExtractedFeature",
|
||||
"LLMCandidateExtractor",
|
||||
"LLMExtractionError",
|
||||
"create_llm_connect_adapter",
|
||||
]
|
||||
214
src/repo_registry/llm_extraction/extractor.py
Normal file
214
src/repo_registry/llm_extraction/extractor.py
Normal file
@@ -0,0 +1,214 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Protocol
|
||||
|
||||
from repo_registry.core.models import ContentChunk, Repository
|
||||
|
||||
|
||||
class LLMExtractionError(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
class LLMResponseLike(Protocol):
|
||||
content: str
|
||||
|
||||
|
||||
class LLMAdapterLike(Protocol):
|
||||
def execute_prompt(self, prompt: str, config: Any) -> LLMResponseLike:
|
||||
pass
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ExtractedEvidence:
|
||||
type: str
|
||||
reference: str
|
||||
strength: str = "medium"
|
||||
source_paths: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ExtractedFeature:
|
||||
name: str
|
||||
type: str
|
||||
location: str = ""
|
||||
source_paths: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ExtractedCapability:
|
||||
name: str
|
||||
description: str = ""
|
||||
inputs: list[str] = field(default_factory=list)
|
||||
outputs: list[str] = field(default_factory=list)
|
||||
features: list[ExtractedFeature] = field(default_factory=list)
|
||||
evidence: list[ExtractedEvidence] = field(default_factory=list)
|
||||
source_paths: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ExtractedAbility:
|
||||
name: str
|
||||
description: str = ""
|
||||
capabilities: list[ExtractedCapability] = field(default_factory=list)
|
||||
source_paths: list[str] = field(default_factory=list)
|
||||
|
||||
|
||||
class LLMCandidateExtractor:
|
||||
"""Structured candidate extraction over llm-connect-style adapters."""
|
||||
|
||||
def __init__(self, adapter: LLMAdapterLike, run_config: Any | None = None) -> None:
|
||||
self.adapter = adapter
|
||||
self.run_config = run_config or self._default_run_config()
|
||||
|
||||
def extract(
|
||||
self,
|
||||
repository: Repository,
|
||||
chunks: list[ContentChunk],
|
||||
) -> list[ExtractedAbility]:
|
||||
prompt = self.build_prompt(repository, chunks)
|
||||
response = self.adapter.execute_prompt(prompt, self.run_config)
|
||||
return self.parse_response(response.content)
|
||||
|
||||
def build_prompt(self, repository: Repository, chunks: list[ContentChunk]) -> str:
|
||||
chunk_text = "\n\n".join(
|
||||
(
|
||||
f"Source: {chunk.path}:{chunk.start_line}-{chunk.end_line} "
|
||||
f"({chunk.kind})\n{chunk.text}"
|
||||
)
|
||||
for chunk in chunks[:12]
|
||||
)
|
||||
return (
|
||||
"Extract a conservative, source-linked repository ability map.\n"
|
||||
"Return strict JSON only with this shape:\n"
|
||||
"{\n"
|
||||
' "abilities": [\n'
|
||||
" {\n"
|
||||
' "name": "...",\n'
|
||||
' "description": "...",\n'
|
||||
' "source_paths": ["README.md"],\n'
|
||||
' "capabilities": [\n'
|
||||
" {\n"
|
||||
' "name": "...",\n'
|
||||
' "description": "...",\n'
|
||||
' "inputs": ["..."],\n'
|
||||
' "outputs": ["..."],\n'
|
||||
' "source_paths": ["..."],\n'
|
||||
' "features": [{"name": "...", "type": "...", "location": "...", "source_paths": ["..."]}],\n'
|
||||
' "evidence": [{"type": "documentation", "reference": "...", "strength": "medium", "source_paths": ["..."]}]\n'
|
||||
" }\n"
|
||||
" ]\n"
|
||||
" }\n"
|
||||
" ]\n"
|
||||
"}\n"
|
||||
"Do not invent unsupported claims. If sources are weak, keep names generic.\n\n"
|
||||
f"Repository: {repository.name}\n"
|
||||
f"Description: {repository.description or ''}\n\n"
|
||||
f"{chunk_text}\n"
|
||||
)
|
||||
|
||||
def parse_response(self, content: str) -> list[ExtractedAbility]:
|
||||
try:
|
||||
payload = json.loads(self._json_text(content))
|
||||
except json.JSONDecodeError as exc:
|
||||
raise LLMExtractionError(f"LLM response was not valid JSON: {exc}") from exc
|
||||
abilities = payload.get("abilities")
|
||||
if not isinstance(abilities, list):
|
||||
raise LLMExtractionError("LLM response must contain an abilities list")
|
||||
return [self._ability(item) for item in abilities]
|
||||
|
||||
def _ability(self, item: dict[str, Any]) -> ExtractedAbility:
|
||||
return ExtractedAbility(
|
||||
name=self._required_str(item, "name"),
|
||||
description=self._optional_str(item, "description"),
|
||||
source_paths=self._str_list(item.get("source_paths")),
|
||||
capabilities=[
|
||||
self._capability(capability)
|
||||
for capability in item.get("capabilities", [])
|
||||
if isinstance(capability, dict)
|
||||
],
|
||||
)
|
||||
|
||||
def _capability(self, item: dict[str, Any]) -> ExtractedCapability:
|
||||
return ExtractedCapability(
|
||||
name=self._required_str(item, "name"),
|
||||
description=self._optional_str(item, "description"),
|
||||
inputs=self._str_list(item.get("inputs")),
|
||||
outputs=self._str_list(item.get("outputs")),
|
||||
source_paths=self._str_list(item.get("source_paths")),
|
||||
features=[
|
||||
self._feature(feature)
|
||||
for feature in item.get("features", [])
|
||||
if isinstance(feature, dict)
|
||||
],
|
||||
evidence=[
|
||||
self._evidence(evidence)
|
||||
for evidence in item.get("evidence", [])
|
||||
if isinstance(evidence, dict)
|
||||
],
|
||||
)
|
||||
|
||||
def _feature(self, item: dict[str, Any]) -> ExtractedFeature:
|
||||
return ExtractedFeature(
|
||||
name=self._required_str(item, "name"),
|
||||
type=self._required_str(item, "type"),
|
||||
location=self._optional_str(item, "location"),
|
||||
source_paths=self._str_list(item.get("source_paths")),
|
||||
)
|
||||
|
||||
def _evidence(self, item: dict[str, Any]) -> ExtractedEvidence:
|
||||
return ExtractedEvidence(
|
||||
type=self._required_str(item, "type"),
|
||||
reference=self._required_str(item, "reference"),
|
||||
strength=self._optional_str(item, "strength") or "medium",
|
||||
source_paths=self._str_list(item.get("source_paths")),
|
||||
)
|
||||
|
||||
def _json_text(self, content: str) -> str:
|
||||
stripped = content.strip()
|
||||
if stripped.startswith("```"):
|
||||
lines = stripped.splitlines()
|
||||
if lines and lines[0].startswith("```"):
|
||||
lines = lines[1:]
|
||||
if lines and lines[-1].startswith("```"):
|
||||
lines = lines[:-1]
|
||||
return "\n".join(lines).strip()
|
||||
return stripped
|
||||
|
||||
def _required_str(self, item: dict[str, Any], key: str) -> str:
|
||||
value = item.get(key)
|
||||
if not isinstance(value, str) or not value.strip():
|
||||
raise LLMExtractionError(f"Missing required string field: {key}")
|
||||
return value.strip()
|
||||
|
||||
def _optional_str(self, item: dict[str, Any], key: str) -> str:
|
||||
value = item.get(key, "")
|
||||
return value.strip() if isinstance(value, str) else ""
|
||||
|
||||
def _str_list(self, value: Any) -> list[str]:
|
||||
if not isinstance(value, list):
|
||||
return []
|
||||
return [item.strip() for item in value if isinstance(item, str) and item.strip()]
|
||||
|
||||
def _default_run_config(self) -> Any:
|
||||
try:
|
||||
from llm_connect import RunConfig
|
||||
except ModuleNotFoundError:
|
||||
return None
|
||||
return RunConfig(temperature=0.1, max_tokens=2000)
|
||||
|
||||
|
||||
def create_llm_connect_adapter(
|
||||
provider: str,
|
||||
model: str | None = None,
|
||||
**kwargs: Any,
|
||||
) -> LLMAdapterLike:
|
||||
try:
|
||||
from llm_connect import create_adapter
|
||||
except ModuleNotFoundError as exc:
|
||||
raise LLMExtractionError(
|
||||
"llm-connect is not installed. Install the sibling project with "
|
||||
"`python -m pip install -e ../llm-connect`."
|
||||
) from exc
|
||||
return create_adapter(provider, model=model, **kwargs)
|
||||
124
tests/test_llm_extraction.py
Normal file
124
tests/test_llm_extraction.py
Normal file
@@ -0,0 +1,124 @@
|
||||
import pytest
|
||||
|
||||
from repo_registry.core.models import ContentChunk, Repository
|
||||
from repo_registry.llm_extraction import (
|
||||
LLMCandidateExtractor,
|
||||
LLMExtractionError,
|
||||
create_llm_connect_adapter,
|
||||
)
|
||||
|
||||
|
||||
class Response:
|
||||
def __init__(self, content):
|
||||
self.content = content
|
||||
|
||||
|
||||
class FakeAdapter:
|
||||
def __init__(self, content):
|
||||
self.content = content
|
||||
self.last_prompt = ""
|
||||
self.last_config = object()
|
||||
|
||||
def execute_prompt(self, prompt, config):
|
||||
self.last_prompt = prompt
|
||||
self.last_config = config
|
||||
return Response(self.content)
|
||||
|
||||
|
||||
def repository():
|
||||
return Repository(
|
||||
id=1,
|
||||
name="MailRouter",
|
||||
url="/tmp/mail-router",
|
||||
description="Routes inbound email.",
|
||||
branch="main",
|
||||
status="analyzed",
|
||||
)
|
||||
|
||||
|
||||
def chunk():
|
||||
return ContentChunk(
|
||||
id=1,
|
||||
repository_id=1,
|
||||
analysis_run_id=1,
|
||||
snapshot_id=1,
|
||||
path="README.md",
|
||||
kind="documentation",
|
||||
start_line=1,
|
||||
end_line=2,
|
||||
text="# MailRouter\nRoutes incoming customer email.",
|
||||
)
|
||||
|
||||
|
||||
def test_llm_candidate_extractor_parses_structured_response():
|
||||
adapter = FakeAdapter(
|
||||
"""
|
||||
{
|
||||
"abilities": [
|
||||
{
|
||||
"name": "Business Email Routing",
|
||||
"description": "Routes inbound customer email.",
|
||||
"source_paths": ["README.md"],
|
||||
"capabilities": [
|
||||
{
|
||||
"name": "Classify Incoming Email",
|
||||
"description": "Classify messages.",
|
||||
"inputs": ["email body"],
|
||||
"outputs": ["intent"],
|
||||
"source_paths": ["README.md"],
|
||||
"features": [
|
||||
{
|
||||
"name": "POST /classify",
|
||||
"type": "REST endpoint",
|
||||
"location": "app.py",
|
||||
"source_paths": ["app.py"]
|
||||
}
|
||||
],
|
||||
"evidence": [
|
||||
{
|
||||
"type": "documentation",
|
||||
"reference": "README.md",
|
||||
"strength": "medium",
|
||||
"source_paths": ["README.md"]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
"""
|
||||
)
|
||||
extractor = LLMCandidateExtractor(adapter)
|
||||
|
||||
abilities = extractor.extract(repository(), [chunk()])
|
||||
|
||||
assert "Return strict JSON only" in adapter.last_prompt
|
||||
assert "README.md:1-2" in adapter.last_prompt
|
||||
assert abilities[0].name == "Business Email Routing"
|
||||
assert abilities[0].capabilities[0].features[0].name == "POST /classify"
|
||||
assert abilities[0].capabilities[0].evidence[0].reference == "README.md"
|
||||
|
||||
|
||||
def test_llm_candidate_extractor_accepts_fenced_json():
|
||||
adapter = FakeAdapter(
|
||||
'```json\n{"abilities": [{"name": "A", "capabilities": []}]}\n```'
|
||||
)
|
||||
|
||||
abilities = LLMCandidateExtractor(adapter).extract(repository(), [])
|
||||
|
||||
assert abilities[0].name == "A"
|
||||
|
||||
|
||||
def test_llm_candidate_extractor_rejects_invalid_json():
|
||||
adapter = FakeAdapter("not json")
|
||||
|
||||
with pytest.raises(LLMExtractionError):
|
||||
LLMCandidateExtractor(adapter).extract(repository(), [])
|
||||
|
||||
|
||||
def test_llm_connect_factory_reports_missing_dependency():
|
||||
with pytest.raises(LLMExtractionError) as exc:
|
||||
create_llm_connect_adapter("mock")
|
||||
|
||||
assert "llm-connect is not installed" in str(exc.value)
|
||||
Reference in New Issue
Block a user