generated from coulomb/repo-seed
llm_extraction integration by adding the bridge into candidate graph drafts
This commit is contained in:
@@ -117,8 +117,10 @@ python -m pip install -e ../llm-connect
|
||||
|
||||
The integration accepts any `llm-connect` style adapter with
|
||||
`execute_prompt(prompt, config)` and parses strict JSON candidate drafts from
|
||||
model responses. Tests use a fake adapter, so the default test suite does not
|
||||
call external providers.
|
||||
model responses. Parsed drafts can be mapped into reviewable candidate graph
|
||||
entries while preserving source paths where they match observed facts or
|
||||
content chunks. Tests use fake adapters, so the default test suite does not call
|
||||
external providers.
|
||||
|
||||
## Agent-Facing Endpoints
|
||||
|
||||
|
||||
@@ -7,6 +7,7 @@ from repo_registry.llm_extraction.extractor import (
|
||||
LLMExtractionError,
|
||||
create_llm_connect_adapter,
|
||||
)
|
||||
from repo_registry.llm_extraction.mapper import LLMExtractionMapper
|
||||
|
||||
__all__ = [
|
||||
"ExtractedAbility",
|
||||
@@ -15,5 +16,6 @@ __all__ = [
|
||||
"ExtractedFeature",
|
||||
"LLMCandidateExtractor",
|
||||
"LLMExtractionError",
|
||||
"LLMExtractionMapper",
|
||||
"create_llm_connect_adapter",
|
||||
]
|
||||
|
||||
145
src/repo_registry/llm_extraction/mapper.py
Normal file
145
src/repo_registry/llm_extraction/mapper.py
Normal file
@@ -0,0 +1,145 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from repo_registry.candidate_graph.generator import (
|
||||
CandidateAbilityDraft,
|
||||
CandidateCapabilityDraft,
|
||||
CandidateEvidenceDraft,
|
||||
CandidateFeatureDraft,
|
||||
)
|
||||
from repo_registry.core.models import ContentChunk, ObservedFact, SourceReference
|
||||
from repo_registry.llm_extraction.extractor import ExtractedAbility
|
||||
|
||||
|
||||
class LLMExtractionMapper:
|
||||
"""Map structured LLM extraction drafts into reviewable candidate drafts."""
|
||||
|
||||
def map(
|
||||
self,
|
||||
abilities: list[ExtractedAbility],
|
||||
facts: list[ObservedFact],
|
||||
chunks: list[ContentChunk],
|
||||
) -> list[CandidateAbilityDraft]:
|
||||
return [
|
||||
CandidateAbilityDraft(
|
||||
name=ability.name,
|
||||
description=ability.description,
|
||||
confidence=self._confidence(ability.source_paths, facts, chunks, 0.45),
|
||||
source_refs=self._source_refs(ability.source_paths, facts, chunks),
|
||||
capabilities=[
|
||||
CandidateCapabilityDraft(
|
||||
name=capability.name,
|
||||
description=capability.description,
|
||||
inputs=capability.inputs,
|
||||
outputs=capability.outputs,
|
||||
confidence=self._confidence(
|
||||
capability.source_paths,
|
||||
facts,
|
||||
chunks,
|
||||
0.5,
|
||||
),
|
||||
source_refs=self._source_refs(
|
||||
capability.source_paths,
|
||||
facts,
|
||||
chunks,
|
||||
),
|
||||
features=[
|
||||
CandidateFeatureDraft(
|
||||
name=feature.name,
|
||||
type=feature.type,
|
||||
location=feature.location,
|
||||
confidence=self._confidence(
|
||||
feature.source_paths or [feature.location],
|
||||
facts,
|
||||
chunks,
|
||||
0.45,
|
||||
),
|
||||
source_refs=self._source_refs(
|
||||
feature.source_paths or [feature.location],
|
||||
facts,
|
||||
chunks,
|
||||
),
|
||||
)
|
||||
for feature in capability.features
|
||||
],
|
||||
evidence=[
|
||||
CandidateEvidenceDraft(
|
||||
type=evidence.type,
|
||||
reference=evidence.reference,
|
||||
strength=evidence.strength,
|
||||
source_refs=self._source_refs(
|
||||
evidence.source_paths or [evidence.reference],
|
||||
facts,
|
||||
chunks,
|
||||
),
|
||||
)
|
||||
for evidence in capability.evidence
|
||||
],
|
||||
)
|
||||
for capability in ability.capabilities
|
||||
],
|
||||
)
|
||||
for ability in abilities
|
||||
]
|
||||
|
||||
def _confidence(
|
||||
self,
|
||||
source_paths: list[str],
|
||||
facts: list[ObservedFact],
|
||||
chunks: list[ContentChunk],
|
||||
base: float,
|
||||
) -> float:
|
||||
refs = self._source_refs(source_paths, facts, chunks)
|
||||
if not refs:
|
||||
return base
|
||||
fact_kinds = {ref.kind for ref in refs}
|
||||
score = base + 0.15
|
||||
if "documentation" in fact_kinds:
|
||||
score += 0.10
|
||||
if "test" in fact_kinds or "example" in fact_kinds:
|
||||
score += 0.10
|
||||
if "interface" in fact_kinds:
|
||||
score += 0.10
|
||||
return min(0.95, round(score, 2))
|
||||
|
||||
def _source_refs(
|
||||
self,
|
||||
source_paths: list[str],
|
||||
facts: list[ObservedFact],
|
||||
chunks: list[ContentChunk],
|
||||
) -> list[SourceReference]:
|
||||
refs: list[SourceReference] = []
|
||||
seen: set[tuple[int | None, str, str, int | None]] = set()
|
||||
for path in source_paths:
|
||||
normalized = path.split(":", 1)[0]
|
||||
for fact in facts:
|
||||
if fact.path != normalized:
|
||||
continue
|
||||
ref = SourceReference(
|
||||
fact_id=fact.id,
|
||||
path=fact.path,
|
||||
kind=fact.kind,
|
||||
name=fact.name,
|
||||
line=fact.metadata.get("line"),
|
||||
)
|
||||
key = (ref.fact_id, ref.path, ref.kind, ref.line)
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
refs.append(ref)
|
||||
if any(ref.path == normalized for ref in refs):
|
||||
continue
|
||||
for chunk in chunks:
|
||||
if chunk.path != normalized:
|
||||
continue
|
||||
ref = SourceReference(
|
||||
fact_id=None,
|
||||
path=chunk.path,
|
||||
kind=chunk.kind,
|
||||
name=chunk.path,
|
||||
line=chunk.start_line,
|
||||
)
|
||||
key = (ref.fact_id, ref.path, ref.kind, ref.line)
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
refs.append(ref)
|
||||
break
|
||||
return refs
|
||||
110
tests/test_llm_extraction_mapper.py
Normal file
110
tests/test_llm_extraction_mapper.py
Normal file
@@ -0,0 +1,110 @@
|
||||
from repo_registry.core.models import ContentChunk, ObservedFact
|
||||
from repo_registry.llm_extraction import (
|
||||
ExtractedAbility,
|
||||
ExtractedCapability,
|
||||
ExtractedEvidence,
|
||||
ExtractedFeature,
|
||||
LLMExtractionMapper,
|
||||
)
|
||||
|
||||
|
||||
def fact(id, kind, name, path, line=None):
|
||||
metadata = {}
|
||||
if line is not None:
|
||||
metadata["line"] = line
|
||||
return ObservedFact(
|
||||
id=id,
|
||||
repository_id=1,
|
||||
analysis_run_id=1,
|
||||
snapshot_id=1,
|
||||
kind=kind,
|
||||
path=path,
|
||||
name=name,
|
||||
value="",
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
|
||||
def chunk(id, kind, path, start_line=1):
|
||||
return ContentChunk(
|
||||
id=id,
|
||||
repository_id=1,
|
||||
analysis_run_id=1,
|
||||
snapshot_id=1,
|
||||
path=path,
|
||||
kind=kind,
|
||||
start_line=start_line,
|
||||
end_line=start_line + 2,
|
||||
text="source text",
|
||||
)
|
||||
|
||||
|
||||
def test_llm_extraction_mapper_builds_candidate_drafts_with_source_refs():
|
||||
extracted = [
|
||||
ExtractedAbility(
|
||||
name="Business Email Routing",
|
||||
description="Routes email.",
|
||||
source_paths=["README.md"],
|
||||
capabilities=[
|
||||
ExtractedCapability(
|
||||
name="Classify Incoming Email",
|
||||
description="Classifies email.",
|
||||
inputs=["email body"],
|
||||
outputs=["intent"],
|
||||
source_paths=["README.md"],
|
||||
features=[
|
||||
ExtractedFeature(
|
||||
name="POST /classify",
|
||||
type="REST endpoint",
|
||||
location="app.py",
|
||||
source_paths=["app.py"],
|
||||
)
|
||||
],
|
||||
evidence=[
|
||||
ExtractedEvidence(
|
||||
type="test",
|
||||
reference="tests/test_app.py",
|
||||
strength="strong",
|
||||
source_paths=["tests/test_app.py"],
|
||||
)
|
||||
],
|
||||
)
|
||||
],
|
||||
)
|
||||
]
|
||||
facts = [
|
||||
fact(1, "documentation", "README", "README.md"),
|
||||
fact(2, "interface", "python route decorator", "app.py", line=4),
|
||||
fact(3, "test", "test_app.py", "tests/test_app.py"),
|
||||
]
|
||||
|
||||
candidates = LLMExtractionMapper().map(extracted, facts, [])
|
||||
|
||||
ability = candidates[0]
|
||||
capability = ability.capabilities[0]
|
||||
assert ability.name == "Business Email Routing"
|
||||
assert ability.confidence == 0.7
|
||||
assert ability.source_refs[0].fact_id == 1
|
||||
assert capability.inputs == ["email body"]
|
||||
assert capability.outputs == ["intent"]
|
||||
assert capability.features[0].source_refs[0].line == 4
|
||||
assert capability.evidence[0].source_refs[0].kind == "test"
|
||||
|
||||
|
||||
def test_llm_extraction_mapper_can_use_chunk_refs_without_facts():
|
||||
extracted = [
|
||||
ExtractedAbility(
|
||||
name="Readme Ability",
|
||||
source_paths=["README.md:1-3"],
|
||||
)
|
||||
]
|
||||
|
||||
candidates = LLMExtractionMapper().map(
|
||||
extracted,
|
||||
facts=[],
|
||||
chunks=[chunk(1, "documentation", "README.md", start_line=1)],
|
||||
)
|
||||
|
||||
assert candidates[0].source_refs[0].fact_id is None
|
||||
assert candidates[0].source_refs[0].path == "README.md"
|
||||
assert candidates[0].source_refs[0].line == 1
|
||||
Reference in New Issue
Block a user