llm_extraction integration by adding the bridge into candidate graph drafts

This commit is contained in:
2026-04-26 03:08:55 +02:00
parent 7e66c57350
commit 3aa0c08ab9
4 changed files with 261 additions and 2 deletions

View File

@@ -117,8 +117,10 @@ python -m pip install -e ../llm-connect
The integration accepts any `llm-connect` style adapter with
`execute_prompt(prompt, config)` and parses strict JSON candidate drafts from
model responses. Tests use a fake adapter, so the default test suite does not
call external providers.
model responses. Parsed drafts can be mapped into reviewable candidate graph
entries while preserving source paths where they match observed facts or
content chunks. Tests use fake adapters, so the default test suite does not call
external providers.
## Agent-Facing Endpoints

View File

@@ -7,6 +7,7 @@ from repo_registry.llm_extraction.extractor import (
LLMExtractionError,
create_llm_connect_adapter,
)
from repo_registry.llm_extraction.mapper import LLMExtractionMapper
__all__ = [
"ExtractedAbility",
@@ -15,5 +16,6 @@ __all__ = [
"ExtractedFeature",
"LLMCandidateExtractor",
"LLMExtractionError",
"LLMExtractionMapper",
"create_llm_connect_adapter",
]

View File

@@ -0,0 +1,145 @@
from __future__ import annotations
from repo_registry.candidate_graph.generator import (
CandidateAbilityDraft,
CandidateCapabilityDraft,
CandidateEvidenceDraft,
CandidateFeatureDraft,
)
from repo_registry.core.models import ContentChunk, ObservedFact, SourceReference
from repo_registry.llm_extraction.extractor import ExtractedAbility
class LLMExtractionMapper:
"""Map structured LLM extraction drafts into reviewable candidate drafts."""
def map(
self,
abilities: list[ExtractedAbility],
facts: list[ObservedFact],
chunks: list[ContentChunk],
) -> list[CandidateAbilityDraft]:
return [
CandidateAbilityDraft(
name=ability.name,
description=ability.description,
confidence=self._confidence(ability.source_paths, facts, chunks, 0.45),
source_refs=self._source_refs(ability.source_paths, facts, chunks),
capabilities=[
CandidateCapabilityDraft(
name=capability.name,
description=capability.description,
inputs=capability.inputs,
outputs=capability.outputs,
confidence=self._confidence(
capability.source_paths,
facts,
chunks,
0.5,
),
source_refs=self._source_refs(
capability.source_paths,
facts,
chunks,
),
features=[
CandidateFeatureDraft(
name=feature.name,
type=feature.type,
location=feature.location,
confidence=self._confidence(
feature.source_paths or [feature.location],
facts,
chunks,
0.45,
),
source_refs=self._source_refs(
feature.source_paths or [feature.location],
facts,
chunks,
),
)
for feature in capability.features
],
evidence=[
CandidateEvidenceDraft(
type=evidence.type,
reference=evidence.reference,
strength=evidence.strength,
source_refs=self._source_refs(
evidence.source_paths or [evidence.reference],
facts,
chunks,
),
)
for evidence in capability.evidence
],
)
for capability in ability.capabilities
],
)
for ability in abilities
]
def _confidence(
self,
source_paths: list[str],
facts: list[ObservedFact],
chunks: list[ContentChunk],
base: float,
) -> float:
refs = self._source_refs(source_paths, facts, chunks)
if not refs:
return base
fact_kinds = {ref.kind for ref in refs}
score = base + 0.15
if "documentation" in fact_kinds:
score += 0.10
if "test" in fact_kinds or "example" in fact_kinds:
score += 0.10
if "interface" in fact_kinds:
score += 0.10
return min(0.95, round(score, 2))
def _source_refs(
self,
source_paths: list[str],
facts: list[ObservedFact],
chunks: list[ContentChunk],
) -> list[SourceReference]:
refs: list[SourceReference] = []
seen: set[tuple[int | None, str, str, int | None]] = set()
for path in source_paths:
normalized = path.split(":", 1)[0]
for fact in facts:
if fact.path != normalized:
continue
ref = SourceReference(
fact_id=fact.id,
path=fact.path,
kind=fact.kind,
name=fact.name,
line=fact.metadata.get("line"),
)
key = (ref.fact_id, ref.path, ref.kind, ref.line)
if key not in seen:
seen.add(key)
refs.append(ref)
if any(ref.path == normalized for ref in refs):
continue
for chunk in chunks:
if chunk.path != normalized:
continue
ref = SourceReference(
fact_id=None,
path=chunk.path,
kind=chunk.kind,
name=chunk.path,
line=chunk.start_line,
)
key = (ref.fact_id, ref.path, ref.kind, ref.line)
if key not in seen:
seen.add(key)
refs.append(ref)
break
return refs

View File

@@ -0,0 +1,110 @@
from repo_registry.core.models import ContentChunk, ObservedFact
from repo_registry.llm_extraction import (
ExtractedAbility,
ExtractedCapability,
ExtractedEvidence,
ExtractedFeature,
LLMExtractionMapper,
)
def fact(id, kind, name, path, line=None):
metadata = {}
if line is not None:
metadata["line"] = line
return ObservedFact(
id=id,
repository_id=1,
analysis_run_id=1,
snapshot_id=1,
kind=kind,
path=path,
name=name,
value="",
metadata=metadata,
)
def chunk(id, kind, path, start_line=1):
return ContentChunk(
id=id,
repository_id=1,
analysis_run_id=1,
snapshot_id=1,
path=path,
kind=kind,
start_line=start_line,
end_line=start_line + 2,
text="source text",
)
def test_llm_extraction_mapper_builds_candidate_drafts_with_source_refs():
extracted = [
ExtractedAbility(
name="Business Email Routing",
description="Routes email.",
source_paths=["README.md"],
capabilities=[
ExtractedCapability(
name="Classify Incoming Email",
description="Classifies email.",
inputs=["email body"],
outputs=["intent"],
source_paths=["README.md"],
features=[
ExtractedFeature(
name="POST /classify",
type="REST endpoint",
location="app.py",
source_paths=["app.py"],
)
],
evidence=[
ExtractedEvidence(
type="test",
reference="tests/test_app.py",
strength="strong",
source_paths=["tests/test_app.py"],
)
],
)
],
)
]
facts = [
fact(1, "documentation", "README", "README.md"),
fact(2, "interface", "python route decorator", "app.py", line=4),
fact(3, "test", "test_app.py", "tests/test_app.py"),
]
candidates = LLMExtractionMapper().map(extracted, facts, [])
ability = candidates[0]
capability = ability.capabilities[0]
assert ability.name == "Business Email Routing"
assert ability.confidence == 0.7
assert ability.source_refs[0].fact_id == 1
assert capability.inputs == ["email body"]
assert capability.outputs == ["intent"]
assert capability.features[0].source_refs[0].line == 4
assert capability.evidence[0].source_refs[0].kind == "test"
def test_llm_extraction_mapper_can_use_chunk_refs_without_facts():
extracted = [
ExtractedAbility(
name="Readme Ability",
source_paths=["README.md:1-3"],
)
]
candidates = LLMExtractionMapper().map(
extracted,
facts=[],
chunks=[chunk(1, "documentation", "README.md", start_line=1)],
)
assert candidates[0].source_refs[0].fact_id is None
assert candidates[0].source_refs[0].path == "README.md"
assert candidates[0].source_refs[0].line == 1