llm_extraction integration by adding the bridge into candidate graph drafts

2026-04-26 03:08:55 +02:00
parent 7e66c57350
commit 3aa0c08ab9
4 changed files with 261 additions and 2 deletions
--- a/README.md
+++ b/README.md
@@ -117,8 +117,10 @@ python -m pip install -e ../llm-connect

 The integration accepts any `llm-connect` style adapter with
 `execute_prompt(prompt, config)` and parses strict JSON candidate drafts from
-model responses. Tests use a fake adapter, so the default test suite does not
-call external providers.
+model responses. Parsed drafts can be mapped into reviewable candidate graph
+entries while preserving source paths where they match observed facts or
+content chunks. Tests use fake adapters, so the default test suite does not call
+external providers.

 ## Agent-Facing Endpoints

--- a/src/repo_registry/llm_extraction/init.py
+++ b/src/repo_registry/llm_extraction/init.py
@@ -7,6 +7,7 @@ from repo_registry.llm_extraction.extractor import (
    LLMExtractionError,
    create_llm_connect_adapter,
 )
+from repo_registry.llm_extraction.mapper import LLMExtractionMapper

 __all__ = [
    "ExtractedAbility",
@@ -15,5 +16,6 @@ __all__ = [
    "ExtractedFeature",
    "LLMCandidateExtractor",
    "LLMExtractionError",
+    "LLMExtractionMapper",
    "create_llm_connect_adapter",
 ]
--- a/src/repo_registry/llm_extraction/mapper.py
+++ b/src/repo_registry/llm_extraction/mapper.py
@@ -0,0 +1,145 @@
+from __future__ import annotations
+
+from repo_registry.candidate_graph.generator import (
+    CandidateAbilityDraft,
+    CandidateCapabilityDraft,
+    CandidateEvidenceDraft,
+    CandidateFeatureDraft,
+)
+from repo_registry.core.models import ContentChunk, ObservedFact, SourceReference
+from repo_registry.llm_extraction.extractor import ExtractedAbility
+
+
+class LLMExtractionMapper:
+    """Map structured LLM extraction drafts into reviewable candidate drafts."""
+
+    def map(
+        self,
+        abilities: list[ExtractedAbility],
+        facts: list[ObservedFact],
+        chunks: list[ContentChunk],
+    ) -> list[CandidateAbilityDraft]:
+        return [
+            CandidateAbilityDraft(
+                name=ability.name,
+                description=ability.description,
+                confidence=self._confidence(ability.source_paths, facts, chunks, 0.45),
+                source_refs=self._source_refs(ability.source_paths, facts, chunks),
+                capabilities=[
+                    CandidateCapabilityDraft(
+                        name=capability.name,
+                        description=capability.description,
+                        inputs=capability.inputs,
+                        outputs=capability.outputs,
+                        confidence=self._confidence(
+                            capability.source_paths,
+                            facts,
+                            chunks,
+                            0.5,
+                        ),
+                        source_refs=self._source_refs(
+                            capability.source_paths,
+                            facts,
+                            chunks,
+                        ),
+                        features=[
+                            CandidateFeatureDraft(
+                                name=feature.name,
+                                type=feature.type,
+                                location=feature.location,
+                                confidence=self._confidence(
+                                    feature.source_paths or [feature.location],
+                                    facts,
+                                    chunks,
+                                    0.45,
+                                ),
+                                source_refs=self._source_refs(
+                                    feature.source_paths or [feature.location],
+                                    facts,
+                                    chunks,
+                                ),
+                            )
+                            for feature in capability.features
+                        ],
+                        evidence=[
+                            CandidateEvidenceDraft(
+                                type=evidence.type,
+                                reference=evidence.reference,
+                                strength=evidence.strength,
+                                source_refs=self._source_refs(
+                                    evidence.source_paths or [evidence.reference],
+                                    facts,
+                                    chunks,
+                                ),
+                            )
+                            for evidence in capability.evidence
+                        ],
+                    )
+                    for capability in ability.capabilities
+                ],
+            )
+            for ability in abilities
+        ]
+
+    def _confidence(
+        self,
+        source_paths: list[str],
+        facts: list[ObservedFact],
+        chunks: list[ContentChunk],
+        base: float,
+    ) -> float:
+        refs = self._source_refs(source_paths, facts, chunks)
+        if not refs:
+            return base
+        fact_kinds = {ref.kind for ref in refs}
+        score = base + 0.15
+        if "documentation" in fact_kinds:
+            score += 0.10
+        if "test" in fact_kinds or "example" in fact_kinds:
+            score += 0.10
+        if "interface" in fact_kinds:
+            score += 0.10
+        return min(0.95, round(score, 2))
+
+    def _source_refs(
+        self,
+        source_paths: list[str],
+        facts: list[ObservedFact],
+        chunks: list[ContentChunk],
+    ) -> list[SourceReference]:
+        refs: list[SourceReference] = []
+        seen: set[tuple[int | None, str, str, int | None]] = set()
+        for path in source_paths:
+            normalized = path.split(":", 1)[0]
+            for fact in facts:
+                if fact.path != normalized:
+                    continue
+                ref = SourceReference(
+                    fact_id=fact.id,
+                    path=fact.path,
+                    kind=fact.kind,
+                    name=fact.name,
+                    line=fact.metadata.get("line"),
+                )
+                key = (ref.fact_id, ref.path, ref.kind, ref.line)
+                if key not in seen:
+                    seen.add(key)
+                    refs.append(ref)
+            if any(ref.path == normalized for ref in refs):
+                continue
+            for chunk in chunks:
+                if chunk.path != normalized:
+                    continue
+                ref = SourceReference(
+                    fact_id=None,
+                    path=chunk.path,
+                    kind=chunk.kind,
+                    name=chunk.path,
+                    line=chunk.start_line,
+                )
+                key = (ref.fact_id, ref.path, ref.kind, ref.line)
+                if key not in seen:
+                    seen.add(key)
+                    refs.append(ref)
+                    break
+        return refs
--- a/tests/test_llm_extraction_mapper.py
+++ b/tests/test_llm_extraction_mapper.py
@@ -0,0 +1,110 @@
+from repo_registry.core.models import ContentChunk, ObservedFact
+from repo_registry.llm_extraction import (
+    ExtractedAbility,
+    ExtractedCapability,
+    ExtractedEvidence,
+    ExtractedFeature,
+    LLMExtractionMapper,
+)
+
+
+def fact(id, kind, name, path, line=None):
+    metadata = {}
+    if line is not None:
+        metadata["line"] = line
+    return ObservedFact(
+        id=id,
+        repository_id=1,
+        analysis_run_id=1,
+        snapshot_id=1,
+        kind=kind,
+        path=path,
+        name=name,
+        value="",
+        metadata=metadata,
+    )
+
+
+def chunk(id, kind, path, start_line=1):
+    return ContentChunk(
+        id=id,
+        repository_id=1,
+        analysis_run_id=1,
+        snapshot_id=1,
+        path=path,
+        kind=kind,
+        start_line=start_line,
+        end_line=start_line + 2,
+        text="source text",
+    )
+
+
+def test_llm_extraction_mapper_builds_candidate_drafts_with_source_refs():
+    extracted = [
+        ExtractedAbility(
+            name="Business Email Routing",
+            description="Routes email.",
+            source_paths=["README.md"],
+            capabilities=[
+                ExtractedCapability(
+                    name="Classify Incoming Email",
+                    description="Classifies email.",
+                    inputs=["email body"],
+                    outputs=["intent"],
+                    source_paths=["README.md"],
+                    features=[
+                        ExtractedFeature(
+                            name="POST /classify",
+                            type="REST endpoint",
+                            location="app.py",
+                            source_paths=["app.py"],
+                        )
+                    ],
+                    evidence=[
+                        ExtractedEvidence(
+                            type="test",
+                            reference="tests/test_app.py",
+                            strength="strong",
+                            source_paths=["tests/test_app.py"],
+                        )
+                    ],
+                )
+            ],
+        )
+    ]
+    facts = [
+        fact(1, "documentation", "README", "README.md"),
+        fact(2, "interface", "python route decorator", "app.py", line=4),
+        fact(3, "test", "test_app.py", "tests/test_app.py"),
+    ]
+
+    candidates = LLMExtractionMapper().map(extracted, facts, [])
+
+    ability = candidates[0]
+    capability = ability.capabilities[0]
+    assert ability.name == "Business Email Routing"
+    assert ability.confidence == 0.7
+    assert ability.source_refs[0].fact_id == 1
+    assert capability.inputs == ["email body"]
+    assert capability.outputs == ["intent"]
+    assert capability.features[0].source_refs[0].line == 4
+    assert capability.evidence[0].source_refs[0].kind == "test"
+
+
+def test_llm_extraction_mapper_can_use_chunk_refs_without_facts():
+    extracted = [
+        ExtractedAbility(
+            name="Readme Ability",
+            source_paths=["README.md:1-3"],
+        )
+    ]
+
+    candidates = LLMExtractionMapper().map(
+        extracted,
+        facts=[],
+        chunks=[chunk(1, "documentation", "README.md", start_line=1)],
+    )
+
+    assert candidates[0].source_refs[0].fact_id is None
+    assert candidates[0].source_refs[0].path == "README.md"
+    assert candidates[0].source_refs[0].line == 1