llm_extraction integration by adding the bridge into candidate graph drafts

2026-04-26 03:08:55 +02:00
parent 7e66c57350
commit 3aa0c08ab9
4 changed files with 261 additions and 2 deletions
--- a/src/repo_registry/llm_extraction/init.py
+++ b/src/repo_registry/llm_extraction/init.py
@@ -7,6 +7,7 @@ from repo_registry.llm_extraction.extractor import (
    LLMExtractionError,
    create_llm_connect_adapter,
 )
+from repo_registry.llm_extraction.mapper import LLMExtractionMapper

 __all__ = [
    "ExtractedAbility",
@@ -15,5 +16,6 @@ __all__ = [
    "ExtractedFeature",
    "LLMCandidateExtractor",
    "LLMExtractionError",
+    "LLMExtractionMapper",
    "create_llm_connect_adapter",
 ]
--- a/src/repo_registry/llm_extraction/mapper.py
+++ b/src/repo_registry/llm_extraction/mapper.py
@@ -0,0 +1,145 @@
+from __future__ import annotations
+
+from repo_registry.candidate_graph.generator import (
+    CandidateAbilityDraft,
+    CandidateCapabilityDraft,
+    CandidateEvidenceDraft,
+    CandidateFeatureDraft,
+)
+from repo_registry.core.models import ContentChunk, ObservedFact, SourceReference
+from repo_registry.llm_extraction.extractor import ExtractedAbility
+
+
+class LLMExtractionMapper:
+    """Map structured LLM extraction drafts into reviewable candidate drafts."""
+
+    def map(
+        self,
+        abilities: list[ExtractedAbility],
+        facts: list[ObservedFact],
+        chunks: list[ContentChunk],
+    ) -> list[CandidateAbilityDraft]:
+        return [
+            CandidateAbilityDraft(
+                name=ability.name,
+                description=ability.description,
+                confidence=self._confidence(ability.source_paths, facts, chunks, 0.45),
+                source_refs=self._source_refs(ability.source_paths, facts, chunks),
+                capabilities=[
+                    CandidateCapabilityDraft(
+                        name=capability.name,
+                        description=capability.description,
+                        inputs=capability.inputs,
+                        outputs=capability.outputs,
+                        confidence=self._confidence(
+                            capability.source_paths,
+                            facts,
+                            chunks,
+                            0.5,
+                        ),
+                        source_refs=self._source_refs(
+                            capability.source_paths,
+                            facts,
+                            chunks,
+                        ),
+                        features=[
+                            CandidateFeatureDraft(
+                                name=feature.name,
+                                type=feature.type,
+                                location=feature.location,
+                                confidence=self._confidence(
+                                    feature.source_paths or [feature.location],
+                                    facts,
+                                    chunks,
+                                    0.45,
+                                ),
+                                source_refs=self._source_refs(
+                                    feature.source_paths or [feature.location],
+                                    facts,
+                                    chunks,
+                                ),
+                            )
+                            for feature in capability.features
+                        ],
+                        evidence=[
+                            CandidateEvidenceDraft(
+                                type=evidence.type,
+                                reference=evidence.reference,
+                                strength=evidence.strength,
+                                source_refs=self._source_refs(
+                                    evidence.source_paths or [evidence.reference],
+                                    facts,
+                                    chunks,
+                                ),
+                            )
+                            for evidence in capability.evidence
+                        ],
+                    )
+                    for capability in ability.capabilities
+                ],
+            )
+            for ability in abilities
+        ]
+
+    def _confidence(
+        self,
+        source_paths: list[str],
+        facts: list[ObservedFact],
+        chunks: list[ContentChunk],
+        base: float,
+    ) -> float:
+        refs = self._source_refs(source_paths, facts, chunks)
+        if not refs:
+            return base
+        fact_kinds = {ref.kind for ref in refs}
+        score = base + 0.15
+        if "documentation" in fact_kinds:
+            score += 0.10
+        if "test" in fact_kinds or "example" in fact_kinds:
+            score += 0.10
+        if "interface" in fact_kinds:
+            score += 0.10
+        return min(0.95, round(score, 2))
+
+    def _source_refs(
+        self,
+        source_paths: list[str],
+        facts: list[ObservedFact],
+        chunks: list[ContentChunk],
+    ) -> list[SourceReference]:
+        refs: list[SourceReference] = []
+        seen: set[tuple[int | None, str, str, int | None]] = set()
+        for path in source_paths:
+            normalized = path.split(":", 1)[0]
+            for fact in facts:
+                if fact.path != normalized:
+                    continue
+                ref = SourceReference(
+                    fact_id=fact.id,
+                    path=fact.path,
+                    kind=fact.kind,
+                    name=fact.name,
+                    line=fact.metadata.get("line"),
+                )
+                key = (ref.fact_id, ref.path, ref.kind, ref.line)
+                if key not in seen:
+                    seen.add(key)
+                    refs.append(ref)
+            if any(ref.path == normalized for ref in refs):
+                continue
+            for chunk in chunks:
+                if chunk.path != normalized:
+                    continue
+                ref = SourceReference(
+                    fact_id=None,
+                    path=chunk.path,
+                    kind=chunk.kind,
+                    name=chunk.path,
+                    line=chunk.start_line,
+                )
+                key = (ref.fact_id, ref.path, ref.kind, ref.line)
+                if key not in seen:
+                    seen.add(key)
+                    refs.append(ref)
+                    break
+        return refs