From 3aa0c08ab9b2532fc24c0f5ed17ed34378999de8 Mon Sep 17 00:00:00 2001 From: tegwick Date: Sun, 26 Apr 2026 03:08:55 +0200 Subject: [PATCH] llm_extraction integration by adding the bridge into candidate graph drafts --- README.md | 6 +- src/repo_registry/llm_extraction/__init__.py | 2 + src/repo_registry/llm_extraction/mapper.py | 145 +++++++++++++++++++ tests/test_llm_extraction_mapper.py | 110 ++++++++++++++ 4 files changed, 261 insertions(+), 2 deletions(-) create mode 100644 src/repo_registry/llm_extraction/mapper.py create mode 100644 tests/test_llm_extraction_mapper.py diff --git a/README.md b/README.md index a0d06e7..798a6f2 100644 --- a/README.md +++ b/README.md @@ -117,8 +117,10 @@ python -m pip install -e ../llm-connect The integration accepts any `llm-connect` style adapter with `execute_prompt(prompt, config)` and parses strict JSON candidate drafts from -model responses. Tests use a fake adapter, so the default test suite does not -call external providers. +model responses. Parsed drafts can be mapped into reviewable candidate graph +entries while preserving source paths where they match observed facts or +content chunks. Tests use fake adapters, so the default test suite does not call +external providers. ## Agent-Facing Endpoints diff --git a/src/repo_registry/llm_extraction/__init__.py b/src/repo_registry/llm_extraction/__init__.py index 2fb2579..3d822e6 100644 --- a/src/repo_registry/llm_extraction/__init__.py +++ b/src/repo_registry/llm_extraction/__init__.py @@ -7,6 +7,7 @@ from repo_registry.llm_extraction.extractor import ( LLMExtractionError, create_llm_connect_adapter, ) +from repo_registry.llm_extraction.mapper import LLMExtractionMapper __all__ = [ "ExtractedAbility", @@ -15,5 +16,6 @@ __all__ = [ "ExtractedFeature", "LLMCandidateExtractor", "LLMExtractionError", + "LLMExtractionMapper", "create_llm_connect_adapter", ] diff --git a/src/repo_registry/llm_extraction/mapper.py b/src/repo_registry/llm_extraction/mapper.py new file mode 100644 index 0000000..d6cb73c --- /dev/null +++ b/src/repo_registry/llm_extraction/mapper.py @@ -0,0 +1,145 @@ +from __future__ import annotations + +from repo_registry.candidate_graph.generator import ( + CandidateAbilityDraft, + CandidateCapabilityDraft, + CandidateEvidenceDraft, + CandidateFeatureDraft, +) +from repo_registry.core.models import ContentChunk, ObservedFact, SourceReference +from repo_registry.llm_extraction.extractor import ExtractedAbility + + +class LLMExtractionMapper: + """Map structured LLM extraction drafts into reviewable candidate drafts.""" + + def map( + self, + abilities: list[ExtractedAbility], + facts: list[ObservedFact], + chunks: list[ContentChunk], + ) -> list[CandidateAbilityDraft]: + return [ + CandidateAbilityDraft( + name=ability.name, + description=ability.description, + confidence=self._confidence(ability.source_paths, facts, chunks, 0.45), + source_refs=self._source_refs(ability.source_paths, facts, chunks), + capabilities=[ + CandidateCapabilityDraft( + name=capability.name, + description=capability.description, + inputs=capability.inputs, + outputs=capability.outputs, + confidence=self._confidence( + capability.source_paths, + facts, + chunks, + 0.5, + ), + source_refs=self._source_refs( + capability.source_paths, + facts, + chunks, + ), + features=[ + CandidateFeatureDraft( + name=feature.name, + type=feature.type, + location=feature.location, + confidence=self._confidence( + feature.source_paths or [feature.location], + facts, + chunks, + 0.45, + ), + source_refs=self._source_refs( + feature.source_paths or [feature.location], + facts, + chunks, + ), + ) + for feature in capability.features + ], + evidence=[ + CandidateEvidenceDraft( + type=evidence.type, + reference=evidence.reference, + strength=evidence.strength, + source_refs=self._source_refs( + evidence.source_paths or [evidence.reference], + facts, + chunks, + ), + ) + for evidence in capability.evidence + ], + ) + for capability in ability.capabilities + ], + ) + for ability in abilities + ] + + def _confidence( + self, + source_paths: list[str], + facts: list[ObservedFact], + chunks: list[ContentChunk], + base: float, + ) -> float: + refs = self._source_refs(source_paths, facts, chunks) + if not refs: + return base + fact_kinds = {ref.kind for ref in refs} + score = base + 0.15 + if "documentation" in fact_kinds: + score += 0.10 + if "test" in fact_kinds or "example" in fact_kinds: + score += 0.10 + if "interface" in fact_kinds: + score += 0.10 + return min(0.95, round(score, 2)) + + def _source_refs( + self, + source_paths: list[str], + facts: list[ObservedFact], + chunks: list[ContentChunk], + ) -> list[SourceReference]: + refs: list[SourceReference] = [] + seen: set[tuple[int | None, str, str, int | None]] = set() + for path in source_paths: + normalized = path.split(":", 1)[0] + for fact in facts: + if fact.path != normalized: + continue + ref = SourceReference( + fact_id=fact.id, + path=fact.path, + kind=fact.kind, + name=fact.name, + line=fact.metadata.get("line"), + ) + key = (ref.fact_id, ref.path, ref.kind, ref.line) + if key not in seen: + seen.add(key) + refs.append(ref) + if any(ref.path == normalized for ref in refs): + continue + for chunk in chunks: + if chunk.path != normalized: + continue + ref = SourceReference( + fact_id=None, + path=chunk.path, + kind=chunk.kind, + name=chunk.path, + line=chunk.start_line, + ) + key = (ref.fact_id, ref.path, ref.kind, ref.line) + if key not in seen: + seen.add(key) + refs.append(ref) + break + return refs diff --git a/tests/test_llm_extraction_mapper.py b/tests/test_llm_extraction_mapper.py new file mode 100644 index 0000000..afe9e58 --- /dev/null +++ b/tests/test_llm_extraction_mapper.py @@ -0,0 +1,110 @@ +from repo_registry.core.models import ContentChunk, ObservedFact +from repo_registry.llm_extraction import ( + ExtractedAbility, + ExtractedCapability, + ExtractedEvidence, + ExtractedFeature, + LLMExtractionMapper, +) + + +def fact(id, kind, name, path, line=None): + metadata = {} + if line is not None: + metadata["line"] = line + return ObservedFact( + id=id, + repository_id=1, + analysis_run_id=1, + snapshot_id=1, + kind=kind, + path=path, + name=name, + value="", + metadata=metadata, + ) + + +def chunk(id, kind, path, start_line=1): + return ContentChunk( + id=id, + repository_id=1, + analysis_run_id=1, + snapshot_id=1, + path=path, + kind=kind, + start_line=start_line, + end_line=start_line + 2, + text="source text", + ) + + +def test_llm_extraction_mapper_builds_candidate_drafts_with_source_refs(): + extracted = [ + ExtractedAbility( + name="Business Email Routing", + description="Routes email.", + source_paths=["README.md"], + capabilities=[ + ExtractedCapability( + name="Classify Incoming Email", + description="Classifies email.", + inputs=["email body"], + outputs=["intent"], + source_paths=["README.md"], + features=[ + ExtractedFeature( + name="POST /classify", + type="REST endpoint", + location="app.py", + source_paths=["app.py"], + ) + ], + evidence=[ + ExtractedEvidence( + type="test", + reference="tests/test_app.py", + strength="strong", + source_paths=["tests/test_app.py"], + ) + ], + ) + ], + ) + ] + facts = [ + fact(1, "documentation", "README", "README.md"), + fact(2, "interface", "python route decorator", "app.py", line=4), + fact(3, "test", "test_app.py", "tests/test_app.py"), + ] + + candidates = LLMExtractionMapper().map(extracted, facts, []) + + ability = candidates[0] + capability = ability.capabilities[0] + assert ability.name == "Business Email Routing" + assert ability.confidence == 0.7 + assert ability.source_refs[0].fact_id == 1 + assert capability.inputs == ["email body"] + assert capability.outputs == ["intent"] + assert capability.features[0].source_refs[0].line == 4 + assert capability.evidence[0].source_refs[0].kind == "test" + + +def test_llm_extraction_mapper_can_use_chunk_refs_without_facts(): + extracted = [ + ExtractedAbility( + name="Readme Ability", + source_paths=["README.md:1-3"], + ) + ] + + candidates = LLMExtractionMapper().map( + extracted, + facts=[], + chunks=[chunk(1, "documentation", "README.md", start_line=1)], + ) + + assert candidates[0].source_refs[0].fact_id is None + assert candidates[0].source_refs[0].path == "README.md" + assert candidates[0].source_refs[0].line == 1