Add llm-assisted discovery extraction

This commit is contained in:
2026-05-19 04:35:35 +02:00
parent bc25eb6871
commit a76c6a4aea
7 changed files with 981 additions and 4 deletions

View File

@@ -0,0 +1,193 @@
from __future__ import annotations
import json
import sys
import types
from pathlib import Path
from types import SimpleNamespace
from railiance_fabric.llm_extraction import LLMExtractionConfig, PROMPT_VERSION
from railiance_fabric.scanner import ScanOptions, scan_repo
from railiance_fabric.schema_validation import draft202012_validator
def test_llm_extraction_uses_llm_connect_boundary_with_mock_adapter(tmp_path: Path, monkeypatch) -> None:
repo = _minimal_repo(tmp_path)
response = json.dumps(
{
"nodes": [
{
"kind": "CapabilityDeclaration",
"label": "Fixture Operations",
"confidence": 0.82,
"evidence_refs": [],
"aliases": ["fixture-ops"],
"attributes": {"capability_type": "operations"},
"rationale": "README describes operational responsibility.",
}
],
"edges": [
{
"edge_type": "suggests_capability",
"source_label": "Fixture Repo",
"target_label": "Fixture Operations",
"confidence": 0.78,
"evidence_refs": [],
"rationale": "The repository appears to own this capability.",
}
],
"attributes": [
{
"entity_label": "Fixture Operations",
"name": "uncertainty",
"value": "needs human review",
"confidence": 0.75,
"evidence_refs": [],
"rationale": "LLM-only extraction should remain review-gated.",
}
],
}
)
calls: dict[str, object] = {}
fake_module = types.ModuleType("llm_connect")
class RunConfig:
def __init__(self, **kwargs: object) -> None:
self.kwargs = kwargs
self.model_name = str(kwargs["model_name"])
class MockLLMAdapter:
def __init__(self, mock_response: str = response) -> None:
self.mock_response = mock_response
def execute_prompt(self, prompt: str, config: RunConfig) -> SimpleNamespace:
calls["prompt"] = prompt
calls["config"] = config
return SimpleNamespace(
content=self.mock_response,
model=config.model_name,
usage={"prompt_tokens": 10, "completion_tokens": 5, "total_tokens": 15},
metadata={"mock": True},
)
def create_adapter(provider: str, model: str | None = None, api_key: str | None = None) -> MockLLMAdapter:
calls["provider"] = provider
calls["model"] = model
calls["api_key"] = api_key
return MockLLMAdapter()
fake_module.RunConfig = RunConfig
fake_module.MockLLMAdapter = MockLLMAdapter
fake_module.create_adapter = create_adapter
monkeypatch.setitem(sys.modules, "llm_connect", fake_module)
snapshot = scan_repo(
ScanOptions(
repo_path=repo,
repo_slug="fixture-repo",
repo_name="Fixture Repo",
commit="abc123",
llm_enabled=True,
deterministic_only=False,
llm_config=LLMExtractionConfig(provider="mock", model="mock-model", min_confidence=0.6),
)
)
_validate_schema("discovery-snapshot.schema.yaml", snapshot)
assert calls["provider"] == "mock"
assert calls["model"] == "mock-model"
assert isinstance(calls["config"], RunConfig)
assert "Evidence bundle:" in str(calls["prompt"])
assert "Use only the JSON evidence bundle below" in str(calls["prompt"])
assert snapshot["scan"]["llm_enabled"] is True
assert snapshot["scan"]["deterministic_only"] is False
assert snapshot["scan"]["llm_budget"]["prompt_version"] == PROMPT_VERSION
llm_node = next(node for node in snapshot["candidates"]["nodes"] if node["label"] == "Fixture Operations")
assert llm_node["origin"] == "llm"
assert llm_node["review_state"] == "needs_review"
assert llm_node["confidence"] == 0.82
assert llm_node["provenance"][0]["provider"] == "mock"
assert llm_node["provenance"][0]["model"] == "mock-model"
assert llm_node["provenance"][0]["usage"]["total_tokens"] == 15
assert any(scope["source_kind"] == "llm" and scope["mode"] == "additive" for scope in snapshot["replacement_scopes"])
assert any(edge["edge_type"] == "suggests_capability" for edge in snapshot["candidates"]["edges"])
assert any(attribute["name"] == "uncertainty" for attribute in snapshot["candidates"]["attributes"])
def test_llm_extraction_fails_closed_for_bad_or_low_confidence_output(tmp_path: Path) -> None:
repo = _minimal_repo(tmp_path)
bad_snapshot = scan_repo(
ScanOptions(
repo_path=repo,
repo_slug="fixture-repo",
repo_name="Fixture Repo",
commit="abc123",
llm_enabled=True,
deterministic_only=False,
llm_config=LLMExtractionConfig(provider="mock", model="mock-model"),
llm_adapter=_Adapter("not json"),
)
)
_validate_schema("discovery-snapshot.schema.yaml", bad_snapshot)
assert {artifact["artifact_type"] for artifact in bad_snapshot["review_artifacts"]} == {"llm_output_invalid"}
low_confidence_snapshot = scan_repo(
ScanOptions(
repo_path=repo,
repo_slug="fixture-repo",
repo_name="Fixture Repo",
commit="abc123",
llm_enabled=True,
deterministic_only=False,
llm_config=LLMExtractionConfig(provider="mock", model="mock-model", min_confidence=0.6),
llm_adapter=_Adapter(
json.dumps(
{
"nodes": [
{
"kind": "CapabilityDeclaration",
"label": "Too Uncertain",
"confidence": 0.2,
"evidence_refs": [],
"rationale": "Weak signal.",
}
],
"edges": [],
"attributes": [],
}
)
),
)
)
_validate_schema("discovery-snapshot.schema.yaml", low_confidence_snapshot)
labels = {node["label"] for node in low_confidence_snapshot["candidates"]["nodes"]}
assert "Too Uncertain" not in labels
assert {artifact["artifact_type"] for artifact in low_confidence_snapshot["review_artifacts"]} == {"llm_low_confidence"}
class _Adapter:
def __init__(self, response: str) -> None:
self.response = response
def execute_prompt(self, prompt: str, config: object) -> SimpleNamespace:
return SimpleNamespace(
content=self.response,
model=getattr(config, "model_name", "mock-model"),
usage={"total_tokens": 1},
metadata={"mock": True},
)
def _minimal_repo(tmp_path: Path) -> Path:
repo = tmp_path / "fixture-repo"
repo.mkdir()
(repo / "README.md").write_text("# Fixture Repo\n\nOwns operational repo signals.\n", encoding="utf-8")
return repo
def _validate_schema(schema_name: str, payload: dict[str, object]) -> None:
validator = draft202012_validator(Path("schemas") / schema_name)
validator.validate(payload)