repo-scoping/tests/test_llm_extraction.py

import pytest

from repo_registry.core.models import ContentChunk, Repository
from repo_registry.llm_extraction import (
    LLMCandidateExtractor,
    LLMExtractionError,
    create_llm_connect_adapter,
)


class Response:
    def __init__(self, content):
        self.content = content


class FakeAdapter:
    def __init__(self, content):
        self.content = content
        self.last_prompt = ""
        self.last_config = object()

    def execute_prompt(self, prompt, config):
        self.last_prompt = prompt
        self.last_config = config
        return Response(self.content)


def repository():
    return Repository(
        id=1,
        name="MailRouter",
        url="/tmp/mail-router",
        description="Routes inbound email.",
        branch="main",
        status="analyzed",
    )


def chunk():
    return ContentChunk(
        id=1,
        repository_id=1,
        analysis_run_id=1,
        snapshot_id=1,
        path="README.md",
        kind="documentation",
        start_line=1,
        end_line=2,
        text="# MailRouter\nRoutes incoming customer email.",
    )


def test_llm_prompt_filters_derived_scope_and_labels_source_roles():
    adapter = FakeAdapter('{"abilities": []}')
    extractor = LLMCandidateExtractor(adapter)
    chunks = [
        ContentChunk(
            id=1,
            repository_id=1,
            analysis_run_id=1,
            snapshot_id=1,
            path="SCOPE.md",
            kind="scope",
            start_line=1,
            end_line=3,
            text="# SCOPE\n\nOld approved LLM routing entry.",
            metadata={"source_role": "derived_scope"},
        ),
        ContentChunk(
            id=2,
            repository_id=1,
            analysis_run_id=1,
            snapshot_id=1,
            path="INTENT.md",
            kind="intent",
            start_line=1,
            end_line=3,
            text="# INTENT\n\nProvide lightweight IAM.",
            metadata={"source_role": "intent_summary"},
        ),
        ContentChunk(
            id=3,
            repository_id=1,
            analysis_run_id=1,
            snapshot_id=1,
            path="CLAUDE.md",
            kind="documentation",
            start_line=1,
            end_line=2,
            text="# CLAUDE\n\nAgent guidance.",
            metadata={"source_role": "agent_guidance"},
        ),
    ]

    extractor.extract(repository(), chunks)

    assert "Source: INTENT.md" in adapter.last_prompt
    assert "source_role=intent_summary" in adapter.last_prompt
    assert "Source: SCOPE.md" not in adapter.last_prompt
    assert "Old approved LLM routing entry" not in adapter.last_prompt
    assert "Source: CLAUDE.md" not in adapter.last_prompt
    assert "Do not use SCOPE.md" in adapter.last_prompt


def test_llm_candidate_extractor_parses_structured_response():
    adapter = FakeAdapter(
        """
        {
          "abilities": [
            {
              "name": "Business Email Routing",
              "description": "Routes inbound customer email.",
              "source_paths": ["README.md"],
              "capabilities": [
                {
                  "name": "Classify Incoming Email",
                  "description": "Classify messages.",
                  "inputs": ["email body"],
                  "outputs": ["intent"],
                  "source_paths": ["README.md"],
                  "features": [
                    {
                      "name": "POST /classify",
                      "type": "REST endpoint",
                      "location": "app.py",
                      "source_paths": ["app.py"]
                    }
                  ],
                  "evidence": [
                    {
                      "type": "documentation",
                      "reference": "README.md",
                      "strength": "medium",
                      "source_paths": ["README.md"]
                    }
                  ]
                }
              ]
            }
          ]
        }
        """
    )
    extractor = LLMCandidateExtractor(adapter)

    abilities = extractor.extract(repository(), [chunk()])

    assert "Return strict JSON only" in adapter.last_prompt
    assert "README.md:1-2" in adapter.last_prompt
    assert abilities[0].name == "Business Email Routing"
    assert abilities[0].capabilities[0].features[0].name == "POST /classify"
    assert abilities[0].capabilities[0].evidence[0].reference == "README.md"


def test_llm_candidate_extractor_accepts_fenced_json():
    adapter = FakeAdapter(
        '```json\n{"abilities": [{"name": "A", "capabilities": []}]}\n```'
    )

    abilities = LLMCandidateExtractor(adapter).extract(repository(), [])

    assert abilities[0].name == "A"


def test_llm_candidate_extractor_rejects_invalid_json():
    adapter = FakeAdapter("not json")

    with pytest.raises(LLMExtractionError):
        LLMCandidateExtractor(adapter).extract(repository(), [])


def test_llm_connect_factory_reports_missing_dependency():
    with pytest.raises(LLMExtractionError) as exc:
        create_llm_connect_adapter("mock")

    assert "llm-connect is not installed" in str(exc.value)