markitect-main/tests/unit/infospace/test_evaluate.py

"""Tests for markitect.infospace.evaluate."""

from datetime import datetime
from pathlib import Path

import pytest

from markitect.infospace.config import InfospaceConfig, TopicConfig
from markitect.infospace.evaluate import (
    build_evaluation_prompt,
    content_digest,
    parse_evaluation_response,
    run_entity_evaluation,
)
from markitect.infospace.evaluation import ScoreEntry
from markitect.infospace.models import EntityMeta
from markitect.prompts.execution.llm_adapter import MockLLMAdapter
from markitect.prompts.execution.models import RunConfig


# ── Helpers ──────────────────────────────────────────────────────────


def _entity(**overrides) -> EntityMeta:
    defaults = dict(
        slug="division-of-labour",
        title="Division Of Labour",
        h1_raw="Division Of Labour",
        definition="Splitting work into specialised tasks.",
        source_chapter="Book I Chapter 1",
        context="Smith introduces the concept early.",
        domain="Production",
        source_path="entities/division-of-labour.md",
    )
    defaults.update(overrides)
    return EntityMeta(**defaults)


def _config() -> InfospaceConfig:
    return InfospaceConfig(topic=TopicConfig(name="The Wealth of Nations"))


_MOCK_RESPONSE = """\
DIMENSION: definition_precision
SCORE: 4.5
RATIONALE: Clear and specific definition of the concept.

DIMENSION: source_grounding
SCORE: 4.0
RATIONALE: Well grounded in Smith's text.

DIMENSION: domain_relevance
SCORE: 5.0
RATIONALE: Directly relevant to production economics.
"""


# ── build_evaluation_prompt ──────────────────────────────────────────


class TestBuildPrompt:
    def test_contains_entity_fields(self):
        entity = _entity()
        prompt = build_evaluation_prompt(entity, "Test Topic")
        assert "division-of-labour" in prompt
        assert "Division Of Labour" in prompt
        assert "Production" in prompt
        assert "Splitting work" in prompt

    def test_contains_topic(self):
        prompt = build_evaluation_prompt(_entity(), "WoN")
        assert "WoN" in prompt

    def test_contains_dimensions(self):
        prompt = build_evaluation_prompt(_entity(), "T")
        assert "definition_precision" in prompt
        assert "source_grounding" in prompt

    def test_custom_dimensions(self):
        prompt = build_evaluation_prompt(
            _entity(), "T", dimensions=["novelty", "coherence"]
        )
        assert "novelty" in prompt
        assert "coherence" in prompt
        assert "definition_precision" not in prompt

    def test_handles_missing_fields(self):
        entity = _entity(definition="", context="", domain="")
        prompt = build_evaluation_prompt(entity, "T")
        assert "(no definition)" in prompt
        assert "(no context)" in prompt
        assert "(unspecified)" in prompt


# ── content_digest ───────────────────────────────────────────────────


class TestContentDigest:
    def test_deterministic(self):
        e = _entity()
        assert content_digest(e) == content_digest(e)

    def test_changes_with_content(self):
        e1 = _entity(definition="A")
        e2 = _entity(definition="B")
        assert content_digest(e1) != content_digest(e2)


# ── parse_evaluation_response ────────────────────────────────────────


class TestParseResponse:
    def test_parses_three_dimensions(self):
        scores = parse_evaluation_response(_MOCK_RESPONSE)
        assert len(scores) == 3

    def test_correct_names(self):
        scores = parse_evaluation_response(_MOCK_RESPONSE)
        names = [s.name for s in scores]
        assert "definition_precision" in names
        assert "source_grounding" in names
        assert "domain_relevance" in names

    def test_correct_scores(self):
        scores = parse_evaluation_response(_MOCK_RESPONSE)
        by_name = {s.name: s for s in scores}
        assert by_name["definition_precision"].value == 4.5
        assert by_name["source_grounding"].value == 4.0
        assert by_name["domain_relevance"].value == 5.0

    def test_correct_rationales(self):
        scores = parse_evaluation_response(_MOCK_RESPONSE)
        by_name = {s.name: s for s in scores}
        assert "Clear" in by_name["definition_precision"].rationale

    def test_empty_response(self):
        scores = parse_evaluation_response("")
        assert scores == []

    def test_malformed_score_skipped(self):
        text = "DIMENSION: x\nSCORE: not-a-number\nRATIONALE: oops"
        scores = parse_evaluation_response(text)
        assert len(scores) == 0


# ── run_entity_evaluation ────────────────────────────────────────────


class TestRunEntityEvaluation:
    def test_evaluates_entities(self, tmp_path):
        adapter = MockLLMAdapter(_MOCK_RESPONSE)
        cfg = _config()
        entities = [_entity(), _entity(slug="pin-factory", title="Pin Factory")]

        summary = run_entity_evaluation(
            config=cfg,
            entities=entities,
            adapter=adapter,
            output_dir=tmp_path / "evals",
        )
        assert summary.total == 2
        assert summary.succeeded == 2
        assert adapter.call_count == 2

    def test_writes_evaluation_files(self, tmp_path):
        adapter = MockLLMAdapter(_MOCK_RESPONSE)
        cfg = _config()
        entities = [_entity()]

        run_entity_evaluation(
            config=cfg,
            entities=entities,
            adapter=adapter,
            output_dir=tmp_path / "evals",
        )
        eval_file = tmp_path / "evals" / "division-of-labour.md"
        assert eval_file.exists()
        text = eval_file.read_text()
        assert "definition_precision" in text

    def test_incremental_skip(self, tmp_path):
        adapter = MockLLMAdapter(_MOCK_RESPONSE)
        cfg = _config()
        entity = _entity()
        digest = content_digest(entity)

        summary = run_entity_evaluation(
            config=cfg,
            entities=[entity],
            adapter=adapter,
            output_dir=tmp_path,
            previous_digests={entity.slug: digest},
        )
        assert summary.skipped == 1
        assert adapter.call_count == 0

    def test_progress_callback_called(self, tmp_path):
        adapter = MockLLMAdapter(_MOCK_RESPONSE)
        cfg = _config()
        calls = []

        run_entity_evaluation(
            config=cfg,
            entities=[_entity()],
            adapter=adapter,
            output_dir=tmp_path,
            progress_callback=lambda d, t, r: calls.append((d, t, r.key)),
        )
        assert len(calls) == 1
        assert calls[0] == (1, 1, "division-of-labour")

    def test_passes_run_config(self, tmp_path):
        adapter = MockLLMAdapter(_MOCK_RESPONSE)
        cfg = _config()
        rc = RunConfig(temperature=0.1, max_tokens=500)

        run_entity_evaluation(
            config=cfg,
            entities=[_entity()],
            adapter=adapter,
            run_config=rc,
            output_dir=tmp_path,
        )
        assert adapter.last_config.temperature == 0.1