markitect-main/tests/unit/infospace/test_evaluation.py

"""Tests for markitect.infospace evaluation models and I/O."""

from datetime import datetime
from pathlib import Path

import pytest

from markitect.infospace import (
    EntityEvaluation,
    EvaluationSnapshot,
    MetricChange,
    MetricValue,
    ScoreChange,
    ScoreEntry,
    SnapshotDiff,
    append_to_history,
    diff_snapshots,
    read_entity_evaluation,
    read_history,
    read_snapshot,
    write_entity_evaluation,
    write_snapshot,
)


# ── Helpers ──────────────────────────────────────────────────────────

_NOW = datetime(2026, 2, 19, 12, 0, 0)


def _sample_scores() -> list:
    return [
        ScoreEntry("definition_precision", 4.5, rationale="Clear and specific."),
        ScoreEntry("source_grounding", 4.0, rationale="Well grounded."),
        ScoreEntry("domain_relevance", 4.5),
    ]


def _sample_evaluation(**overrides) -> EntityEvaluation:
    defaults = dict(
        entity_slug="division-of-labour",
        evaluator="openrouter/anthropic/claude-3.5-sonnet",
        scores=_sample_scores(),
        evaluated_at=_NOW,
        notes=["Strong entity with clear provenance"],
    )
    defaults.update(overrides)
    return EntityEvaluation(**defaults)


def _sample_metric() -> MetricValue:
    return MetricValue("coverage_ratio", 0.85, concern="C2", details={"checked": 85})


def _sample_snapshot(**overrides) -> EvaluationSnapshot:
    defaults = dict(
        snapshot_id="2026-02-19",
        created_at=_NOW,
        schema_name="Economic Entity",
        entity_count=1,
        entity_evaluations=[_sample_evaluation()],
        collection_metrics=[_sample_metric()],
        metadata={"version": "1.0"},
    )
    defaults.update(overrides)
    return EvaluationSnapshot(**defaults)


# ── Model tests ──────────────────────────────────────────────────────


class TestScoreEntry:
    def test_to_dict_from_dict_round_trip(self):
        se = ScoreEntry("precision", 4.5, 5.0, "Good definition.")
        d = se.to_dict()
        restored = ScoreEntry.from_dict(d)
        assert restored.name == se.name
        assert restored.value == se.value
        assert restored.max_value == se.max_value
        assert restored.rationale == se.rationale

    def test_to_dict_omits_empty_rationale(self):
        se = ScoreEntry("precision", 4.5)
        d = se.to_dict()
        assert "rationale" not in d

    def test_from_dict_defaults(self):
        se = ScoreEntry.from_dict({"name": "x", "value": 3.0})
        assert se.max_value == 5.0
        assert se.rationale == ""


class TestEntityEvaluation:
    def test_overall_score_is_mean(self):
        ev = _sample_evaluation()
        # (4.5 + 4.0 + 4.5) / 3 ≈ 4.333
        assert abs(ev.overall_score - 4.333333) < 0.001

    def test_overall_score_zero_scores(self):
        ev = _sample_evaluation(scores=[])
        assert ev.overall_score == 0.0

    def test_to_dict_from_dict_round_trip(self):
        ev = _sample_evaluation()
        d = ev.to_dict()
        restored = EntityEvaluation.from_dict(d)
        assert restored.entity_slug == ev.entity_slug
        assert restored.evaluator == ev.evaluator
        assert len(restored.scores) == len(ev.scores)
        assert restored.evaluated_at == ev.evaluated_at
        assert restored.notes == ev.notes

    def test_to_dict_includes_overall_score(self):
        ev = _sample_evaluation()
        d = ev.to_dict()
        assert "overall_score" in d
        assert abs(d["overall_score"] - 4.3333) < 0.01


class TestMetricValue:
    def test_to_dict_from_dict_round_trip(self):
        mv = _sample_metric()
        d = mv.to_dict()
        restored = MetricValue.from_dict(d)
        assert restored.name == mv.name
        assert restored.value == mv.value
        assert restored.concern == mv.concern
        assert restored.details == mv.details

    def test_to_dict_omits_empty_concern(self):
        mv = MetricValue("x", 1.0)
        d = mv.to_dict()
        assert "concern" not in d
        assert "details" not in d


class TestEvaluationSnapshot:
    def test_to_dict_from_dict_round_trip(self):
        snap = _sample_snapshot()
        d = snap.to_dict()
        restored = EvaluationSnapshot.from_dict(d)
        assert restored.snapshot_id == snap.snapshot_id
        assert restored.created_at == snap.created_at
        assert restored.schema_name == snap.schema_name
        assert restored.entity_count == snap.entity_count
        assert len(restored.entity_evaluations) == 1
        assert len(restored.collection_metrics) == 1
        assert restored.metadata == snap.metadata

    def test_from_dict_empty_lists(self):
        d = {
            "snapshot_id": "test",
            "created_at": _NOW.isoformat(),
            "schema_name": "Test",
            "entity_count": 0,
        }
        snap = EvaluationSnapshot.from_dict(d)
        assert snap.entity_evaluations == []
        assert snap.collection_metrics == []
        assert snap.metadata == {}


# ── Per-entity file I/O ──────────────────────────────────────────────


class TestEntityEvaluationIO:
    def test_write_creates_file(self, tmp_path):
        ev = _sample_evaluation()
        p = tmp_path / "eval.md"
        write_entity_evaluation(ev, p)
        assert p.exists()

    def test_file_has_yaml_frontmatter(self, tmp_path):
        ev = _sample_evaluation()
        p = tmp_path / "eval.md"
        write_entity_evaluation(ev, p)
        text = p.read_text()
        assert text.startswith("---\n")
        assert "\n---\n" in text

    def test_frontmatter_contains_expected_keys(self, tmp_path):
        ev = _sample_evaluation()
        p = tmp_path / "eval.md"
        write_entity_evaluation(ev, p)
        text = p.read_text()
        for key in ["entity_slug", "evaluator", "evaluated_at", "overall_score", "scores"]:
            assert key in text

    def test_markdown_body_contains_rationales(self, tmp_path):
        ev = _sample_evaluation()
        p = tmp_path / "eval.md"
        write_entity_evaluation(ev, p)
        text = p.read_text()
        assert "Clear and specific." in text
        assert "Well grounded." in text
        assert "## definition_precision" in text

    def test_read_back_matches_original(self, tmp_path):
        ev = _sample_evaluation()
        p = tmp_path / "eval.md"
        write_entity_evaluation(ev, p)
        restored = read_entity_evaluation(p)
        assert restored.entity_slug == ev.entity_slug
        assert restored.evaluator == ev.evaluator
        assert restored.evaluated_at == ev.evaluated_at
        assert restored.notes == ev.notes
        assert len(restored.scores) == len(ev.scores)

    def test_round_trip_preserves_scores(self, tmp_path):
        ev = _sample_evaluation()
        p = tmp_path / "eval.md"
        write_entity_evaluation(ev, p)
        restored = read_entity_evaluation(p)
        for orig, rest in zip(ev.scores, restored.scores):
            assert rest.name == orig.name
            assert rest.value == orig.value
            assert rest.max_value == orig.max_value

    def test_round_trip_preserves_rationales(self, tmp_path):
        ev = _sample_evaluation()
        p = tmp_path / "eval.md"
        write_entity_evaluation(ev, p)
        restored = read_entity_evaluation(p)
        assert restored.scores[0].rationale == "Clear and specific."
        assert restored.scores[1].rationale == "Well grounded."
        # Third score has no rationale
        assert restored.scores[2].rationale == ""

    def test_write_creates_parent_dirs(self, tmp_path):
        ev = _sample_evaluation()
        p = tmp_path / "deep" / "nested" / "eval.md"
        write_entity_evaluation(ev, p)
        assert p.exists()


# ── Snapshot I/O ─────────────────────────────────────────────────────


class TestSnapshotIO:
    def test_write_creates_file(self, tmp_path):
        snap = _sample_snapshot()
        p = tmp_path / "snapshot.yaml"
        write_snapshot(snap, p)
        assert p.exists()

    def test_read_back_matches_original(self, tmp_path):
        snap = _sample_snapshot()
        p = tmp_path / "snapshot.yaml"
        write_snapshot(snap, p)
        restored = read_snapshot(p)
        assert restored.snapshot_id == snap.snapshot_id
        assert restored.created_at == snap.created_at
        assert restored.schema_name == snap.schema_name
        assert restored.entity_count == snap.entity_count

    def test_round_trip_preserves_entity_evaluations(self, tmp_path):
        snap = _sample_snapshot()
        p = tmp_path / "snapshot.yaml"
        write_snapshot(snap, p)
        restored = read_snapshot(p)
        assert len(restored.entity_evaluations) == 1
        ev = restored.entity_evaluations[0]
        assert ev.entity_slug == "division-of-labour"
        assert len(ev.scores) == 3

    def test_round_trip_preserves_collection_metrics(self, tmp_path):
        snap = _sample_snapshot()
        p = tmp_path / "snapshot.yaml"
        write_snapshot(snap, p)
        restored = read_snapshot(p)
        assert len(restored.collection_metrics) == 1
        m = restored.collection_metrics[0]
        assert m.name == "coverage_ratio"
        assert m.value == 0.85
        assert m.concern == "C2"


# ── History ──────────────────────────────────────────────────────────


class TestHistory:
    def test_append_creates_new_file(self, tmp_path):
        snap = _sample_snapshot()
        hp = tmp_path / "history.yaml"
        append_to_history(snap, hp)
        assert hp.exists()
        history = read_history(hp)
        assert len(history) == 1

    def test_append_adds_to_existing(self, tmp_path):
        hp = tmp_path / "history.yaml"
        snap1 = _sample_snapshot(snapshot_id="snap-1")
        snap2 = _sample_snapshot(snapshot_id="snap-2")
        append_to_history(snap1, hp)
        append_to_history(snap2, hp)
        history = read_history(hp)
        assert len(history) == 2
        assert history[0].snapshot_id == "snap-1"
        assert history[1].snapshot_id == "snap-2"

    def test_multiple_appends_all_preserved(self, tmp_path):
        hp = tmp_path / "history.yaml"
        for i in range(5):
            snap = _sample_snapshot(snapshot_id=f"snap-{i}")
            append_to_history(snap, hp)
        history = read_history(hp)
        assert len(history) == 5
        assert [h.snapshot_id for h in history] == [f"snap-{i}" for i in range(5)]

    def test_read_history_returns_list_in_order(self, tmp_path):
        hp = tmp_path / "history.yaml"
        snap_a = _sample_snapshot(snapshot_id="a")
        snap_b = _sample_snapshot(snapshot_id="b")
        append_to_history(snap_a, hp)
        append_to_history(snap_b, hp)
        history = read_history(hp)
        assert history[0].snapshot_id == "a"
        assert history[1].snapshot_id == "b"


# ── Diffing ──────────────────────────────────────────────────────────


class TestDiffSnapshots:
    def test_identical_snapshots_empty_diff(self):
        snap = _sample_snapshot()
        diff = diff_snapshots(snap, snap)
        assert diff.added_entities == []
        assert diff.removed_entities == []
        assert diff.score_changes == []
        assert diff.metric_changes == []

    def test_added_entity(self):
        before = _sample_snapshot(entity_evaluations=[])
        after = _sample_snapshot()
        diff = diff_snapshots(before, after)
        assert "division-of-labour" in diff.added_entities
        assert diff.removed_entities == []

    def test_removed_entity(self):
        before = _sample_snapshot()
        after = _sample_snapshot(entity_evaluations=[])
        diff = diff_snapshots(before, after)
        assert "division-of-labour" in diff.removed_entities
        assert diff.added_entities == []

    def test_changed_score(self):
        ev_before = _sample_evaluation(scores=[ScoreEntry("precision", 4.0)])
        ev_after = _sample_evaluation(scores=[ScoreEntry("precision", 4.8)])
        before = _sample_snapshot(entity_evaluations=[ev_before])
        after = _sample_snapshot(entity_evaluations=[ev_after])
        diff = diff_snapshots(before, after)
        assert len(diff.score_changes) == 1
        sc = diff.score_changes[0]
        assert sc.entity_slug == "division-of-labour"
        assert sc.dimension == "precision"
        assert sc.before == 4.0
        assert sc.after == 4.8

    def test_changed_metric(self):
        before = _sample_snapshot(
            collection_metrics=[MetricValue("coverage_ratio", 0.80)]
        )
        after = _sample_snapshot(
            collection_metrics=[MetricValue("coverage_ratio", 0.90)]
        )
        diff = diff_snapshots(before, after)
        assert len(diff.metric_changes) == 1
        mc = diff.metric_changes[0]
        assert mc.name == "coverage_ratio"
        assert mc.before == 0.80
        assert mc.after == 0.90

    def test_summary_readable(self):
        ev_before = _sample_evaluation(scores=[ScoreEntry("precision", 4.0)])
        ev_after = _sample_evaluation(scores=[ScoreEntry("precision", 4.8)])
        before = _sample_snapshot(
            snapshot_id="snap-1",
            entity_evaluations=[ev_before],
            collection_metrics=[MetricValue("coverage", 0.80)],
        )
        after = _sample_snapshot(
            snapshot_id="snap-2",
            entity_evaluations=[ev_after],
            collection_metrics=[MetricValue("coverage", 0.90)],
        )
        diff = diff_snapshots(before, after)
        text = diff.summary()
        assert "snap-1" in text
        assert "snap-2" in text
        assert "precision" in text
        assert "coverage" in text

    def test_summary_no_changes(self):
        snap = _sample_snapshot()
        diff = diff_snapshots(snap, snap)
        text = diff.summary()
        assert "No changes" in text