eval history and metrics

2026-05-14 15:35:04 +02:00
parent d0c1f82863
commit 7f54dec585
9 changed files with 870 additions and 16 deletions
--- a/tests/test_evaluation_history.py
+++ b/tests/test_evaluation_history.py
@@ -0,0 +1,196 @@
+import json
+import os
+import subprocess
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+import yaml
+
+from infospace_bench import add_artifact, create_infospace, load_infospace
+from infospace_bench.checks import run_collection_checks
+from infospace_bench.evaluation import (
+    EntityEvaluation,
+    EvaluationSnapshot,
+    MetricValue,
+    ScoreEntry,
+)
+from infospace_bench.evaluation_io import (
+    append_to_history,
+    read_entity_evaluation,
+    read_history,
+    read_snapshot,
+    write_entity_evaluation,
+    write_snapshot,
+)
+from infospace_bench.history import (
+    find_snapshot,
+    get_history,
+    metric_trend,
+    read_metrics_file,
+    record_check_results,
+    write_metrics_file,
+)
+
+
+NOW = datetime(2026, 5, 14, 10, 30, tzinfo=timezone.utc)
+
+
+def cli_env() -> dict[str, str]:
+    env = os.environ.copy()
+    env["PYTHONPATH"] = "src:/home/worsch/markitect-tool/src"
+    return env
+
+
+def evaluation(
+    artifact_id: str = "entity/division.md",
+    value: float = 4.0,
+) -> EntityEvaluation:
+    return EntityEvaluation(
+        artifact_id=artifact_id,
+        evaluator="test",
+        evaluated_at=NOW,
+        scores=[
+            ScoreEntry("definition_precision", value, rationale="Grounded and clear."),
+            ScoreEntry("source_grounding", 5.0),
+        ],
+        notes=["reviewed"],
+    )
+
+
+def snapshot(snapshot_id: str, value: float, day: int) -> EvaluationSnapshot:
+    return EvaluationSnapshot(
+        snapshot_id=snapshot_id,
+        created_at=datetime(2026, 5, day, 9, tzinfo=timezone.utc),
+        schema_name="baseline",
+        artifact_count=1,
+        artifact_evaluations=[evaluation(value=value)],
+        collection_metrics=[MetricValue("coverage_ratio", value / 5.0, concern="C2")],
+        metadata={"source": "test"},
+    )
+
+
+def test_entity_evaluation_file_preserves_frontmatter_and_markdown_body(
+    tmp_path: Path,
+) -> None:
+    path = tmp_path / "output" / "evaluations" / "division.md"
+
+    write_entity_evaluation(evaluation(), path)
+    restored = read_entity_evaluation(path)
+    text = path.read_text(encoding="utf-8")
+
+    assert text.startswith("---\n")
+    assert "artifact_id: entity/division.md" in text
+    assert "# Evaluation: entity/division.md" in text
+    assert "## definition_precision" in text
+    assert restored == evaluation()
+    assert restored.scores[0].rationale == "Grounded and clear."
+
+
+def test_snapshot_and_history_round_trip_and_diff_from_named_refs(tmp_path: Path) -> None:
+    first = snapshot("snap-a", 3.0, 1)
+    second = snapshot("snap-b", 4.0, 2)
+    snapshot_path = tmp_path / "output" / "metrics" / "snap-a.yaml"
+    history_path = tmp_path / "output" / "metrics" / "history.yaml"
+
+    write_snapshot(first, snapshot_path)
+    append_to_history(first, history_path)
+    append_to_history(second, history_path)
+
+    history = read_history(history_path)
+    diff = find_snapshot(history, "snap-a").diff(find_snapshot(history, "2026-05-02"))
+
+    assert read_snapshot(snapshot_path) == first
+    assert [item.snapshot_id for item in history] == ["snap-a", "snap-b"]
+    assert get_history(tmp_path) == history
+    assert metric_trend(history, "coverage_ratio") == [
+        {"date": "2026-05-01T09:00:00+00:00", "value": 0.6},
+        {"date": "2026-05-02T09:00:00+00:00", "value": 0.8},
+    ]
+    assert diff.score_changes[0].artifact_id == "entity/division.md"
+    assert diff.metric_changes[0].name == "coverage_ratio"
+
+
+def test_metrics_file_preserves_structured_values_and_recording_merges(
+    tmp_path: Path,
+) -> None:
+    infospace = create_infospace(tmp_path, "pilot", name="Pilot")
+    source = tmp_path / "chapter.md"
+    source.write_text("# Chapter\n\nSource text.\n", encoding="utf-8")
+    add_artifact(infospace.root, source, kind="source", title="Chapter")
+
+    config_path = infospace.root / "infospace.yaml"
+    config = yaml.safe_load(config_path.read_text(encoding="utf-8"))
+    config["viability"] = {
+        "coverage_ratio": {"min": 0.5},
+        "redundancy_ratio": {"max": 0.1},
+    }
+    config_path.write_text(yaml.safe_dump(config, sort_keys=False), encoding="utf-8")
+
+    metrics_path = infospace.root / "output" / "metrics" / "metrics.yaml"
+    write_metrics_file(
+        {"type_distribution": {"source": 1}, "manual_metric": 7, "rounded": 1.1234567},
+        metrics_path,
+    )
+
+    report = record_check_results(
+        infospace.root,
+        run_collection_checks(load_infospace(infospace.root).artifacts),
+        artifact_evaluations=[evaluation()],
+    )
+    metrics = read_metrics_file(metrics_path)
+
+    assert metrics["type_distribution"] == {"source": 1}
+    assert metrics["manual_metric"] == 7
+    assert metrics["rounded"] == 1.123457
+    assert metrics["per_artifact_mean"] == 4.5
+    assert metrics["coverage_ratio"] == 1.0
+    assert report.viability is not None
+    assert report.viability["passed"] is True
+
+
+def test_cli_history_history_diff_and_metrics_json(tmp_path: Path) -> None:
+    infospace = create_infospace(tmp_path, "pilot", name="Pilot")
+    source = tmp_path / "chapter.md"
+    source.write_text("# Chapter\n\nSource text.\n", encoding="utf-8")
+    add_artifact(infospace.root, source, kind="source", title="Chapter")
+    history_path = infospace.root / "output" / "metrics" / "history.yaml"
+    append_to_history(snapshot("snap-a", 3.0, 1), history_path)
+    append_to_history(snapshot("snap-b", 4.0, 2), history_path)
+
+    history = subprocess.run(
+        [sys.executable, "-m", "infospace_bench", "history", str(infospace.root)],
+        check=False,
+        env=cli_env(),
+        text=True,
+        capture_output=True,
+    )
+    diff = subprocess.run(
+        [
+            sys.executable,
+            "-m",
+            "infospace_bench",
+            "history-diff",
+            str(infospace.root),
+            "snap-a",
+            "snap-b",
+        ],
+        check=False,
+        env=cli_env(),
+        text=True,
+        capture_output=True,
+    )
+    metrics = subprocess.run(
+        [sys.executable, "-m", "infospace_bench", "metrics", str(infospace.root)],
+        check=False,
+        env=cli_env(),
+        text=True,
+        capture_output=True,
+    )
+
+    assert history.returncode == 0, history.stderr
+    assert diff.returncode == 0, diff.stderr
+    assert metrics.returncode == 0, metrics.stderr
+    assert json.loads(history.stdout)["history"][1]["snapshot_id"] == "snap-b"
+    assert json.loads(diff.stdout)["diff"]["after_id"] == "snap-b"
+    assert json.loads(metrics.stdout)["snapshot"]["artifact_count"] == 1