generated from coulomb/repo-seed
eval history and metrics
This commit is contained in:
196
tests/test_evaluation_history.py
Normal file
196
tests/test_evaluation_history.py
Normal file
@@ -0,0 +1,196 @@
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
|
||||
from infospace_bench import add_artifact, create_infospace, load_infospace
|
||||
from infospace_bench.checks import run_collection_checks
|
||||
from infospace_bench.evaluation import (
|
||||
EntityEvaluation,
|
||||
EvaluationSnapshot,
|
||||
MetricValue,
|
||||
ScoreEntry,
|
||||
)
|
||||
from infospace_bench.evaluation_io import (
|
||||
append_to_history,
|
||||
read_entity_evaluation,
|
||||
read_history,
|
||||
read_snapshot,
|
||||
write_entity_evaluation,
|
||||
write_snapshot,
|
||||
)
|
||||
from infospace_bench.history import (
|
||||
find_snapshot,
|
||||
get_history,
|
||||
metric_trend,
|
||||
read_metrics_file,
|
||||
record_check_results,
|
||||
write_metrics_file,
|
||||
)
|
||||
|
||||
|
||||
NOW = datetime(2026, 5, 14, 10, 30, tzinfo=timezone.utc)
|
||||
|
||||
|
||||
def cli_env() -> dict[str, str]:
|
||||
env = os.environ.copy()
|
||||
env["PYTHONPATH"] = "src:/home/worsch/markitect-tool/src"
|
||||
return env
|
||||
|
||||
|
||||
def evaluation(
|
||||
artifact_id: str = "entity/division.md",
|
||||
value: float = 4.0,
|
||||
) -> EntityEvaluation:
|
||||
return EntityEvaluation(
|
||||
artifact_id=artifact_id,
|
||||
evaluator="test",
|
||||
evaluated_at=NOW,
|
||||
scores=[
|
||||
ScoreEntry("definition_precision", value, rationale="Grounded and clear."),
|
||||
ScoreEntry("source_grounding", 5.0),
|
||||
],
|
||||
notes=["reviewed"],
|
||||
)
|
||||
|
||||
|
||||
def snapshot(snapshot_id: str, value: float, day: int) -> EvaluationSnapshot:
|
||||
return EvaluationSnapshot(
|
||||
snapshot_id=snapshot_id,
|
||||
created_at=datetime(2026, 5, day, 9, tzinfo=timezone.utc),
|
||||
schema_name="baseline",
|
||||
artifact_count=1,
|
||||
artifact_evaluations=[evaluation(value=value)],
|
||||
collection_metrics=[MetricValue("coverage_ratio", value / 5.0, concern="C2")],
|
||||
metadata={"source": "test"},
|
||||
)
|
||||
|
||||
|
||||
def test_entity_evaluation_file_preserves_frontmatter_and_markdown_body(
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
path = tmp_path / "output" / "evaluations" / "division.md"
|
||||
|
||||
write_entity_evaluation(evaluation(), path)
|
||||
restored = read_entity_evaluation(path)
|
||||
text = path.read_text(encoding="utf-8")
|
||||
|
||||
assert text.startswith("---\n")
|
||||
assert "artifact_id: entity/division.md" in text
|
||||
assert "# Evaluation: entity/division.md" in text
|
||||
assert "## definition_precision" in text
|
||||
assert restored == evaluation()
|
||||
assert restored.scores[0].rationale == "Grounded and clear."
|
||||
|
||||
|
||||
def test_snapshot_and_history_round_trip_and_diff_from_named_refs(tmp_path: Path) -> None:
|
||||
first = snapshot("snap-a", 3.0, 1)
|
||||
second = snapshot("snap-b", 4.0, 2)
|
||||
snapshot_path = tmp_path / "output" / "metrics" / "snap-a.yaml"
|
||||
history_path = tmp_path / "output" / "metrics" / "history.yaml"
|
||||
|
||||
write_snapshot(first, snapshot_path)
|
||||
append_to_history(first, history_path)
|
||||
append_to_history(second, history_path)
|
||||
|
||||
history = read_history(history_path)
|
||||
diff = find_snapshot(history, "snap-a").diff(find_snapshot(history, "2026-05-02"))
|
||||
|
||||
assert read_snapshot(snapshot_path) == first
|
||||
assert [item.snapshot_id for item in history] == ["snap-a", "snap-b"]
|
||||
assert get_history(tmp_path) == history
|
||||
assert metric_trend(history, "coverage_ratio") == [
|
||||
{"date": "2026-05-01T09:00:00+00:00", "value": 0.6},
|
||||
{"date": "2026-05-02T09:00:00+00:00", "value": 0.8},
|
||||
]
|
||||
assert diff.score_changes[0].artifact_id == "entity/division.md"
|
||||
assert diff.metric_changes[0].name == "coverage_ratio"
|
||||
|
||||
|
||||
def test_metrics_file_preserves_structured_values_and_recording_merges(
|
||||
tmp_path: Path,
|
||||
) -> None:
|
||||
infospace = create_infospace(tmp_path, "pilot", name="Pilot")
|
||||
source = tmp_path / "chapter.md"
|
||||
source.write_text("# Chapter\n\nSource text.\n", encoding="utf-8")
|
||||
add_artifact(infospace.root, source, kind="source", title="Chapter")
|
||||
|
||||
config_path = infospace.root / "infospace.yaml"
|
||||
config = yaml.safe_load(config_path.read_text(encoding="utf-8"))
|
||||
config["viability"] = {
|
||||
"coverage_ratio": {"min": 0.5},
|
||||
"redundancy_ratio": {"max": 0.1},
|
||||
}
|
||||
config_path.write_text(yaml.safe_dump(config, sort_keys=False), encoding="utf-8")
|
||||
|
||||
metrics_path = infospace.root / "output" / "metrics" / "metrics.yaml"
|
||||
write_metrics_file(
|
||||
{"type_distribution": {"source": 1}, "manual_metric": 7, "rounded": 1.1234567},
|
||||
metrics_path,
|
||||
)
|
||||
|
||||
report = record_check_results(
|
||||
infospace.root,
|
||||
run_collection_checks(load_infospace(infospace.root).artifacts),
|
||||
artifact_evaluations=[evaluation()],
|
||||
)
|
||||
metrics = read_metrics_file(metrics_path)
|
||||
|
||||
assert metrics["type_distribution"] == {"source": 1}
|
||||
assert metrics["manual_metric"] == 7
|
||||
assert metrics["rounded"] == 1.123457
|
||||
assert metrics["per_artifact_mean"] == 4.5
|
||||
assert metrics["coverage_ratio"] == 1.0
|
||||
assert report.viability is not None
|
||||
assert report.viability["passed"] is True
|
||||
|
||||
|
||||
def test_cli_history_history_diff_and_metrics_json(tmp_path: Path) -> None:
|
||||
infospace = create_infospace(tmp_path, "pilot", name="Pilot")
|
||||
source = tmp_path / "chapter.md"
|
||||
source.write_text("# Chapter\n\nSource text.\n", encoding="utf-8")
|
||||
add_artifact(infospace.root, source, kind="source", title="Chapter")
|
||||
history_path = infospace.root / "output" / "metrics" / "history.yaml"
|
||||
append_to_history(snapshot("snap-a", 3.0, 1), history_path)
|
||||
append_to_history(snapshot("snap-b", 4.0, 2), history_path)
|
||||
|
||||
history = subprocess.run(
|
||||
[sys.executable, "-m", "infospace_bench", "history", str(infospace.root)],
|
||||
check=False,
|
||||
env=cli_env(),
|
||||
text=True,
|
||||
capture_output=True,
|
||||
)
|
||||
diff = subprocess.run(
|
||||
[
|
||||
sys.executable,
|
||||
"-m",
|
||||
"infospace_bench",
|
||||
"history-diff",
|
||||
str(infospace.root),
|
||||
"snap-a",
|
||||
"snap-b",
|
||||
],
|
||||
check=False,
|
||||
env=cli_env(),
|
||||
text=True,
|
||||
capture_output=True,
|
||||
)
|
||||
metrics = subprocess.run(
|
||||
[sys.executable, "-m", "infospace_bench", "metrics", str(infospace.root)],
|
||||
check=False,
|
||||
env=cli_env(),
|
||||
text=True,
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
assert history.returncode == 0, history.stderr
|
||||
assert diff.returncode == 0, diff.stderr
|
||||
assert metrics.returncode == 0, metrics.stderr
|
||||
assert json.loads(history.stdout)["history"][1]["snapshot_id"] == "snap-b"
|
||||
assert json.loads(diff.stdout)["diff"]["after_id"] == "snap-b"
|
||||
assert json.loads(metrics.stdout)["snapshot"]["artifact_count"] == 1
|
||||
Reference in New Issue
Block a user