import json import os import subprocess import sys from datetime import datetime, timezone from pathlib import Path import yaml from infospace_bench import add_artifact, create_infospace, load_infospace from infospace_bench.checks import run_collection_checks from infospace_bench.evaluation import ( EntityEvaluation, EvaluationSnapshot, MetricValue, ScoreEntry, ) from infospace_bench.evaluation_io import ( append_to_history, read_entity_evaluation, read_history, read_snapshot, write_entity_evaluation, write_snapshot, ) from infospace_bench.history import ( find_snapshot, get_history, metric_trend, read_metrics_file, record_check_results, write_metrics_file, ) NOW = datetime(2026, 5, 14, 10, 30, tzinfo=timezone.utc) def cli_env() -> dict[str, str]: env = os.environ.copy() env["PYTHONPATH"] = "src:/home/worsch/markitect-tool/src" return env def evaluation( artifact_id: str = "entity/division.md", value: float = 4.0, ) -> EntityEvaluation: return EntityEvaluation( artifact_id=artifact_id, evaluator="test", evaluated_at=NOW, scores=[ ScoreEntry("definition_precision", value, rationale="Grounded and clear."), ScoreEntry("source_grounding", 5.0), ], notes=["reviewed"], ) def snapshot(snapshot_id: str, value: float, day: int) -> EvaluationSnapshot: return EvaluationSnapshot( snapshot_id=snapshot_id, created_at=datetime(2026, 5, day, 9, tzinfo=timezone.utc), schema_name="baseline", artifact_count=1, artifact_evaluations=[evaluation(value=value)], collection_metrics=[MetricValue("coverage_ratio", value / 5.0, concern="C2")], metadata={"source": "test"}, ) def test_entity_evaluation_file_preserves_frontmatter_and_markdown_body( tmp_path: Path, ) -> None: path = tmp_path / "output" / "evaluations" / "division.md" write_entity_evaluation(evaluation(), path) restored = read_entity_evaluation(path) text = path.read_text(encoding="utf-8") assert text.startswith("---\n") assert "artifact_id: entity/division.md" in text assert "# Evaluation: entity/division.md" in text assert "## definition_precision" in text assert restored == evaluation() assert restored.scores[0].rationale == "Grounded and clear." def test_snapshot_and_history_round_trip_and_diff_from_named_refs(tmp_path: Path) -> None: first = snapshot("snap-a", 3.0, 1) second = snapshot("snap-b", 4.0, 2) snapshot_path = tmp_path / "output" / "metrics" / "snap-a.yaml" history_path = tmp_path / "output" / "metrics" / "history.yaml" write_snapshot(first, snapshot_path) append_to_history(first, history_path) append_to_history(second, history_path) history = read_history(history_path) diff = find_snapshot(history, "snap-a").diff(find_snapshot(history, "2026-05-02")) assert read_snapshot(snapshot_path) == first assert [item.snapshot_id for item in history] == ["snap-a", "snap-b"] assert get_history(tmp_path) == history assert metric_trend(history, "coverage_ratio") == [ {"date": "2026-05-01T09:00:00+00:00", "value": 0.6}, {"date": "2026-05-02T09:00:00+00:00", "value": 0.8}, ] assert diff.score_changes[0].artifact_id == "entity/division.md" assert diff.metric_changes[0].name == "coverage_ratio" def test_metrics_file_preserves_structured_values_and_recording_merges( tmp_path: Path, ) -> None: infospace = create_infospace(tmp_path, "pilot", name="Pilot") source = tmp_path / "chapter.md" source.write_text("# Chapter\n\nSource text.\n", encoding="utf-8") add_artifact(infospace.root, source, kind="source", title="Chapter") config_path = infospace.root / "infospace.yaml" config = yaml.safe_load(config_path.read_text(encoding="utf-8")) config["viability"] = { "coverage_ratio": {"min": 0.5}, "redundancy_ratio": {"max": 0.1}, } config_path.write_text(yaml.safe_dump(config, sort_keys=False), encoding="utf-8") metrics_path = infospace.root / "output" / "metrics" / "metrics.yaml" write_metrics_file( {"type_distribution": {"source": 1}, "manual_metric": 7, "rounded": 1.1234567}, metrics_path, ) report = record_check_results( infospace.root, run_collection_checks(load_infospace(infospace.root).artifacts), artifact_evaluations=[evaluation()], ) metrics = read_metrics_file(metrics_path) assert metrics["type_distribution"] == {"source": 1} assert metrics["manual_metric"] == 7 assert metrics["rounded"] == 1.123457 assert metrics["per_artifact_mean"] == 4.5 assert metrics["coverage_ratio"] == 1.0 assert report.viability is not None assert report.viability["passed"] is True def test_cli_history_history_diff_and_metrics_json(tmp_path: Path) -> None: infospace = create_infospace(tmp_path, "pilot", name="Pilot") source = tmp_path / "chapter.md" source.write_text("# Chapter\n\nSource text.\n", encoding="utf-8") add_artifact(infospace.root, source, kind="source", title="Chapter") history_path = infospace.root / "output" / "metrics" / "history.yaml" append_to_history(snapshot("snap-a", 3.0, 1), history_path) append_to_history(snapshot("snap-b", 4.0, 2), history_path) history = subprocess.run( [sys.executable, "-m", "infospace_bench", "history", str(infospace.root)], check=False, env=cli_env(), text=True, capture_output=True, ) diff = subprocess.run( [ sys.executable, "-m", "infospace_bench", "history-diff", str(infospace.root), "snap-a", "snap-b", ], check=False, env=cli_env(), text=True, capture_output=True, ) metrics = subprocess.run( [sys.executable, "-m", "infospace_bench", "metrics", str(infospace.root)], check=False, env=cli_env(), text=True, capture_output=True, ) assert history.returncode == 0, history.stderr assert diff.returncode == 0, diff.stderr assert metrics.returncode == 0, metrics.stderr assert json.loads(history.stdout)["history"][1]["snapshot_id"] == "snap-b" assert json.loads(diff.stdout)["diff"]["after_id"] == "snap-b" assert json.loads(metrics.stdout)["snapshot"]["artifact_count"] == 1