eval history and metrics

2026-05-14 15:35:04 +02:00
parent d0c1f82863
commit 7f54dec585
9 changed files with 870 additions and 16 deletions
--- a/README.md
+++ b/README.md
@@ -20,6 +20,7 @@ Start with:
 - `docs/markitect-main-scope-assessment.md`
 - `docs/markitect-tool-adapter.md`
 - `docs/entity-relation-model.md`
+- `docs/evaluation-history-and-metrics.md`
 - `docs/orthogonal-successor-roadmap.md`
 - `docs/legacy-infospace-feature-inventory.md`
 - `docs/successor-boundary-interface-map.md`
--- a/docs/evaluation-history-and-metrics.md
+++ b/docs/evaluation-history-and-metrics.md
@@ -0,0 +1,43 @@
+# Evaluation History And Metrics
+
+`infospace-bench` keeps evaluation history as committed, inspectable files under
+each infospace root. This replaces the legacy `markitect-project` history
+workflow while retaining the useful behaviors: Markdown evaluation files,
+append-only snapshot history, metric merging, and viability checks.
+
+## Files
+
+- `output/evaluations/*.md`: per-artifact evaluation files with YAML
+  frontmatter and a human-readable Markdown body.
+- `output/metrics/metrics.yaml`: latest merged metrics. Collection metrics,
+  evaluation-derived metrics, and structured non-numeric values are preserved.
+- `output/metrics/history.yaml`: append-only list of evaluation snapshots.
+- `output/metrics/snapshots/<snapshot-id>.yaml`: named snapshot copies for
+  reproducible diffs.
+- `output/metrics/viability.yaml`: structured viability report generated from
+  `infospace.yaml` thresholds and the current metrics file.
+
+## Replacement Mapping
+
+The old infospace history code used entity-oriented names such as
+`entity_count`, `entity_evaluations`, and `entity_slug`. The successor model
+uses artifact-oriented names:
+
+- `artifact_count` replaces `entity_count`
+- `artifact_evaluations` replaces `entity_evaluations`
+- `artifact_id` replaces `entity_slug`
+
+Readers accept the old snapshot aliases where practical so legacy fixtures can
+be inspected, but new files should use the artifact-oriented vocabulary.
+
+## CLI
+
+```bash
+python3 -m infospace_bench metrics infospaces/bootstrap-pilot
+python3 -m infospace_bench history infospaces/bootstrap-pilot
+python3 -m infospace_bench history infospaces/bootstrap-pilot --metric coverage_ratio
+python3 -m infospace_bench history-diff infospaces/bootstrap-pilot snap-a snap-b
+```
+
+Snapshot references may be exact snapshot IDs or ISO-like dates such as
+`2026-05-14`. Date references resolve to the nearest snapshot in the history.
--- a/src/infospace_bench/init.py
+++ b/src/infospace_bench/init.py
@@ -1,5 +1,22 @@
 from .errors import InfospaceError
 from .evaluation import EntityEvaluation, EvaluationSnapshot, MetricValue, ScoreEntry
+from .evaluation_io import (
+    append_to_history,
+    read_entity_evaluation,
+    read_history,
+    read_snapshot,
+    write_entity_evaluation,
+    write_snapshot,
+)
+from .history import (
+    find_snapshot,
+    get_history,
+    get_latest_snapshot,
+    metric_trend,
+    read_metrics_file,
+    record_check_results,
+    write_metrics_file,
+)
 from .lifecycle import add_artifact, create_infospace, load_infospace
 from .models import (
    DisciplineBinding,
@@ -26,8 +43,21 @@ __all__ = [
    "TopicConfig",
    "ViabilityThreshold",
    "add_artifact",
+    "append_to_history",
    "create_infospace",
+    "find_snapshot",
+    "get_history",
+    "get_latest_snapshot",
    "list_entities",
    "list_relations",
    "load_infospace",
+    "metric_trend",
+    "read_entity_evaluation",
+    "read_history",
+    "read_metrics_file",
+    "read_snapshot",
+    "record_check_results",
+    "write_entity_evaluation",
+    "write_metrics_file",
+    "write_snapshot",
 ]
--- a/src/infospace_bench/cli.py
+++ b/src/infospace_bench/cli.py
@@ -5,7 +5,9 @@ import json
 import sys
 from pathlib import Path

+from .checks import run_collection_checks
 from .errors import InfospaceError
+from .history import find_snapshot, get_history, metric_trend, record_check_results
 from .lifecycle import add_artifact, create_infospace, load_infospace
 from .markdown_adapter import validate_infospace_artifacts
 from .semantics import list_entities, list_relations
@@ -42,6 +44,24 @@ def build_parser() -> argparse.ArgumentParser:
    relations = sub.add_parser("relations", help="List parsed relation artifacts")
    relations.add_argument("root")

+    history = sub.add_parser("history", help="List evaluation snapshot history")
+    history.add_argument("root")
+    history.add_argument("--metric", default="")
+
+    history_diff = sub.add_parser(
+        "history-diff",
+        help="Diff two evaluation snapshots by snapshot ID or date",
+    )
+    history_diff.add_argument("root")
+    history_diff.add_argument("before")
+    history_diff.add_argument("after")
+
+    metrics = sub.add_parser(
+        "metrics",
+        help="Run collection checks and persist metrics/history",
+    )
+    metrics.add_argument("root")
+
    return parser


@@ -96,6 +116,40 @@ def main(argv: list[str] | None = None) -> int:
                    ]
                }
            )
+        elif args.command == "history":
+            history = get_history(Path(args.root))
+            if args.metric:
+                _write_json(
+                    {
+                        "metric": args.metric,
+                        "trend": metric_trend(history, args.metric),
+                    }
+                )
+            else:
+                _write_json({"history": [item.to_dict() for item in history]})
+        elif args.command == "history-diff":
+            history = get_history(Path(args.root))
+            before = find_snapshot(history, args.before)
+            after = find_snapshot(history, args.after)
+            if before is None or after is None:
+                missing = []
+                if before is None:
+                    missing.append(args.before)
+                if after is None:
+                    missing.append(args.after)
+                raise InfospaceError(
+                    "missing_snapshot",
+                    "Could not resolve requested snapshot reference",
+                    {"missing_refs": missing},
+                )
+            _write_json({"diff": before.diff(after).to_dict()})
+        elif args.command == "metrics":
+            infospace = load_infospace(Path(args.root))
+            result = record_check_results(
+                infospace.root,
+                run_collection_checks(infospace.artifacts),
+            )
+            _write_json(result.to_dict())
        else:
            parser.error(f"Unhandled command: {args.command}")
    except InfospaceError as exc:
--- a/src/infospace_bench/evaluation.py
+++ b/src/infospace_bench/evaluation.py
@@ -40,6 +40,11 @@ class EntityEvaluation:
    evaluated_at: datetime
    notes: list[str] = field(default_factory=list)

+    @property
+    def entity_slug(self) -> str:
+        """Legacy alias for readers moving from entity-oriented history files."""
+        return self.artifact_id
+
    @property
    def overall_score(self) -> float:
        if not self.scores:
@@ -102,6 +107,16 @@ class EvaluationSnapshot:
    collection_metrics: list[MetricValue] = field(default_factory=list)
    metadata: dict[str, Any] = field(default_factory=dict)

+    @property
+    def entity_count(self) -> int:
+        """Legacy alias retained for old infospace history readers."""
+        return self.artifact_count
+
+    @property
+    def entity_evaluations(self) -> list[EntityEvaluation]:
+        """Legacy alias retained for old infospace history readers."""
+        return self.artifact_evaluations
+
    def to_dict(self) -> dict[str, Any]:
        return {
            "snapshot_id": self.snapshot_id,
@@ -122,11 +137,14 @@ class EvaluationSnapshot:
        return cls(
            snapshot_id=str(data["snapshot_id"]),
            created_at=datetime.fromisoformat(str(data["created_at"])),
-            schema_name=str(data["schema_name"]),
-            artifact_count=int(data["artifact_count"]),
+            schema_name=str(data.get("schema_name") or "default"),
+            artifact_count=int(data.get("artifact_count", data.get("entity_count", 0))),
            artifact_evaluations=[
                EntityEvaluation.from_dict(item)
-                for item in data.get("artifact_evaluations", [])
+                for item in data.get(
+                    "artifact_evaluations",
+                    data.get("entity_evaluations", []),
+                )
            ],
            collection_metrics=[
                MetricValue.from_dict(item) for item in data.get("collection_metrics", [])
@@ -134,6 +152,9 @@ class EvaluationSnapshot:
            metadata=dict(data.get("metadata") or {}),
        )

+    def diff(self, after: "EvaluationSnapshot") -> "SnapshotDiff":
+        return diff_snapshots(self, after)
+

@dataclass(frozen=True)
 class ScoreChange:
@@ -146,6 +167,20 @@ class ScoreChange:
    def delta(self) -> float:
        return self.after - self.before

+    @property
+    def entity_slug(self) -> str:
+        """Legacy alias for old diff consumers."""
+        return self.artifact_id
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "artifact_id": self.artifact_id,
+            "dimension": self.dimension,
+            "before": self.before,
+            "after": self.after,
+            "delta": self.delta,
+        }
+

@dataclass(frozen=True)
 class MetricChange:
@@ -157,6 +192,14 @@ class MetricChange:
    def delta(self) -> float:
        return self.after - self.before

+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "name": self.name,
+            "before": self.before,
+            "after": self.after,
+            "delta": self.delta,
+        }
+

@dataclass(frozen=True)
 class SnapshotDiff:
@@ -167,6 +210,51 @@ class SnapshotDiff:
    score_changes: list[ScoreChange] = field(default_factory=list)
    metric_changes: list[MetricChange] = field(default_factory=list)

+    @property
+    def added_entities(self) -> list[str]:
+        """Legacy alias for old history diff output."""
+        return self.added_artifacts
+
+    @property
+    def removed_entities(self) -> list[str]:
+        """Legacy alias for old history diff output."""
+        return self.removed_artifacts
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "before_id": self.before_id,
+            "after_id": self.after_id,
+            "added_artifacts": self.added_artifacts,
+            "removed_artifacts": self.removed_artifacts,
+            "score_changes": [change.to_dict() for change in self.score_changes],
+            "metric_changes": [change.to_dict() for change in self.metric_changes],
+        }
+
+    def summary(self) -> str:
+        lines = [f"Snapshot diff: {self.before_id} -> {self.after_id}"]
+        if not (
+            self.added_artifacts
+            or self.removed_artifacts
+            or self.score_changes
+            or self.metric_changes
+        ):
+            return "\n".join([*lines, "No changes."])
+        for artifact_id in self.added_artifacts:
+            lines.append(f"Added artifact: {artifact_id}")
+        for artifact_id in self.removed_artifacts:
+            lines.append(f"Removed artifact: {artifact_id}")
+        for change in self.score_changes:
+            lines.append(
+                f"Score {change.artifact_id} {change.dimension}: "
+                f"{change.before} -> {change.after} ({change.delta:+.4f})"
+            )
+        for change in self.metric_changes:
+            lines.append(
+                f"Metric {change.name}: "
+                f"{change.before} -> {change.after} ({change.delta:+.4f})"
+            )
+        return "\n".join(lines)
+

 def diff_snapshots(
    before: EvaluationSnapshot,
@@ -174,22 +262,29 @@ def diff_snapshots(
 ) -> SnapshotDiff:
    before_scores = _score_index(before)
    after_scores = _score_index(after)
-    before_artifacts = {artifact_id for artifact_id, _ in before_scores}
-    after_artifacts = {artifact_id for artifact_id, _ in after_scores}
+    before_artifacts = {
+        evaluation.artifact_id for evaluation in before.artifact_evaluations
+    }
+    after_artifacts = {evaluation.artifact_id for evaluation in after.artifact_evaluations}

    score_changes = [
-        ScoreChange(artifact_id, dimension, before_scores[key], after_scores[key])
-        for key in sorted(before_scores.keys() & after_scores.keys())
+        ScoreChange(
+            artifact_id,
+            dimension,
+            before_scores.get(key, 0.0),
+            after_scores.get(key, 0.0),
+        )
+        for key in sorted(before_scores.keys() | after_scores.keys())
        for artifact_id, dimension in [key]
-        if before_scores[key] != after_scores[key]
+        if before_scores.get(key) != after_scores.get(key)
    ]

    before_metrics = {metric.name: metric.value for metric in before.collection_metrics}
    after_metrics = {metric.name: metric.value for metric in after.collection_metrics}
    metric_changes = [
-        MetricChange(name, before_metrics[name], after_metrics[name])
-        for name in sorted(before_metrics.keys() & after_metrics.keys())
-        if before_metrics[name] != after_metrics[name]
+        MetricChange(name, before_metrics.get(name, 0.0), after_metrics.get(name, 0.0))
+        for name in sorted(before_metrics.keys() | after_metrics.keys())
+        if before_metrics.get(name) != after_metrics.get(name)
    ]

    return SnapshotDiff(
--- a/src/infospace_bench/evaluation_io.py
+++ b/src/infospace_bench/evaluation_io.py
@@ -0,0 +1,181 @@
+from __future__ import annotations
+
+from datetime import datetime
+from pathlib import Path
+from typing import Any
+
+import yaml
+
+from .errors import InfospaceError
+from .evaluation import (
+    EntityEvaluation,
+    EvaluationSnapshot,
+    ScoreEntry,
+    diff_snapshots,
+)
+
+FRONTMATTER_MARKER = "---"
+
+
+def write_entity_evaluation(
+    evaluation: EntityEvaluation,
+    path: str | Path,
+) -> None:
+    target = Path(path)
+    frontmatter: dict[str, Any] = {
+        "artifact_id": evaluation.artifact_id,
+        "evaluator": evaluation.evaluator,
+        "evaluated_at": evaluation.evaluated_at.isoformat(),
+        "overall_score": round(evaluation.overall_score, 4),
+        "scores": [score.to_dict() for score in evaluation.scores],
+    }
+    if evaluation.notes:
+        frontmatter["notes"] = evaluation.notes
+
+    lines = [
+        FRONTMATTER_MARKER,
+        yaml.safe_dump(frontmatter, sort_keys=False).rstrip(),
+        FRONTMATTER_MARKER,
+        "",
+        f"# Evaluation: {evaluation.artifact_id}",
+        "",
+    ]
+    for score in evaluation.scores:
+        lines.append(f"## {score.name} - {score.value} / {score.max_value}")
+        lines.append("")
+        if score.rationale:
+            lines.append(score.rationale)
+            lines.append("")
+
+    target.parent.mkdir(parents=True, exist_ok=True)
+    target.write_text("\n".join(lines), encoding="utf-8")
+
+
+def read_entity_evaluation(path: str | Path) -> EntityEvaluation:
+    source = Path(path)
+    frontmatter, body = _read_frontmatter_markdown(source)
+    rationales = _parse_rationales(body)
+    scores = [
+        _score_with_body_rationale(ScoreEntry.from_dict(item), rationales)
+        for item in frontmatter.get("scores", [])
+    ]
+    artifact_id = frontmatter.get("artifact_id", frontmatter.get("entity_slug"))
+    if not artifact_id:
+        raise InfospaceError(
+            "invalid_evaluation_file",
+            f"Missing artifact_id in evaluation file: {source}",
+            {"path": str(source)},
+        )
+    return EntityEvaluation(
+        artifact_id=str(artifact_id),
+        evaluator=str(frontmatter["evaluator"]),
+        scores=scores,
+        evaluated_at=datetime.fromisoformat(str(frontmatter["evaluated_at"])),
+        notes=list(frontmatter.get("notes") or []),
+    )
+
+
+def write_snapshot(snapshot: EvaluationSnapshot, path: str | Path) -> None:
+    target = Path(path)
+    target.parent.mkdir(parents=True, exist_ok=True)
+    target.write_text(
+        yaml.safe_dump(snapshot.to_dict(), sort_keys=False),
+        encoding="utf-8",
+    )
+
+
+def read_snapshot(path: str | Path) -> EvaluationSnapshot:
+    source = Path(path)
+    data = yaml.safe_load(source.read_text(encoding="utf-8"))
+    if not isinstance(data, dict):
+        raise InfospaceError(
+            "invalid_snapshot_file",
+            f"Expected mapping in snapshot file: {source}",
+            {"path": str(source)},
+        )
+    return EvaluationSnapshot.from_dict(data)
+
+
+def append_to_history(snapshot: EvaluationSnapshot, history_path: str | Path) -> None:
+    target = Path(history_path)
+    existing = [item.to_dict() for item in read_history(target)] if target.exists() else []
+    existing.append(snapshot.to_dict())
+    target.parent.mkdir(parents=True, exist_ok=True)
+    target.write_text(
+        yaml.safe_dump(existing, sort_keys=False),
+        encoding="utf-8",
+    )
+
+
+def read_history(history_path: str | Path) -> list[EvaluationSnapshot]:
+    source = Path(history_path)
+    if not source.is_file():
+        return []
+    data = yaml.safe_load(source.read_text(encoding="utf-8"))
+    if data is None:
+        return []
+    if not isinstance(data, list):
+        raise InfospaceError(
+            "invalid_history_file",
+            f"Expected list in history file: {source}",
+            {"path": str(source)},
+        )
+    return [EvaluationSnapshot.from_dict(item) for item in data]
+
+
+def _read_frontmatter_markdown(path: Path) -> tuple[dict[str, Any], str]:
+    text = path.read_text(encoding="utf-8")
+    if not text.startswith(f"{FRONTMATTER_MARKER}\n"):
+        raise InfospaceError(
+            "invalid_evaluation_file",
+            f"Missing YAML frontmatter in evaluation file: {path}",
+            {"path": str(path)},
+        )
+    end = text.find(f"\n{FRONTMATTER_MARKER}\n", len(FRONTMATTER_MARKER) + 1)
+    if end == -1:
+        raise InfospaceError(
+            "invalid_evaluation_file",
+            f"Unclosed YAML frontmatter in evaluation file: {path}",
+            {"path": str(path)},
+        )
+    raw = text[len(FRONTMATTER_MARKER) + 1 : end]
+    body = text[end + len(FRONTMATTER_MARKER) + 2 :]
+    data = yaml.safe_load(raw)
+    if not isinstance(data, dict):
+        raise InfospaceError(
+            "invalid_evaluation_file",
+            f"Expected mapping frontmatter in evaluation file: {path}",
+            {"path": str(path)},
+        )
+    return data, body
+
+
+def _parse_rationales(body: str) -> dict[str, str]:
+    rationales: dict[str, str] = {}
+    current_name: str | None = None
+    current_lines: list[str] = []
+    for line in body.splitlines():
+        if line.startswith("## "):
+            if current_name is not None:
+                rationales[current_name] = "\n".join(current_lines).strip()
+            heading = line[3:].strip()
+            current_name = heading.split(" - ", maxsplit=1)[0].strip()
+            current_lines = []
+        elif current_name is not None:
+            current_lines.append(line)
+    if current_name is not None:
+        rationales[current_name] = "\n".join(current_lines).strip()
+    return rationales
+
+
+def _score_with_body_rationale(
+    score: ScoreEntry,
+    rationales: dict[str, str],
+) -> ScoreEntry:
+    rationale = rationales.get(score.name, score.rationale)
+    return ScoreEntry(
+        name=score.name,
+        value=score.value,
+        max_value=score.max_value,
+        rationale=rationale,
+    )
--- a/src/infospace_bench/history.py
+++ b/src/infospace_bench/history.py
@@ -0,0 +1,254 @@
+from __future__ import annotations
+
+import uuid
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+import yaml
+
+from .checks import CollectionCheckReport
+from .evaluation import EntityEvaluation, EvaluationSnapshot, MetricValue
+from .evaluation_io import append_to_history, read_history, write_snapshot
+from .lifecycle import load_infospace
+from .viability import evaluate_viability
+
+METRICS_PATH = Path("output/metrics/metrics.yaml")
+HISTORY_PATH = Path("output/metrics/history.yaml")
+VIABILITY_PATH = Path("output/metrics/viability.yaml")
+SNAPSHOT_DIR = Path("output/metrics/snapshots")
+
+
+@dataclass(frozen=True)
+class RecordedCheckResult:
+    snapshot: EvaluationSnapshot
+    metrics: dict[str, Any]
+    viability: dict[str, Any] | None = None
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "snapshot": self.snapshot.to_dict(),
+            "metrics": self.metrics,
+            "viability": self.viability,
+        }
+
+
+def snapshot_from_checks(
+    check_report: CollectionCheckReport,
+    artifact_count: int,
+    *,
+    schema_name: str = "default",
+    metadata: dict[str, Any] | None = None,
+    artifact_evaluations: list[EntityEvaluation] | None = None,
+) -> EvaluationSnapshot:
+    metrics = _numeric_metrics(check_report.metrics)
+    collection_metrics = [
+        MetricValue(name=name, value=value, concern=_concern_for_metric(name))
+        for name, value in sorted(metrics.items())
+    ]
+    collection_metrics.extend(
+        MetricValue(name=name, value=value, concern="evaluation")
+        for name, value in sorted(
+            _evaluation_metrics(artifact_evaluations or []).items()
+        )
+    )
+    return EvaluationSnapshot(
+        snapshot_id=str(uuid.uuid4())[:8],
+        created_at=datetime.now(timezone.utc),
+        schema_name=schema_name,
+        artifact_count=artifact_count,
+        artifact_evaluations=artifact_evaluations or [],
+        collection_metrics=collection_metrics,
+        metadata=metadata or {},
+    )
+
+
+def write_metrics_file(metrics: dict[str, Any], path: str | Path) -> None:
+    target = Path(path)
+    target.parent.mkdir(parents=True, exist_ok=True)
+    target.write_text(
+        yaml.safe_dump(
+            {
+                key: _normalize_metric_value(value)
+                for key, value in sorted(metrics.items())
+            },
+            sort_keys=True,
+        ),
+        encoding="utf-8",
+    )
+
+
+def read_metrics_file(path: str | Path) -> dict[str, Any]:
+    source = Path(path)
+    if not source.is_file():
+        return {}
+    data = yaml.safe_load(source.read_text(encoding="utf-8"))
+    return data if isinstance(data, dict) else {}
+
+
+def record_check_results(
+    root: str | Path,
+    check_report: CollectionCheckReport,
+    *,
+    artifact_evaluations: list[EntityEvaluation] | None = None,
+    schema_name: str = "default",
+    metadata: dict[str, Any] | None = None,
+) -> RecordedCheckResult:
+    infospace = load_infospace(root)
+    artifact_count = int(
+        check_report.details.get("artifact_count", len(infospace.artifacts))
+    )
+    snapshot = snapshot_from_checks(
+        check_report,
+        artifact_count,
+        schema_name=schema_name,
+        metadata={"source": "collection-checks", **(metadata or {})},
+        artifact_evaluations=artifact_evaluations,
+    )
+    metrics_file = infospace.root / METRICS_PATH
+    existing = read_metrics_file(metrics_file)
+    merged = {
+        **existing,
+        **check_report.metrics,
+        **_evaluation_metrics(artifact_evaluations or []),
+    }
+    write_metrics_file(merged, metrics_file)
+
+    history_path = infospace.root / HISTORY_PATH
+    append_to_history(snapshot, history_path)
+    write_snapshot(
+        snapshot,
+        infospace.root / SNAPSHOT_DIR / f"{snapshot.snapshot_id}.yaml",
+    )
+
+    viability = build_viability_report(infospace.root, merged)
+    write_viability_report(infospace.root, viability)
+    return RecordedCheckResult(snapshot=snapshot, metrics=merged, viability=viability)
+
+
+def get_history(root: str | Path) -> list[EvaluationSnapshot]:
+    return read_history(Path(root) / HISTORY_PATH)
+
+
+def get_latest_snapshot(root: str | Path) -> EvaluationSnapshot | None:
+    history = get_history(root)
+    return history[-1] if history else None
+
+
+def find_snapshot(
+    history: list[EvaluationSnapshot],
+    ref: str,
+) -> EvaluationSnapshot | None:
+    for snapshot in history:
+        if snapshot.snapshot_id == ref:
+            return snapshot
+    return find_snapshot_by_date(history, ref)
+
+
+def find_snapshot_by_date(
+    history: list[EvaluationSnapshot],
+    date_ref: str,
+) -> EvaluationSnapshot | None:
+    if not history:
+        return None
+    try:
+        target = datetime.fromisoformat(
+            date_ref if "T" in date_ref else f"{date_ref}T00:00:00"
+        )
+    except ValueError:
+        return None
+    if target.tzinfo is None:
+        target = target.replace(tzinfo=timezone.utc)
+
+    def delta(snapshot: EvaluationSnapshot) -> float:
+        created_at = snapshot.created_at
+        if created_at.tzinfo is None:
+            created_at = created_at.replace(tzinfo=timezone.utc)
+        return abs((created_at - target).total_seconds())
+
+    return min(history, key=delta)
+
+
+def metric_trend(
+    history: list[EvaluationSnapshot],
+    metric_name: str,
+) -> list[dict[str, Any]]:
+    trend: list[dict[str, Any]] = []
+    for snapshot in history:
+        for metric in snapshot.collection_metrics:
+            if metric.name == metric_name:
+                trend.append(
+                    {"date": snapshot.created_at.isoformat(), "value": metric.value}
+                )
+                break
+    return trend
+
+
+def build_viability_report(
+    root: str | Path,
+    metrics: dict[str, Any] | None = None,
+) -> dict[str, Any]:
+    infospace = load_infospace(root)
+    current = (
+        metrics
+        if metrics is not None
+        else read_metrics_file(infospace.root / METRICS_PATH)
+    )
+    numeric = _numeric_metrics(current)
+    report = evaluate_viability(numeric, infospace.config.viability)
+    return {
+        "passed": report.passed,
+        "results": {
+            name: {
+                "metric": result.metric,
+                "value": result.value,
+                "threshold": result.threshold.to_dict(),
+                "passed": result.passed,
+            }
+            for name, result in report.results.items()
+        },
+    }
+
+
+def write_viability_report(root: str | Path, report: dict[str, Any]) -> None:
+    target = Path(root) / VIABILITY_PATH
+    target.parent.mkdir(parents=True, exist_ok=True)
+    target.write_text(yaml.safe_dump(report, sort_keys=False), encoding="utf-8")
+
+
+def _evaluation_metrics(evaluations: list[EntityEvaluation]) -> dict[str, float | int]:
+    if not evaluations:
+        return {}
+    return {
+        "evaluated_artifact_count": len(evaluations),
+        "per_artifact_mean": sum(item.overall_score for item in evaluations)
+        / len(evaluations),
+    }
+
+
+def _numeric_metrics(metrics: dict[str, Any]) -> dict[str, float]:
+    return {
+        str(name): float(value)
+        for name, value in metrics.items()
+        if isinstance(value, (int, float)) and not isinstance(value, bool)
+    }
+
+
+def _normalize_metric_value(value: Any) -> Any:
+    if isinstance(value, bool):
+        return value
+    if isinstance(value, float):
+        return round(value, 6)
+    return value
+
+
+def _concern_for_metric(name: str) -> str:
+    mapping = {
+        "redundancy_ratio": "C1",
+        "coverage_ratio": "C2",
+        "coherence_components": "C3",
+        "consistency_cycles": "C4",
+        "granularity_entropy": "C5",
+    }
+    return mapping.get(name, "")
--- a/tests/test_evaluation_history.py
+++ b/tests/test_evaluation_history.py
@@ -0,0 +1,196 @@
+import json
+import os
+import subprocess
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+import yaml
+
+from infospace_bench import add_artifact, create_infospace, load_infospace
+from infospace_bench.checks import run_collection_checks
+from infospace_bench.evaluation import (
+    EntityEvaluation,
+    EvaluationSnapshot,
+    MetricValue,
+    ScoreEntry,
+)
+from infospace_bench.evaluation_io import (
+    append_to_history,
+    read_entity_evaluation,
+    read_history,
+    read_snapshot,
+    write_entity_evaluation,
+    write_snapshot,
+)
+from infospace_bench.history import (
+    find_snapshot,
+    get_history,
+    metric_trend,
+    read_metrics_file,
+    record_check_results,
+    write_metrics_file,
+)
+
+
+NOW = datetime(2026, 5, 14, 10, 30, tzinfo=timezone.utc)
+
+
+def cli_env() -> dict[str, str]:
+    env = os.environ.copy()
+    env["PYTHONPATH"] = "src:/home/worsch/markitect-tool/src"
+    return env
+
+
+def evaluation(
+    artifact_id: str = "entity/division.md",
+    value: float = 4.0,
+) -> EntityEvaluation:
+    return EntityEvaluation(
+        artifact_id=artifact_id,
+        evaluator="test",
+        evaluated_at=NOW,
+        scores=[
+            ScoreEntry("definition_precision", value, rationale="Grounded and clear."),
+            ScoreEntry("source_grounding", 5.0),
+        ],
+        notes=["reviewed"],
+    )
+
+
+def snapshot(snapshot_id: str, value: float, day: int) -> EvaluationSnapshot:
+    return EvaluationSnapshot(
+        snapshot_id=snapshot_id,
+        created_at=datetime(2026, 5, day, 9, tzinfo=timezone.utc),
+        schema_name="baseline",
+        artifact_count=1,
+        artifact_evaluations=[evaluation(value=value)],
+        collection_metrics=[MetricValue("coverage_ratio", value / 5.0, concern="C2")],
+        metadata={"source": "test"},
+    )
+
+
+def test_entity_evaluation_file_preserves_frontmatter_and_markdown_body(
+    tmp_path: Path,
+) -> None:
+    path = tmp_path / "output" / "evaluations" / "division.md"
+
+    write_entity_evaluation(evaluation(), path)
+    restored = read_entity_evaluation(path)
+    text = path.read_text(encoding="utf-8")
+
+    assert text.startswith("---\n")
+    assert "artifact_id: entity/division.md" in text
+    assert "# Evaluation: entity/division.md" in text
+    assert "## definition_precision" in text
+    assert restored == evaluation()
+    assert restored.scores[0].rationale == "Grounded and clear."
+
+
+def test_snapshot_and_history_round_trip_and_diff_from_named_refs(tmp_path: Path) -> None:
+    first = snapshot("snap-a", 3.0, 1)
+    second = snapshot("snap-b", 4.0, 2)
+    snapshot_path = tmp_path / "output" / "metrics" / "snap-a.yaml"
+    history_path = tmp_path / "output" / "metrics" / "history.yaml"
+
+    write_snapshot(first, snapshot_path)
+    append_to_history(first, history_path)
+    append_to_history(second, history_path)
+
+    history = read_history(history_path)
+    diff = find_snapshot(history, "snap-a").diff(find_snapshot(history, "2026-05-02"))
+
+    assert read_snapshot(snapshot_path) == first
+    assert [item.snapshot_id for item in history] == ["snap-a", "snap-b"]
+    assert get_history(tmp_path) == history
+    assert metric_trend(history, "coverage_ratio") == [
+        {"date": "2026-05-01T09:00:00+00:00", "value": 0.6},
+        {"date": "2026-05-02T09:00:00+00:00", "value": 0.8},
+    ]
+    assert diff.score_changes[0].artifact_id == "entity/division.md"
+    assert diff.metric_changes[0].name == "coverage_ratio"
+
+
+def test_metrics_file_preserves_structured_values_and_recording_merges(
+    tmp_path: Path,
+) -> None:
+    infospace = create_infospace(tmp_path, "pilot", name="Pilot")
+    source = tmp_path / "chapter.md"
+    source.write_text("# Chapter\n\nSource text.\n", encoding="utf-8")
+    add_artifact(infospace.root, source, kind="source", title="Chapter")
+
+    config_path = infospace.root / "infospace.yaml"
+    config = yaml.safe_load(config_path.read_text(encoding="utf-8"))
+    config["viability"] = {
+        "coverage_ratio": {"min": 0.5},
+        "redundancy_ratio": {"max": 0.1},
+    }
+    config_path.write_text(yaml.safe_dump(config, sort_keys=False), encoding="utf-8")
+
+    metrics_path = infospace.root / "output" / "metrics" / "metrics.yaml"
+    write_metrics_file(
+        {"type_distribution": {"source": 1}, "manual_metric": 7, "rounded": 1.1234567},
+        metrics_path,
+    )
+
+    report = record_check_results(
+        infospace.root,
+        run_collection_checks(load_infospace(infospace.root).artifacts),
+        artifact_evaluations=[evaluation()],
+    )
+    metrics = read_metrics_file(metrics_path)
+
+    assert metrics["type_distribution"] == {"source": 1}
+    assert metrics["manual_metric"] == 7
+    assert metrics["rounded"] == 1.123457
+    assert metrics["per_artifact_mean"] == 4.5
+    assert metrics["coverage_ratio"] == 1.0
+    assert report.viability is not None
+    assert report.viability["passed"] is True
+
+
+def test_cli_history_history_diff_and_metrics_json(tmp_path: Path) -> None:
+    infospace = create_infospace(tmp_path, "pilot", name="Pilot")
+    source = tmp_path / "chapter.md"
+    source.write_text("# Chapter\n\nSource text.\n", encoding="utf-8")
+    add_artifact(infospace.root, source, kind="source", title="Chapter")
+    history_path = infospace.root / "output" / "metrics" / "history.yaml"
+    append_to_history(snapshot("snap-a", 3.0, 1), history_path)
+    append_to_history(snapshot("snap-b", 4.0, 2), history_path)
+
+    history = subprocess.run(
+        [sys.executable, "-m", "infospace_bench", "history", str(infospace.root)],
+        check=False,
+        env=cli_env(),
+        text=True,
+        capture_output=True,
+    )
+    diff = subprocess.run(
+        [
+            sys.executable,
+            "-m",
+            "infospace_bench",
+            "history-diff",
+            str(infospace.root),
+            "snap-a",
+            "snap-b",
+        ],
+        check=False,
+        env=cli_env(),
+        text=True,
+        capture_output=True,
+    )
+    metrics = subprocess.run(
+        [sys.executable, "-m", "infospace_bench", "metrics", str(infospace.root)],
+        check=False,
+        env=cli_env(),
+        text=True,
+        capture_output=True,
+    )
+
+    assert history.returncode == 0, history.stderr
+    assert diff.returncode == 0, diff.stderr
+    assert metrics.returncode == 0, metrics.stderr
+    assert json.loads(history.stdout)["history"][1]["snapshot_id"] == "snap-b"
+    assert json.loads(diff.stdout)["diff"]["after_id"] == "snap-b"
+    assert json.loads(metrics.stdout)["snapshot"]["artifact_count"] == 1
--- a/workplans/IB-WP-0008-evaluation-history-metrics-parity.md
+++ b/workplans/IB-WP-0008-evaluation-history-metrics-parity.md
@@ -4,7 +4,7 @@ type: workplan
 title: "Evaluation History And Metrics Parity"
 domain: markitect
 repo: infospace-bench
-status: planned
+status: completed
 owner: markitect
 topic_slug: markitect
 created: "2026-05-14"
@@ -26,7 +26,7 @@ infospace evaluation history and metrics behavior.

 ```task
 id: IB-WP-0008-T01
-status: todo
+status: done
 priority: high
 state_hub_task_id: "95b48ad3-c4d1-442c-9bc0-7591d948d23e"
 ```
@@ -39,7 +39,7 @@ state_hub_task_id: "95b48ad3-c4d1-442c-9bc0-7591d948d23e"

 ```task
 id: IB-WP-0008-T02
-status: todo
+status: done
 priority: high
 state_hub_task_id: "b4800ba8-5b86-44bb-bf47-e893bae36b22"
 ```
@@ -52,7 +52,7 @@ state_hub_task_id: "b4800ba8-5b86-44bb-bf47-e893bae36b22"

 ```task
 id: IB-WP-0008-T03
-status: todo
+status: done
 priority: high
 state_hub_task_id: "7abcbd63-0147-4ae8-85f0-4af51882f476"
 ```
@@ -65,7 +65,7 @@ state_hub_task_id: "7abcbd63-0147-4ae8-85f0-4af51882f476"

 ```task
 id: IB-WP-0008-T04
-status: todo
+status: done
 priority: medium
 state_hub_task_id: "675d1d45-39d9-4ddd-9ab7-5d7de8a0f601"
 ```