markitect-main/markitect/infospace/evaluation_io.py

"""
Read/write utilities for evaluation output files.

Per-entity evaluations are stored as markdown with YAML frontmatter.
Snapshots and history are stored as pure YAML files.
"""

from pathlib import Path
from typing import List

import yaml

from .evaluation import (
    EntityEvaluation,
    EvaluationSnapshot,
    MetricChange,
    MetricValue,
    ScoreChange,
    SnapshotDiff,
)

_FRONTMATTER_SEP = "---"


def write_entity_evaluation(evaluation: EntityEvaluation, path: Path) -> None:
    """Write a per-entity evaluation as YAML frontmatter + markdown body."""
    frontmatter = {
        "entity_slug": evaluation.entity_slug,
        "evaluator": evaluation.evaluator,
        "evaluated_at": evaluation.evaluated_at.isoformat(),
        "overall_score": round(evaluation.overall_score, 4),
        "scores": [s.to_dict() for s in evaluation.scores],
    }
    if evaluation.notes:
        frontmatter["notes"] = evaluation.notes

    lines: List[str] = []
    lines.append(_FRONTMATTER_SEP)
    lines.append(yaml.safe_dump(frontmatter, default_flow_style=False, sort_keys=False).rstrip())
    lines.append(_FRONTMATTER_SEP)
    lines.append("")

    # Title
    title = evaluation.entity_slug.replace("_", " ").replace("-", " ").title()
    lines.append(f"# Evaluation: {title}")
    lines.append("")

    # One section per score with rationale
    for score in evaluation.scores:
        lines.append(f"## {score.name} — {score.value} / {score.max_value}")
        lines.append("")
        if score.rationale:
            lines.append(score.rationale)
            lines.append("")

    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text("\n".join(lines), encoding="utf-8")


def read_entity_evaluation(path: Path) -> EntityEvaluation:
    """Read a per-entity evaluation from a YAML frontmatter markdown file."""
    text = path.read_text(encoding="utf-8")
    parts = text.split(f"{_FRONTMATTER_SEP}\n", maxsplit=2)
    # parts: ["", frontmatter_text, body]
    if len(parts) < 3:
        raise ValueError(f"Invalid frontmatter in {path}")
    fm_text = parts[1]
    body = parts[2]

    fm = yaml.safe_load(fm_text)

    # Parse rationales from body
    rationales = _parse_rationales(body)

    from .evaluation import ScoreEntry

    scores = []
    for s_data in fm["scores"]:
        se = ScoreEntry.from_dict(s_data)
        if se.name in rationales:
            se.rationale = rationales[se.name]
        scores.append(se)

    return EntityEvaluation(
        entity_slug=fm["entity_slug"],
        evaluator=fm["evaluator"],
        scores=scores,
        evaluated_at=__import__("datetime").datetime.fromisoformat(fm["evaluated_at"]),
        notes=fm.get("notes", []),
    )


def _parse_rationales(body: str) -> dict:
    """Extract rationale text per dimension from the markdown body."""
    rationales: dict = {}
    current_name = None
    current_lines: List[str] = []

    for line in body.splitlines():
        if line.startswith("## "):
            # Save previous
            if current_name is not None:
                rationales[current_name] = "\n".join(current_lines).strip()
            # Parse "## dimension_name — 4.5 / 5.0"
            heading = line[3:].strip()
            name = heading.split("—")[0].strip() if "—" in heading else heading
            current_name = name
            current_lines = []
        elif current_name is not None:
            current_lines.append(line)

    if current_name is not None:
        rationales[current_name] = "\n".join(current_lines).strip()

    return rationales


def write_snapshot(snapshot: EvaluationSnapshot, path: Path) -> None:
    """Write an evaluation snapshot as a YAML file."""
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(
        yaml.safe_dump(snapshot.to_dict(), default_flow_style=False, sort_keys=False),
        encoding="utf-8",
    )


def read_snapshot(path: Path) -> EvaluationSnapshot:
    """Read an evaluation snapshot from a YAML file."""
    data = yaml.safe_load(path.read_text(encoding="utf-8"))
    return EvaluationSnapshot.from_dict(data)


def append_to_history(snapshot: EvaluationSnapshot, history_path: Path) -> None:
    """Append a snapshot to a YAML list file (creates if missing)."""
    history_path.parent.mkdir(parents=True, exist_ok=True)
    existing: List[dict] = []
    if history_path.exists():
        loaded = yaml.safe_load(history_path.read_text(encoding="utf-8"))
        if loaded is not None:
            existing = loaded

    existing.append(snapshot.to_dict())
    history_path.write_text(
        yaml.safe_dump(existing, default_flow_style=False, sort_keys=False),
        encoding="utf-8",
    )


def read_history(history_path: Path) -> List[EvaluationSnapshot]:
    """Read all snapshots from a YAML history file."""
    data = yaml.safe_load(history_path.read_text(encoding="utf-8"))
    if data is None:
        return []
    return [EvaluationSnapshot.from_dict(d) for d in data]


def diff_snapshots(before: EvaluationSnapshot, after: EvaluationSnapshot) -> SnapshotDiff:
    """Compute the diff between two evaluation snapshots."""
    before_slugs = {e.entity_slug for e in before.entity_evaluations}
    after_slugs = {e.entity_slug for e in after.entity_evaluations}

    added = sorted(after_slugs - before_slugs)
    removed = sorted(before_slugs - after_slugs)

    # Build score lookup: {slug: {dimension: value}}
    before_scores: dict = {}
    for ev in before.entity_evaluations:
        before_scores[ev.entity_slug] = {s.name: s.value for s in ev.scores}

    after_scores: dict = {}
    for ev in after.entity_evaluations:
        after_scores[ev.entity_slug] = {s.name: s.value for s in ev.scores}

    score_changes: List[ScoreChange] = []
    common_slugs = sorted(before_slugs & after_slugs)
    for slug in common_slugs:
        b_dims = before_scores[slug]
        a_dims = after_scores[slug]
        all_dims = sorted(set(b_dims) | set(a_dims))
        for dim in all_dims:
            bv = b_dims.get(dim)
            av = a_dims.get(dim)
            if bv != av:
                score_changes.append(ScoreChange(
                    entity_slug=slug,
                    dimension=dim,
                    before=bv if bv is not None else 0.0,
                    after=av if av is not None else 0.0,
                ))

    # Metric changes
    before_metrics = {m.name: m.value for m in before.collection_metrics}
    after_metrics = {m.name: m.value for m in after.collection_metrics}
    all_metric_names = sorted(set(before_metrics) | set(after_metrics))
    metric_changes: List[MetricChange] = []
    for name in all_metric_names:
        bv = before_metrics.get(name)
        av = after_metrics.get(name)
        if bv != av:
            metric_changes.append(MetricChange(
                name=name,
                before=bv if bv is not None else 0.0,
                after=av if av is not None else 0.0,
            ))

    return SnapshotDiff(
        before_id=before.snapshot_id,
        after_id=after.snapshot_id,
        added_entities=added,
        removed_entities=removed,
        score_changes=score_changes,
        metric_changes=metric_changes,
    )