Files
markitect-main/tests/unit/infospace/test_history.py
tegwick c0615c2d50
Some checks failed
Test Suite / code-quality (push) Has been cancelled
Test Suite / security-scan (push) Has been cancelled
Test Suite / unit-tests (3.11) (push) Has been cancelled
Test Suite / unit-tests (3.12) (push) Has been cancelled
Test Suite / integration-tests (push) Has been cancelled
Test Suite / e2e-tests (push) Has been cancelled
Test Suite / performance-tests (push) Has been cancelled
Test Suite / test-summary (push) Has been cancelled
feat(infospace,llm): stabilize free-tier eval workflow
Five improvements that eliminate most of the agent-in-the-loop friction
observed while closing out the 988-entity WoN evaluation (C.1):

1. Gemini adapter now retries on 429 + 5xx with exponential backoff
   (same pattern already used by OpenRouter/OpenAI). Removes the need
   for shell-level retry wrappers when hitting free-tier rate limits.

2. evaluate CLI prints the underlying error ("ERROR — HTTP 503 …")
   instead of a bare "ERROR", so agents don't have to drop into Python
   to diagnose transient failures.

3. --entity/--chapter now respect existing evaluation files by default
   (previously only the full-collection pass did). New --force flag
   opts into re-evaluation. Stops silently burning free-tier quota on
   re-runs of the same slug.

4. --entity accepts hyphenated slugs (matching entity filenames) and
   normalizes them to the underscore form used on disk. On a miss the
   CLI suggests near matches instead of a bare "not found".

5. eval-summary --update-metrics is no longer destructive:
   read_metrics_file/write_metrics_file preserve structured values
   (type_distribution) and don't flatten ints to floats. Fixes a
   silent data loss observed on every run.

Bonus: the evaluator field in written evaluation frontmatter now
falls back from run_config.model_name to the adapter's resolved model
(or the model echoed back in the API response), so rows no longer
show `evaluator: null` when --model is omitted.

Tests: new tests/unit/llm/test_gemini.py covers retry behavior;
tests/unit/infospace/test_history.py gains a round-trip test that
pins the type_distribution / int-preservation invariants.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-22 00:51:00 +02:00

286 lines
10 KiB
Python

"""
Tests for metrics history and viability tracking (S2.5).
"""
from __future__ import annotations
import json
from datetime import datetime, timezone
from pathlib import Path
import pytest
import yaml
from markitect.infospace.checks.orchestrator import CheckReport
from markitect.infospace.checks.granularity import GranularityReport
from markitect.infospace.checks.redundancy import RedundancyReport
from markitect.infospace.config import InfospaceConfig, TopicConfig, ViabilityThreshold
from markitect.infospace.evaluation import EvaluationSnapshot, MetricValue
from markitect.infospace.history import (
find_snapshot_by_date,
get_history,
get_latest_snapshot,
metric_trend,
read_metrics_file,
record_check_results,
snapshot_from_checks,
write_metrics_file,
)
# ── helpers ──────────────────────────────────────────────────────────
def _check_report() -> CheckReport:
return CheckReport(
redundancy=RedundancyReport(redundancy_ratio=0.1, entity_count=10),
granularity=GranularityReport(domain_entropy=1.5, entity_count=10),
)
def _config(tmp_path: Path) -> InfospaceConfig:
return InfospaceConfig(
topic=TopicConfig(name="Test Topic", domain="Testing"),
metrics_dir=str(tmp_path / "metrics"),
)
def _snapshot(snap_id: str, date_str: str, metrics: dict) -> EvaluationSnapshot:
return EvaluationSnapshot(
snapshot_id=snap_id,
created_at=datetime.fromisoformat(date_str).replace(tzinfo=timezone.utc),
schema_name="default",
entity_count=10,
collection_metrics=[
MetricValue(name=k, value=v) for k, v in metrics.items()
],
)
# ── snapshot_from_checks ────────────────────────────────────────────
class TestSnapshotFromChecks:
def test_creates_snapshot(self):
report = _check_report()
snap = snapshot_from_checks(report, entity_count=10)
assert snap.entity_count == 10
assert snap.snapshot_id # non-empty
assert snap.created_at is not None
def test_contains_metrics(self):
report = _check_report()
snap = snapshot_from_checks(report, entity_count=10)
metric_names = {m.name for m in snap.collection_metrics}
assert "redundancy_ratio" in metric_names
assert "granularity_entropy" in metric_names
def test_concern_labels(self):
report = _check_report()
snap = snapshot_from_checks(report, entity_count=10)
by_name = {m.name: m for m in snap.collection_metrics}
assert by_name["redundancy_ratio"].concern == "C1"
assert by_name["granularity_entropy"].concern == "C5"
def test_custom_schema(self):
report = _check_report()
snap = snapshot_from_checks(report, entity_count=5, schema_name="custom")
assert snap.schema_name == "custom"
def test_metadata(self):
report = _check_report()
snap = snapshot_from_checks(report, entity_count=5, metadata={"key": "val"})
assert snap.metadata == {"key": "val"}
def test_empty_report(self):
report = CheckReport()
snap = snapshot_from_checks(report, entity_count=0)
assert snap.collection_metrics == []
# ── write_metrics_file / read_metrics_file ──────────────────────────
class TestMetricsFileIO:
def test_round_trip(self, tmp_path):
path = tmp_path / "metrics.yaml"
metrics = {"redundancy_ratio": 0.05, "coverage_ratio": 0.85}
write_metrics_file(metrics, path)
loaded = read_metrics_file(path)
assert loaded["redundancy_ratio"] == pytest.approx(0.05)
assert loaded["coverage_ratio"] == pytest.approx(0.85)
def test_creates_parent_dirs(self, tmp_path):
path = tmp_path / "deep" / "nested" / "metrics.yaml"
write_metrics_file({"x": 1.0}, path)
assert path.is_file()
def test_read_missing_file(self, tmp_path):
path = tmp_path / "nonexistent.yaml"
assert read_metrics_file(path) == {}
def test_read_invalid_content(self, tmp_path):
path = tmp_path / "bad.yaml"
path.write_text("just a string", encoding="utf-8")
assert read_metrics_file(path) == {}
def test_round_trip_preserves_structured_values(self, tmp_path):
"""Non-numeric values like type_distribution must survive a round-trip.
Regression: eval-summary --update-metrics used to drop any key
whose value wasn't a bare number, silently erasing type_distribution
from the file on every run.
"""
path = tmp_path / "metrics.yaml"
metrics = {
"per_entity_mean": 3.9567,
"vsm_type_matrix_cells": 29,
"type_distribution": {
"Element": 315,
"Institution": 122,
"Principle": 102,
},
}
write_metrics_file(metrics, path)
loaded = read_metrics_file(path)
assert loaded["type_distribution"] == {
"Element": 315, "Institution": 122, "Principle": 102,
}
# And the int stayed an int on disk, not 29.0.
raw = path.read_text(encoding="utf-8")
assert "vsm_type_matrix_cells: 29\n" in raw
assert "vsm_type_matrix_cells: 29.0" not in raw
# ── record_check_results ────────────────────────────────────────────
class TestRecordCheckResults:
def test_creates_metrics_file(self, tmp_path):
cfg = _config(tmp_path)
report = _check_report()
record_check_results(report, cfg, tmp_path, entity_count=10)
metrics_path = tmp_path / cfg.metrics_dir / "metrics.yaml"
assert metrics_path.is_file()
def test_creates_history_file(self, tmp_path):
cfg = _config(tmp_path)
report = _check_report()
record_check_results(report, cfg, tmp_path, entity_count=10)
history_path = tmp_path / cfg.metrics_dir / "history.yaml"
assert history_path.is_file()
def test_appends_to_history(self, tmp_path):
cfg = _config(tmp_path)
report = _check_report()
record_check_results(report, cfg, tmp_path, entity_count=10)
record_check_results(report, cfg, tmp_path, entity_count=12)
history = get_history(cfg, tmp_path)
assert len(history) == 2
assert history[0].entity_count == 10
assert history[1].entity_count == 12
def test_returns_snapshot(self, tmp_path):
cfg = _config(tmp_path)
report = _check_report()
snap = record_check_results(report, cfg, tmp_path, entity_count=10)
assert snap.snapshot_id
assert snap.entity_count == 10
# ── get_history / get_latest_snapshot ────────────────────────────────
class TestGetHistory:
def test_empty_history(self, tmp_path):
cfg = _config(tmp_path)
assert get_history(cfg, tmp_path) == []
def test_get_latest(self, tmp_path):
cfg = _config(tmp_path)
report = _check_report()
record_check_results(report, cfg, tmp_path, entity_count=5)
record_check_results(report, cfg, tmp_path, entity_count=10)
latest = get_latest_snapshot(cfg, tmp_path)
assert latest is not None
assert latest.entity_count == 10
def test_latest_none_when_empty(self, tmp_path):
cfg = _config(tmp_path)
assert get_latest_snapshot(cfg, tmp_path) is None
# ── find_snapshot_by_date ────────────────────────────────────────────
class TestFindSnapshotByDate:
def test_finds_closest(self):
history = [
_snapshot("a", "2026-01-01T10:00:00", {"x": 1.0}),
_snapshot("b", "2026-02-15T10:00:00", {"x": 2.0}),
_snapshot("c", "2026-03-01T10:00:00", {"x": 3.0}),
]
result = find_snapshot_by_date(history, "2026-02-14")
assert result is not None
assert result.snapshot_id == "b"
def test_exact_match(self):
history = [
_snapshot("a", "2026-01-01T00:00:00", {"x": 1.0}),
_snapshot("b", "2026-02-01T00:00:00", {"x": 2.0}),
]
result = find_snapshot_by_date(history, "2026-02-01")
assert result is not None
assert result.snapshot_id == "b"
def test_empty_history(self):
assert find_snapshot_by_date([], "2026-01-01") is None
def test_invalid_date(self):
history = [_snapshot("a", "2026-01-01T00:00:00", {"x": 1.0})]
assert find_snapshot_by_date(history, "not-a-date") is None
def test_with_timestamp(self):
history = [
_snapshot("a", "2026-01-01T10:00:00", {"x": 1.0}),
_snapshot("b", "2026-01-01T14:00:00", {"x": 2.0}),
]
result = find_snapshot_by_date(history, "2026-01-01T13:00:00")
assert result is not None
assert result.snapshot_id == "b"
# ── metric_trend ─────────────────────────────────────────────────────
class TestMetricTrend:
def test_extracts_trend(self):
history = [
_snapshot("a", "2026-01-01T00:00:00", {"x": 1.0, "y": 2.0}),
_snapshot("b", "2026-02-01T00:00:00", {"x": 1.5, "y": 2.5}),
]
trend = metric_trend(history, "x")
assert len(trend) == 2
assert trend[0]["value"] == 1.0
assert trend[1]["value"] == 1.5
def test_missing_metric(self):
history = [
_snapshot("a", "2026-01-01T00:00:00", {"x": 1.0}),
]
assert metric_trend(history, "nonexistent") == []
def test_empty_history(self):
assert metric_trend([], "x") == []
def test_partial_presence(self):
history = [
_snapshot("a", "2026-01-01T00:00:00", {"x": 1.0}),
_snapshot("b", "2026-02-01T00:00:00", {"y": 2.0}), # x missing
_snapshot("c", "2026-03-01T00:00:00", {"x": 3.0}),
]
trend = metric_trend(history, "x")
assert len(trend) == 2
assert trend[0]["value"] == 1.0
assert trend[1]["value"] == 3.0