Some checks failed
Test Suite / code-quality (push) Has been cancelled
Test Suite / security-scan (push) Has been cancelled
Test Suite / unit-tests (3.11) (push) Has been cancelled
Test Suite / unit-tests (3.12) (push) Has been cancelled
Test Suite / integration-tests (push) Has been cancelled
Test Suite / e2e-tests (push) Has been cancelled
Test Suite / performance-tests (push) Has been cancelled
Test Suite / test-summary (push) Has been cancelled
Five improvements that eliminate most of the agent-in-the-loop friction
observed while closing out the 988-entity WoN evaluation (C.1):
1. Gemini adapter now retries on 429 + 5xx with exponential backoff
(same pattern already used by OpenRouter/OpenAI). Removes the need
for shell-level retry wrappers when hitting free-tier rate limits.
2. evaluate CLI prints the underlying error ("ERROR — HTTP 503 …")
instead of a bare "ERROR", so agents don't have to drop into Python
to diagnose transient failures.
3. --entity/--chapter now respect existing evaluation files by default
(previously only the full-collection pass did). New --force flag
opts into re-evaluation. Stops silently burning free-tier quota on
re-runs of the same slug.
4. --entity accepts hyphenated slugs (matching entity filenames) and
normalizes them to the underscore form used on disk. On a miss the
CLI suggests near matches instead of a bare "not found".
5. eval-summary --update-metrics is no longer destructive:
read_metrics_file/write_metrics_file preserve structured values
(type_distribution) and don't flatten ints to floats. Fixes a
silent data loss observed on every run.
Bonus: the evaluator field in written evaluation frontmatter now
falls back from run_config.model_name to the adapter's resolved model
(or the model echoed back in the API response), so rows no longer
show `evaluator: null` when --model is omitted.
Tests: new tests/unit/llm/test_gemini.py covers retry behavior;
tests/unit/infospace/test_history.py gains a round-trip test that
pins the type_distribution / int-preservation invariants.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
286 lines
10 KiB
Python
286 lines
10 KiB
Python
"""
|
|
Tests for metrics history and viability tracking (S2.5).
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
import yaml
|
|
|
|
from markitect.infospace.checks.orchestrator import CheckReport
|
|
from markitect.infospace.checks.granularity import GranularityReport
|
|
from markitect.infospace.checks.redundancy import RedundancyReport
|
|
from markitect.infospace.config import InfospaceConfig, TopicConfig, ViabilityThreshold
|
|
from markitect.infospace.evaluation import EvaluationSnapshot, MetricValue
|
|
from markitect.infospace.history import (
|
|
find_snapshot_by_date,
|
|
get_history,
|
|
get_latest_snapshot,
|
|
metric_trend,
|
|
read_metrics_file,
|
|
record_check_results,
|
|
snapshot_from_checks,
|
|
write_metrics_file,
|
|
)
|
|
|
|
|
|
# ── helpers ──────────────────────────────────────────────────────────
|
|
|
|
|
|
def _check_report() -> CheckReport:
|
|
return CheckReport(
|
|
redundancy=RedundancyReport(redundancy_ratio=0.1, entity_count=10),
|
|
granularity=GranularityReport(domain_entropy=1.5, entity_count=10),
|
|
)
|
|
|
|
|
|
def _config(tmp_path: Path) -> InfospaceConfig:
|
|
return InfospaceConfig(
|
|
topic=TopicConfig(name="Test Topic", domain="Testing"),
|
|
metrics_dir=str(tmp_path / "metrics"),
|
|
)
|
|
|
|
|
|
def _snapshot(snap_id: str, date_str: str, metrics: dict) -> EvaluationSnapshot:
|
|
return EvaluationSnapshot(
|
|
snapshot_id=snap_id,
|
|
created_at=datetime.fromisoformat(date_str).replace(tzinfo=timezone.utc),
|
|
schema_name="default",
|
|
entity_count=10,
|
|
collection_metrics=[
|
|
MetricValue(name=k, value=v) for k, v in metrics.items()
|
|
],
|
|
)
|
|
|
|
|
|
# ── snapshot_from_checks ────────────────────────────────────────────
|
|
|
|
|
|
class TestSnapshotFromChecks:
|
|
def test_creates_snapshot(self):
|
|
report = _check_report()
|
|
snap = snapshot_from_checks(report, entity_count=10)
|
|
assert snap.entity_count == 10
|
|
assert snap.snapshot_id # non-empty
|
|
assert snap.created_at is not None
|
|
|
|
def test_contains_metrics(self):
|
|
report = _check_report()
|
|
snap = snapshot_from_checks(report, entity_count=10)
|
|
metric_names = {m.name for m in snap.collection_metrics}
|
|
assert "redundancy_ratio" in metric_names
|
|
assert "granularity_entropy" in metric_names
|
|
|
|
def test_concern_labels(self):
|
|
report = _check_report()
|
|
snap = snapshot_from_checks(report, entity_count=10)
|
|
by_name = {m.name: m for m in snap.collection_metrics}
|
|
assert by_name["redundancy_ratio"].concern == "C1"
|
|
assert by_name["granularity_entropy"].concern == "C5"
|
|
|
|
def test_custom_schema(self):
|
|
report = _check_report()
|
|
snap = snapshot_from_checks(report, entity_count=5, schema_name="custom")
|
|
assert snap.schema_name == "custom"
|
|
|
|
def test_metadata(self):
|
|
report = _check_report()
|
|
snap = snapshot_from_checks(report, entity_count=5, metadata={"key": "val"})
|
|
assert snap.metadata == {"key": "val"}
|
|
|
|
def test_empty_report(self):
|
|
report = CheckReport()
|
|
snap = snapshot_from_checks(report, entity_count=0)
|
|
assert snap.collection_metrics == []
|
|
|
|
|
|
# ── write_metrics_file / read_metrics_file ──────────────────────────
|
|
|
|
|
|
class TestMetricsFileIO:
|
|
def test_round_trip(self, tmp_path):
|
|
path = tmp_path / "metrics.yaml"
|
|
metrics = {"redundancy_ratio": 0.05, "coverage_ratio": 0.85}
|
|
write_metrics_file(metrics, path)
|
|
loaded = read_metrics_file(path)
|
|
assert loaded["redundancy_ratio"] == pytest.approx(0.05)
|
|
assert loaded["coverage_ratio"] == pytest.approx(0.85)
|
|
|
|
def test_creates_parent_dirs(self, tmp_path):
|
|
path = tmp_path / "deep" / "nested" / "metrics.yaml"
|
|
write_metrics_file({"x": 1.0}, path)
|
|
assert path.is_file()
|
|
|
|
def test_read_missing_file(self, tmp_path):
|
|
path = tmp_path / "nonexistent.yaml"
|
|
assert read_metrics_file(path) == {}
|
|
|
|
def test_read_invalid_content(self, tmp_path):
|
|
path = tmp_path / "bad.yaml"
|
|
path.write_text("just a string", encoding="utf-8")
|
|
assert read_metrics_file(path) == {}
|
|
|
|
def test_round_trip_preserves_structured_values(self, tmp_path):
|
|
"""Non-numeric values like type_distribution must survive a round-trip.
|
|
|
|
Regression: eval-summary --update-metrics used to drop any key
|
|
whose value wasn't a bare number, silently erasing type_distribution
|
|
from the file on every run.
|
|
"""
|
|
path = tmp_path / "metrics.yaml"
|
|
metrics = {
|
|
"per_entity_mean": 3.9567,
|
|
"vsm_type_matrix_cells": 29,
|
|
"type_distribution": {
|
|
"Element": 315,
|
|
"Institution": 122,
|
|
"Principle": 102,
|
|
},
|
|
}
|
|
write_metrics_file(metrics, path)
|
|
loaded = read_metrics_file(path)
|
|
assert loaded["type_distribution"] == {
|
|
"Element": 315, "Institution": 122, "Principle": 102,
|
|
}
|
|
# And the int stayed an int on disk, not 29.0.
|
|
raw = path.read_text(encoding="utf-8")
|
|
assert "vsm_type_matrix_cells: 29\n" in raw
|
|
assert "vsm_type_matrix_cells: 29.0" not in raw
|
|
|
|
|
|
# ── record_check_results ────────────────────────────────────────────
|
|
|
|
|
|
class TestRecordCheckResults:
|
|
def test_creates_metrics_file(self, tmp_path):
|
|
cfg = _config(tmp_path)
|
|
report = _check_report()
|
|
record_check_results(report, cfg, tmp_path, entity_count=10)
|
|
metrics_path = tmp_path / cfg.metrics_dir / "metrics.yaml"
|
|
assert metrics_path.is_file()
|
|
|
|
def test_creates_history_file(self, tmp_path):
|
|
cfg = _config(tmp_path)
|
|
report = _check_report()
|
|
record_check_results(report, cfg, tmp_path, entity_count=10)
|
|
history_path = tmp_path / cfg.metrics_dir / "history.yaml"
|
|
assert history_path.is_file()
|
|
|
|
def test_appends_to_history(self, tmp_path):
|
|
cfg = _config(tmp_path)
|
|
report = _check_report()
|
|
record_check_results(report, cfg, tmp_path, entity_count=10)
|
|
record_check_results(report, cfg, tmp_path, entity_count=12)
|
|
history = get_history(cfg, tmp_path)
|
|
assert len(history) == 2
|
|
assert history[0].entity_count == 10
|
|
assert history[1].entity_count == 12
|
|
|
|
def test_returns_snapshot(self, tmp_path):
|
|
cfg = _config(tmp_path)
|
|
report = _check_report()
|
|
snap = record_check_results(report, cfg, tmp_path, entity_count=10)
|
|
assert snap.snapshot_id
|
|
assert snap.entity_count == 10
|
|
|
|
|
|
# ── get_history / get_latest_snapshot ────────────────────────────────
|
|
|
|
|
|
class TestGetHistory:
|
|
def test_empty_history(self, tmp_path):
|
|
cfg = _config(tmp_path)
|
|
assert get_history(cfg, tmp_path) == []
|
|
|
|
def test_get_latest(self, tmp_path):
|
|
cfg = _config(tmp_path)
|
|
report = _check_report()
|
|
record_check_results(report, cfg, tmp_path, entity_count=5)
|
|
record_check_results(report, cfg, tmp_path, entity_count=10)
|
|
latest = get_latest_snapshot(cfg, tmp_path)
|
|
assert latest is not None
|
|
assert latest.entity_count == 10
|
|
|
|
def test_latest_none_when_empty(self, tmp_path):
|
|
cfg = _config(tmp_path)
|
|
assert get_latest_snapshot(cfg, tmp_path) is None
|
|
|
|
|
|
# ── find_snapshot_by_date ────────────────────────────────────────────
|
|
|
|
|
|
class TestFindSnapshotByDate:
|
|
def test_finds_closest(self):
|
|
history = [
|
|
_snapshot("a", "2026-01-01T10:00:00", {"x": 1.0}),
|
|
_snapshot("b", "2026-02-15T10:00:00", {"x": 2.0}),
|
|
_snapshot("c", "2026-03-01T10:00:00", {"x": 3.0}),
|
|
]
|
|
result = find_snapshot_by_date(history, "2026-02-14")
|
|
assert result is not None
|
|
assert result.snapshot_id == "b"
|
|
|
|
def test_exact_match(self):
|
|
history = [
|
|
_snapshot("a", "2026-01-01T00:00:00", {"x": 1.0}),
|
|
_snapshot("b", "2026-02-01T00:00:00", {"x": 2.0}),
|
|
]
|
|
result = find_snapshot_by_date(history, "2026-02-01")
|
|
assert result is not None
|
|
assert result.snapshot_id == "b"
|
|
|
|
def test_empty_history(self):
|
|
assert find_snapshot_by_date([], "2026-01-01") is None
|
|
|
|
def test_invalid_date(self):
|
|
history = [_snapshot("a", "2026-01-01T00:00:00", {"x": 1.0})]
|
|
assert find_snapshot_by_date(history, "not-a-date") is None
|
|
|
|
def test_with_timestamp(self):
|
|
history = [
|
|
_snapshot("a", "2026-01-01T10:00:00", {"x": 1.0}),
|
|
_snapshot("b", "2026-01-01T14:00:00", {"x": 2.0}),
|
|
]
|
|
result = find_snapshot_by_date(history, "2026-01-01T13:00:00")
|
|
assert result is not None
|
|
assert result.snapshot_id == "b"
|
|
|
|
|
|
# ── metric_trend ─────────────────────────────────────────────────────
|
|
|
|
|
|
class TestMetricTrend:
|
|
def test_extracts_trend(self):
|
|
history = [
|
|
_snapshot("a", "2026-01-01T00:00:00", {"x": 1.0, "y": 2.0}),
|
|
_snapshot("b", "2026-02-01T00:00:00", {"x": 1.5, "y": 2.5}),
|
|
]
|
|
trend = metric_trend(history, "x")
|
|
assert len(trend) == 2
|
|
assert trend[0]["value"] == 1.0
|
|
assert trend[1]["value"] == 1.5
|
|
|
|
def test_missing_metric(self):
|
|
history = [
|
|
_snapshot("a", "2026-01-01T00:00:00", {"x": 1.0}),
|
|
]
|
|
assert metric_trend(history, "nonexistent") == []
|
|
|
|
def test_empty_history(self):
|
|
assert metric_trend([], "x") == []
|
|
|
|
def test_partial_presence(self):
|
|
history = [
|
|
_snapshot("a", "2026-01-01T00:00:00", {"x": 1.0}),
|
|
_snapshot("b", "2026-02-01T00:00:00", {"y": 2.0}), # x missing
|
|
_snapshot("c", "2026-03-01T00:00:00", {"x": 3.0}),
|
|
]
|
|
trend = metric_trend(history, "x")
|
|
assert len(trend) == 2
|
|
assert trend[0]["value"] == 1.0
|
|
assert trend[1]["value"] == 3.0
|