agentic-resources/tests/test_retro_build.py

"""Weekly retro report tests (AGENTIC-WP-0010 T01)."""

import os
import sys

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from session_memory.curate.catalog import Catalog  # noqa: E402
from session_memory.curate.schema import Resolution, SolutionPattern  # noqa: E402
from session_memory.retro.build import weekly_retro  # noqa: E402


def _digest(uid, repo, ts, flavor="claude", retries=5):
    return {
        "session_uid": uid, "flavor": flavor, "repo": repo, "outcome": "fail",
        "started_at": ts, "event_count": 40,
        "first_prompt": "Fix the failing build and retry the suite",
        "cost": {"input_tokens": 100, "output_tokens": 10},
        "tool_histogram": {"Bash": 20, "Edit": 12, "Read": 8},
        "markers": {"errors": 0, "retries": retries, "test_runs": 0},
        "error_snippets": [],
    }


def test_window_excludes_old_sessions():
    digs = [
        _digest("claude:a", "r1", "2026-06-01T10:00:00Z"),
        _digest("claude:b", "r1", "2026-06-02T10:00:00Z"),
        _digest("claude:old", "r1", "2026-01-01T10:00:00Z"),   # outside window
    ]
    r = weekly_retro(digs, since="2026-05-30T00:00:00Z", until="2026-06-08T00:00:00Z")
    assert r["n_sessions"] == 2
    assert r["window"]["days"] == 7


def test_retry_storm_becomes_suggestion():
    digs = [_digest(f"claude:{i}", "r1", "2026-06-0{}T10:00:00Z".format(i + 1))
            for i in range(2)]
    r = weekly_retro(digs, since="2026-05-30T00:00:00Z", until="2026-06-08T00:00:00Z")
    s = r["suggestions"]
    assert s and s[0]["repo"] == "r1"
    assert s[0]["signal_type"] == "retry_storm"
    assert "Investigate" in s[0]["recommendation"]  # no catalog -> default


def test_recommendation_from_catalog(tmp_path):
    cat = Catalog(str(tmp_path / "catalog"))
    key = "problem:retry_storm:retries"
    cat.upsert(SolutionPattern(
        id=SolutionPattern.make_id(key), name="Retry storm", version="1.0.0",
        polarity="problem", problem="repeated retries",
        resolutions=[Resolution(summary="Stop and diagnose before retrying")]))
    digs = [_digest(f"claude:{i}", "r1", "2026-06-0{}T10:00:00Z".format(i + 1)) for i in range(2)]
    r = weekly_retro(digs, catalog=cat, since="2026-05-30T00:00:00Z", until="2026-06-08T00:00:00Z")
    assert r["suggestions"][0]["recommendation"] == "Stop and diagnose before retrying"


def test_recurring_error_inherits_recommendation_via_covers(tmp_path):
    cat = Catalog(str(tmp_path / "catalog"))
    cat.upsert(SolutionPattern(
        id="sp-rbe", name="Read before edit", version="1.0.0", polarity="problem",
        problem="edit before read",
        resolutions=[Resolution(summary="Read the file first before Edit/Write")],
        covers=["file has not been read"]))
    digs = []
    for i in range(2):
        d = _digest(f"claude:{i}", "r1", "2026-06-0{}T10:00:00Z".format(i + 1))
        d["error_snippets"] = [{
            "fingerprint": "<tool_use_error>file has not been read yet. read it first...",
            "sample": "File has not been read yet", "count": 2, "tool": "Edit"}]
        digs.append(d)
    r = weekly_retro(digs, catalog=cat, since="2026-05-30T00:00:00Z", until="2026-06-08T00:00:00Z")
    rec_err = [s for s in r["suggestions"] if s["signal_type"] == "recurring_error"]
    assert rec_err, "expected a recurring_error suggestion"
    assert rec_err[0]["recommendation"] == "Read the file first before Edit/Write"


def test_caps_three_per_repo():
    # five distinct problem signals in one repo -> capped at 3
    digs = []
    for i in range(2):
        d = _digest(f"claude:{i}", "r1", "2026-06-0{}T10:00:00Z".format(i + 1))
        d["markers"] = {"errors": 5, "retries": 5, "test_runs": 0, "human_interventions": 0}
        d["tool_histogram"] = {"Bash": 120, "ToolSearch": 9,
                               "mcp__state-hub__x": 30, "Edit": 5}
        d["outcome"] = "abandoned"
        digs.append(d)
    r = weekly_retro(digs, since="2026-05-30T00:00:00Z", until="2026-06-08T00:00:00Z")
    per_repo = [s for s in r["suggestions"] if s["repo"] == "r1"]
    assert len(per_repo) <= 3


def test_cross_flavor_ranks_first():
    digs = [
        _digest("claude:a", "r1", "2026-06-01T10:00:00Z", flavor="claude"),
        _digest("grok:b", "r2", "2026-06-02T10:00:00Z", flavor="grok"),
    ]
    r = weekly_retro(digs, since="2026-05-30T00:00:00Z", until="2026-06-08T00:00:00Z")
    assert r["suggestions"][0]["cross_flavor"] is True
    assert r["suggestions"][0]["priority"] == "high"


def test_includes_measure_snapshot():
    digs = [_digest(f"claude:{i}", "r1", "2026-06-0{}T10:00:00Z".format(i + 1)) for i in range(2)]
    r = weekly_retro(digs, since="2026-05-30T00:00:00Z", until="2026-06-08T00:00:00Z")
    assert r["measure"]["n_sessions"] == 2