phase-memory/tests/test_evaluation_scenarios.py

import json
from datetime import datetime, timezone
from pathlib import Path

from phase_memory.adapters import InMemorySemanticIndex
from phase_memory.contracts import graph_from_markitect
from phase_memory.evaluation import (
    EVALUATION_REPORT_SCHEMA,
    EVALUATION_TREND_HISTORY_SCHEMA,
    EVALUATION_TREND_REGRESSION_GATE_SCHEMA,
    EVALUATION_TREND_SCHEMA,
    evaluation_threshold_report,
    evaluation_trend_artifact,
    evaluation_trend_regression_gate,
    load_evaluation_trend_history,
    write_evaluation_trend_history,
)
from phase_memory.models import ActivationPlan, MemoryPath
from phase_memory.retrieval import activation_quality_report, select_event_path
from phase_memory.runtime import PhaseMemoryRuntime


FIXTURES = Path(__file__).parent / "fixtures"


def _scenarios():
    data = json.loads((FIXTURES / "evaluation-scenarios.json").read_text(encoding="utf-8"))
    return {scenario["id"]: scenario for scenario in data["scenarios"]}


def test_policy_denied_activation_scenario_is_redacted_and_audited() -> None:
    scenario = _scenarios()["policy-denied-activation"]
    runtime = PhaseMemoryRuntime()

    response = runtime.plan_activation(
        scenario["graph"],
        max_items=4,
        max_tokens=60,
        profile_id=scenario["profile"]["id"],
        policy_context={"denied_labels": ["restricted"], "secrets_allowed": False, "trust_zone": "local"},
    )
    audit = runtime.query_audit({"operation": "graph.activation.plan"})

    denied_ids = [item["id"] for item in response["data"]["policy_denials"]]
    assert response["valid"] is True
    assert denied_ids == scenario["expect"]["denied_node_ids"]
    assert response["data"]["policy_denials"][0]["text"] == "[REDACTED]"
    assert [diagnostic["code"] for diagnostic in response["diagnostics"]] == ["activation_policy_denied"]
    assert audit["count"] == 1


def test_profile_lifecycle_rules_scenario_emits_expected_actions() -> None:
    scenario = _scenarios()["profile-lifecycle-rules"]
    runtime = PhaseMemoryRuntime()

    response = runtime.plan_lifecycle_with_profile(
        scenario["profile"],
        scenario["graph"],
        refresh_digests={"life.decision": "decision-new"},
        now=datetime(2026, 5, 18, tzinfo=timezone.utc),
    )

    actions = [(action["target_id"], action["action"]) for action in response["data"]["dry_run_actions"]]
    compact_actions = [action for action in response["data"]["dry_run_actions"] if action["action"] == "compact"]
    assert response["valid"] is True
    for expected in scenario["expect"]["actions"]:
        assert tuple(expected) in actions
    assert compact_actions[0]["metadata"]["source_node_ids"] == [scenario["expect"]["compact_source"]]


def test_budget_path_and_semantic_hint_scenario_meets_quality_thresholds() -> None:
    scenario = _scenarios()["budget-path-and-semantic-hints"]
    graph = graph_from_markitect(scenario["graph"]).value
    runtime = PhaseMemoryRuntime()
    index = InMemorySemanticIndex()

    index.upsert_nodes(list(graph.nodes))
    response = runtime.plan_activation(
        scenario["graph"],
        max_items=scenario["profile"]["activation"]["max_items"],
        max_tokens=scenario["profile"]["activation"]["max_tokens"],
        profile_id=scenario["profile"]["id"],
        priority_node_ids=tuple(scenario["expect"]["selected_node_ids"]),
    )
    path = MemoryPath.from_mapping(scenario["path"])
    selected_path_events = select_event_path(graph.events, path, max_events=2)
    semantic_results = index.query(graph_id=graph.graph_id, query="semantic restart", limit=2)
    report = activation_quality_report(_activation_plan(response), expected_node_ids=tuple(scenario["expect"]["selected_node_ids"]))

    plan = response["data"]["activation_plan"]
    assert plan["selected_node_ids"] == scenario["expect"]["selected_node_ids"]
    assert [item["id"] for item in plan["omitted"]] == scenario["expect"]["omitted_node_ids"]
    assert selected_path_events == tuple(scenario["expect"]["event_ids"])
    assert semantic_results[0]["id"] == scenario["expect"]["semantic_top_id"]
    assert report["source_span_coverage"] == 1.0
    assert report["explanation_coverage"] == 1.0


def test_evaluation_threshold_report_summarizes_all_scenarios() -> None:
    data = json.loads((FIXTURES / "evaluation-scenarios.json").read_text(encoding="utf-8"))

    report = evaluation_threshold_report(data)

    assert report["schema_version"] == EVALUATION_REPORT_SCHEMA
    assert report["valid"] is True
    assert report["metrics"]["scenario_count"] == 3
    assert report["metrics"]["policy_denial_count"] == 1
    assert report["metrics"]["lifecycle_action_count"] >= 3
    assert report["metrics"]["path_event_count"] == 1
    assert report["metrics"]["semantic_hit_count"] == 1
    assert report["metrics"]["budget_omission_count"] == 1
    assert report["diagnostics"] == []


def test_evaluation_trend_artifact_tracks_threshold_and_metric_deltas() -> None:
    data = json.loads((FIXTURES / "evaluation-scenarios.json").read_text(encoding="utf-8"))
    report = evaluation_threshold_report(data)
    previous = {
        "id": "previous",
        "metrics": {
            **report["metrics"],
            "policy_denial_count": report["metrics"]["policy_denial_count"] + 1,
        },
    }

    trend = evaluation_trend_artifact(
        report,
        previous_report=previous,
        run_metadata={"run_id": "pytest", "created_at": "2026-05-19T00:00:00+00:00"},
    )

    assert trend["schema_version"] == EVALUATION_TREND_SCHEMA
    assert trend["run"]["run_id"] == "pytest"
    assert trend["threshold_deltas"]["policy_denial_count"] == 0.0
    assert trend["metric_deltas"]["policy_denial_count"] == -1.0
    assert trend["diagnostics"][0]["code"] == "evaluation_metric_regressed"


def test_evaluation_trend_history_persists_without_duplicate_runs(tmp_path) -> None:
    data = json.loads((FIXTURES / "evaluation-scenarios.json").read_text(encoding="utf-8"))
    report = evaluation_threshold_report(data)
    first = evaluation_trend_artifact(
        report,
        run_metadata={"run_id": "first", "created_at": "2026-05-19T00:00:00+00:00"},
    )
    second = evaluation_trend_artifact(
        report,
        previous_report=report,
        run_metadata={"run_id": "second", "created_at": "2026-05-20T00:00:00+00:00"},
    )
    path = tmp_path / "evaluation-trend-history.json"

    history = write_evaluation_trend_history(path, first)
    history = write_evaluation_trend_history(path, first)
    history = write_evaluation_trend_history(path, second)
    loaded = load_evaluation_trend_history(path)

    assert history["schema_version"] == EVALUATION_TREND_HISTORY_SCHEMA
    assert loaded["count"] == 2
    assert loaded["latest_artifact_id"] == second["id"]
    assert "policy_denial_count" in loaded["metric_keys"]


def test_evaluation_trend_regression_gate_flags_metric_declines() -> None:
    data = json.loads((FIXTURES / "evaluation-scenarios.json").read_text(encoding="utf-8"))
    report = evaluation_threshold_report(data)
    previous = evaluation_trend_artifact(
        report,
        run_metadata={"run_id": "previous", "created_at": "2026-05-18T00:00:00+00:00"},
    )
    regressed_report = {
        **report,
        "metrics": {
            **report["metrics"],
            "policy_denial_count": report["metrics"]["policy_denial_count"] - 1,
        },
    }
    latest = evaluation_trend_artifact(
        regressed_report,
        previous_report=report,
        run_metadata={"run_id": "latest", "created_at": "2026-05-19T00:00:00+00:00"},
    )
    history = {
        "schema_version": EVALUATION_TREND_HISTORY_SCHEMA,
        "artifacts": [previous, latest],
    }

    gate = evaluation_trend_regression_gate(history)

    assert gate["schema_version"] == EVALUATION_TREND_REGRESSION_GATE_SCHEMA
    assert gate["valid"] is False
    assert gate["metric_regressions"]["policy_denial_count"] == -1.0


def _activation_plan(response):
    data = response["data"]["activation_plan"]
    return ActivationPlan(
        plan_id=data["plan_id"],
        graph_id=data["graph_id"],
        selected_node_ids=tuple(data["selected_node_ids"]),
        selected_event_ids=tuple(data["selected_event_ids"]),
        omitted=tuple(data["omitted"]),
        token_estimate=data["token_estimate"],
        max_items=data["max_items"],
        max_tokens=data["max_tokens"],
        selection=response["data"]["package_request"]["selection"],
        diagnostics=(),
    )