Files
phase-memory/tests/test_evaluation_scenarios.py

176 lines
7.2 KiB
Python

import json
from datetime import datetime, timezone
from pathlib import Path
from phase_memory.adapters import InMemorySemanticIndex
from phase_memory.contracts import graph_from_markitect
from phase_memory.evaluation import (
EVALUATION_REPORT_SCHEMA,
EVALUATION_TREND_HISTORY_SCHEMA,
EVALUATION_TREND_SCHEMA,
evaluation_threshold_report,
evaluation_trend_artifact,
load_evaluation_trend_history,
write_evaluation_trend_history,
)
from phase_memory.models import ActivationPlan, MemoryPath
from phase_memory.retrieval import activation_quality_report, select_event_path
from phase_memory.runtime import PhaseMemoryRuntime
FIXTURES = Path(__file__).parent / "fixtures"
def _scenarios():
data = json.loads((FIXTURES / "evaluation-scenarios.json").read_text(encoding="utf-8"))
return {scenario["id"]: scenario for scenario in data["scenarios"]}
def test_policy_denied_activation_scenario_is_redacted_and_audited() -> None:
scenario = _scenarios()["policy-denied-activation"]
runtime = PhaseMemoryRuntime()
response = runtime.plan_activation(
scenario["graph"],
max_items=4,
max_tokens=60,
profile_id=scenario["profile"]["id"],
policy_context={"denied_labels": ["restricted"], "secrets_allowed": False, "trust_zone": "local"},
)
audit = runtime.query_audit({"operation": "graph.activation.plan"})
denied_ids = [item["id"] for item in response["data"]["policy_denials"]]
assert response["valid"] is True
assert denied_ids == scenario["expect"]["denied_node_ids"]
assert response["data"]["policy_denials"][0]["text"] == "[REDACTED]"
assert [diagnostic["code"] for diagnostic in response["diagnostics"]] == ["activation_policy_denied"]
assert audit["count"] == 1
def test_profile_lifecycle_rules_scenario_emits_expected_actions() -> None:
scenario = _scenarios()["profile-lifecycle-rules"]
runtime = PhaseMemoryRuntime()
response = runtime.plan_lifecycle_with_profile(
scenario["profile"],
scenario["graph"],
refresh_digests={"life.decision": "decision-new"},
now=datetime(2026, 5, 18, tzinfo=timezone.utc),
)
actions = [(action["target_id"], action["action"]) for action in response["data"]["dry_run_actions"]]
compact_actions = [action for action in response["data"]["dry_run_actions"] if action["action"] == "compact"]
assert response["valid"] is True
for expected in scenario["expect"]["actions"]:
assert tuple(expected) in actions
assert compact_actions[0]["metadata"]["source_node_ids"] == [scenario["expect"]["compact_source"]]
def test_budget_path_and_semantic_hint_scenario_meets_quality_thresholds() -> None:
scenario = _scenarios()["budget-path-and-semantic-hints"]
graph = graph_from_markitect(scenario["graph"]).value
runtime = PhaseMemoryRuntime()
index = InMemorySemanticIndex()
index.upsert_nodes(list(graph.nodes))
response = runtime.plan_activation(
scenario["graph"],
max_items=scenario["profile"]["activation"]["max_items"],
max_tokens=scenario["profile"]["activation"]["max_tokens"],
profile_id=scenario["profile"]["id"],
priority_node_ids=tuple(scenario["expect"]["selected_node_ids"]),
)
path = MemoryPath.from_mapping(scenario["path"])
selected_path_events = select_event_path(graph.events, path, max_events=2)
semantic_results = index.query(graph_id=graph.graph_id, query="semantic restart", limit=2)
report = activation_quality_report(_activation_plan(response), expected_node_ids=tuple(scenario["expect"]["selected_node_ids"]))
plan = response["data"]["activation_plan"]
assert plan["selected_node_ids"] == scenario["expect"]["selected_node_ids"]
assert [item["id"] for item in plan["omitted"]] == scenario["expect"]["omitted_node_ids"]
assert selected_path_events == tuple(scenario["expect"]["event_ids"])
assert semantic_results[0]["id"] == scenario["expect"]["semantic_top_id"]
assert report["source_span_coverage"] == 1.0
assert report["explanation_coverage"] == 1.0
def test_evaluation_threshold_report_summarizes_all_scenarios() -> None:
data = json.loads((FIXTURES / "evaluation-scenarios.json").read_text(encoding="utf-8"))
report = evaluation_threshold_report(data)
assert report["schema_version"] == EVALUATION_REPORT_SCHEMA
assert report["valid"] is True
assert report["metrics"]["scenario_count"] == 3
assert report["metrics"]["policy_denial_count"] == 1
assert report["metrics"]["lifecycle_action_count"] >= 3
assert report["metrics"]["path_event_count"] == 1
assert report["metrics"]["semantic_hit_count"] == 1
assert report["metrics"]["budget_omission_count"] == 1
assert report["diagnostics"] == []
def test_evaluation_trend_artifact_tracks_threshold_and_metric_deltas() -> None:
data = json.loads((FIXTURES / "evaluation-scenarios.json").read_text(encoding="utf-8"))
report = evaluation_threshold_report(data)
previous = {
"id": "previous",
"metrics": {
**report["metrics"],
"policy_denial_count": report["metrics"]["policy_denial_count"] + 1,
},
}
trend = evaluation_trend_artifact(
report,
previous_report=previous,
run_metadata={"run_id": "pytest", "created_at": "2026-05-19T00:00:00+00:00"},
)
assert trend["schema_version"] == EVALUATION_TREND_SCHEMA
assert trend["run"]["run_id"] == "pytest"
assert trend["threshold_deltas"]["policy_denial_count"] == 0.0
assert trend["metric_deltas"]["policy_denial_count"] == -1.0
assert trend["diagnostics"][0]["code"] == "evaluation_metric_regressed"
def test_evaluation_trend_history_persists_without_duplicate_runs(tmp_path) -> None:
data = json.loads((FIXTURES / "evaluation-scenarios.json").read_text(encoding="utf-8"))
report = evaluation_threshold_report(data)
first = evaluation_trend_artifact(
report,
run_metadata={"run_id": "first", "created_at": "2026-05-19T00:00:00+00:00"},
)
second = evaluation_trend_artifact(
report,
previous_report=report,
run_metadata={"run_id": "second", "created_at": "2026-05-20T00:00:00+00:00"},
)
path = tmp_path / "evaluation-trend-history.json"
history = write_evaluation_trend_history(path, first)
history = write_evaluation_trend_history(path, first)
history = write_evaluation_trend_history(path, second)
loaded = load_evaluation_trend_history(path)
assert history["schema_version"] == EVALUATION_TREND_HISTORY_SCHEMA
assert loaded["count"] == 2
assert loaded["latest_artifact_id"] == second["id"]
assert "policy_denial_count" in loaded["metric_keys"]
def _activation_plan(response):
data = response["data"]["activation_plan"]
return ActivationPlan(
plan_id=data["plan_id"],
graph_id=data["graph_id"],
selected_node_ids=tuple(data["selected_node_ids"]),
selected_event_ids=tuple(data["selected_event_ids"]),
omitted=tuple(data["omitted"]),
token_estimate=data["token_estimate"],
max_items=data["max_items"],
max_tokens=data["max_tokens"],
selection=response["data"]["package_request"]["selection"],
diagnostics=(),
)