generated from coulomb/repo-seed
176 lines
7.2 KiB
Python
176 lines
7.2 KiB
Python
import json
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
from phase_memory.adapters import InMemorySemanticIndex
|
|
from phase_memory.contracts import graph_from_markitect
|
|
from phase_memory.evaluation import (
|
|
EVALUATION_REPORT_SCHEMA,
|
|
EVALUATION_TREND_HISTORY_SCHEMA,
|
|
EVALUATION_TREND_SCHEMA,
|
|
evaluation_threshold_report,
|
|
evaluation_trend_artifact,
|
|
load_evaluation_trend_history,
|
|
write_evaluation_trend_history,
|
|
)
|
|
from phase_memory.models import ActivationPlan, MemoryPath
|
|
from phase_memory.retrieval import activation_quality_report, select_event_path
|
|
from phase_memory.runtime import PhaseMemoryRuntime
|
|
|
|
|
|
FIXTURES = Path(__file__).parent / "fixtures"
|
|
|
|
|
|
def _scenarios():
|
|
data = json.loads((FIXTURES / "evaluation-scenarios.json").read_text(encoding="utf-8"))
|
|
return {scenario["id"]: scenario for scenario in data["scenarios"]}
|
|
|
|
|
|
def test_policy_denied_activation_scenario_is_redacted_and_audited() -> None:
|
|
scenario = _scenarios()["policy-denied-activation"]
|
|
runtime = PhaseMemoryRuntime()
|
|
|
|
response = runtime.plan_activation(
|
|
scenario["graph"],
|
|
max_items=4,
|
|
max_tokens=60,
|
|
profile_id=scenario["profile"]["id"],
|
|
policy_context={"denied_labels": ["restricted"], "secrets_allowed": False, "trust_zone": "local"},
|
|
)
|
|
audit = runtime.query_audit({"operation": "graph.activation.plan"})
|
|
|
|
denied_ids = [item["id"] for item in response["data"]["policy_denials"]]
|
|
assert response["valid"] is True
|
|
assert denied_ids == scenario["expect"]["denied_node_ids"]
|
|
assert response["data"]["policy_denials"][0]["text"] == "[REDACTED]"
|
|
assert [diagnostic["code"] for diagnostic in response["diagnostics"]] == ["activation_policy_denied"]
|
|
assert audit["count"] == 1
|
|
|
|
|
|
def test_profile_lifecycle_rules_scenario_emits_expected_actions() -> None:
|
|
scenario = _scenarios()["profile-lifecycle-rules"]
|
|
runtime = PhaseMemoryRuntime()
|
|
|
|
response = runtime.plan_lifecycle_with_profile(
|
|
scenario["profile"],
|
|
scenario["graph"],
|
|
refresh_digests={"life.decision": "decision-new"},
|
|
now=datetime(2026, 5, 18, tzinfo=timezone.utc),
|
|
)
|
|
|
|
actions = [(action["target_id"], action["action"]) for action in response["data"]["dry_run_actions"]]
|
|
compact_actions = [action for action in response["data"]["dry_run_actions"] if action["action"] == "compact"]
|
|
assert response["valid"] is True
|
|
for expected in scenario["expect"]["actions"]:
|
|
assert tuple(expected) in actions
|
|
assert compact_actions[0]["metadata"]["source_node_ids"] == [scenario["expect"]["compact_source"]]
|
|
|
|
|
|
def test_budget_path_and_semantic_hint_scenario_meets_quality_thresholds() -> None:
|
|
scenario = _scenarios()["budget-path-and-semantic-hints"]
|
|
graph = graph_from_markitect(scenario["graph"]).value
|
|
runtime = PhaseMemoryRuntime()
|
|
index = InMemorySemanticIndex()
|
|
|
|
index.upsert_nodes(list(graph.nodes))
|
|
response = runtime.plan_activation(
|
|
scenario["graph"],
|
|
max_items=scenario["profile"]["activation"]["max_items"],
|
|
max_tokens=scenario["profile"]["activation"]["max_tokens"],
|
|
profile_id=scenario["profile"]["id"],
|
|
priority_node_ids=tuple(scenario["expect"]["selected_node_ids"]),
|
|
)
|
|
path = MemoryPath.from_mapping(scenario["path"])
|
|
selected_path_events = select_event_path(graph.events, path, max_events=2)
|
|
semantic_results = index.query(graph_id=graph.graph_id, query="semantic restart", limit=2)
|
|
report = activation_quality_report(_activation_plan(response), expected_node_ids=tuple(scenario["expect"]["selected_node_ids"]))
|
|
|
|
plan = response["data"]["activation_plan"]
|
|
assert plan["selected_node_ids"] == scenario["expect"]["selected_node_ids"]
|
|
assert [item["id"] for item in plan["omitted"]] == scenario["expect"]["omitted_node_ids"]
|
|
assert selected_path_events == tuple(scenario["expect"]["event_ids"])
|
|
assert semantic_results[0]["id"] == scenario["expect"]["semantic_top_id"]
|
|
assert report["source_span_coverage"] == 1.0
|
|
assert report["explanation_coverage"] == 1.0
|
|
|
|
|
|
def test_evaluation_threshold_report_summarizes_all_scenarios() -> None:
|
|
data = json.loads((FIXTURES / "evaluation-scenarios.json").read_text(encoding="utf-8"))
|
|
|
|
report = evaluation_threshold_report(data)
|
|
|
|
assert report["schema_version"] == EVALUATION_REPORT_SCHEMA
|
|
assert report["valid"] is True
|
|
assert report["metrics"]["scenario_count"] == 3
|
|
assert report["metrics"]["policy_denial_count"] == 1
|
|
assert report["metrics"]["lifecycle_action_count"] >= 3
|
|
assert report["metrics"]["path_event_count"] == 1
|
|
assert report["metrics"]["semantic_hit_count"] == 1
|
|
assert report["metrics"]["budget_omission_count"] == 1
|
|
assert report["diagnostics"] == []
|
|
|
|
|
|
def test_evaluation_trend_artifact_tracks_threshold_and_metric_deltas() -> None:
|
|
data = json.loads((FIXTURES / "evaluation-scenarios.json").read_text(encoding="utf-8"))
|
|
report = evaluation_threshold_report(data)
|
|
previous = {
|
|
"id": "previous",
|
|
"metrics": {
|
|
**report["metrics"],
|
|
"policy_denial_count": report["metrics"]["policy_denial_count"] + 1,
|
|
},
|
|
}
|
|
|
|
trend = evaluation_trend_artifact(
|
|
report,
|
|
previous_report=previous,
|
|
run_metadata={"run_id": "pytest", "created_at": "2026-05-19T00:00:00+00:00"},
|
|
)
|
|
|
|
assert trend["schema_version"] == EVALUATION_TREND_SCHEMA
|
|
assert trend["run"]["run_id"] == "pytest"
|
|
assert trend["threshold_deltas"]["policy_denial_count"] == 0.0
|
|
assert trend["metric_deltas"]["policy_denial_count"] == -1.0
|
|
assert trend["diagnostics"][0]["code"] == "evaluation_metric_regressed"
|
|
|
|
|
|
def test_evaluation_trend_history_persists_without_duplicate_runs(tmp_path) -> None:
|
|
data = json.loads((FIXTURES / "evaluation-scenarios.json").read_text(encoding="utf-8"))
|
|
report = evaluation_threshold_report(data)
|
|
first = evaluation_trend_artifact(
|
|
report,
|
|
run_metadata={"run_id": "first", "created_at": "2026-05-19T00:00:00+00:00"},
|
|
)
|
|
second = evaluation_trend_artifact(
|
|
report,
|
|
previous_report=report,
|
|
run_metadata={"run_id": "second", "created_at": "2026-05-20T00:00:00+00:00"},
|
|
)
|
|
path = tmp_path / "evaluation-trend-history.json"
|
|
|
|
history = write_evaluation_trend_history(path, first)
|
|
history = write_evaluation_trend_history(path, first)
|
|
history = write_evaluation_trend_history(path, second)
|
|
loaded = load_evaluation_trend_history(path)
|
|
|
|
assert history["schema_version"] == EVALUATION_TREND_HISTORY_SCHEMA
|
|
assert loaded["count"] == 2
|
|
assert loaded["latest_artifact_id"] == second["id"]
|
|
assert "policy_denial_count" in loaded["metric_keys"]
|
|
|
|
|
|
def _activation_plan(response):
|
|
data = response["data"]["activation_plan"]
|
|
return ActivationPlan(
|
|
plan_id=data["plan_id"],
|
|
graph_id=data["graph_id"],
|
|
selected_node_ids=tuple(data["selected_node_ids"]),
|
|
selected_event_ids=tuple(data["selected_event_ids"]),
|
|
omitted=tuple(data["omitted"]),
|
|
token_estimate=data["token_estimate"],
|
|
max_items=data["max_items"],
|
|
max_tokens=data["max_tokens"],
|
|
selection=response["data"]["package_request"]["selection"],
|
|
diagnostics=(),
|
|
)
|