generated from coulomb/repo-seed
Add credential routing advisories via warden route/access, live pilot evidence helpers, managed deployment pilot probes, evaluation trend regression gates, and expanded troubleshooting. Update operator runbook and maturity scorecard.
209 lines
8.3 KiB
Python
209 lines
8.3 KiB
Python
import json
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
|
|
from phase_memory.adapters import InMemorySemanticIndex
|
|
from phase_memory.contracts import graph_from_markitect
|
|
from phase_memory.evaluation import (
|
|
EVALUATION_REPORT_SCHEMA,
|
|
EVALUATION_TREND_HISTORY_SCHEMA,
|
|
EVALUATION_TREND_REGRESSION_GATE_SCHEMA,
|
|
EVALUATION_TREND_SCHEMA,
|
|
evaluation_threshold_report,
|
|
evaluation_trend_artifact,
|
|
evaluation_trend_regression_gate,
|
|
load_evaluation_trend_history,
|
|
write_evaluation_trend_history,
|
|
)
|
|
from phase_memory.models import ActivationPlan, MemoryPath
|
|
from phase_memory.retrieval import activation_quality_report, select_event_path
|
|
from phase_memory.runtime import PhaseMemoryRuntime
|
|
|
|
|
|
FIXTURES = Path(__file__).parent / "fixtures"
|
|
|
|
|
|
def _scenarios():
|
|
data = json.loads((FIXTURES / "evaluation-scenarios.json").read_text(encoding="utf-8"))
|
|
return {scenario["id"]: scenario for scenario in data["scenarios"]}
|
|
|
|
|
|
def test_policy_denied_activation_scenario_is_redacted_and_audited() -> None:
|
|
scenario = _scenarios()["policy-denied-activation"]
|
|
runtime = PhaseMemoryRuntime()
|
|
|
|
response = runtime.plan_activation(
|
|
scenario["graph"],
|
|
max_items=4,
|
|
max_tokens=60,
|
|
profile_id=scenario["profile"]["id"],
|
|
policy_context={"denied_labels": ["restricted"], "secrets_allowed": False, "trust_zone": "local"},
|
|
)
|
|
audit = runtime.query_audit({"operation": "graph.activation.plan"})
|
|
|
|
denied_ids = [item["id"] for item in response["data"]["policy_denials"]]
|
|
assert response["valid"] is True
|
|
assert denied_ids == scenario["expect"]["denied_node_ids"]
|
|
assert response["data"]["policy_denials"][0]["text"] == "[REDACTED]"
|
|
assert [diagnostic["code"] for diagnostic in response["diagnostics"]] == ["activation_policy_denied"]
|
|
assert audit["count"] == 1
|
|
|
|
|
|
def test_profile_lifecycle_rules_scenario_emits_expected_actions() -> None:
|
|
scenario = _scenarios()["profile-lifecycle-rules"]
|
|
runtime = PhaseMemoryRuntime()
|
|
|
|
response = runtime.plan_lifecycle_with_profile(
|
|
scenario["profile"],
|
|
scenario["graph"],
|
|
refresh_digests={"life.decision": "decision-new"},
|
|
now=datetime(2026, 5, 18, tzinfo=timezone.utc),
|
|
)
|
|
|
|
actions = [(action["target_id"], action["action"]) for action in response["data"]["dry_run_actions"]]
|
|
compact_actions = [action for action in response["data"]["dry_run_actions"] if action["action"] == "compact"]
|
|
assert response["valid"] is True
|
|
for expected in scenario["expect"]["actions"]:
|
|
assert tuple(expected) in actions
|
|
assert compact_actions[0]["metadata"]["source_node_ids"] == [scenario["expect"]["compact_source"]]
|
|
|
|
|
|
def test_budget_path_and_semantic_hint_scenario_meets_quality_thresholds() -> None:
|
|
scenario = _scenarios()["budget-path-and-semantic-hints"]
|
|
graph = graph_from_markitect(scenario["graph"]).value
|
|
runtime = PhaseMemoryRuntime()
|
|
index = InMemorySemanticIndex()
|
|
|
|
index.upsert_nodes(list(graph.nodes))
|
|
response = runtime.plan_activation(
|
|
scenario["graph"],
|
|
max_items=scenario["profile"]["activation"]["max_items"],
|
|
max_tokens=scenario["profile"]["activation"]["max_tokens"],
|
|
profile_id=scenario["profile"]["id"],
|
|
priority_node_ids=tuple(scenario["expect"]["selected_node_ids"]),
|
|
)
|
|
path = MemoryPath.from_mapping(scenario["path"])
|
|
selected_path_events = select_event_path(graph.events, path, max_events=2)
|
|
semantic_results = index.query(graph_id=graph.graph_id, query="semantic restart", limit=2)
|
|
report = activation_quality_report(_activation_plan(response), expected_node_ids=tuple(scenario["expect"]["selected_node_ids"]))
|
|
|
|
plan = response["data"]["activation_plan"]
|
|
assert plan["selected_node_ids"] == scenario["expect"]["selected_node_ids"]
|
|
assert [item["id"] for item in plan["omitted"]] == scenario["expect"]["omitted_node_ids"]
|
|
assert selected_path_events == tuple(scenario["expect"]["event_ids"])
|
|
assert semantic_results[0]["id"] == scenario["expect"]["semantic_top_id"]
|
|
assert report["source_span_coverage"] == 1.0
|
|
assert report["explanation_coverage"] == 1.0
|
|
|
|
|
|
def test_evaluation_threshold_report_summarizes_all_scenarios() -> None:
|
|
data = json.loads((FIXTURES / "evaluation-scenarios.json").read_text(encoding="utf-8"))
|
|
|
|
report = evaluation_threshold_report(data)
|
|
|
|
assert report["schema_version"] == EVALUATION_REPORT_SCHEMA
|
|
assert report["valid"] is True
|
|
assert report["metrics"]["scenario_count"] == 3
|
|
assert report["metrics"]["policy_denial_count"] == 1
|
|
assert report["metrics"]["lifecycle_action_count"] >= 3
|
|
assert report["metrics"]["path_event_count"] == 1
|
|
assert report["metrics"]["semantic_hit_count"] == 1
|
|
assert report["metrics"]["budget_omission_count"] == 1
|
|
assert report["diagnostics"] == []
|
|
|
|
|
|
def test_evaluation_trend_artifact_tracks_threshold_and_metric_deltas() -> None:
|
|
data = json.loads((FIXTURES / "evaluation-scenarios.json").read_text(encoding="utf-8"))
|
|
report = evaluation_threshold_report(data)
|
|
previous = {
|
|
"id": "previous",
|
|
"metrics": {
|
|
**report["metrics"],
|
|
"policy_denial_count": report["metrics"]["policy_denial_count"] + 1,
|
|
},
|
|
}
|
|
|
|
trend = evaluation_trend_artifact(
|
|
report,
|
|
previous_report=previous,
|
|
run_metadata={"run_id": "pytest", "created_at": "2026-05-19T00:00:00+00:00"},
|
|
)
|
|
|
|
assert trend["schema_version"] == EVALUATION_TREND_SCHEMA
|
|
assert trend["run"]["run_id"] == "pytest"
|
|
assert trend["threshold_deltas"]["policy_denial_count"] == 0.0
|
|
assert trend["metric_deltas"]["policy_denial_count"] == -1.0
|
|
assert trend["diagnostics"][0]["code"] == "evaluation_metric_regressed"
|
|
|
|
|
|
def test_evaluation_trend_history_persists_without_duplicate_runs(tmp_path) -> None:
|
|
data = json.loads((FIXTURES / "evaluation-scenarios.json").read_text(encoding="utf-8"))
|
|
report = evaluation_threshold_report(data)
|
|
first = evaluation_trend_artifact(
|
|
report,
|
|
run_metadata={"run_id": "first", "created_at": "2026-05-19T00:00:00+00:00"},
|
|
)
|
|
second = evaluation_trend_artifact(
|
|
report,
|
|
previous_report=report,
|
|
run_metadata={"run_id": "second", "created_at": "2026-05-20T00:00:00+00:00"},
|
|
)
|
|
path = tmp_path / "evaluation-trend-history.json"
|
|
|
|
history = write_evaluation_trend_history(path, first)
|
|
history = write_evaluation_trend_history(path, first)
|
|
history = write_evaluation_trend_history(path, second)
|
|
loaded = load_evaluation_trend_history(path)
|
|
|
|
assert history["schema_version"] == EVALUATION_TREND_HISTORY_SCHEMA
|
|
assert loaded["count"] == 2
|
|
assert loaded["latest_artifact_id"] == second["id"]
|
|
assert "policy_denial_count" in loaded["metric_keys"]
|
|
|
|
|
|
def test_evaluation_trend_regression_gate_flags_metric_declines() -> None:
|
|
data = json.loads((FIXTURES / "evaluation-scenarios.json").read_text(encoding="utf-8"))
|
|
report = evaluation_threshold_report(data)
|
|
previous = evaluation_trend_artifact(
|
|
report,
|
|
run_metadata={"run_id": "previous", "created_at": "2026-05-18T00:00:00+00:00"},
|
|
)
|
|
regressed_report = {
|
|
**report,
|
|
"metrics": {
|
|
**report["metrics"],
|
|
"policy_denial_count": report["metrics"]["policy_denial_count"] - 1,
|
|
},
|
|
}
|
|
latest = evaluation_trend_artifact(
|
|
regressed_report,
|
|
previous_report=report,
|
|
run_metadata={"run_id": "latest", "created_at": "2026-05-19T00:00:00+00:00"},
|
|
)
|
|
history = {
|
|
"schema_version": EVALUATION_TREND_HISTORY_SCHEMA,
|
|
"artifacts": [previous, latest],
|
|
}
|
|
|
|
gate = evaluation_trend_regression_gate(history)
|
|
|
|
assert gate["schema_version"] == EVALUATION_TREND_REGRESSION_GATE_SCHEMA
|
|
assert gate["valid"] is False
|
|
assert gate["metric_regressions"]["policy_denial_count"] == -1.0
|
|
|
|
|
|
def _activation_plan(response):
|
|
data = response["data"]["activation_plan"]
|
|
return ActivationPlan(
|
|
plan_id=data["plan_id"],
|
|
graph_id=data["graph_id"],
|
|
selected_node_ids=tuple(data["selected_node_ids"]),
|
|
selected_event_ids=tuple(data["selected_event_ids"]),
|
|
omitted=tuple(data["omitted"]),
|
|
token_estimate=data["token_estimate"],
|
|
max_items=data["max_items"],
|
|
max_tokens=data["max_tokens"],
|
|
selection=response["data"]["package_request"]["selection"],
|
|
diagnostics=(),
|
|
)
|