generated from coulomb/repo-seed
Implement refinement hardening workplan
This commit is contained in:
170
tests/fixtures/evaluation-scenarios.json
vendored
Normal file
170
tests/fixtures/evaluation-scenarios.json
vendored
Normal file
@@ -0,0 +1,170 @@
|
||||
{
|
||||
"schema_version": "phase_memory.evaluation_scenarios.v1",
|
||||
"scenarios": [
|
||||
{
|
||||
"id": "policy-denied-activation",
|
||||
"profile": {
|
||||
"schema_version": "markitect.memory.profile.v1",
|
||||
"id": "eval-policy-profile",
|
||||
"memory_kinds": ["knowledge", "decision"],
|
||||
"activation": {"max_items": 4, "max_tokens": 60},
|
||||
"policy": {"mode": "allow-all", "trust_zone_labels": ["local"]},
|
||||
"observability": {"audit_sink": "recording"}
|
||||
},
|
||||
"graph": {
|
||||
"schema_version": "markitect.memory.graph.v1",
|
||||
"id": "eval-policy-graph",
|
||||
"nodes": [
|
||||
{
|
||||
"id": "policy.public",
|
||||
"kind": "knowledge",
|
||||
"text": "Public operating constraint that can be activated for local planning.",
|
||||
"phase": "stabilized",
|
||||
"policy": {"labels": ["public"], "trust_zone": "local"},
|
||||
"source_spans": [{"path": "policy.md", "line_start": 1}],
|
||||
"metadata": {"graph_id": "eval-policy-graph"}
|
||||
},
|
||||
{
|
||||
"id": "policy.secret",
|
||||
"kind": "knowledge",
|
||||
"text": "Sensitive credential note that must not enter restart context.",
|
||||
"phase": "stabilized",
|
||||
"policy": {"labels": ["restricted"], "trust_zone": "local", "secret": true},
|
||||
"metadata": {"graph_id": "eval-policy-graph"}
|
||||
}
|
||||
],
|
||||
"edges": [
|
||||
{
|
||||
"id": "edge.policy",
|
||||
"kind": "references",
|
||||
"source": "policy.public",
|
||||
"target": "policy.secret"
|
||||
}
|
||||
],
|
||||
"events": []
|
||||
},
|
||||
"expect": {"denied_node_ids": ["policy.secret"]}
|
||||
},
|
||||
{
|
||||
"id": "profile-lifecycle-rules",
|
||||
"profile": {
|
||||
"schema_version": "markitect.memory.profile.v1",
|
||||
"id": "eval-lifecycle-profile",
|
||||
"memory_kinds": ["episode", "decision"],
|
||||
"retention": {
|
||||
"episode": {"stale_after_days": 7},
|
||||
"decision": {"delete_after_days": 365}
|
||||
},
|
||||
"refresh": {"mode": "enabled"},
|
||||
"compaction": {"node_ids": ["life.old-episode"]},
|
||||
"metadata": {
|
||||
"phase_transitions": [
|
||||
{
|
||||
"node_kind": "decision",
|
||||
"from_phase": "fluid",
|
||||
"to_phase": "stabilized",
|
||||
"min_age_days": 2,
|
||||
"reason": "decision has stabilized"
|
||||
}
|
||||
]
|
||||
}
|
||||
},
|
||||
"graph": {
|
||||
"schema_version": "markitect.memory.graph.v1",
|
||||
"id": "eval-lifecycle-graph",
|
||||
"nodes": [
|
||||
{
|
||||
"id": "life.old-episode",
|
||||
"kind": "episode",
|
||||
"text": "An old episode ready to become stale and compacted.",
|
||||
"phase": "fluid",
|
||||
"freshness": {"updated_at": "2026-04-01T00:00:00+00:00", "source_digest": "old"},
|
||||
"metadata": {"graph_id": "eval-lifecycle-graph"}
|
||||
},
|
||||
{
|
||||
"id": "life.decision",
|
||||
"kind": "decision",
|
||||
"text": "A decision that should transition to stabilized after review.",
|
||||
"phase": "fluid",
|
||||
"freshness": {"updated_at": "2026-05-01T00:00:00+00:00", "source_digest": "decision-old"},
|
||||
"metadata": {"graph_id": "eval-lifecycle-graph"}
|
||||
}
|
||||
],
|
||||
"edges": [],
|
||||
"events": []
|
||||
},
|
||||
"expect": {
|
||||
"actions": [
|
||||
["life.old-episode", "mark_stale"],
|
||||
["life.decision", "transition_phase"],
|
||||
["life.decision", "refresh"]
|
||||
],
|
||||
"compact_source": "life.old-episode"
|
||||
}
|
||||
},
|
||||
{
|
||||
"id": "budget-path-and-semantic-hints",
|
||||
"profile": {
|
||||
"schema_version": "markitect.memory.profile.v1",
|
||||
"id": "eval-budget-profile",
|
||||
"memory_kinds": ["decision", "knowledge", "episode"],
|
||||
"activation": {"max_items": 2, "max_tokens": 16, "semantic_index": "memory"}
|
||||
},
|
||||
"graph": {
|
||||
"schema_version": "markitect.memory.graph.v1",
|
||||
"id": "eval-budget-graph",
|
||||
"nodes": [
|
||||
{
|
||||
"id": "budget.anchor",
|
||||
"kind": "decision",
|
||||
"text": "Restart anchor with source.",
|
||||
"phase": "stabilized",
|
||||
"source_spans": [{"path": "restart.md", "line_start": 3}],
|
||||
"metadata": {"graph_id": "eval-budget-graph"}
|
||||
},
|
||||
{
|
||||
"id": "budget.semantic",
|
||||
"kind": "knowledge",
|
||||
"text": "Semantic index hint for restart package selection.",
|
||||
"phase": "stabilized",
|
||||
"source_spans": [{"path": "retrieval.md", "line_start": 7}],
|
||||
"metadata": {"graph_id": "eval-budget-graph"}
|
||||
},
|
||||
{
|
||||
"id": "budget.long",
|
||||
"kind": "episode",
|
||||
"text": "This verbose episode is intentionally long enough to lose against the strict activation token budget pressure.",
|
||||
"phase": "fluid",
|
||||
"metadata": {"graph_id": "eval-budget-graph"}
|
||||
}
|
||||
],
|
||||
"edges": [
|
||||
{
|
||||
"id": "edge.budget",
|
||||
"kind": "supports",
|
||||
"source": "budget.anchor",
|
||||
"target": "budget.semantic"
|
||||
}
|
||||
],
|
||||
"events": [
|
||||
{
|
||||
"id": "budget.path-event",
|
||||
"kind": "activated",
|
||||
"timestamp": "2026-05-18T00:00:00+00:00",
|
||||
"activation_refs": ["activation.budget"]
|
||||
}
|
||||
]
|
||||
},
|
||||
"path": {
|
||||
"id": "path.budget",
|
||||
"event_ids": ["budget.path-event"]
|
||||
},
|
||||
"expect": {
|
||||
"selected_node_ids": ["budget.anchor", "budget.semantic"],
|
||||
"omitted_node_ids": ["budget.long"],
|
||||
"semantic_top_id": "budget.semantic",
|
||||
"event_ids": ["budget.path-event"]
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
101
tests/test_evaluation_scenarios.py
Normal file
101
tests/test_evaluation_scenarios.py
Normal file
@@ -0,0 +1,101 @@
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from phase_memory.adapters import InMemorySemanticIndex
|
||||
from phase_memory.contracts import graph_from_markitect
|
||||
from phase_memory.models import ActivationPlan, MemoryPath
|
||||
from phase_memory.retrieval import activation_quality_report, select_event_path
|
||||
from phase_memory.runtime import PhaseMemoryRuntime
|
||||
|
||||
|
||||
FIXTURES = Path(__file__).parent / "fixtures"
|
||||
|
||||
|
||||
def _scenarios():
|
||||
data = json.loads((FIXTURES / "evaluation-scenarios.json").read_text(encoding="utf-8"))
|
||||
return {scenario["id"]: scenario for scenario in data["scenarios"]}
|
||||
|
||||
|
||||
def test_policy_denied_activation_scenario_is_redacted_and_audited() -> None:
|
||||
scenario = _scenarios()["policy-denied-activation"]
|
||||
runtime = PhaseMemoryRuntime()
|
||||
|
||||
response = runtime.plan_activation(
|
||||
scenario["graph"],
|
||||
max_items=4,
|
||||
max_tokens=60,
|
||||
profile_id=scenario["profile"]["id"],
|
||||
policy_context={"denied_labels": ["restricted"], "secrets_allowed": False, "trust_zone": "local"},
|
||||
)
|
||||
audit = runtime.query_audit({"operation": "graph.activation.plan"})
|
||||
|
||||
denied_ids = [item["id"] for item in response["data"]["policy_denials"]]
|
||||
assert response["valid"] is True
|
||||
assert denied_ids == scenario["expect"]["denied_node_ids"]
|
||||
assert response["data"]["policy_denials"][0]["text"] == "[REDACTED]"
|
||||
assert [diagnostic["code"] for diagnostic in response["diagnostics"]] == ["activation_policy_denied"]
|
||||
assert audit["count"] == 1
|
||||
|
||||
|
||||
def test_profile_lifecycle_rules_scenario_emits_expected_actions() -> None:
|
||||
scenario = _scenarios()["profile-lifecycle-rules"]
|
||||
runtime = PhaseMemoryRuntime()
|
||||
|
||||
response = runtime.plan_lifecycle_with_profile(
|
||||
scenario["profile"],
|
||||
scenario["graph"],
|
||||
refresh_digests={"life.decision": "decision-new"},
|
||||
now=datetime(2026, 5, 18, tzinfo=timezone.utc),
|
||||
)
|
||||
|
||||
actions = [(action["target_id"], action["action"]) for action in response["data"]["dry_run_actions"]]
|
||||
compact_actions = [action for action in response["data"]["dry_run_actions"] if action["action"] == "compact"]
|
||||
assert response["valid"] is True
|
||||
for expected in scenario["expect"]["actions"]:
|
||||
assert tuple(expected) in actions
|
||||
assert compact_actions[0]["metadata"]["source_node_ids"] == [scenario["expect"]["compact_source"]]
|
||||
|
||||
|
||||
def test_budget_path_and_semantic_hint_scenario_meets_quality_thresholds() -> None:
|
||||
scenario = _scenarios()["budget-path-and-semantic-hints"]
|
||||
graph = graph_from_markitect(scenario["graph"]).value
|
||||
runtime = PhaseMemoryRuntime()
|
||||
index = InMemorySemanticIndex()
|
||||
|
||||
index.upsert_nodes(list(graph.nodes))
|
||||
response = runtime.plan_activation(
|
||||
scenario["graph"],
|
||||
max_items=scenario["profile"]["activation"]["max_items"],
|
||||
max_tokens=scenario["profile"]["activation"]["max_tokens"],
|
||||
profile_id=scenario["profile"]["id"],
|
||||
priority_node_ids=tuple(scenario["expect"]["selected_node_ids"]),
|
||||
)
|
||||
path = MemoryPath.from_mapping(scenario["path"])
|
||||
selected_path_events = select_event_path(graph.events, path, max_events=2)
|
||||
semantic_results = index.query(graph_id=graph.graph_id, query="semantic restart", limit=2)
|
||||
report = activation_quality_report(_activation_plan(response), expected_node_ids=tuple(scenario["expect"]["selected_node_ids"]))
|
||||
|
||||
plan = response["data"]["activation_plan"]
|
||||
assert plan["selected_node_ids"] == scenario["expect"]["selected_node_ids"]
|
||||
assert [item["id"] for item in plan["omitted"]] == scenario["expect"]["omitted_node_ids"]
|
||||
assert selected_path_events == tuple(scenario["expect"]["event_ids"])
|
||||
assert semantic_results[0]["id"] == scenario["expect"]["semantic_top_id"]
|
||||
assert report["source_span_coverage"] == 1.0
|
||||
assert report["explanation_coverage"] == 1.0
|
||||
|
||||
|
||||
def _activation_plan(response):
|
||||
data = response["data"]["activation_plan"]
|
||||
return ActivationPlan(
|
||||
plan_id=data["plan_id"],
|
||||
graph_id=data["graph_id"],
|
||||
selected_node_ids=tuple(data["selected_node_ids"]),
|
||||
selected_event_ids=tuple(data["selected_event_ids"]),
|
||||
omitted=tuple(data["omitted"]),
|
||||
token_estimate=data["token_estimate"],
|
||||
max_items=data["max_items"],
|
||||
max_tokens=data["max_tokens"],
|
||||
selection=response["data"]["package_request"]["selection"],
|
||||
diagnostics=(),
|
||||
)
|
||||
@@ -1,7 +1,14 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from phase_memory.external_adapters import fake_external_adapter_pack, fake_external_runtime_config
|
||||
from phase_memory.external_adapters import (
|
||||
ADAPTER_PACK_MANIFEST_SCHEMA,
|
||||
ExternalAdapterPack,
|
||||
adapter_pack_manifest,
|
||||
fake_external_adapter_pack,
|
||||
fake_external_runtime_config,
|
||||
validate_adapter_pack_manifest,
|
||||
)
|
||||
from phase_memory.service import (
|
||||
assert_audit_sink_conformance,
|
||||
assert_context_compiler_conformance,
|
||||
@@ -37,6 +44,35 @@ def test_fake_external_adapter_pack_satisfies_public_conformance_helpers() -> No
|
||||
assert pack.to_dict()["adapters"]["package_compiler"] == "FakeMarkitectPackageCompiler"
|
||||
|
||||
|
||||
def test_fake_external_adapter_pack_manifest_declares_compatibility() -> None:
|
||||
pack = fake_external_adapter_pack()
|
||||
|
||||
manifest = adapter_pack_manifest(pack)
|
||||
diagnostics = validate_adapter_pack_manifest(pack)
|
||||
|
||||
assert manifest["schema_version"] == ADAPTER_PACK_MANIFEST_SCHEMA
|
||||
assert manifest["adapters"]["package_compiler"]["required_conformance"] == "assert_context_compiler_conformance"
|
||||
assert manifest["adapters"]["audit_sink"]["required_capabilities"] == ["telemetry.audit.fake"]
|
||||
assert diagnostics == ()
|
||||
|
||||
|
||||
def test_adapter_pack_manifest_reports_missing_capabilities() -> None:
|
||||
pack = fake_external_adapter_pack()
|
||||
incomplete = ExternalAdapterPack(
|
||||
name=pack.name,
|
||||
adapters=pack.adapters,
|
||||
capabilities=tuple(capability for capability in pack.capabilities if capability != "telemetry.audit.fake"),
|
||||
ownership_boundaries=pack.ownership_boundaries,
|
||||
required_conformance=pack.required_conformance,
|
||||
metadata=pack.metadata,
|
||||
)
|
||||
|
||||
diagnostics = validate_adapter_pack_manifest(incomplete)
|
||||
|
||||
assert [diagnostic.code for diagnostic in diagnostics] == ["missing_adapter_capability"]
|
||||
assert diagnostics[0].metadata["capability"] == "telemetry.audit.fake"
|
||||
|
||||
|
||||
def test_external_runtime_config_resolves_supplied_fake_pack() -> None:
|
||||
config = fake_external_runtime_config()
|
||||
pack = fake_external_adapter_pack()
|
||||
|
||||
@@ -87,6 +87,44 @@ def test_repair_diagnostics_report_missing_edges_and_orphaned_path_events(tmp_pa
|
||||
assert [diagnostic["code"] for diagnostic in envelope["diagnostics"]] == ["missing_edge_target", "orphaned_path_event"]
|
||||
|
||||
|
||||
def test_file_backed_store_reports_migration_needs_and_uses_atomic_json_writes(tmp_path) -> None:
|
||||
store = FileBackedMemoryGraphStore(tmp_path)
|
||||
metadata_path = tmp_path / "phase-memory.json"
|
||||
metadata_path.write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"schema_version": "phase_memory.local_store.v0",
|
||||
"planned_migrations": ["v0-to-v1"],
|
||||
}
|
||||
),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
store.save_node(MemoryNode("node.atomic", "decision", "Atomic write target"))
|
||||
runtime = PhaseMemoryRuntime(graph_store=store, event_log=JsonlMemoryEventLog(tmp_path / "events.jsonl"))
|
||||
|
||||
envelope = runtime.repair_diagnostics(source_ref=str(tmp_path))
|
||||
|
||||
codes = [diagnostic["code"] for diagnostic in envelope["diagnostics"]]
|
||||
assert envelope["valid"] is True
|
||||
assert "store_migration_required" in codes
|
||||
assert "planned_store_migrations" in codes
|
||||
assert not list(tmp_path.rglob("*.tmp"))
|
||||
|
||||
|
||||
def test_repair_diagnostics_distinguish_corrupt_store_records(tmp_path) -> None:
|
||||
store = FileBackedMemoryGraphStore(tmp_path)
|
||||
runtime = PhaseMemoryRuntime(graph_store=store, event_log=JsonlMemoryEventLog(tmp_path / "events.jsonl"))
|
||||
|
||||
(tmp_path / "nodes" / "broken.json").write_text("{not-json}\n", encoding="utf-8")
|
||||
|
||||
envelope = runtime.repair_diagnostics(source_ref=str(tmp_path))
|
||||
|
||||
assert envelope["valid"] is False
|
||||
assert envelope["diagnostics"][0]["code"] == "corrupt_store_record"
|
||||
assert envelope["diagnostics"][0]["metadata"]["record_type"] == "node"
|
||||
|
||||
|
||||
def test_lifecycle_apply_requires_approval_for_reviewable_actions(tmp_path) -> None:
|
||||
store = FileBackedMemoryGraphStore(tmp_path)
|
||||
runtime = PhaseMemoryRuntime(graph_store=store, event_log=JsonlMemoryEventLog(tmp_path / "events.jsonl"))
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
from phase_memory.models import LifecycleState, MemoryNode
|
||||
from phase_memory.lifecycle import plan_compaction
|
||||
from phase_memory.models import LifecycleAction, LifecycleActionKind, LifecycleState, MemoryNode
|
||||
from phase_memory.service import (
|
||||
HEALTH_REPORT_SCHEMA,
|
||||
KONTEXTUAL_DELEGATION_SCHEMA,
|
||||
@@ -76,6 +77,58 @@ def test_service_runner_handles_profile_driven_lifecycle_plan() -> None:
|
||||
assert ("event.restart", "refresh") in actions
|
||||
|
||||
|
||||
def test_service_runner_handles_package_compile_and_audit_query() -> None:
|
||||
runner = LocalServiceRunner()
|
||||
selection = {
|
||||
"schema_version": "markitect.memory.selection.v1",
|
||||
"id": "selection.service",
|
||||
"nodes": ["decision.boundary"],
|
||||
"events": ["event.activation"],
|
||||
}
|
||||
|
||||
compiled = runner.handle("package.compile", {"selection": selection, "source_ref": "service-test"})
|
||||
audit = runner.handle("audit.query", {"filters": {"operation": "package.compile"}})
|
||||
|
||||
assert compiled["operation"] == "package.compile"
|
||||
assert compiled["data"]["package_response"]["package_ref"] == "package:selection.service"
|
||||
assert audit["operation"] == "audit.query"
|
||||
assert audit["count"] == 1
|
||||
assert audit["events"][0]["source"]["ref"] == "service-test"
|
||||
assert audit["retention"]["mode"] == "in_memory"
|
||||
|
||||
|
||||
def test_service_runner_handles_review_gated_lifecycle_apply() -> None:
|
||||
runner = LocalServiceRunner()
|
||||
node = runner.runtime.graph_store.save_node(MemoryNode("node.review", "episode", "Review gated content"))
|
||||
compact = plan_compaction([node]).to_dict()
|
||||
|
||||
denied = runner.handle("lifecycle.apply", {"actions": [compact]})
|
||||
applied = runner.handle("lifecycle.apply", {"actions": [compact], "approval_marker": "review:service"})
|
||||
audit = runner.handle("audit.query", {"filters": {"operation": "lifecycle.apply", "dry_run": False}})
|
||||
|
||||
assert denied["valid"] is False
|
||||
assert denied["data"]["denied"][0]["reason"] == "review_required"
|
||||
assert applied["valid"] is True
|
||||
assert runner.runtime.graph_store.get_node(applied["data"]["applied"][0]["target_id"]).kind == "summary"
|
||||
assert audit["count"] == 2
|
||||
|
||||
|
||||
def test_service_runner_handles_non_review_lifecycle_apply() -> None:
|
||||
runner = LocalServiceRunner()
|
||||
runner.runtime.graph_store.save_node(MemoryNode("node.stale.service", "episode"))
|
||||
action = LifecycleAction(
|
||||
LifecycleActionKind.MARK_STALE,
|
||||
"node.stale.service",
|
||||
from_state=LifecycleState.ACTIVE,
|
||||
to_state=LifecycleState.STALE,
|
||||
)
|
||||
|
||||
applied = runner.handle("lifecycle.apply", {"actions": [action.to_dict()]})
|
||||
|
||||
assert applied["valid"] is True
|
||||
assert runner.runtime.graph_store.get_node("node.stale.service").lifecycle == LifecycleState.STALE
|
||||
|
||||
|
||||
def test_profile_driven_runtime_config_resolves_file_backed_adapters(tmp_path) -> None:
|
||||
config = RuntimeConfig.from_profile(
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user