Implement refinement hardening workplan

2026-05-18 23:56:41 +02:00
parent 836acf7e01
commit 0eea94d05e
17 changed files with 1164 additions and 68 deletions
--- a/tests/fixtures/evaluation-scenarios.json
+++ b/tests/fixtures/evaluation-scenarios.json
@@ -0,0 +1,170 @@
+{
+  "schema_version": "phase_memory.evaluation_scenarios.v1",
+  "scenarios": [
+    {
+      "id": "policy-denied-activation",
+      "profile": {
+        "schema_version": "markitect.memory.profile.v1",
+        "id": "eval-policy-profile",
+        "memory_kinds": ["knowledge", "decision"],
+        "activation": {"max_items": 4, "max_tokens": 60},
+        "policy": {"mode": "allow-all", "trust_zone_labels": ["local"]},
+        "observability": {"audit_sink": "recording"}
+      },
+      "graph": {
+        "schema_version": "markitect.memory.graph.v1",
+        "id": "eval-policy-graph",
+        "nodes": [
+          {
+            "id": "policy.public",
+            "kind": "knowledge",
+            "text": "Public operating constraint that can be activated for local planning.",
+            "phase": "stabilized",
+            "policy": {"labels": ["public"], "trust_zone": "local"},
+            "source_spans": [{"path": "policy.md", "line_start": 1}],
+            "metadata": {"graph_id": "eval-policy-graph"}
+          },
+          {
+            "id": "policy.secret",
+            "kind": "knowledge",
+            "text": "Sensitive credential note that must not enter restart context.",
+            "phase": "stabilized",
+            "policy": {"labels": ["restricted"], "trust_zone": "local", "secret": true},
+            "metadata": {"graph_id": "eval-policy-graph"}
+          }
+        ],
+        "edges": [
+          {
+            "id": "edge.policy",
+            "kind": "references",
+            "source": "policy.public",
+            "target": "policy.secret"
+          }
+        ],
+        "events": []
+      },
+      "expect": {"denied_node_ids": ["policy.secret"]}
+    },
+    {
+      "id": "profile-lifecycle-rules",
+      "profile": {
+        "schema_version": "markitect.memory.profile.v1",
+        "id": "eval-lifecycle-profile",
+        "memory_kinds": ["episode", "decision"],
+        "retention": {
+          "episode": {"stale_after_days": 7},
+          "decision": {"delete_after_days": 365}
+        },
+        "refresh": {"mode": "enabled"},
+        "compaction": {"node_ids": ["life.old-episode"]},
+        "metadata": {
+          "phase_transitions": [
+            {
+              "node_kind": "decision",
+              "from_phase": "fluid",
+              "to_phase": "stabilized",
+              "min_age_days": 2,
+              "reason": "decision has stabilized"
+            }
+          ]
+        }
+      },
+      "graph": {
+        "schema_version": "markitect.memory.graph.v1",
+        "id": "eval-lifecycle-graph",
+        "nodes": [
+          {
+            "id": "life.old-episode",
+            "kind": "episode",
+            "text": "An old episode ready to become stale and compacted.",
+            "phase": "fluid",
+            "freshness": {"updated_at": "2026-04-01T00:00:00+00:00", "source_digest": "old"},
+            "metadata": {"graph_id": "eval-lifecycle-graph"}
+          },
+          {
+            "id": "life.decision",
+            "kind": "decision",
+            "text": "A decision that should transition to stabilized after review.",
+            "phase": "fluid",
+            "freshness": {"updated_at": "2026-05-01T00:00:00+00:00", "source_digest": "decision-old"},
+            "metadata": {"graph_id": "eval-lifecycle-graph"}
+          }
+        ],
+        "edges": [],
+        "events": []
+      },
+      "expect": {
+        "actions": [
+          ["life.old-episode", "mark_stale"],
+          ["life.decision", "transition_phase"],
+          ["life.decision", "refresh"]
+        ],
+        "compact_source": "life.old-episode"
+      }
+    },
+    {
+      "id": "budget-path-and-semantic-hints",
+      "profile": {
+        "schema_version": "markitect.memory.profile.v1",
+        "id": "eval-budget-profile",
+        "memory_kinds": ["decision", "knowledge", "episode"],
+        "activation": {"max_items": 2, "max_tokens": 16, "semantic_index": "memory"}
+      },
+      "graph": {
+        "schema_version": "markitect.memory.graph.v1",
+        "id": "eval-budget-graph",
+        "nodes": [
+          {
+            "id": "budget.anchor",
+            "kind": "decision",
+            "text": "Restart anchor with source.",
+            "phase": "stabilized",
+            "source_spans": [{"path": "restart.md", "line_start": 3}],
+            "metadata": {"graph_id": "eval-budget-graph"}
+          },
+          {
+            "id": "budget.semantic",
+            "kind": "knowledge",
+            "text": "Semantic index hint for restart package selection.",
+            "phase": "stabilized",
+            "source_spans": [{"path": "retrieval.md", "line_start": 7}],
+            "metadata": {"graph_id": "eval-budget-graph"}
+          },
+          {
+            "id": "budget.long",
+            "kind": "episode",
+            "text": "This verbose episode is intentionally long enough to lose against the strict activation token budget pressure.",
+            "phase": "fluid",
+            "metadata": {"graph_id": "eval-budget-graph"}
+          }
+        ],
+        "edges": [
+          {
+            "id": "edge.budget",
+            "kind": "supports",
+            "source": "budget.anchor",
+            "target": "budget.semantic"
+          }
+        ],
+        "events": [
+          {
+            "id": "budget.path-event",
+            "kind": "activated",
+            "timestamp": "2026-05-18T00:00:00+00:00",
+            "activation_refs": ["activation.budget"]
+          }
+        ]
+      },
+      "path": {
+        "id": "path.budget",
+        "event_ids": ["budget.path-event"]
+      },
+      "expect": {
+        "selected_node_ids": ["budget.anchor", "budget.semantic"],
+        "omitted_node_ids": ["budget.long"],
+        "semantic_top_id": "budget.semantic",
+        "event_ids": ["budget.path-event"]
+      }
+    }
+  ]
+}
--- a/tests/test_evaluation_scenarios.py
+++ b/tests/test_evaluation_scenarios.py
@@ -0,0 +1,101 @@
+import json
+from datetime import datetime, timezone
+from pathlib import Path
+
+from phase_memory.adapters import InMemorySemanticIndex
+from phase_memory.contracts import graph_from_markitect
+from phase_memory.models import ActivationPlan, MemoryPath
+from phase_memory.retrieval import activation_quality_report, select_event_path
+from phase_memory.runtime import PhaseMemoryRuntime
+
+
+FIXTURES = Path(__file__).parent / "fixtures"
+
+
+def _scenarios():
+    data = json.loads((FIXTURES / "evaluation-scenarios.json").read_text(encoding="utf-8"))
+    return {scenario["id"]: scenario for scenario in data["scenarios"]}
+
+
+def test_policy_denied_activation_scenario_is_redacted_and_audited() -> None:
+    scenario = _scenarios()["policy-denied-activation"]
+    runtime = PhaseMemoryRuntime()
+
+    response = runtime.plan_activation(
+        scenario["graph"],
+        max_items=4,
+        max_tokens=60,
+        profile_id=scenario["profile"]["id"],
+        policy_context={"denied_labels": ["restricted"], "secrets_allowed": False, "trust_zone": "local"},
+    )
+    audit = runtime.query_audit({"operation": "graph.activation.plan"})
+
+    denied_ids = [item["id"] for item in response["data"]["policy_denials"]]
+    assert response["valid"] is True
+    assert denied_ids == scenario["expect"]["denied_node_ids"]
+    assert response["data"]["policy_denials"][0]["text"] == "[REDACTED]"
+    assert [diagnostic["code"] for diagnostic in response["diagnostics"]] == ["activation_policy_denied"]
+    assert audit["count"] == 1
+
+
+def test_profile_lifecycle_rules_scenario_emits_expected_actions() -> None:
+    scenario = _scenarios()["profile-lifecycle-rules"]
+    runtime = PhaseMemoryRuntime()
+
+    response = runtime.plan_lifecycle_with_profile(
+        scenario["profile"],
+        scenario["graph"],
+        refresh_digests={"life.decision": "decision-new"},
+        now=datetime(2026, 5, 18, tzinfo=timezone.utc),
+    )
+
+    actions = [(action["target_id"], action["action"]) for action in response["data"]["dry_run_actions"]]
+    compact_actions = [action for action in response["data"]["dry_run_actions"] if action["action"] == "compact"]
+    assert response["valid"] is True
+    for expected in scenario["expect"]["actions"]:
+        assert tuple(expected) in actions
+    assert compact_actions[0]["metadata"]["source_node_ids"] == [scenario["expect"]["compact_source"]]
+
+
+def test_budget_path_and_semantic_hint_scenario_meets_quality_thresholds() -> None:
+    scenario = _scenarios()["budget-path-and-semantic-hints"]
+    graph = graph_from_markitect(scenario["graph"]).value
+    runtime = PhaseMemoryRuntime()
+    index = InMemorySemanticIndex()
+
+    index.upsert_nodes(list(graph.nodes))
+    response = runtime.plan_activation(
+        scenario["graph"],
+        max_items=scenario["profile"]["activation"]["max_items"],
+        max_tokens=scenario["profile"]["activation"]["max_tokens"],
+        profile_id=scenario["profile"]["id"],
+        priority_node_ids=tuple(scenario["expect"]["selected_node_ids"]),
+    )
+    path = MemoryPath.from_mapping(scenario["path"])
+    selected_path_events = select_event_path(graph.events, path, max_events=2)
+    semantic_results = index.query(graph_id=graph.graph_id, query="semantic restart", limit=2)
+    report = activation_quality_report(_activation_plan(response), expected_node_ids=tuple(scenario["expect"]["selected_node_ids"]))
+
+    plan = response["data"]["activation_plan"]
+    assert plan["selected_node_ids"] == scenario["expect"]["selected_node_ids"]
+    assert [item["id"] for item in plan["omitted"]] == scenario["expect"]["omitted_node_ids"]
+    assert selected_path_events == tuple(scenario["expect"]["event_ids"])
+    assert semantic_results[0]["id"] == scenario["expect"]["semantic_top_id"]
+    assert report["source_span_coverage"] == 1.0
+    assert report["explanation_coverage"] == 1.0
+
+
+def _activation_plan(response):
+    data = response["data"]["activation_plan"]
+    return ActivationPlan(
+        plan_id=data["plan_id"],
+        graph_id=data["graph_id"],
+        selected_node_ids=tuple(data["selected_node_ids"]),
+        selected_event_ids=tuple(data["selected_event_ids"]),
+        omitted=tuple(data["omitted"]),
+        token_estimate=data["token_estimate"],
+        max_items=data["max_items"],
+        max_tokens=data["max_tokens"],
+        selection=response["data"]["package_request"]["selection"],
+        diagnostics=(),
+    )
--- a/tests/test_external_adapter_packs.py
+++ b/tests/test_external_adapter_packs.py
@@ -1,7 +1,14 @@
 import json
 from pathlib import Path

-from phase_memory.external_adapters import fake_external_adapter_pack, fake_external_runtime_config
+from phase_memory.external_adapters import (
+    ADAPTER_PACK_MANIFEST_SCHEMA,
+    ExternalAdapterPack,
+    adapter_pack_manifest,
+    fake_external_adapter_pack,
+    fake_external_runtime_config,
+    validate_adapter_pack_manifest,
+)
 from phase_memory.service import (
    assert_audit_sink_conformance,
    assert_context_compiler_conformance,
@@ -37,6 +44,35 @@ def test_fake_external_adapter_pack_satisfies_public_conformance_helpers() -> No
    assert pack.to_dict()["adapters"]["package_compiler"] == "FakeMarkitectPackageCompiler"


+def test_fake_external_adapter_pack_manifest_declares_compatibility() -> None:
+    pack = fake_external_adapter_pack()
+
+    manifest = adapter_pack_manifest(pack)
+    diagnostics = validate_adapter_pack_manifest(pack)
+
+    assert manifest["schema_version"] == ADAPTER_PACK_MANIFEST_SCHEMA
+    assert manifest["adapters"]["package_compiler"]["required_conformance"] == "assert_context_compiler_conformance"
+    assert manifest["adapters"]["audit_sink"]["required_capabilities"] == ["telemetry.audit.fake"]
+    assert diagnostics == ()
+
+
+def test_adapter_pack_manifest_reports_missing_capabilities() -> None:
+    pack = fake_external_adapter_pack()
+    incomplete = ExternalAdapterPack(
+        name=pack.name,
+        adapters=pack.adapters,
+        capabilities=tuple(capability for capability in pack.capabilities if capability != "telemetry.audit.fake"),
+        ownership_boundaries=pack.ownership_boundaries,
+        required_conformance=pack.required_conformance,
+        metadata=pack.metadata,
+    )
+
+    diagnostics = validate_adapter_pack_manifest(incomplete)
+
+    assert [diagnostic.code for diagnostic in diagnostics] == ["missing_adapter_capability"]
+    assert diagnostics[0].metadata["capability"] == "telemetry.audit.fake"
+
+
 def test_external_runtime_config_resolves_supplied_fake_pack() -> None:
    config = fake_external_runtime_config()
    pack = fake_external_adapter_pack()
--- a/tests/test_file_backed_runtime.py
+++ b/tests/test_file_backed_runtime.py
@@ -87,6 +87,44 @@ def test_repair_diagnostics_report_missing_edges_and_orphaned_path_events(tmp_pa
    assert [diagnostic["code"] for diagnostic in envelope["diagnostics"]] == ["missing_edge_target", "orphaned_path_event"]


+def test_file_backed_store_reports_migration_needs_and_uses_atomic_json_writes(tmp_path) -> None:
+    store = FileBackedMemoryGraphStore(tmp_path)
+    metadata_path = tmp_path / "phase-memory.json"
+    metadata_path.write_text(
+        json.dumps(
+            {
+                "schema_version": "phase_memory.local_store.v0",
+                "planned_migrations": ["v0-to-v1"],
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    store.save_node(MemoryNode("node.atomic", "decision", "Atomic write target"))
+    runtime = PhaseMemoryRuntime(graph_store=store, event_log=JsonlMemoryEventLog(tmp_path / "events.jsonl"))
+
+    envelope = runtime.repair_diagnostics(source_ref=str(tmp_path))
+
+    codes = [diagnostic["code"] for diagnostic in envelope["diagnostics"]]
+    assert envelope["valid"] is True
+    assert "store_migration_required" in codes
+    assert "planned_store_migrations" in codes
+    assert not list(tmp_path.rglob("*.tmp"))
+
+
+def test_repair_diagnostics_distinguish_corrupt_store_records(tmp_path) -> None:
+    store = FileBackedMemoryGraphStore(tmp_path)
+    runtime = PhaseMemoryRuntime(graph_store=store, event_log=JsonlMemoryEventLog(tmp_path / "events.jsonl"))
+
+    (tmp_path / "nodes" / "broken.json").write_text("{not-json}\n", encoding="utf-8")
+
+    envelope = runtime.repair_diagnostics(source_ref=str(tmp_path))
+
+    assert envelope["valid"] is False
+    assert envelope["diagnostics"][0]["code"] == "corrupt_store_record"
+    assert envelope["diagnostics"][0]["metadata"]["record_type"] == "node"
+
+
 def test_lifecycle_apply_requires_approval_for_reviewable_actions(tmp_path) -> None:
    store = FileBackedMemoryGraphStore(tmp_path)
    runtime = PhaseMemoryRuntime(graph_store=store, event_log=JsonlMemoryEventLog(tmp_path / "events.jsonl"))
--- a/tests/test_service_readiness.py
+++ b/tests/test_service_readiness.py
@@ -1,7 +1,8 @@
 import json
 from pathlib import Path

-from phase_memory.models import LifecycleState, MemoryNode
+from phase_memory.lifecycle import plan_compaction
+from phase_memory.models import LifecycleAction, LifecycleActionKind, LifecycleState, MemoryNode
 from phase_memory.service import (
    HEALTH_REPORT_SCHEMA,
    KONTEXTUAL_DELEGATION_SCHEMA,
@@ -76,6 +77,58 @@ def test_service_runner_handles_profile_driven_lifecycle_plan() -> None:
    assert ("event.restart", "refresh") in actions


+def test_service_runner_handles_package_compile_and_audit_query() -> None:
+    runner = LocalServiceRunner()
+    selection = {
+        "schema_version": "markitect.memory.selection.v1",
+        "id": "selection.service",
+        "nodes": ["decision.boundary"],
+        "events": ["event.activation"],
+    }
+
+    compiled = runner.handle("package.compile", {"selection": selection, "source_ref": "service-test"})
+    audit = runner.handle("audit.query", {"filters": {"operation": "package.compile"}})
+
+    assert compiled["operation"] == "package.compile"
+    assert compiled["data"]["package_response"]["package_ref"] == "package:selection.service"
+    assert audit["operation"] == "audit.query"
+    assert audit["count"] == 1
+    assert audit["events"][0]["source"]["ref"] == "service-test"
+    assert audit["retention"]["mode"] == "in_memory"
+
+
+def test_service_runner_handles_review_gated_lifecycle_apply() -> None:
+    runner = LocalServiceRunner()
+    node = runner.runtime.graph_store.save_node(MemoryNode("node.review", "episode", "Review gated content"))
+    compact = plan_compaction([node]).to_dict()
+
+    denied = runner.handle("lifecycle.apply", {"actions": [compact]})
+    applied = runner.handle("lifecycle.apply", {"actions": [compact], "approval_marker": "review:service"})
+    audit = runner.handle("audit.query", {"filters": {"operation": "lifecycle.apply", "dry_run": False}})
+
+    assert denied["valid"] is False
+    assert denied["data"]["denied"][0]["reason"] == "review_required"
+    assert applied["valid"] is True
+    assert runner.runtime.graph_store.get_node(applied["data"]["applied"][0]["target_id"]).kind == "summary"
+    assert audit["count"] == 2
+
+
+def test_service_runner_handles_non_review_lifecycle_apply() -> None:
+    runner = LocalServiceRunner()
+    runner.runtime.graph_store.save_node(MemoryNode("node.stale.service", "episode"))
+    action = LifecycleAction(
+        LifecycleActionKind.MARK_STALE,
+        "node.stale.service",
+        from_state=LifecycleState.ACTIVE,
+        to_state=LifecycleState.STALE,
+    )
+
+    applied = runner.handle("lifecycle.apply", {"actions": [action.to_dict()]})
+
+    assert applied["valid"] is True
+    assert runner.runtime.graph_store.get_node("node.stale.service").lifecycle == LifecycleState.STALE
+
+
 def test_profile_driven_runtime_config_resolves_file_backed_adapters(tmp_path) -> None:
    config = RuntimeConfig.from_profile(
        {