Implement refinement hardening workplan

This commit is contained in:
2026-05-18 23:56:41 +02:00
parent 836acf7e01
commit 0eea94d05e
17 changed files with 1164 additions and 68 deletions

170
tests/fixtures/evaluation-scenarios.json vendored Normal file
View File

@@ -0,0 +1,170 @@
{
"schema_version": "phase_memory.evaluation_scenarios.v1",
"scenarios": [
{
"id": "policy-denied-activation",
"profile": {
"schema_version": "markitect.memory.profile.v1",
"id": "eval-policy-profile",
"memory_kinds": ["knowledge", "decision"],
"activation": {"max_items": 4, "max_tokens": 60},
"policy": {"mode": "allow-all", "trust_zone_labels": ["local"]},
"observability": {"audit_sink": "recording"}
},
"graph": {
"schema_version": "markitect.memory.graph.v1",
"id": "eval-policy-graph",
"nodes": [
{
"id": "policy.public",
"kind": "knowledge",
"text": "Public operating constraint that can be activated for local planning.",
"phase": "stabilized",
"policy": {"labels": ["public"], "trust_zone": "local"},
"source_spans": [{"path": "policy.md", "line_start": 1}],
"metadata": {"graph_id": "eval-policy-graph"}
},
{
"id": "policy.secret",
"kind": "knowledge",
"text": "Sensitive credential note that must not enter restart context.",
"phase": "stabilized",
"policy": {"labels": ["restricted"], "trust_zone": "local", "secret": true},
"metadata": {"graph_id": "eval-policy-graph"}
}
],
"edges": [
{
"id": "edge.policy",
"kind": "references",
"source": "policy.public",
"target": "policy.secret"
}
],
"events": []
},
"expect": {"denied_node_ids": ["policy.secret"]}
},
{
"id": "profile-lifecycle-rules",
"profile": {
"schema_version": "markitect.memory.profile.v1",
"id": "eval-lifecycle-profile",
"memory_kinds": ["episode", "decision"],
"retention": {
"episode": {"stale_after_days": 7},
"decision": {"delete_after_days": 365}
},
"refresh": {"mode": "enabled"},
"compaction": {"node_ids": ["life.old-episode"]},
"metadata": {
"phase_transitions": [
{
"node_kind": "decision",
"from_phase": "fluid",
"to_phase": "stabilized",
"min_age_days": 2,
"reason": "decision has stabilized"
}
]
}
},
"graph": {
"schema_version": "markitect.memory.graph.v1",
"id": "eval-lifecycle-graph",
"nodes": [
{
"id": "life.old-episode",
"kind": "episode",
"text": "An old episode ready to become stale and compacted.",
"phase": "fluid",
"freshness": {"updated_at": "2026-04-01T00:00:00+00:00", "source_digest": "old"},
"metadata": {"graph_id": "eval-lifecycle-graph"}
},
{
"id": "life.decision",
"kind": "decision",
"text": "A decision that should transition to stabilized after review.",
"phase": "fluid",
"freshness": {"updated_at": "2026-05-01T00:00:00+00:00", "source_digest": "decision-old"},
"metadata": {"graph_id": "eval-lifecycle-graph"}
}
],
"edges": [],
"events": []
},
"expect": {
"actions": [
["life.old-episode", "mark_stale"],
["life.decision", "transition_phase"],
["life.decision", "refresh"]
],
"compact_source": "life.old-episode"
}
},
{
"id": "budget-path-and-semantic-hints",
"profile": {
"schema_version": "markitect.memory.profile.v1",
"id": "eval-budget-profile",
"memory_kinds": ["decision", "knowledge", "episode"],
"activation": {"max_items": 2, "max_tokens": 16, "semantic_index": "memory"}
},
"graph": {
"schema_version": "markitect.memory.graph.v1",
"id": "eval-budget-graph",
"nodes": [
{
"id": "budget.anchor",
"kind": "decision",
"text": "Restart anchor with source.",
"phase": "stabilized",
"source_spans": [{"path": "restart.md", "line_start": 3}],
"metadata": {"graph_id": "eval-budget-graph"}
},
{
"id": "budget.semantic",
"kind": "knowledge",
"text": "Semantic index hint for restart package selection.",
"phase": "stabilized",
"source_spans": [{"path": "retrieval.md", "line_start": 7}],
"metadata": {"graph_id": "eval-budget-graph"}
},
{
"id": "budget.long",
"kind": "episode",
"text": "This verbose episode is intentionally long enough to lose against the strict activation token budget pressure.",
"phase": "fluid",
"metadata": {"graph_id": "eval-budget-graph"}
}
],
"edges": [
{
"id": "edge.budget",
"kind": "supports",
"source": "budget.anchor",
"target": "budget.semantic"
}
],
"events": [
{
"id": "budget.path-event",
"kind": "activated",
"timestamp": "2026-05-18T00:00:00+00:00",
"activation_refs": ["activation.budget"]
}
]
},
"path": {
"id": "path.budget",
"event_ids": ["budget.path-event"]
},
"expect": {
"selected_node_ids": ["budget.anchor", "budget.semantic"],
"omitted_node_ids": ["budget.long"],
"semantic_top_id": "budget.semantic",
"event_ids": ["budget.path-event"]
}
}
]
}

View File

@@ -0,0 +1,101 @@
import json
from datetime import datetime, timezone
from pathlib import Path
from phase_memory.adapters import InMemorySemanticIndex
from phase_memory.contracts import graph_from_markitect
from phase_memory.models import ActivationPlan, MemoryPath
from phase_memory.retrieval import activation_quality_report, select_event_path
from phase_memory.runtime import PhaseMemoryRuntime
FIXTURES = Path(__file__).parent / "fixtures"
def _scenarios():
data = json.loads((FIXTURES / "evaluation-scenarios.json").read_text(encoding="utf-8"))
return {scenario["id"]: scenario for scenario in data["scenarios"]}
def test_policy_denied_activation_scenario_is_redacted_and_audited() -> None:
scenario = _scenarios()["policy-denied-activation"]
runtime = PhaseMemoryRuntime()
response = runtime.plan_activation(
scenario["graph"],
max_items=4,
max_tokens=60,
profile_id=scenario["profile"]["id"],
policy_context={"denied_labels": ["restricted"], "secrets_allowed": False, "trust_zone": "local"},
)
audit = runtime.query_audit({"operation": "graph.activation.plan"})
denied_ids = [item["id"] for item in response["data"]["policy_denials"]]
assert response["valid"] is True
assert denied_ids == scenario["expect"]["denied_node_ids"]
assert response["data"]["policy_denials"][0]["text"] == "[REDACTED]"
assert [diagnostic["code"] for diagnostic in response["diagnostics"]] == ["activation_policy_denied"]
assert audit["count"] == 1
def test_profile_lifecycle_rules_scenario_emits_expected_actions() -> None:
scenario = _scenarios()["profile-lifecycle-rules"]
runtime = PhaseMemoryRuntime()
response = runtime.plan_lifecycle_with_profile(
scenario["profile"],
scenario["graph"],
refresh_digests={"life.decision": "decision-new"},
now=datetime(2026, 5, 18, tzinfo=timezone.utc),
)
actions = [(action["target_id"], action["action"]) for action in response["data"]["dry_run_actions"]]
compact_actions = [action for action in response["data"]["dry_run_actions"] if action["action"] == "compact"]
assert response["valid"] is True
for expected in scenario["expect"]["actions"]:
assert tuple(expected) in actions
assert compact_actions[0]["metadata"]["source_node_ids"] == [scenario["expect"]["compact_source"]]
def test_budget_path_and_semantic_hint_scenario_meets_quality_thresholds() -> None:
scenario = _scenarios()["budget-path-and-semantic-hints"]
graph = graph_from_markitect(scenario["graph"]).value
runtime = PhaseMemoryRuntime()
index = InMemorySemanticIndex()
index.upsert_nodes(list(graph.nodes))
response = runtime.plan_activation(
scenario["graph"],
max_items=scenario["profile"]["activation"]["max_items"],
max_tokens=scenario["profile"]["activation"]["max_tokens"],
profile_id=scenario["profile"]["id"],
priority_node_ids=tuple(scenario["expect"]["selected_node_ids"]),
)
path = MemoryPath.from_mapping(scenario["path"])
selected_path_events = select_event_path(graph.events, path, max_events=2)
semantic_results = index.query(graph_id=graph.graph_id, query="semantic restart", limit=2)
report = activation_quality_report(_activation_plan(response), expected_node_ids=tuple(scenario["expect"]["selected_node_ids"]))
plan = response["data"]["activation_plan"]
assert plan["selected_node_ids"] == scenario["expect"]["selected_node_ids"]
assert [item["id"] for item in plan["omitted"]] == scenario["expect"]["omitted_node_ids"]
assert selected_path_events == tuple(scenario["expect"]["event_ids"])
assert semantic_results[0]["id"] == scenario["expect"]["semantic_top_id"]
assert report["source_span_coverage"] == 1.0
assert report["explanation_coverage"] == 1.0
def _activation_plan(response):
data = response["data"]["activation_plan"]
return ActivationPlan(
plan_id=data["plan_id"],
graph_id=data["graph_id"],
selected_node_ids=tuple(data["selected_node_ids"]),
selected_event_ids=tuple(data["selected_event_ids"]),
omitted=tuple(data["omitted"]),
token_estimate=data["token_estimate"],
max_items=data["max_items"],
max_tokens=data["max_tokens"],
selection=response["data"]["package_request"]["selection"],
diagnostics=(),
)

View File

@@ -1,7 +1,14 @@
import json
from pathlib import Path
from phase_memory.external_adapters import fake_external_adapter_pack, fake_external_runtime_config
from phase_memory.external_adapters import (
ADAPTER_PACK_MANIFEST_SCHEMA,
ExternalAdapterPack,
adapter_pack_manifest,
fake_external_adapter_pack,
fake_external_runtime_config,
validate_adapter_pack_manifest,
)
from phase_memory.service import (
assert_audit_sink_conformance,
assert_context_compiler_conformance,
@@ -37,6 +44,35 @@ def test_fake_external_adapter_pack_satisfies_public_conformance_helpers() -> No
assert pack.to_dict()["adapters"]["package_compiler"] == "FakeMarkitectPackageCompiler"
def test_fake_external_adapter_pack_manifest_declares_compatibility() -> None:
pack = fake_external_adapter_pack()
manifest = adapter_pack_manifest(pack)
diagnostics = validate_adapter_pack_manifest(pack)
assert manifest["schema_version"] == ADAPTER_PACK_MANIFEST_SCHEMA
assert manifest["adapters"]["package_compiler"]["required_conformance"] == "assert_context_compiler_conformance"
assert manifest["adapters"]["audit_sink"]["required_capabilities"] == ["telemetry.audit.fake"]
assert diagnostics == ()
def test_adapter_pack_manifest_reports_missing_capabilities() -> None:
pack = fake_external_adapter_pack()
incomplete = ExternalAdapterPack(
name=pack.name,
adapters=pack.adapters,
capabilities=tuple(capability for capability in pack.capabilities if capability != "telemetry.audit.fake"),
ownership_boundaries=pack.ownership_boundaries,
required_conformance=pack.required_conformance,
metadata=pack.metadata,
)
diagnostics = validate_adapter_pack_manifest(incomplete)
assert [diagnostic.code for diagnostic in diagnostics] == ["missing_adapter_capability"]
assert diagnostics[0].metadata["capability"] == "telemetry.audit.fake"
def test_external_runtime_config_resolves_supplied_fake_pack() -> None:
config = fake_external_runtime_config()
pack = fake_external_adapter_pack()

View File

@@ -87,6 +87,44 @@ def test_repair_diagnostics_report_missing_edges_and_orphaned_path_events(tmp_pa
assert [diagnostic["code"] for diagnostic in envelope["diagnostics"]] == ["missing_edge_target", "orphaned_path_event"]
def test_file_backed_store_reports_migration_needs_and_uses_atomic_json_writes(tmp_path) -> None:
store = FileBackedMemoryGraphStore(tmp_path)
metadata_path = tmp_path / "phase-memory.json"
metadata_path.write_text(
json.dumps(
{
"schema_version": "phase_memory.local_store.v0",
"planned_migrations": ["v0-to-v1"],
}
),
encoding="utf-8",
)
store.save_node(MemoryNode("node.atomic", "decision", "Atomic write target"))
runtime = PhaseMemoryRuntime(graph_store=store, event_log=JsonlMemoryEventLog(tmp_path / "events.jsonl"))
envelope = runtime.repair_diagnostics(source_ref=str(tmp_path))
codes = [diagnostic["code"] for diagnostic in envelope["diagnostics"]]
assert envelope["valid"] is True
assert "store_migration_required" in codes
assert "planned_store_migrations" in codes
assert not list(tmp_path.rglob("*.tmp"))
def test_repair_diagnostics_distinguish_corrupt_store_records(tmp_path) -> None:
store = FileBackedMemoryGraphStore(tmp_path)
runtime = PhaseMemoryRuntime(graph_store=store, event_log=JsonlMemoryEventLog(tmp_path / "events.jsonl"))
(tmp_path / "nodes" / "broken.json").write_text("{not-json}\n", encoding="utf-8")
envelope = runtime.repair_diagnostics(source_ref=str(tmp_path))
assert envelope["valid"] is False
assert envelope["diagnostics"][0]["code"] == "corrupt_store_record"
assert envelope["diagnostics"][0]["metadata"]["record_type"] == "node"
def test_lifecycle_apply_requires_approval_for_reviewable_actions(tmp_path) -> None:
store = FileBackedMemoryGraphStore(tmp_path)
runtime = PhaseMemoryRuntime(graph_store=store, event_log=JsonlMemoryEventLog(tmp_path / "events.jsonl"))

View File

@@ -1,7 +1,8 @@
import json
from pathlib import Path
from phase_memory.models import LifecycleState, MemoryNode
from phase_memory.lifecycle import plan_compaction
from phase_memory.models import LifecycleAction, LifecycleActionKind, LifecycleState, MemoryNode
from phase_memory.service import (
HEALTH_REPORT_SCHEMA,
KONTEXTUAL_DELEGATION_SCHEMA,
@@ -76,6 +77,58 @@ def test_service_runner_handles_profile_driven_lifecycle_plan() -> None:
assert ("event.restart", "refresh") in actions
def test_service_runner_handles_package_compile_and_audit_query() -> None:
runner = LocalServiceRunner()
selection = {
"schema_version": "markitect.memory.selection.v1",
"id": "selection.service",
"nodes": ["decision.boundary"],
"events": ["event.activation"],
}
compiled = runner.handle("package.compile", {"selection": selection, "source_ref": "service-test"})
audit = runner.handle("audit.query", {"filters": {"operation": "package.compile"}})
assert compiled["operation"] == "package.compile"
assert compiled["data"]["package_response"]["package_ref"] == "package:selection.service"
assert audit["operation"] == "audit.query"
assert audit["count"] == 1
assert audit["events"][0]["source"]["ref"] == "service-test"
assert audit["retention"]["mode"] == "in_memory"
def test_service_runner_handles_review_gated_lifecycle_apply() -> None:
runner = LocalServiceRunner()
node = runner.runtime.graph_store.save_node(MemoryNode("node.review", "episode", "Review gated content"))
compact = plan_compaction([node]).to_dict()
denied = runner.handle("lifecycle.apply", {"actions": [compact]})
applied = runner.handle("lifecycle.apply", {"actions": [compact], "approval_marker": "review:service"})
audit = runner.handle("audit.query", {"filters": {"operation": "lifecycle.apply", "dry_run": False}})
assert denied["valid"] is False
assert denied["data"]["denied"][0]["reason"] == "review_required"
assert applied["valid"] is True
assert runner.runtime.graph_store.get_node(applied["data"]["applied"][0]["target_id"]).kind == "summary"
assert audit["count"] == 2
def test_service_runner_handles_non_review_lifecycle_apply() -> None:
runner = LocalServiceRunner()
runner.runtime.graph_store.save_node(MemoryNode("node.stale.service", "episode"))
action = LifecycleAction(
LifecycleActionKind.MARK_STALE,
"node.stale.service",
from_state=LifecycleState.ACTIVE,
to_state=LifecycleState.STALE,
)
applied = runner.handle("lifecycle.apply", {"actions": [action.to_dict()]})
assert applied["valid"] is True
assert runner.runtime.graph_store.get_node("node.stale.service").lifecycle == LifecycleState.STALE
def test_profile_driven_runtime_config_resolves_file_backed_adapters(tmp_path) -> None:
config = RuntimeConfig.from_profile(
{