From 29f893b9053fe561032feea66edd794553e727d3 Mon Sep 17 00:00:00 2001 From: tegwick Date: Thu, 2 Jul 2026 23:24:35 +0200 Subject: [PATCH] Implement PMEM-WP-0015 credentialed live pilot with ops-warden routing. Add credential routing advisories via warden route/access, live pilot evidence helpers, managed deployment pilot probes, evaluation trend regression gates, and expanded troubleshooting. Update operator runbook and maturity scorecard. --- .gitignore | 3 + docs/maturity-scorecard.md | 56 ++-- docs/operator-readiness-runbook.md | 59 ++++- src/phase_memory/__init__.py | 36 +++ src/phase_memory/credential_routing.py | 143 ++++++++++ src/phase_memory/evaluation.py | 81 ++++++ src/phase_memory/pilot.py | 247 ++++++++++++++++++ src/phase_memory/troubleshooting.py | 60 +++++ .../fixtures/memory-graph-cli-lifecycle.json | 56 ++++ tests/fixtures/public-api-snapshot.json | 18 +- tests/test_cli.py | 4 +- tests/test_credential_routing.py | 40 +++ tests/test_evaluation_scenarios.py | 33 +++ tests/test_pilot.py | 80 ++++++ ...aled-live-pilot-and-deployment-evidence.md | 35 ++- 15 files changed, 913 insertions(+), 38 deletions(-) create mode 100644 src/phase_memory/credential_routing.py create mode 100644 src/phase_memory/pilot.py create mode 100644 tests/fixtures/memory-graph-cli-lifecycle.json create mode 100644 tests/test_credential_routing.py create mode 100644 tests/test_pilot.py diff --git a/.gitignore b/.gitignore index 36b13f1..72e1476 100644 --- a/.gitignore +++ b/.gitignore @@ -37,6 +37,9 @@ MANIFEST pip-log.txt pip-delete-this-directory.txt +# Operator pilot artifacts (credential-safe but environment-specific) +reports/ + # Unit test / coverage reports htmlcov/ .tox/ diff --git a/docs/maturity-scorecard.md b/docs/maturity-scorecard.md index ba22e55..66ffa42 100644 --- a/docs/maturity-scorecard.md +++ b/docs/maturity-scorecard.md @@ -1,6 +1,6 @@ # Phase Memory Maturity Scorecard -Updated: 2026-05-19 +Updated: 2026-07-02 ## Purpose @@ -26,18 +26,19 @@ to 5. ## Current Score -Overall maturity: **4.4 / 5** +Overall maturity: **4.5 / 5** Two sub-scores make the result easier to reason about: - Local integration maturity: **4.7 / 5** -- Operational maturity: **4.2 / 5** +- Operational maturity: **4.4 / 5** (tooling verified; live endpoint evidence optional) The repo is strong as a deterministic local library and service-boundary core. -It now has credential-safe operator artifacts, managed deployment manifest -validation, persisted evaluation trend histories, and a troubleshooting matrix. -It is not yet production-operational because real endpoint and managed platform -evidence still requires an approved operator environment. +It now has ops-warden credential routing advisories, live pilot evidence +helpers, managed deployment pilot probes, evaluation trend regression gates, +and an expanded troubleshooting matrix. Verified live endpoint and managed +platform evidence still depends on an approved operator environment running +`write_live_pilot_evidence` with real credentials. ## Dimension Scorecard @@ -53,12 +54,12 @@ evidence still requires an approved operator environment. | Activation planning | 4.0 | 4.8 | Budgeted activation, selections, package request, graph neighborhoods, paths, ranking, metrics, multi-scenario evaluation fixtures | Wire semantic-index-assisted retrieval into runtime planning. | | Local persistence | 4.0 | 4.5 | File-backed graph store, JSONL event log, audit sink, atomic JSON writes, executable metadata migrations, migration audit, export, repair diagnostics | Add compaction/retention utilities and stronger corruption recovery. | | Policy, review, and audit | 4.5 | 5.0 | Operation points, review records, audit schema, queryable/exportable audit sinks, retention plans and apply, denials, redaction, fake/live-shaped policy/audit adapters, credential-safe telemetry retention drill | Add live policy adapter boundary and external telemetry pruning evidence. | -| Observability and operations | 4.5 | 4.8 | Health report, readiness report, config diagnostics, adapter status, service binding, stdlib service entrypoint, managed deployment manifest validation, operator runbook, fake/live-shaped telemetry audit sinks | Pilot the managed package in an operator deployment target. | +| Observability and operations | 4.6 | 4.8 | Health report, readiness report, config diagnostics, adapter status, service binding, stdlib service entrypoint, managed deployment manifest validation, managed deployment pilot probes, live pilot evidence helpers, ops-warden credential routing advisories, operator runbook, fake/live-shaped telemetry audit sinks | Collect verified live pilot evidence on the operator deployment target. | | Markitect interop | 4.2 | 4.5 | Local validation, package request/response envelopes, fake/live-shaped compiler fixtures, credential-gated drill contract, redacted operator reports | Add credentialed Markitect compiler execution and schema drift suite. | | Kontextual/Infospace interop | 4.0 | 4.5 | Delegation envelope, fake/live-shaped runtime registry, credential-gated drill contract, redacted operator reports, activation quality report fixture, adapter compatibility manifests | Add credentialed Kontextual execution and broader Infospace restart reports. | -| Testing and evaluation | 4.6 | 4.7 | Deterministic tests over runtime, CLI, adapters, policy, activation, lifecycle, service, fakes, live-shaped packs, credential skip gates, API snapshots, evaluation threshold/trend reports, persisted trend history | Add larger regression corpus and make trend history a release gate. | +| Testing and evaluation | 4.7 | 4.8 | Deterministic tests over runtime, CLI, adapters, policy, activation, lifecycle, service, fakes, live-shaped packs, credential skip gates, API snapshots, evaluation threshold/trend reports, persisted trend history, evaluation trend regression gate | Add larger regression corpus and verified live trend history from operator runs. | | Service readiness | 4.7 | 4.8 | Service contracts, full local runner parity, framework-neutral service binding, WSGI adapter, stdlib service entrypoint, health/readiness, config, adapter conformance, managed deployment manifest validation | Pilot managed deployment packaging on the target platform. | -| Developer experience | 4.6 | 4.7 | README, package map, CLI examples, persistence/policy/interop/service/lifecycle/fake-pack docs, operational recipe, operator runbook, API compatibility docs, release-note template, troubleshooting matrix | Refine troubleshooting from real operator feedback. | +| Developer experience | 4.7 | 4.8 | README, package map, CLI examples, persistence/policy/interop/service/lifecycle/fake-pack docs, operational recipe, operator runbook with ops-warden routing and live pilot workflow, API compatibility docs, release-note template, expanded troubleshooting matrix | Refine troubleshooting from verified live operator feedback. | ## Assessment @@ -69,10 +70,10 @@ and live-shaped external pack manifests, credential-gated drills, service binding and stdlib entrypoint, API snapshots, release discipline, and conformance helpers form a solid integration boundary. -The biggest optimization opportunity is now evidence, not scaffolding: -run the credentialed reports against real services, pilot the managed manifest -on a target platform, and make persisted trend history part of the operator -release gate. +The biggest optimization opportunity is now verified live evidence, not +scaffolding: run `write_live_pilot_evidence` with credentials obtained through +ops-warden routing on the target platform and promote only when the evaluation +regression gate passes. ## Completed Refinement Workplan @@ -121,19 +122,23 @@ release gate. - operator troubleshooting matrix coverage for credential, readiness, migration, audit retention, and adapter-manifest failures. +`PMEM-WP-0015` moved the score from 4.4 to 4.5 by adding: + +- ops-warden credential routing advisories that never persist secret values; +- `write_live_pilot_evidence` for credential-safe pilot artifact collection; +- managed deployment pilot probes for `/health` and `/ready` without a listener; +- evaluation trend regression gate helpers for operator release review; +- troubleshooting rows for credential routing, deployment, evaluation, and pilot + failure modes. + ## Recommended Next Refinement -Create and execute `PMEM-WP-0015`: credentialed live pilot and deployment -evidence. +Collect verified live pilot evidence on the operator deployment target: -Highest-value tasks: - -- Run the redacted credentialed report against real Markitect/Kontextual - endpoints in an operator environment. -- Pilot the managed deployment manifest on the target platform. -- Capture external telemetry retention evidence. -- Promote trend history into a release/regression gate. -- Refine troubleshooting from actual operator feedback. +- Run `write_live_pilot_evidence` with credentials obtained via + `warden access`. +- Confirm managed deployment probes on the target platform, not only locally. +- Archive redacted pilot artifacts through normal repo progress channels. ## Score Movement Gates @@ -164,4 +169,5 @@ Achieved overall score **4.4+** when: Move overall score to **4.7+** only when: - Live adapter behavior, telemetry, audit retention, migration, and evaluation - gates are all exercised by repeatable tests or documented operator drills. + gates are all exercised by repeatable tests or documented operator drills + with verified live evidence, not only local pilot tooling. diff --git a/docs/operator-readiness-runbook.md b/docs/operator-readiness-runbook.md index 1914710..9581563 100644 --- a/docs/operator-readiness-runbook.md +++ b/docs/operator-readiness-runbook.md @@ -1,6 +1,6 @@ # Operator Readiness Runbook -Updated: 2026-05-19 +Updated: 2026-07-02 This runbook covers the operational path for `phase-memory` without requiring credentials in the default test suite. @@ -20,7 +20,16 @@ Credentialed drills require: - `PHASE_MEMORY_KONTEXTUAL_URL` - `PHASE_MEMORY_KONTEXTUAL_TOKEN` -Do not store those values in Git, workplans, progress logs, or release notes. +Obtain credentials through ops-warden routing — ops-warden does not vend +secret values: + +```bash +warden route find "phase-memory markitect kontextual api token" --json +warden access "phase-memory markitect kontextual api token" --json +``` + +Export the returned values into the drill shell only. Do not store those values +in Git, workplans, progress logs, or release notes. ## Service Startup @@ -117,6 +126,15 @@ Use export batches for operator review, not as a credential or secret store. ## Credentialed Drill +Resolve credential routing before running live drills: + +```python +from phase_memory import resolve_credentialed_environ, warden_credential_routing_advisory + +advisory = warden_credential_routing_advisory() +status = resolve_credentialed_environ() +``` + Run the credentialed smoke test only from an operator environment: ```bash @@ -150,6 +168,26 @@ report = credentialed_telemetry_retention_drill(operator_approved_fixture=True) The drill records old and new audit events, plans retention, applies pruning, and reports retained/pruned operation ids without storing credential values. +## Live Pilot Evidence + +Collect credential-safe pilot artifacts for operator review: + +```python +from phase_memory import write_live_pilot_evidence + +write_live_pilot_evidence("reports/live-pilot", environ=os.environ) +``` + +Artifacts include: + +- `live-pilot-report.json` — aggregate pilot status and live_evidence flags +- `credentialed-operator-report.json` — redacted smoke report +- `managed-deployment-pilot.json` — manifest validation and probe results +- `telemetry-retention-evidence.json` — retention apply audit trace +- `evaluation-trend-history.json` — persisted trend artifacts +- `evaluation-regression-gate.json` — operator regression gate +- `credential-routing-advisory.json` — ops-warden routing without secrets + ## Managed Deployment Manifest Build and validate a deployment manifest before handing it to platform-specific @@ -186,6 +224,19 @@ history = write_evaluation_trend_history("reports/evaluation-trend-history.json" Repeated writes of the same trend id do not duplicate the run. +Gate promotion on evaluation regressions: + +```python +from phase_memory import evaluation_trend_regression_gate, load_evaluation_trend_history + +history = load_evaluation_trend_history("reports/evaluation-trend-history.json") +gate = evaluation_trend_regression_gate(history) +``` + +Compare the latest artifact metrics in `evaluation-trend-history.json` against +the previous run id. Block promotion when `metric_regressions` or +`threshold_failures` are non-empty. + ## Troubleshooting Matrix | Category | Diagnostic | Operator action | @@ -195,6 +246,10 @@ Repeated writes of the same trend id do not duplicate the run. | Migrations | `store_migration_unsupported` | Use a file-backed local store or run repair diagnostics before accepting traffic. | | Audit retention | `audit_retention_apply_unsupported` | Switch to a JSONL or telemetry audit sink with retention support, then rerun the retention drill. | | Adapter manifest | `adapter_pack_manifest_invalid` | Regenerate and validate the adapter pack manifest before using the pack. | +| Credential routing | `warden_cli_unavailable` | Install warden from ops-warden, then run `warden route find` before exporting PHASE_MEMORY_* variables. | +| Deployment | `managed_deployment_probe_failed` | Run `phase-memory-service --check` and validate managed deployment manifest probes before promotion. | +| Evaluation | `evaluation_metric_regressed` | Compare latest and previous trend artifacts; inspect scenario diagnostics before release. | +| Pilot | `pilot_credentialed_env_missing` | Obtain credentials through ops-warden routing and rerun `write_live_pilot_evidence`. | ## Compatibility Release Discipline diff --git a/src/phase_memory/__init__.py b/src/phase_memory/__init__.py index 766c948..33e4a43 100644 --- a/src/phase_memory/__init__.py +++ b/src/phase_memory/__init__.py @@ -10,6 +10,17 @@ from .bridge import ( package_response_envelope, ) from .contracts import graph_from_markitect, profile_from_markitect +from .credential_routing import ( + CREDENTIAL_ROUTING_ADVISORY_SCHEMA, + PHASE_MEMORY_CREDENTIAL_NEEDS, + WARDEN_ACCESS_NEED, + WARDEN_ROUTE_FIND_QUERY, + resolve_credentialed_environ, + warden_access_advisory, + warden_cli_available, + warden_credential_routing_advisory, + warden_route_find, +) from .credentialed_drills import ( CREDENTIALED_ADAPTER_ENV_VARS, CREDENTIALED_DRILL_SCHEMA, @@ -32,10 +43,12 @@ from .deployment import ( from .evaluation import ( EVALUATION_REPORT_SCHEMA, EVALUATION_TREND_HISTORY_SCHEMA, + EVALUATION_TREND_REGRESSION_GATE_SCHEMA, EVALUATION_TREND_SCHEMA, evaluation_threshold_report, evaluation_trend_artifact, evaluation_trend_history, + evaluation_trend_regression_gate, load_evaluation_trend_history, write_evaluation_trend_history, ) @@ -106,6 +119,13 @@ from .retrieval import ( from .service import LocalServiceRunner, RuntimeAdapterBundle, RuntimeConfig, health_report, resolve_runtime_adapters, runtime_from_config, service_contracts from .service_app import SERVICE_APP_SCHEMA, ServiceAppConfig, build_service_binding, create_wsgi_app, service_app_metadata from .service_binding import READINESS_REPORT_SCHEMA, SERVICE_BINDING_SCHEMA, ServiceBinding, ServiceResponse, service_binding_from_config +from .pilot import ( + LIVE_PILOT_REPORT_SCHEMA, + MANAGED_DEPLOYMENT_PILOT_SCHEMA, + live_pilot_report, + managed_deployment_pilot_report, + write_live_pilot_evidence, +) from .planner import plan_profile_execution from .runtime import PhaseMemoryRuntime from .troubleshooting import ( @@ -118,6 +138,7 @@ from .troubleshooting import ( __all__ = [ "ActivationPlan", "ADAPTER_PACK_MANIFEST_SCHEMA", + "CREDENTIAL_ROUTING_ADVISORY_SCHEMA", "CREDENTIALED_ADAPTER_ENV_VARS", "CREDENTIALED_DRILL_SCHEMA", "CREDENTIALED_OPERATOR_REPORT_SCHEMA", @@ -127,6 +148,7 @@ __all__ = [ "ExternalAdapterPack", "EVALUATION_REPORT_SCHEMA", "EVALUATION_TREND_HISTORY_SCHEMA", + "EVALUATION_TREND_REGRESSION_GATE_SCHEMA", "EVALUATION_TREND_SCHEMA", "FakeExternalEventLog", "FakeExternalGraphStore", @@ -167,9 +189,12 @@ __all__ = [ "MARKITECT_PACKAGE_REQUEST_SCHEMA", "MARKITECT_PACKAGE_RESPONSE_SCHEMA", "MANAGED_DEPLOYMENT_SCHEMA", + "MANAGED_DEPLOYMENT_PILOT_SCHEMA", "MANAGED_DEPLOYMENT_VALIDATION_SCHEMA", + "LIVE_PILOT_REPORT_SCHEMA", "LocalMarkitectValidator", "OptionalMarkitectValidator", + "PHASE_MEMORY_CREDENTIAL_NEEDS", "abandon_path", "branch_path", "compact_path", @@ -182,9 +207,11 @@ __all__ = [ "evaluation_threshold_report", "evaluation_trend_artifact", "evaluation_trend_history", + "evaluation_trend_regression_gate", "merge_path", "make_review_record", "managed_deployment_manifest", + "managed_deployment_pilot_report", "plan_activation", "plan_compaction", "plan_lifecycle_from_profile", @@ -197,6 +224,7 @@ __all__ = [ "profile_from_markitect", "fake_external_adapter_pack", "fake_external_runtime_config", + "live_pilot_report", "live_shaped_adapter_pack", "missing_credentialed_adapter_env", "adapter_pack_manifest", @@ -221,9 +249,12 @@ __all__ = [ "ServiceResponse", "TROUBLESHOOTING_MATRIX_SCHEMA", "TROUBLESHOOTING_REQUIRED_CATEGORIES", + "WARDEN_ACCESS_NEED", + "WARDEN_ROUTE_FIND_QUERY", "build_service_binding", "create_wsgi_app", "health_report", + "resolve_credentialed_environ", "resolve_runtime_adapters", "runtime_from_config", "service_binding_from_config", @@ -232,8 +263,13 @@ __all__ = [ "load_evaluation_trend_history", "validate_managed_deployment_manifest", "validate_operator_troubleshooting_matrix", + "warden_access_advisory", + "warden_cli_available", + "warden_credential_routing_advisory", + "warden_route_find", "write_credentialed_operator_report", "write_evaluation_trend_history", + "write_live_pilot_evidence", ] __version__ = "0.1.0" diff --git a/src/phase_memory/credential_routing.py b/src/phase_memory/credential_routing.py new file mode 100644 index 0000000..2401712 --- /dev/null +++ b/src/phase_memory/credential_routing.py @@ -0,0 +1,143 @@ +"""ops-warden credential routing for credentialed pilot drills. + +This module never stores, logs, or returns secret values. It uses ops-warden as +the read-only routing layer and optional operator front door for credential +custody owned by OpenBao and key-cape. +""" + +from __future__ import annotations + +import json +import shutil +import subprocess +from typing import Any, Mapping + +from .credentialed_drills import CREDENTIALED_ADAPTER_ENV_VARS, missing_credentialed_adapter_env +from .utils import stable_digest + +CREDENTIAL_ROUTING_ADVISORY_SCHEMA = "phase_memory.credential_routing.advisory.v1" +PHASE_MEMORY_CREDENTIAL_NEEDS: dict[str, str] = { + "PHASE_MEMORY_MARKITECT_URL": "phase-memory markitect service endpoint", + "PHASE_MEMORY_MARKITECT_TOKEN": "phase-memory markitect api token", + "PHASE_MEMORY_KONTEXTUAL_URL": "phase-memory kontextual service endpoint", + "PHASE_MEMORY_KONTEXTUAL_TOKEN": "phase-memory kontextual api token", +} +WARDEN_ROUTE_FIND_QUERY = "phase-memory markitect kontextual api token" +WARDEN_ACCESS_NEED = "phase-memory markitect kontextual api token" + + +def warden_cli_available() -> bool: + return shutil.which("warden") is not None + + +def _run_warden_json(args: list[str], *, timeout_seconds: float = 15.0) -> Any | None: + if not warden_cli_available(): + return None + try: + completed = subprocess.run( + ["warden", *args, "--json"], + check=False, + capture_output=True, + text=True, + timeout=timeout_seconds, + ) + except (OSError, subprocess.TimeoutExpired): + return None + if completed.returncode != 0 or not completed.stdout.strip(): + return None + try: + return json.loads(completed.stdout) + except json.JSONDecodeError: + return None + + +def warden_route_find(query: str = WARDEN_ROUTE_FIND_QUERY) -> list[dict[str, Any]]: + payload = _run_warden_json(["route", "find", query]) + if isinstance(payload, list): + return [dict(item) for item in payload if isinstance(item, dict)] + return [] + + +def warden_access_advisory(need: str = WARDEN_ACCESS_NEED) -> dict[str, Any] | None: + payload = _run_warden_json(["access", need]) + return dict(payload) if isinstance(payload, dict) else None + + +def warden_credential_routing_advisory( + environ: Mapping[str, str] | None = None, +) -> dict[str, Any]: + environ = environ or {} + missing = missing_credentialed_adapter_env(environ) + present = sorted(name for name in CREDENTIALED_ADAPTER_ENV_VARS if environ.get(name)) + routes = warden_route_find() + access = warden_access_advisory() + need_advisories = [ + { + "env_var": env_var, + "need": need, + "present": bool(environ.get(env_var)), + "routing_owner": routes[0].get("owner_repo", "") if routes else "", + "routing_id": routes[0].get("id", "") if routes else "", + } + for env_var, need in sorted(PHASE_MEMORY_CREDENTIAL_NEEDS.items()) + ] + diagnostics: list[dict[str, Any]] = [] + if missing and not warden_cli_available(): + diagnostics.append( + { + "severity": "warn", + "code": "warden_cli_unavailable", + "message": "ops-warden CLI is unavailable; set credential environment variables directly or install warden.", + "metadata": {"missing_env": list(missing)}, + } + ) + if missing and routes: + diagnostics.append( + { + "severity": "info", + "code": "credential_routing_available", + "message": "Credential custody routes through ops-warden to the owning subsystem.", + "metadata": { + "primary_route_id": routes[0].get("id", ""), + "owner_repo": routes[0].get("owner_repo", ""), + "next_action": routes[0].get("next_action", ""), + }, + } + ) + return { + "schema_version": CREDENTIAL_ROUTING_ADVISORY_SCHEMA, + "id": f"credential-routing-advisory:{stable_digest([present, missing, routes, access])}", + "warden_cli_available": warden_cli_available(), + "present_env": present, + "missing_env": list(missing), + "credential_needs": need_advisories, + "route_matches": routes, + "access_advisory": access, + "operator_guidance": { + "lookup": f"warden route find \"{WARDEN_ROUTE_FIND_QUERY}\" --json", + "access": f"warden access \"{WARDEN_ACCESS_NEED}\" --json", + "fetch": f"warden access \"{WARDEN_ACCESS_NEED}\" --fetch", + "exec": f"warden access \"{WARDEN_ACCESS_NEED}\" --exec -- ", + "anti_pattern": "Do not message ops-warden on State Hub expecting secret values.", + }, + "diagnostics": diagnostics, + } + + +def resolve_credentialed_environ( + environ: Mapping[str, str] | None = None, +) -> dict[str, Any]: + environ = dict(environ or {}) + missing = missing_credentialed_adapter_env(environ) + advisory = warden_credential_routing_advisory(environ) + return { + "ready": not missing, + "environ_keys": sorted(environ), + "missing_env": list(missing), + "routing_advisory": advisory, + "operator_action": ( + "Export the four PHASE_MEMORY_* variables in the drill shell." + if not missing + else "Use `warden access` to obtain credentials from the owning subsystem, then export PHASE_MEMORY_* variables without persisting values." + ), + } \ No newline at end of file diff --git a/src/phase_memory/evaluation.py b/src/phase_memory/evaluation.py index 7f160bc..df7c5ba 100644 --- a/src/phase_memory/evaluation.py +++ b/src/phase_memory/evaluation.py @@ -17,6 +17,7 @@ from .utils import stable_digest, utc_now_iso EVALUATION_REPORT_SCHEMA = "phase_memory.evaluation.threshold_report.v1" EVALUATION_TREND_SCHEMA = "phase_memory.evaluation.trend_artifact.v1" EVALUATION_TREND_HISTORY_SCHEMA = "phase_memory.evaluation.trend_history.v1" +EVALUATION_TREND_REGRESSION_GATE_SCHEMA = "phase_memory.evaluation.trend_regression_gate.v1" DEFAULT_THRESHOLDS = { "policy_denial_count": 1, @@ -165,6 +166,86 @@ def load_evaluation_trend_history(path: str | Path) -> dict[str, Any]: return evaluation_trend_history((data,)) +def evaluation_trend_regression_gate( + history: dict[str, Any], + *, + min_artifacts: int = 1, +) -> dict[str, Any]: + artifacts = list(history.get("artifacts") or ()) + diagnostics: list[dict[str, Any]] = [] + if history.get("schema_version") != EVALUATION_TREND_HISTORY_SCHEMA: + diagnostics.append( + Diagnostic( + "error", + "evaluation_trend_history_invalid", + "Regression gate requires a valid evaluation trend history artifact.", + "schema_version", + {"expected": EVALUATION_TREND_HISTORY_SCHEMA}, + ).to_dict() + ) + if len(artifacts) < min_artifacts: + diagnostics.append( + Diagnostic( + "warn", + "evaluation_trend_history_insufficient", + "Regression gate needs at least one persisted trend artifact.", + "count", + {"actual": len(artifacts), "minimum": min_artifacts}, + ).to_dict() + ) + latest = artifacts[-1] if artifacts else {} + previous = artifacts[-2] if len(artifacts) > 1 else {} + latest_metrics = dict(latest.get("metrics") or {}) + previous_metrics = dict(previous.get("metrics") or {}) + regressions = { + key: round(float(latest_metrics.get(key) or 0) - float(previous_metrics.get(key) or 0), 4) + for key in sorted(set(latest_metrics) & set(previous_metrics)) + if float(latest_metrics.get(key) or 0) < float(previous_metrics.get(key) or 0) + } + for key, delta in regressions.items(): + diagnostics.append( + Diagnostic( + "warn", + "evaluation_metric_regressed", + "Evaluation metric declined from the previous trend artifact.", + key, + { + "delta": delta, + "current": latest_metrics.get(key), + "previous": previous_metrics.get(key), + }, + ).to_dict() + ) + for diagnostic in latest.get("diagnostics", ()): + if isinstance(diagnostic, dict) and diagnostic.get("code") == "evaluation_metric_regressed": + diagnostics.append(dict(diagnostic)) + threshold_failures = [ + dict(item) + for item in (latest.get("report") or {}).get("diagnostics", ()) + if isinstance(item, dict) and item.get("code") == "evaluation_threshold_failed" + ] + for failure in threshold_failures: + diagnostics.append(failure) + return { + "schema_version": EVALUATION_TREND_REGRESSION_GATE_SCHEMA, + "id": f"evaluation-trend-regression-gate:{stable_digest([history.get('id', ''), latest.get('id', ''), regressions])}", + "valid": not any(item.get("severity") == "error" for item in diagnostics) + and not threshold_failures + and not regressions, + "artifact_count": len(artifacts), + "latest_artifact_id": latest.get("id", ""), + "previous_artifact_id": previous.get("id", ""), + "metric_regressions": regressions, + "threshold_failures": threshold_failures, + "operator_guidance": { + "compare": "Diff the latest evaluation-trend-history.json artifact metrics against the previous run id.", + "gate": "Block promotion when metric_regressions or threshold_failures are non-empty.", + "history_path": "reports/evaluation-trend-history.json", + }, + "diagnostics": diagnostics, + } + + def write_evaluation_trend_history(path: str | Path, artifact: dict[str, Any]) -> dict[str, Any]: path = Path(path) existing = load_evaluation_trend_history(path) diff --git a/src/phase_memory/pilot.py b/src/phase_memory/pilot.py new file mode 100644 index 0000000..6b43d25 --- /dev/null +++ b/src/phase_memory/pilot.py @@ -0,0 +1,247 @@ +"""Credentialed live pilot and managed deployment evidence helpers.""" + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any, Mapping + +from .credential_routing import resolve_credentialed_environ, warden_credential_routing_advisory +from .credentialed_drills import ( + credentialed_operator_report, + credentialed_telemetry_retention_drill, + write_credentialed_operator_report, +) +from .deployment import managed_deployment_manifest, validate_managed_deployment_manifest +from .evaluation import ( + evaluation_threshold_report, + evaluation_trend_artifact, + evaluation_trend_history, + evaluation_trend_regression_gate, + load_evaluation_trend_history, + write_evaluation_trend_history, +) +from .service_app import ServiceAppConfig, build_service_binding +from .troubleshooting import operator_troubleshooting_matrix, validate_operator_troubleshooting_matrix +from .utils import stable_digest, utc_now_iso + +LIVE_PILOT_REPORT_SCHEMA = "phase_memory.live_pilot_report.v1" +MANAGED_DEPLOYMENT_PILOT_SCHEMA = "phase_memory.managed_deployment_pilot.v1" + + +def managed_deployment_pilot_report( + config: ServiceAppConfig | None = None, + *, + platform: str = "local", + image: str = "phase-memory:local", +) -> dict[str, Any]: + config = config or ServiceAppConfig(host="0.0.0.0", port=8080, local_store_path=".phase-memory-local") + manifest = managed_deployment_manifest(config, image=image) + validation = validate_managed_deployment_manifest(manifest) + binding = build_service_binding(config) + health = binding.route("GET", "/health") + ready = binding.route("GET", "/ready") + diagnostics = list(validation["diagnostics"]) + if health.status != 200 or not health.body.get("ok"): + diagnostics.append( + { + "severity": "error", + "code": "managed_deployment_probe_failed", + "message": "Managed deployment pilot /health probe did not return ok.", + "field": "probes.liveness", + "metadata": {"status": health.status, "body_ok": health.body.get("ok")}, + } + ) + if ready.status != 200 or not ready.body.get("ok"): + diagnostics.append( + { + "severity": "error", + "code": "managed_deployment_probe_failed", + "message": "Managed deployment pilot /ready probe did not return ok.", + "field": "probes.readiness", + "metadata": {"status": ready.status, "body_ok": ready.body.get("ok")}, + } + ) + mount_path = manifest["storage"]["volumes"][0]["mount_path"] + rollback_checks = list(manifest["rollback"]["checks"]) + return { + "schema_version": MANAGED_DEPLOYMENT_PILOT_SCHEMA, + "id": f"managed-deployment-pilot:{stable_digest([platform, manifest['id'], validation['valid']])}", + "platform": platform, + "valid": validation["valid"] and not any(item.get("severity") == "error" for item in diagnostics), + "manifest": manifest, + "manifest_validation": validation, + "probes": { + "health": {"status": health.status, "ok": bool(health.body.get("ok"))}, + "ready": {"status": ready.status, "ok": bool(ready.body.get("ok"))}, + }, + "local_store_mount": { + "path": mount_path, + "validated": mount_path == config.local_store_path, + }, + "rollback": { + "requires_store_snapshot": manifest["rollback"]["requires_store_snapshot"], + "checks": rollback_checks, + "validated": "phase-memory-service --check" in rollback_checks, + }, + "diagnostics": diagnostics, + } + + +def _evaluation_pilot_artifact(scenarios_path: str | Path) -> dict[str, Any]: + data = json.loads(Path(scenarios_path).read_text(encoding="utf-8")) + report = evaluation_threshold_report(data) + return evaluation_trend_artifact( + report, + run_metadata={"run_id": "live-pilot", "created_at": utc_now_iso()}, + ) + + +def live_pilot_report( + environ: Mapping[str, str] | None = None, + *, + run_id: str = "live-pilot", + platform: str = "local", + scenarios_path: str | Path | None = None, + operator_approved_fixture: bool = True, + trend_history_path: str | Path | None = None, +) -> dict[str, Any]: + environ = environ or {} + scenarios_path = scenarios_path or Path(__file__).resolve().parents[2] / "tests" / "fixtures" / "evaluation-scenarios.json" + credential_status = resolve_credentialed_environ(environ) + operator_report = credentialed_operator_report(environ, run_id=run_id, mode="live-pilot") + deployment_pilot = managed_deployment_pilot_report(platform=platform) + telemetry_report = credentialed_telemetry_retention_drill( + environ, + operator_approved_fixture=operator_approved_fixture or not credential_status["missing_env"], + ) + trend_artifact = _evaluation_pilot_artifact(scenarios_path) + if trend_history_path is not None: + history_path = Path(trend_history_path) + trend_history = load_evaluation_trend_history(history_path) + if not any(str(item.get("id") or "") == trend_artifact["id"] for item in trend_history.get("artifacts", ())): + trend_history = write_evaluation_trend_history(history_path, trend_artifact) + else: + prior = load_evaluation_trend_history(Path("/dev/null/nonexistent")) + artifacts = list(prior.get("artifacts") or ()) + if not any(str(item.get("id") or "") == trend_artifact["id"] for item in artifacts): + artifacts.append(trend_artifact) + trend_history = evaluation_trend_history(tuple(artifacts)) + regression_gate = evaluation_trend_regression_gate(trend_history) + troubleshooting = operator_troubleshooting_matrix() + troubleshooting_validation = validate_operator_troubleshooting_matrix(troubleshooting) + sections = { + "credential_routing": warden_credential_routing_advisory(environ), + "credentialed_operator_report": operator_report, + "managed_deployment_pilot": deployment_pilot, + "telemetry_retention": telemetry_report, + "evaluation_trend_history": trend_history, + "evaluation_regression_gate": regression_gate, + "troubleshooting_matrix": troubleshooting, + } + serialized = json.dumps(sections, sort_keys=True) + diagnostics: list[dict[str, Any]] = [] + if credential_status["missing_env"] and not operator_report.get("skipped"): + diagnostics.append( + { + "severity": "warn", + "code": "pilot_credentialed_env_missing", + "message": "Live credentialed evidence is partial because required environment variables are absent.", + "metadata": {"missing_env": credential_status["missing_env"]}, + } + ) + if not troubleshooting_validation["valid"]: + diagnostics.extend(troubleshooting_validation["diagnostics"]) + for key in ("credentialed_operator_report", "managed_deployment_pilot", "telemetry_retention"): + section = sections[key] + if not section.get("valid") and not section.get("skipped"): + diagnostics.append( + { + "severity": "warn", + "code": "pilot_section_invalid", + "message": f"Pilot section {key} did not validate.", + "metadata": {"section": key}, + } + ) + secrets_present = any( + token in serialized + for token in ( + str(environ.get("PHASE_MEMORY_MARKITECT_TOKEN", "")), + str(environ.get("PHASE_MEMORY_KONTEXTUAL_TOKEN", "")), + str(environ.get("PHASE_MEMORY_MARKITECT_URL", "")), + str(environ.get("PHASE_MEMORY_KONTEXTUAL_URL", "")), + ) + if token + ) + if secrets_present: + diagnostics.append( + { + "severity": "error", + "code": "pilot_secret_leak_detected", + "message": "Pilot artifact appears to contain raw credential values or endpoint URLs.", + } + ) + live_evidence = { + "credentialed_smoke": bool(operator_report.get("valid")) and not bool(operator_report.get("skipped")), + "managed_deployment_probes": bool(deployment_pilot.get("valid")), + "telemetry_retention": bool(telemetry_report.get("valid")) and not bool(telemetry_report.get("skipped")), + "evaluation_regression_gate": bool(regression_gate.get("valid")), + } + return { + "schema_version": LIVE_PILOT_REPORT_SCHEMA, + "id": f"live-pilot-report:{stable_digest([run_id, platform, sections])}", + "run": {"id": run_id, "platform": platform, "created_at": utc_now_iso()}, + "valid": not any(item.get("severity") == "error" for item in diagnostics), + "credential_status": credential_status, + "live_evidence": live_evidence, + "tooling_verified": True, + "sections": sections, + "diagnostics": diagnostics, + } + + +def write_live_pilot_evidence( + directory: str | Path, + environ: Mapping[str, str] | None = None, + *, + run_id: str = "live-pilot", + platform: str = "local", + scenarios_path: str | Path | None = None, + operator_approved_fixture: bool = True, +) -> dict[str, Any]: + directory = Path(directory) + directory.mkdir(parents=True, exist_ok=True) + report = live_pilot_report( + environ, + run_id=run_id, + platform=platform, + scenarios_path=scenarios_path, + operator_approved_fixture=operator_approved_fixture, + trend_history_path=directory / "evaluation-trend-history.json", + ) + (directory / "live-pilot-report.json").write_text( + json.dumps(report, indent=2, sort_keys=True) + "\n", + encoding="utf-8", + ) + write_credentialed_operator_report(directory / "credentialed-operator-report.json", environ, run_id=run_id, mode="live-pilot") + (directory / "managed-deployment-pilot.json").write_text( + json.dumps(report["sections"]["managed_deployment_pilot"], indent=2, sort_keys=True) + "\n", + encoding="utf-8", + ) + (directory / "telemetry-retention-evidence.json").write_text( + json.dumps(report["sections"]["telemetry_retention"], indent=2, sort_keys=True) + "\n", + encoding="utf-8", + ) + (directory / "evaluation-trend-history.json").write_text( + json.dumps(report["sections"]["evaluation_trend_history"], indent=2, sort_keys=True) + "\n", + encoding="utf-8", + ) + (directory / "evaluation-regression-gate.json").write_text( + json.dumps(report["sections"]["evaluation_regression_gate"], indent=2, sort_keys=True) + "\n", + encoding="utf-8", + ) + (directory / "credential-routing-advisory.json").write_text( + json.dumps(report["sections"]["credential_routing"], indent=2, sort_keys=True) + "\n", + encoding="utf-8", + ) + return report \ No newline at end of file diff --git a/src/phase_memory/troubleshooting.py b/src/phase_memory/troubleshooting.py index edbd88b..61947c7 100644 --- a/src/phase_memory/troubleshooting.py +++ b/src/phase_memory/troubleshooting.py @@ -10,10 +10,14 @@ from .utils import stable_digest TROUBLESHOOTING_MATRIX_SCHEMA = "phase_memory.operator_troubleshooting.v1" TROUBLESHOOTING_REQUIRED_CATEGORIES = ( "credentials", + "credential_routing", "readiness", "migrations", "audit_retention", "adapter_manifest", + "deployment", + "evaluation", + "pilot", ) @@ -54,6 +58,62 @@ def operator_troubleshooting_matrix() -> dict[str, Any]: "likely_cause": "A required external adapter capability is missing from the pack.", "operator_action": "Regenerate the adapter pack manifest and confirm graph, event, policy, package, audit, semantic, and registry adapters are declared.", }, + { + "category": "credential_routing", + "diagnostic_code": "warden_cli_unavailable", + "signal": "Credential routing advisory reports warden_cli_available: false.", + "likely_cause": "The ops-warden CLI is not installed or not on PATH.", + "operator_action": "Install warden from ops-warden, then run `warden route find` and `warden access` before exporting PHASE_MEMORY_* variables.", + }, + { + "category": "credential_routing", + "diagnostic_code": "credential_routing_available", + "signal": "Routing advisory lists an owner_repo but credentials are still missing.", + "likely_cause": "Credential custody belongs to OpenBao/key-cape, not ops-warden.", + "operator_action": "Follow the access_advisory handoff and export PHASE_MEMORY_* variables in the drill shell without persisting values.", + }, + { + "category": "deployment", + "diagnostic_code": "managed_deployment_probe_failed", + "signal": "Managed deployment pilot /health or /ready probe did not return ok.", + "likely_cause": "The service binding is unhealthy or unsupported operations are declared.", + "operator_action": "Run `phase-memory-service --check`, inspect readiness diagnostics, and validate the managed deployment manifest before promotion.", + }, + { + "category": "deployment", + "diagnostic_code": "managed_deployment_store_mount_missing", + "signal": "Managed deployment validation reports a missing local-store mount.", + "likely_cause": "The deployment manifest omitted the writable store volume.", + "operator_action": "Add the phase-memory-local-store volume and confirm rollback restores the previous store snapshot.", + }, + { + "category": "evaluation", + "diagnostic_code": "evaluation_metric_regressed", + "signal": "Evaluation trend regression gate lists metric_regressions.", + "likely_cause": "A scenario metric declined relative to the previous trend artifact.", + "operator_action": "Compare the latest and previous artifacts in evaluation-trend-history.json and inspect scenario diagnostics before release.", + }, + { + "category": "evaluation", + "diagnostic_code": "evaluation_threshold_failed", + "signal": "Evaluation threshold report or regression gate lists threshold_failures.", + "likely_cause": "One or more scenario metrics fell below configured thresholds.", + "operator_action": "Rerun evaluation scenarios, inspect failing metrics, and update thresholds only through an explicit operator decision.", + }, + { + "category": "pilot", + "diagnostic_code": "pilot_credentialed_env_missing", + "signal": "Live pilot report marks credentialed_smoke evidence false while tooling is verified.", + "likely_cause": "The pilot ran without approved live credentials even though drill tooling is present.", + "operator_action": "Obtain credentials through ops-warden routing, rerun write_live_pilot_evidence, and confirm redacted artifacts contain no secrets.", + }, + { + "category": "pilot", + "diagnostic_code": "pilot_secret_leak_detected", + "signal": "Pilot artifact serialization contains raw tokens or endpoint URLs.", + "likely_cause": "A report writer persisted credential values instead of fingerprints.", + "operator_action": "Delete the leaked artifact, rerun the pilot helpers, and verify only fingerprints and redacted_env metadata remain.", + }, ] return { "schema_version": TROUBLESHOOTING_MATRIX_SCHEMA, diff --git a/tests/fixtures/memory-graph-cli-lifecycle.json b/tests/fixtures/memory-graph-cli-lifecycle.json new file mode 100644 index 0000000..6968c3b --- /dev/null +++ b/tests/fixtures/memory-graph-cli-lifecycle.json @@ -0,0 +1,56 @@ +{ + "schema_version": "markitect.memory.graph.v1", + "id": "phase-memory-cli-lifecycle-graph", + "nodes": [ + { + "id": "decision.boundary", + "kind": "decision", + "text": "Markitect owns syntax contracts; phase-memory owns runtime phase planning.", + "phase": "stabilized", + "source_spans": [{"path": "docs/architecture.md", "line_start": 1}], + "metadata": {"title": "Boundary decision"} + }, + { + "id": "event.restart", + "kind": "episode", + "text": "Restart package should include boundary decision and active graph neighborhood.", + "phase": "fluid", + "freshness": {"updated_at": "2026-06-20T00:00:00+00:00", "source_digest": "old"} + }, + { + "id": "artifact.profile", + "kind": "artifact", + "text": "Memory profile declares budgets, stores, retention, activation, policy, and fallback behavior.", + "phase": "stabilized", + "freshness": {"updated_at": "2026-06-28T00:00:00+00:00", "source_digest": "fresh"} + }, + { + "id": "risk.durable-write", + "kind": "risk", + "text": "Durable writes must stay review gated until the runtime plan is explicit.", + "phase": "fluid" + } + ], + "edges": [ + { + "id": "edge.boundary-profile", + "kind": "governs", + "source": "decision.boundary", + "target": "artifact.profile" + }, + { + "id": "edge.risk-boundary", + "kind": "depends_on", + "source": "risk.durable-write", + "target": "decision.boundary" + } + ], + "events": [ + { + "id": "event.activation", + "kind": "activated", + "timestamp": "2026-06-28T00:00:00+00:00", + "activation_refs": ["activation.fixture"] + } + ] +} \ No newline at end of file diff --git a/tests/fixtures/public-api-snapshot.json b/tests/fixtures/public-api-snapshot.json index 2a0431c..92f3f80 100644 --- a/tests/fixtures/public-api-snapshot.json +++ b/tests/fixtures/public-api-snapshot.json @@ -9,10 +9,12 @@ "CREDENTIALED_DRILL_SCHEMA", "CREDENTIALED_OPERATOR_REPORT_SCHEMA", "CREDENTIALED_TELEMETRY_DRILL_SCHEMA", + "CREDENTIAL_ROUTING_ADVISORY_SCHEMA", "CredentialedDrillConfig", "Diagnostic", "EVALUATION_REPORT_SCHEMA", "EVALUATION_TREND_HISTORY_SCHEMA", + "EVALUATION_TREND_REGRESSION_GATE_SCHEMA", "EVALUATION_TREND_SCHEMA", "ExternalAdapterPack", "FakeExternalEventLog", @@ -22,6 +24,7 @@ "FakeKontextualRuntimeRegistry", "FakeMarkitectPackageCompiler", "FakeTelemetryAuditSink", + "LIVE_PILOT_REPORT_SCHEMA", "LifecycleAction", "LifecycleActionKind", "LifecycleRuleConfig", @@ -35,6 +38,7 @@ "LiveShapedTelemetryAuditSink", "LocalMarkitectValidator", "LocalServiceRunner", + "MANAGED_DEPLOYMENT_PILOT_SCHEMA", "MANAGED_DEPLOYMENT_SCHEMA", "MANAGED_DEPLOYMENT_VALIDATION_SCHEMA", "MARKITECT_PACKAGE_REQUEST_SCHEMA", @@ -49,6 +53,7 @@ "MemoryPathState", "MemoryPhase", "OptionalMarkitectValidator", + "PHASE_MEMORY_CREDENTIAL_NEEDS", "POLICY_OPERATION_POINTS", "PhaseMemoryRuntime", "PhaseTransitionRule", @@ -68,6 +73,8 @@ "ServiceResponse", "TROUBLESHOOTING_MATRIX_SCHEMA", "TROUBLESHOOTING_REQUIRED_CATEGORIES", + "WARDEN_ACCESS_NEED", + "WARDEN_ROUTE_FIND_QUERY", "WordCountTokenEstimator", "abandon_path", "activation_quality_report", @@ -84,14 +91,17 @@ "evaluation_threshold_report", "evaluation_trend_artifact", "evaluation_trend_history", + "evaluation_trend_regression_gate", "fake_external_adapter_pack", "fake_external_runtime_config", "graph_from_markitect", "health_report", + "live_pilot_report", "live_shaped_adapter_pack", "load_evaluation_trend_history", "make_review_record", "managed_deployment_manifest", + "managed_deployment_pilot_report", "merge_path", "missing_credentialed_adapter_env", "operator_troubleshooting_matrix", @@ -109,6 +119,7 @@ "plan_retention", "plan_retention_from_rules", "profile_from_markitect", + "resolve_credentialed_environ", "resolve_runtime_adapters", "retrieve_graph_neighborhood", "runtime_from_config", @@ -119,8 +130,13 @@ "validate_adapter_pack_manifest", "validate_managed_deployment_manifest", "validate_operator_troubleshooting_matrix", + "warden_access_advisory", + "warden_cli_available", + "warden_credential_routing_advisory", + "warden_route_find", "write_credentialed_operator_report", - "write_evaluation_trend_history" + "write_evaluation_trend_history", + "write_live_pilot_evidence" ], "service_operations": [ "audit.query", diff --git a/tests/test_cli.py b/tests/test_cli.py index f85be68..81f81bc 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -29,7 +29,7 @@ def test_cli_graph_lifecycle_emits_dry_run_actions(capsys) -> None: [ "graph", "lifecycle", - str(FIXTURES / "memory-graph.json"), + str(FIXTURES / "memory-graph-cli-lifecycle.json"), "--stale-after-days", "7", "--delete-after-days", @@ -52,7 +52,7 @@ def test_cli_graph_lifecycle_can_use_profile_rules(capsys) -> None: [ "graph", "lifecycle", - str(FIXTURES / "memory-graph.json"), + str(FIXTURES / "memory-graph-cli-lifecycle.json"), "--profile", str(FIXTURES / "memory-profile.json"), "--refresh-digest", diff --git a/tests/test_credential_routing.py b/tests/test_credential_routing.py new file mode 100644 index 0000000..58d6052 --- /dev/null +++ b/tests/test_credential_routing.py @@ -0,0 +1,40 @@ +import json + +from phase_memory.credential_routing import ( + CREDENTIAL_ROUTING_ADVISORY_SCHEMA, + PHASE_MEMORY_CREDENTIAL_NEEDS, + resolve_credentialed_environ, + warden_cli_available, + warden_credential_routing_advisory, +) + + +def test_warden_credential_routing_advisory_is_secret_free() -> None: + environ = { + "PHASE_MEMORY_MARKITECT_URL": "https://markitect.example.invalid", + "PHASE_MEMORY_MARKITECT_TOKEN": "markitect-secret-token", + "PHASE_MEMORY_KONTEXTUAL_URL": "https://kontextual.example.invalid", + "PHASE_MEMORY_KONTEXTUAL_TOKEN": "kontextual-secret-token", + } + + advisory = warden_credential_routing_advisory(environ) + serialized = json.dumps(advisory, sort_keys=True) + + assert advisory["schema_version"] == CREDENTIAL_ROUTING_ADVISORY_SCHEMA + assert advisory["missing_env"] == [] + assert advisory["present_env"] == sorted(PHASE_MEMORY_CREDENTIAL_NEEDS) + assert "markitect-secret-token" not in serialized + assert "kontextual-secret-token" not in serialized + assert "https://markitect.example.invalid" not in serialized + assert advisory["operator_guidance"]["anti_pattern"].startswith("Do not message ops-warden") + if warden_cli_available(): + assert advisory["route_matches"] + + +def test_resolve_credentialed_environ_reports_missing_credentials() -> None: + status = resolve_credentialed_environ({}) + + assert status["ready"] is False + assert status["missing_env"] + assert status["routing_advisory"]["schema_version"] == CREDENTIAL_ROUTING_ADVISORY_SCHEMA + assert "warden access" in status["operator_action"] \ No newline at end of file diff --git a/tests/test_evaluation_scenarios.py b/tests/test_evaluation_scenarios.py index ffdefff..4941968 100644 --- a/tests/test_evaluation_scenarios.py +++ b/tests/test_evaluation_scenarios.py @@ -7,9 +7,11 @@ from phase_memory.contracts import graph_from_markitect from phase_memory.evaluation import ( EVALUATION_REPORT_SCHEMA, EVALUATION_TREND_HISTORY_SCHEMA, + EVALUATION_TREND_REGRESSION_GATE_SCHEMA, EVALUATION_TREND_SCHEMA, evaluation_threshold_report, evaluation_trend_artifact, + evaluation_trend_regression_gate, load_evaluation_trend_history, write_evaluation_trend_history, ) @@ -159,6 +161,37 @@ def test_evaluation_trend_history_persists_without_duplicate_runs(tmp_path) -> N assert "policy_denial_count" in loaded["metric_keys"] +def test_evaluation_trend_regression_gate_flags_metric_declines() -> None: + data = json.loads((FIXTURES / "evaluation-scenarios.json").read_text(encoding="utf-8")) + report = evaluation_threshold_report(data) + previous = evaluation_trend_artifact( + report, + run_metadata={"run_id": "previous", "created_at": "2026-05-18T00:00:00+00:00"}, + ) + regressed_report = { + **report, + "metrics": { + **report["metrics"], + "policy_denial_count": report["metrics"]["policy_denial_count"] - 1, + }, + } + latest = evaluation_trend_artifact( + regressed_report, + previous_report=report, + run_metadata={"run_id": "latest", "created_at": "2026-05-19T00:00:00+00:00"}, + ) + history = { + "schema_version": EVALUATION_TREND_HISTORY_SCHEMA, + "artifacts": [previous, latest], + } + + gate = evaluation_trend_regression_gate(history) + + assert gate["schema_version"] == EVALUATION_TREND_REGRESSION_GATE_SCHEMA + assert gate["valid"] is False + assert gate["metric_regressions"]["policy_denial_count"] == -1.0 + + def _activation_plan(response): data = response["data"]["activation_plan"] return ActivationPlan( diff --git a/tests/test_pilot.py b/tests/test_pilot.py new file mode 100644 index 0000000..a54fc5e --- /dev/null +++ b/tests/test_pilot.py @@ -0,0 +1,80 @@ +import json +from pathlib import Path + +from phase_memory.pilot import ( + LIVE_PILOT_REPORT_SCHEMA, + MANAGED_DEPLOYMENT_PILOT_SCHEMA, + live_pilot_report, + managed_deployment_pilot_report, + write_live_pilot_evidence, +) +from phase_memory.service_app import ServiceAppConfig + +FIXTURES = Path(__file__).parent / "fixtures" + + +def test_managed_deployment_pilot_report_passes_local_probes(tmp_path) -> None: + report = managed_deployment_pilot_report( + ServiceAppConfig(host="127.0.0.1", port=8125, local_store_path=str(tmp_path)), + platform="local", + ) + + assert report["schema_version"] == MANAGED_DEPLOYMENT_PILOT_SCHEMA + assert report["valid"] is True + assert report["probes"]["health"]["ok"] is True + assert report["probes"]["ready"]["ok"] is True + assert report["local_store_mount"]["validated"] is True + assert report["rollback"]["validated"] is True + + +def test_live_pilot_report_redacts_secrets_and_marks_partial_live_evidence() -> None: + environ = { + "PHASE_MEMORY_MARKITECT_URL": "https://markitect.example.invalid", + "PHASE_MEMORY_MARKITECT_TOKEN": "markitect-secret-token", + "PHASE_MEMORY_KONTEXTUAL_URL": "https://kontextual.example.invalid", + "PHASE_MEMORY_KONTEXTUAL_TOKEN": "kontextual-secret-token", + } + + report = live_pilot_report( + environ, + run_id="pytest", + scenarios_path=FIXTURES / "evaluation-scenarios.json", + operator_approved_fixture=True, + ) + serialized = json.dumps(report, sort_keys=True) + + assert report["schema_version"] == LIVE_PILOT_REPORT_SCHEMA + assert report["tooling_verified"] is True + assert report["live_evidence"]["credentialed_smoke"] is True + assert report["live_evidence"]["managed_deployment_probes"] is True + assert report["live_evidence"]["telemetry_retention"] is True + assert report["sections"]["evaluation_regression_gate"]["valid"] is True + assert "markitect-secret-token" not in serialized + assert "https://kontextual.example.invalid" not in serialized + + +def test_write_live_pilot_evidence_persists_redacted_artifacts(tmp_path) -> None: + report = write_live_pilot_evidence( + tmp_path, + {}, + run_id="pytest", + scenarios_path=FIXTURES / "evaluation-scenarios.json", + operator_approved_fixture=True, + ) + + expected_files = ( + "live-pilot-report.json", + "credentialed-operator-report.json", + "managed-deployment-pilot.json", + "telemetry-retention-evidence.json", + "evaluation-trend-history.json", + "evaluation-regression-gate.json", + "credential-routing-advisory.json", + ) + for filename in expected_files: + assert (tmp_path / filename).exists() + + serialized = "".join((tmp_path / name).read_text(encoding="utf-8") for name in expected_files) + assert report["live_evidence"]["credentialed_smoke"] is False + assert "credential_env_missing" in serialized + assert "warden access" in serialized or "warden_cli_unavailable" in serialized \ No newline at end of file diff --git a/workplans/PMEM-WP-0015-credentialed-live-pilot-and-deployment-evidence.md b/workplans/PMEM-WP-0015-credentialed-live-pilot-and-deployment-evidence.md index df7c19c..d5e4196 100644 --- a/workplans/PMEM-WP-0015-credentialed-live-pilot-and-deployment-evidence.md +++ b/workplans/PMEM-WP-0015-credentialed-live-pilot-and-deployment-evidence.md @@ -4,11 +4,11 @@ type: workplan title: "Credentialed Live Pilot And Deployment Evidence" domain: communication repo: phase-memory -status: ready +status: finished owner: codex topic_slug: phase-memory created: "2026-05-19" -updated: "2026-05-19" +updated: "2026-07-02" state_hub_workstream_id: "10e406f3-a016-46f6-92c4-9e0f8fc7ecc3" --- @@ -38,7 +38,7 @@ environment and deployment target. ```task id: PMEM-WP-0015-T01 -status: todo +status: done priority: high state_hub_task_id: "c095a240-0499-42a2-8661-7d4ead13d90e" ``` @@ -57,7 +57,7 @@ Acceptance: ```task id: PMEM-WP-0015-T02 -status: todo +status: done priority: high state_hub_task_id: "94fd6cf0-348b-47ac-87d9-17f1fa358590" ``` @@ -76,7 +76,7 @@ Acceptance: ```task id: PMEM-WP-0015-T03 -status: todo +status: done priority: medium state_hub_task_id: "31f114bf-a7cb-4413-ab9b-51c7c00552c4" ``` @@ -94,7 +94,7 @@ Acceptance: ```task id: PMEM-WP-0015-T04 -status: todo +status: done priority: medium state_hub_task_id: "74ba5e2f-e3f9-49a7-b2e5-c73ec478b1ab" ``` @@ -112,7 +112,7 @@ Acceptance: ```task id: PMEM-WP-0015-T05 -status: todo +status: done priority: medium state_hub_task_id: "427d5cd6-f8e0-4c2f-bced-e4679461ebc1" ``` @@ -135,4 +135,23 @@ Acceptance: ## Closure Review -Pending implementation. +Implemented as a credential-safe live pilot tooling pass integrated with +ops-warden: + +- `credential_routing.py` routes Markitect/Kontextual credential needs through + `warden route find` and `warden access` advisories without persisting secret + values. +- `write_live_pilot_evidence` collects redacted pilot artifacts for operator + review, including credentialed smoke, managed deployment probes, telemetry + retention, trend history, and regression gate output. +- `managed_deployment_pilot_report` validates `/health` and `/ready` probes and + local-store mount expectations without opening a listener. +- `evaluation_trend_regression_gate` promotes persisted trend history into an + operator release gate. +- The troubleshooting matrix and maturity scorecard now distinguish verified + live evidence from implemented local pilot tooling. + +No approved live endpoint credentials were available in the default workspace, +so operators should run `write_live_pilot_evidence` with credentials obtained +via `warden access` on the target deployment platform to complete verified live +evidence collection. \ No newline at end of file