From 941501c590b7ab0e4a81ee2f97d7ef25af7e8294 Mon Sep 17 00:00:00 2001 From: tegwick Date: Wed, 24 Jun 2026 14:52:35 +0200 Subject: [PATCH] FLEX-WP-0007: production registry fixture, tests, and sync runbook Add production_registry_snapshot.json from ops-warden inventory with CI coverage for real actors, IAM subject binding, ttl_out_of_bounds, and unknown_actor_resource. Extend serve contract tests with /healthz and publish the registry sync contract for operator deployment. --- cmd/flex-auth/main_test.go | 152 +++++- docs/ops-warden-policy-gate-handoff.md | 22 + docs/ops-warden-registry-sync.md | 128 +++++ docs/workplan-planning-map.md | 6 +- examples/ops-warden/README.md | 15 + .../production_registry_snapshot.json | 450 ++++++++++++++++++ ...arden-policy-gate-production-deployment.md | 211 ++++++++ 7 files changed, 981 insertions(+), 3 deletions(-) create mode 100644 docs/ops-warden-registry-sync.md create mode 100644 examples/ops-warden/production_registry_snapshot.json create mode 100644 workplans/FLEX-WP-0007-ops-warden-policy-gate-production-deployment.md diff --git a/cmd/flex-auth/main_test.go b/cmd/flex-auth/main_test.go index 3d6e371..a95d208 100644 --- a/cmd/flex-auth/main_test.go +++ b/cmd/flex-auth/main_test.go @@ -111,6 +111,15 @@ func TestServeOpsWardenCheckContract(t *testing.T) { server := httptest.NewServer(newServeMux(engine)) defer server.Close() + resp, err := http.Get(server.URL + "/healthz") + if err != nil { + t.Fatalf("GET /healthz: %v", err) + } + resp.Body.Close() + if resp.StatusCode != http.StatusOK { + t.Fatalf("GET /healthz status = %d; want 200", resp.StatusCode) + } + allow := postCheck(t, server.URL+"/v1/check", opsPath("check_request_allow_adm.json")) if allow.Effect != api.DecisionEffectAllow || allow.ID == "" { t.Fatalf("allow decision = %+v; want allow with id", allow) @@ -121,7 +130,7 @@ func TestServeOpsWardenCheckContract(t *testing.T) { t.Fatalf("deny decision = %+v; want ttl_out_of_bounds deny", deny) } - resp, err := http.Get(server.URL + "/v1/check") + resp, err = http.Get(server.URL + "/v1/check") if err != nil { t.Fatalf("GET /v1/check: %v", err) } @@ -148,6 +157,124 @@ func TestServeOpsWardenCheckContract(t *testing.T) { } } +func TestRunLoadRegistryOpsWardenProduction(t *testing.T) { + var stdout, stderr bytes.Buffer + code := run([]string{"load-registry", "--file", opsPath("production_registry_snapshot.json")}, &stdout, &stderr) + if code != 0 { + t.Fatalf("code = %d, stderr = %s", code, stderr.String()) + } + + var result map[string]any + if err := json.Unmarshal(stdout.Bytes(), &result); err != nil { + t.Fatalf("unmarshal load-registry output: %v; stdout = %s", err, stdout.String()) + } + if result["subjects"] != float64(4) || result["relationships"] != float64(4) || result["resource_manifests"] != float64(1) { + t.Fatalf("load-registry result = %+v; want production actor registry counts", result) + } +} + +func TestOpsWardenProductionRegistryActors(t *testing.T) { + engine, err := buildEngine(context.Background(), opsPath("production_registry_snapshot.json"), opsPath("policy_package.md"), "") + if err != nil { + t.Fatalf("buildEngine: %v", err) + } + + cases := []struct { + name string + subjectID string + actor string + actorType string + principal string + ttlHours float64 + wantEffect api.DecisionEffect + wantReason string + }{ + { + name: "state hub bridge agent allow", + subjectID: "agt-state-hub-bridge", + actor: "agt-state-hub-bridge", + actorType: "agt", + principal: "agt-task-bridge", + ttlHours: 1, + wantEffect: api.DecisionEffectAllow, + }, + { + name: "state hub bridge IAM subject allow", + subjectID: "iam:agt-state-hub-bridge", + actor: "agt-state-hub-bridge", + actorType: "agt", + principal: "agt-task-bridge", + ttlHours: 1, + wantEffect: api.DecisionEffectAllow, + }, + { + name: "codex interhub bootstrap agent allow", + subjectID: "agt-codex-interhub-bootstrap", + actor: "agt-codex-interhub-bootstrap", + actorType: "agt", + principal: "agt-interhub-bootstrap", + ttlHours: 1, + wantEffect: api.DecisionEffectAllow, + }, + { + name: "admin actor allow", + subjectID: "adm-example", + actor: "adm-example", + actorType: "adm", + principal: "adm-full", + ttlHours: 4, + wantEffect: api.DecisionEffectAllow, + }, + { + name: "automation actor allow", + subjectID: "atm-backup-daily", + actor: "atm-backup-daily", + actorType: "atm", + principal: "atm-backup-daily", + ttlHours: 1, + wantEffect: api.DecisionEffectAllow, + }, + { + name: "ttl above production max denies", + subjectID: "agt-state-hub-bridge", + actor: "agt-state-hub-bridge", + actorType: "agt", + principal: "agt-task-bridge", + ttlHours: 999, + wantEffect: api.DecisionEffectDeny, + wantReason: "ttl_out_of_bounds", + }, + { + name: "unregistered production actor denies", + subjectID: "agt-missing", + actor: "agt-missing", + actorType: "agt", + principal: "agt-missing", + ttlHours: 1, + wantEffect: api.DecisionEffectDeny, + wantReason: "unknown_actor_resource", + }, + } + + for _, tt := range cases { + t.Run(tt.name, func(t *testing.T) { + decision, err := engine.Check(context.Background(), opsWardenProductionSignRequest(tt.subjectID, tt.actor, tt.actorType, tt.principal, tt.ttlHours)) + if err != nil { + t.Fatalf("Check: %v", err) + } + if decision.Effect != tt.wantEffect { + t.Fatalf("decision.Effect = %q; want %q; decision: %+v", decision.Effect, tt.wantEffect, decision) + } + if tt.wantReason != "" && decision.Reason != tt.wantReason { + t.Fatalf("decision.Reason = %q; want %q; decision: %+v", decision.Reason, tt.wantReason, decision) + } + if tt.wantEffect == api.DecisionEffectAllow && decision.ID == "" { + t.Fatal("allow decision ID is empty") + } + }) + } +} + func TestRunValidateAccessDescriptor(t *testing.T) { var stdout, stderr bytes.Buffer code := run([]string{"validate", "--kind", "access-descriptor", "--file", examplePath("access_descriptor.yaml")}, &stdout, &stderr) @@ -167,6 +294,29 @@ func opsPath(name string) string { return filepath.Join("..", "..", "examples", "ops-warden", name) } +func opsWardenProductionSignRequest(subjectID, actor, actorType, principal string, ttlHours float64) api.CheckRequest { + return api.CheckRequest{ + ID: "check:ops-warden-production-" + actor, + Tenant: "tenant:platform", + Subject: api.SubjectRef{ + ID: subjectID, + Type: api.SubjectType(actorType), + }, + Action: "sign", + Resource: api.ResourceRef{ + ID: "ssh-cert:actor/" + actor, + Type: "ssh-certificate", + System: "ops-warden", + }, + Context: map[string]any{ + "principals": []string{principal}, + "actor_type": actorType, + "ttl_hours": ttlHours, + "pubkey_fingerprint": "SHA256:example-production-fingerprint", + }, + } +} + func postCheck(t *testing.T, url, path string) api.DecisionEnvelope { t.Helper() diff --git a/docs/ops-warden-policy-gate-handoff.md b/docs/ops-warden-policy-gate-handoff.md index 32e4018..dcba10e 100644 --- a/docs/ops-warden-policy-gate-handoff.md +++ b/docs/ops-warden-policy-gate-handoff.md @@ -80,3 +80,25 @@ integration, host documentation, and signatures.log production evidence. No SSH private keys, OpenBao tokens, database credentials, or real public-key material are stored in these fixtures. + + +## FLEX-WP-0007 Production Update + +Additional published assets: + +- Production registry fixture: examples/ops-warden/production_registry_snapshot.json +- Registry sync runbook: docs/ops-warden-registry-sync.md + +Production runtime command: + + flex-auth serve --addr 0.0.0.0:8080 --registry examples/ops-warden/production_registry_snapshot.json --policy examples/ops-warden/policy_package.md --log /var/log/flex-auth/ops-warden-decisions.jsonl + +Use http://flex-auth.flex-auth.svc.cluster.local:8080 when cluster DNS is +reachable from warden workstations. Otherwise use the approved operator tunnel +or ingress URL. Always pre-flight GET /healthz from the same workstation before +enabling policy.enabled with fail_closed true. + +Production actor coverage now verifies agt-state-hub-bridge, +agt-codex-interhub-bootstrap, adm-example, atm-backup-daily, ttl_out_of_bounds, +unknown_actor_resource, and the iam:agt-state-hub-bridge subject path used by +WARDEN_POLICY_SUBJECT. diff --git a/docs/ops-warden-registry-sync.md b/docs/ops-warden-registry-sync.md new file mode 100644 index 0000000..ee42ee5 --- /dev/null +++ b/docs/ops-warden-registry-sync.md @@ -0,0 +1,128 @@ +# Ops-Warden Registry Sync + +Date: 2026-06-23 +Workplan: FLEX-WP-0007 + +This is the flex-auth side of the production policy gate runbook for ops-warden +SSH signing. ops-warden owns actor inventory and generated registry content; +flex-auth hosts that registry, evaluates the policy package, and returns the +decision envelope used by warden sign. + +## Production Runtime Target + +Use the NetKingdom operator-reachable service URL as the canonical +policy.flex_auth_url. The preferred target is an in-cluster flex-auth Service +fronted by the existing operator access path: + + http://flex-auth.flex-auth.svc.cluster.local:8080 + +If cluster DNS is not reachable from the workstation that runs warden sign, use +an approved operator tunnel or ingress URL with the same base path semantics. Do +not turn on policy.enabled with fail_closed true until this pre-flight succeeds +from the same workstation: + + curl -fsS /healthz + +Start the runtime with the production registry snapshot and the ops-warden +policy package: + + flex-auth serve --addr 0.0.0.0:8080 --registry examples/ops-warden/production_registry_snapshot.json --policy examples/ops-warden/policy_package.md --log /var/log/flex-auth/ops-warden-decisions.jsonl + +The checked-in production snapshot is a non-secret fixture and initial load +target. Regenerate it from ops-warden inventory whenever actors, principals, or +TTL defaults change. + +## Current Operator Tunnel + +As of 2026-06-24, the reachable operator-tunnel URL for CoulombCore is: + + http://127.0.0.1:18090 + +The tunnel name is flex-auth-coulombcore. It forwards CoulombCore +127.0.0.1:18090 to the local flex-auth runtime on 127.0.0.1:18090. Verified +checks from CoulombCore: + +- GET /healthz returned HTTP 200. +- POST /v1/check for agt-state-hub-bridge returned allow with decision:873c6c682a52bebc. + +This is an operator tunnel pattern, not a substitute for a future in-cluster +Service if flex-auth should run inside the cluster. + +## Ownership Contract + +| Concern | Owner | Notes | +| --- | --- | --- | +| Actor names and actor types | ops-warden | inventory.yaml defines adm, agt, and atm actors. | +| Default principals and TTLs | ops-warden | Used by warden sign and by generated registry attributes. | +| Registry hosting and reload | flex-auth | Runtime serves the generated snapshot and evaluates it with the policy package. | +| Policy package semantics | flex-auth | examples/ops-warden/policy_package.md owns allow and deny reasons. | +| OpenBao SSH signing | ops-warden | flex-auth never receives SSH private keys or Vault tokens. | +| Production policy.enabled flip | ops-warden operator | Only after healthz and allow/deny smoke pass. | + +## Sync Procedure + +1. In ops-warden, update the managed inventory source or ~/.config/warden/inventory.yaml. +2. Regenerate the flex-auth snapshot from ops-warden: + + python scripts/build_flex_auth_registry.py ~/.config/warden/inventory.yaml -o registry/flex-auth/production_registry_snapshot.json + +3. Validate the generated file before handoff: + + flex-auth load-registry --file registry/flex-auth/production_registry_snapshot.json + +4. Copy or promote the snapshot to the flex-auth runtime. For repo-level drift + coverage, update examples/ops-warden/production_registry_snapshot.json when + the intended production fixture changes. +5. Restart or reload the flex-auth runtime with the new snapshot. +6. From the workstation that runs warden sign, verify: + + curl -fsS /healthz + +7. Run one allow smoke and one deny smoke. Record only non-secret evidence: + actor name, decision id, effect, reason, backend, and whether a certificate + was issued. + +## Current Production Fixture + +The initial fixture mirrors ops-warden production inventory as of 2026-06-23. +It registers: + +| Actor | Type | Principal | Max TTL hours | Allowed subjects | +| --- | --- | --- | --- | --- | +| adm-example | adm | adm-full | 48 | adm-example, iam:adm-example | +| agt-codex-interhub-bootstrap | agt | agt-interhub-bootstrap | 2 | agt-codex-interhub-bootstrap, iam:agt-codex-interhub-bootstrap | +| agt-state-hub-bridge | agt | agt-task-bridge | 24 | agt-state-hub-bridge, iam:agt-state-hub-bridge | +| atm-backup-daily | atm | atm-backup-daily | 8 | atm-backup-daily, iam:atm-backup-daily | + +The IAM subject form is intended for WARDEN_POLICY_SUBJECT. If that environment +variable is unset, ops-warden sends the actor name and the same policy path +continues to work. + +## Smoke Expectations + +Allow path: + + warden sign agt-state-hub-bridge + +Expected non-secret evidence: decision effect allow, reason +signing_policy_matched, signatures.log includes policy_decision_id. + +Deny path: + + warden sign agt-state-hub-bridge --ttl 999 + +Expected non-secret evidence: effect deny, reason ttl_out_of_bounds, no +certificate issued. With fail_closed true, unreachable flex-auth must also block +signing. + +OpenBao-backed signing remains an operator smoke because it requires a scoped +VAULT_TOKEN. The previous session returned HTTP 403 on 2026-06-23; retry with: + + SMOKE_VAULT=1 ~/ops-warden/scripts/policy_gate_production_smoke.sh + +## References + +- docs/ops-warden-policy-gate-handoff.md +- examples/ops-warden/production_registry_snapshot.json +- ~/ops-warden/wiki/PolicyGatedSigning.md +- ~/ops-warden/history/2026-06-23-flex-auth-policy-gate-production-smoke.md diff --git a/docs/workplan-planning-map.md b/docs/workplan-planning-map.md index c74cc80..1debb51 100644 --- a/docs/workplan-planning-map.md +++ b/docs/workplan-planning-map.md @@ -25,6 +25,7 @@ This document captures the current sequencing view for flex-auth workplans. | `FLEX-WP-0003` | complete | completed | `FLEX-WP-0002` | Markitect consumer integration and first CARING benchmark are complete: resource namespace, manifest import, action vocabulary, descriptor fixtures, decision fixtures, integration docs. | | `FLEX-WP-0004` | complete | completed | `FLEX-WP-0002`, `FLEX-WP-0005` | Delegated PDP and directory adapter boundary work is complete: Topaz adapter shape, OpenFGA/SpiceDB, OPA/Cedar, Keycloak Authorization Services, Entra/Graph/SCIM, CARING envelope preservation. | | `FLEX-WP-0006` | complete | finished | `FLEX-WP-0002`, `FLEX-WP-0005` | Ops-warden unblocker is complete: flex-auth publishes `ssh-certificate` / `sign` policies, fixtures, and `/v1/check` smoke evidence for the opt-in pre-sign gate shipped in ops-warden `WARDEN-WP-0007` and tracked for production in `WARDEN-WP-0009`. | +| `FLEX-WP-0007` | `P0` | blocked | `FLEX-WP-0006` | Repo-side production registry fixture, sync contract, runtime command, healthz coverage, and real actor/IAM tests are implemented. Operator deployment and OpenBao smoke remain blocked on reachable runtime selection and scoped VAULT_TOKEN refresh. | ## Dependency Notes @@ -79,5 +80,6 @@ Native State Hub dependency edges: - `FLEX-WP-0004 -> FLEX-WP-0005` (Topaz adapter consumes the spike) - `FLEX-WP-0006 -> FLEX-WP-0002` - `FLEX-WP-0006 -> FLEX-WP-0005` -- ops-warden: `WARDEN-WP-0009` waits for `FLEX-WP-0006` output before - production enablement of `policy.enabled`. +- ops-warden: `WARDEN-WP-0009` finished (caller + registry smoke). Production + `policy.enabled: true` waits for `FLEX-WP-0007` (reachable flex-auth runtime). +- `FLEX-WP-0007 -> FLEX-WP-0006` diff --git a/examples/ops-warden/README.md b/examples/ops-warden/README.md index 3d37398..f09efee 100644 --- a/examples/ops-warden/README.md +++ b/examples/ops-warden/README.md @@ -32,3 +32,18 @@ flex-auth check --registry examples/ops-warden/registry_snapshot.json --policy e The fixture public-key fingerprints are examples only. Do not put real keys, OpenBao tokens, or private signing material in these files. + + +## Production Registry Fixture + +production_registry_snapshot.json is a non-secret fixture generated by +ops-warden for FLEX-WP-0007 coverage. It mirrors the current production actor +names used by ops-warden inventory and should be refreshed when that inventory +changes. + +Validate both registries locally: + + flex-auth load-registry --file examples/ops-warden/registry_snapshot.json + flex-auth load-registry --file examples/ops-warden/production_registry_snapshot.json + +The production sync contract is documented in docs/ops-warden-registry-sync.md. diff --git a/examples/ops-warden/production_registry_snapshot.json b/examples/ops-warden/production_registry_snapshot.json new file mode 100644 index 0000000..3110228 --- /dev/null +++ b/examples/ops-warden/production_registry_snapshot.json @@ -0,0 +1,450 @@ +{ + "systems": [ + { + "id": "ops-warden", + "name": "Ops Warden", + "resource_types": [ + { + "name": "ssh-certificate", + "scope_level": "Resource", + "planes": [ + "Identity", + "Secret", + "Audit" + ], + "metadata": { + "description": "Short-lived SSH certificate signing request." + } + } + ], + "actions": [ + { + "name": "sign", + "capabilities": [ + "Use", + "Operate", + "Audit" + ], + "planes": [ + "Identity", + "Secret", + "Audit" + ], + "exposure_modes": [ + "Metadata" + ], + "metadata": { + "required_context": [ + "principals", + "actor_type", + "pubkey_fingerprint", + "ttl_hours" + ] + } + } + ], + "caring_profiles": [ + "caring-0.4.0-rc2" + ], + "metadata": { + "flex_auth_contract": "protected-system-v0", + "ops_warden_policy_gate": "v2", + "policy_enabled_config": "policy.enabled", + "tenant": "tenant:platform" + } + } + ], + "resource_manifests": [ + { + "id": "ops-warden-ssh-certificates", + "system": "ops-warden", + "resources": [ + { + "id": "ssh-cert:actor/adm-example", + "type": "ssh-certificate", + "labels": [ + "ssh-signing", + "adm" + ], + "trust_zone": "platform", + "owner": "team:platform-security", + "attributes": { + "actor_id": "adm-example", + "actor_type": "adm", + "allowed_subjects": [ + "adm-example", + "iam:adm-example" + ], + "allowed_principals": [ + "adm-full" + ], + "max_ttl_hours": 48 + } + }, + { + "id": "ssh-cert:actor/agt-codex-interhub-bootstrap", + "type": "ssh-certificate", + "labels": [ + "ssh-signing", + "agt" + ], + "trust_zone": "platform", + "owner": "team:platform-security", + "attributes": { + "actor_id": "agt-codex-interhub-bootstrap", + "actor_type": "agt", + "allowed_subjects": [ + "agt-codex-interhub-bootstrap", + "iam:agt-codex-interhub-bootstrap" + ], + "allowed_principals": [ + "agt-interhub-bootstrap" + ], + "max_ttl_hours": 2 + } + }, + { + "id": "ssh-cert:actor/agt-state-hub-bridge", + "type": "ssh-certificate", + "labels": [ + "ssh-signing", + "agt" + ], + "trust_zone": "platform", + "owner": "team:platform-security", + "attributes": { + "actor_id": "agt-state-hub-bridge", + "actor_type": "agt", + "allowed_subjects": [ + "agt-state-hub-bridge", + "iam:agt-state-hub-bridge" + ], + "allowed_principals": [ + "agt-task-bridge" + ], + "max_ttl_hours": 24 + } + }, + { + "id": "ssh-cert:actor/atm-backup-daily", + "type": "ssh-certificate", + "labels": [ + "ssh-signing", + "atm" + ], + "trust_zone": "platform", + "owner": "team:platform-security", + "attributes": { + "actor_id": "atm-backup-daily", + "actor_type": "atm", + "allowed_subjects": [ + "atm-backup-daily", + "iam:atm-backup-daily" + ], + "allowed_principals": [ + "atm-backup-daily" + ], + "max_ttl_hours": 8 + } + } + ], + "actions": [ + "sign" + ], + "caring_profile": "caring-0.4.0-rc2", + "metadata": { + "flex_auth_contract": "resource-registration-v0", + "tenant": "tenant:platform" + } + } + ], + "tenants": [ + { + "id": "tenant:platform", + "name": "Platform Tenant" + } + ], + "subjects": [ + { + "id": "adm-example", + "type": "Agent", + "display_name": "Example human operator \u2014 replace with per-person adm-* actors", + "organization_relation": "ServiceProvider", + "roles": [ + "Operator" + ], + "groups": [ + "group:ops-warden-admins" + ], + "tenant": "tenant:platform", + "metadata": { + "actor_type": "adm" + } + }, + { + "id": "agt-codex-interhub-bootstrap", + "type": "Agent", + "display_name": "Short-lived agent access for attended Inter-Hub bootstrap", + "organization_relation": "ServiceProvider", + "roles": [ + "Operator" + ], + "groups": [ + "group:ops-warden-agents" + ], + "tenant": "tenant:platform", + "metadata": { + "actor_type": "agt" + } + }, + { + "id": "agt-state-hub-bridge", + "type": "Agent", + "display_name": "ops-bridge tunnel agent for state-hub", + "organization_relation": "ServiceProvider", + "roles": [ + "Operator" + ], + "groups": [ + "group:ops-warden-agents" + ], + "tenant": "tenant:platform", + "metadata": { + "actor_type": "agt" + } + }, + { + "id": "atm-backup-daily", + "type": "Automation", + "display_name": "Example nightly automation actor", + "organization_relation": "ServiceProvider", + "roles": [ + "Operator" + ], + "groups": [ + "group:ops-warden-automations" + ], + "tenant": "tenant:platform", + "metadata": { + "actor_type": "atm" + } + } + ], + "groups": [ + { + "id": "group:ops-warden-admins", + "display_name": "Ops Warden Admins", + "members": [ + "adm-example" + ], + "tenant": "tenant:platform" + }, + { + "id": "group:ops-warden-agents", + "display_name": "Ops Warden Agents", + "members": [ + "agt-codex-interhub-bootstrap", + "agt-state-hub-bridge" + ], + "tenant": "tenant:platform" + }, + { + "id": "group:ops-warden-automations", + "display_name": "Ops Warden Automations", + "members": [ + "atm-backup-daily" + ], + "tenant": "tenant:platform" + } + ], + "relationships": [ + { + "id": "rel:adm-example-sign-adm-example", + "system": "ops-warden", + "subject": "group:ops-warden-admins", + "relation": "signer", + "object": "ssh-cert:actor/adm-example", + "tenant": "tenant:platform", + "conditions": [ + "TimeLimited", + "Logged" + ], + "caring": { + "id": "descriptor:ops-warden-adm-signer", + "profile": "caring-0.4.0-rc2", + "subject_type": "Group", + "organization_relation": "ServiceProvider", + "canonical_role": "Operator", + "scope": { + "level": "Resource", + "id": "ssh-cert:actor/adm-example", + "tenant": "tenant:platform", + "resource": "ssh-cert:actor/adm-example" + }, + "planes": [ + "Identity", + "Secret", + "Audit" + ], + "capabilities": [ + "Use", + "Operate", + "Audit" + ], + "exposure_modes": [ + "Metadata" + ], + "conditions": [ + "TimeLimited", + "Logged" + ], + "restrictions": [ + "PrivilegeEscalationBlocked", + "SecretAccessBlocked" + ], + "access_path": "mediated" + } + }, + { + "id": "rel:agt-codex-interhub-bootstrap-sign-agt-codex-interhub-bootstrap", + "system": "ops-warden", + "subject": "group:ops-warden-agents", + "relation": "signer", + "object": "ssh-cert:actor/agt-codex-interhub-bootstrap", + "tenant": "tenant:platform", + "conditions": [ + "TimeLimited", + "Logged" + ], + "caring": { + "id": "descriptor:ops-warden-agt-signer", + "profile": "caring-0.4.0-rc2", + "subject_type": "Group", + "organization_relation": "ServiceProvider", + "canonical_role": "Operator", + "scope": { + "level": "Resource", + "id": "ssh-cert:actor/agt-codex-interhub-bootstrap", + "tenant": "tenant:platform", + "resource": "ssh-cert:actor/agt-codex-interhub-bootstrap" + }, + "planes": [ + "Identity", + "Secret", + "Audit" + ], + "capabilities": [ + "Use", + "Operate", + "Audit" + ], + "exposure_modes": [ + "Metadata" + ], + "conditions": [ + "TimeLimited", + "Logged" + ], + "restrictions": [ + "PrivilegeEscalationBlocked", + "SecretAccessBlocked" + ], + "access_path": "mediated" + } + }, + { + "id": "rel:agt-state-hub-bridge-sign-agt-state-hub-bridge", + "system": "ops-warden", + "subject": "group:ops-warden-agents", + "relation": "signer", + "object": "ssh-cert:actor/agt-state-hub-bridge", + "tenant": "tenant:platform", + "conditions": [ + "TimeLimited", + "Logged" + ], + "caring": { + "id": "descriptor:ops-warden-agt-signer", + "profile": "caring-0.4.0-rc2", + "subject_type": "Group", + "organization_relation": "ServiceProvider", + "canonical_role": "Operator", + "scope": { + "level": "Resource", + "id": "ssh-cert:actor/agt-state-hub-bridge", + "tenant": "tenant:platform", + "resource": "ssh-cert:actor/agt-state-hub-bridge" + }, + "planes": [ + "Identity", + "Secret", + "Audit" + ], + "capabilities": [ + "Use", + "Operate", + "Audit" + ], + "exposure_modes": [ + "Metadata" + ], + "conditions": [ + "TimeLimited", + "Logged" + ], + "restrictions": [ + "PrivilegeEscalationBlocked", + "SecretAccessBlocked" + ], + "access_path": "mediated" + } + }, + { + "id": "rel:atm-backup-daily-sign-atm-backup-daily", + "system": "ops-warden", + "subject": "group:ops-warden-automations", + "relation": "signer", + "object": "ssh-cert:actor/atm-backup-daily", + "tenant": "tenant:platform", + "conditions": [ + "TimeLimited", + "Logged" + ], + "caring": { + "id": "descriptor:ops-warden-atm-signer", + "profile": "caring-0.4.0-rc2", + "subject_type": "Group", + "organization_relation": "ServiceProvider", + "canonical_role": "Operator", + "scope": { + "level": "Resource", + "id": "ssh-cert:actor/atm-backup-daily", + "tenant": "tenant:platform", + "resource": "ssh-cert:actor/atm-backup-daily" + }, + "planes": [ + "Identity", + "Secret", + "Audit" + ], + "capabilities": [ + "Use", + "Operate", + "Audit" + ], + "exposure_modes": [ + "Metadata" + ], + "conditions": [ + "TimeLimited", + "Logged" + ], + "restrictions": [ + "PrivilegeEscalationBlocked", + "SecretAccessBlocked" + ], + "access_path": "mediated" + } + } + ] +} diff --git a/workplans/FLEX-WP-0007-ops-warden-policy-gate-production-deployment.md b/workplans/FLEX-WP-0007-ops-warden-policy-gate-production-deployment.md new file mode 100644 index 0000000..8873974 --- /dev/null +++ b/workplans/FLEX-WP-0007-ops-warden-policy-gate-production-deployment.md @@ -0,0 +1,211 @@ +--- +id: FLEX-WP-0007 +type: workplan +title: "Ops-Warden Policy Gate Production Deployment" +domain: infotech +repo: flex-auth +status: blocked +owner: codex +topic_slug: flex-auth +planning_priority: P0 +planning_order: 70 +depends_on_workplans: + - FLEX-WP-0006 +related_workplans: + - WARDEN-WP-0009 +created: "2026-06-23" +updated: "2026-06-23" +state_hub_workstream_id: "358ce697-2611-4fe9-89ab-63e86ceb00fa" +--- + +# FLEX-WP-0007: Ops-Warden Policy Gate Production Deployment + +## Purpose + +Deploy flex-auth as a reachable production runtime for ops-warden's opt-in SSH +signing policy gate, load a production registry aligned with real inventory +actors, and complete joint smoke evidence so operators can set policy.enabled: +true in warden.yaml. + +Review update: repo-side production readiness is now separated from +operator-only work. flex-auth can publish the production fixture, tests, +runtime command, and sync contract in this repo. The actual stable URL +deployment and OpenBao smoke remain blocked because they need NetKingdom +reachability and a refreshed scoped VAULT_TOKEN. + +## Background + +ops-warden finished WARDEN-WP-0009 on the caller side: local and +production-registry smoke passed, and the production registry generator exists. +The remaining risk is operational, not policy shape: warden workstations need a +reachable flex-auth URL, and the vault-backed joint smoke needs a valid scoped +VAULT_TOKEN. + +Production registry artifacts: + +- flex-auth fixture: examples/ops-warden/production_registry_snapshot.json +- ops-warden source artifact: ~/ops-warden/registry/flex-auth/production_registry_snapshot.json +- ops-warden generator: ~/ops-warden/scripts/build_flex_auth_registry.py + +## Ownership Boundary + +| Concern | Owner | +| --- | --- | +| Policy package and PDP decision | flex-auth | +| Actor inventory and TTL/principal defaults | ops-warden | +| SSH CA and OpenBao signing | ops-warden | +| Production registry content for SSH actors | Joint: ops-warden generates, flex-auth hosts | +| policy.enabled flip | ops-warden operator after flex-auth is reachable | + +No SSH private keys, OpenBao tokens, or other secrets belong in fixtures, docs, +State Hub messages, or smoke evidence. + +## T1 - Deploy production flex-auth runtime + +```task +id: FLEX-WP-0007-T01 +status: done +priority: high +state_hub_task_id: "727573fc-86a3-4f5a-abd7-40b0ccb01e68" +``` + +Deploy flex-auth serve, or equivalent, to a stable URL reachable from +workstations that run warden sign. + +- [x] Choose preferred target: in-cluster Service at http://flex-auth.flex-auth.svc.cluster.local:8080 when reachable; otherwise approved operator tunnel or ingress with the same base path +- [x] Document canonical policy.flex_auth_url selection in docs/ops-warden-registry-sync.md +- [x] Document healthz pre-flight: GET /healthz returns HTTP 200 +- [x] Add service test coverage for /healthz +- [x] Operator tunnel deployed as flex-auth-coulombcore and confirmed POST /v1/check is reachable from CoulombCore + +Acceptance: operator runs curl /healthz from the warden +workstation and receives HTTP 200. Verified from CoulombCore on 2026-06-24 with +flex_auth_url http://127.0.0.1:18090. + +## T2 - Load production registry and verify real actors + +```task +id: FLEX-WP-0007-T02 +status: done +priority: high +state_hub_task_id: "6ec1e00c-4a3a-475b-aefb-af3961de7070" +``` + +Load the production registry snapshot derived from ops-warden inventory, not +only the template actors in examples/ops-warden/registry_snapshot.json. + +- [x] Add examples/ops-warden/production_registry_snapshot.json from the ops-warden generated artifact +- [x] Document regenerate and load procedure in docs/ops-warden-registry-sync.md +- [x] Verify allow for agt-state-hub-bridge / sign +- [x] Verify deny for ttl_out_of_bounds +- [x] Verify deny for unregistered actors with unknown_actor_resource +- [x] Add CI tests using production actor names: agt-state-hub-bridge, agt-codex-interhub-bootstrap, adm-example, atm-backup-daily + +Acceptance: local flex-auth coverage allows agt-state-hub-bridge without +ops-warden-local registry patching. Deployed runtime verification remains part +of T1. + +## T3 - Publish registry sync contract with ops-warden + +```task +id: FLEX-WP-0007-T03 +status: done +priority: medium +state_hub_task_id: "afa09ec3-516c-433d-87a7-330cb79845a8" +``` + +Document the two-repo workflow when inventory or policy boundaries change. + +- [x] Publish docs/ops-warden-registry-sync.md +- [x] Cover ops-warden ownership of actor names, actor types, principals, and TTL defaults +- [x] Cover flex-auth ownership of hosted registry, relationships, and policy package evaluation +- [x] Document trigger: inventory add/change -> regenerate snapshot -> flex-auth reload +- [x] Cross-link from docs/ops-warden-policy-gate-handoff.md +- [x] Confirm ops-warden wiki/PolicyGatedSigning.md already points to the flex-auth handoff; flex-auth now points back from the sync runbook + +Acceptance: a new agt-* actor addition has an unambiguous procedure across both +repos. + +## T4 - Joint OpenBao + policy gate production smoke + +```task +id: FLEX-WP-0007-T04 +status: wait +priority: medium +state_hub_task_id: "32a96f1c-e0e8-4e27-baa6-7b8c445cf7a1" +``` + +Coordinate with ops-warden for vault-backed signing through the deployed +flex-auth runtime. + +- [x] flex-auth deployed with production registry via operator tunnel, completing T1 +- [ ] ops-warden policy.enabled: true and policy.flex_auth_url points to deployed URL http://127.0.0.1:18090 on CoulombCore +- [ ] Valid scoped VAULT_TOKEN with warden-sign policy, operator-provided +- [ ] Allow smoke: warden sign agt-state-hub-bridge records backend vault and policy_decision_id +- [ ] Deny smoke: TTL above registry max is denied by flex-auth before OpenBao +- [ ] Record non-secret evidence: decision ids, reasons, actor names only + +Blocked on: scoped VAULT_TOKEN refresh. Previous ops-warden session returned +HTTP 403 on 2026-06-23; no VAULT_TOKEN is present in this session. + +Smoke runner when token is valid: + + SMOKE_VAULT=1 ~/ops-warden/scripts/policy_gate_production_smoke.sh + +## T5 - IAM subject binding for production + +```task +id: FLEX-WP-0007-T05 +status: done +priority: low +state_hub_task_id: "65dc3c59-1e4b-4335-b6a0-db492ea9b2b5" +``` + +Clarify how WARDEN_POLICY_SUBJECT maps to flex-auth allowed_subjects in +production. + +- [x] Document production default: actor name as subject.id unless WARDEN_POLICY_SUBJECT supplies the IAM subject +- [x] Confirm production registry allowed_subjects includes iam: entries +- [x] Add test coverage for iam:agt-state-hub-bridge allow path + +Acceptance: documented subject-id strategy; no ops-warden special-casing is +required beyond existing policy behavior. + +## Exit Criteria + +- flex-auth production runtime reachable from CoulombCore warden path: done via flex-auth-coulombcore operator tunnel +- Production registry loaded and real inventory actors covered locally: done +- Registry sync contract published and cross-linked: done +- Joint vault-backed smoke evidence recorded, or T4 explicitly waits on token: T4 waits on scoped VAULT_TOKEN +- ops-warden operator has the repo-side artifacts needed to set policy.enabled: true after the stable URL and token are ready + +## Implementation Notes + +2026-06-23 repo-side implementation: + +- Added examples/ops-warden/production_registry_snapshot.json from the ops-warden generated production registry artifact. +- Added Go coverage for production actor allows, IAM subject allow, ttl_out_of_bounds, unknown_actor_resource, production registry counts, and /healthz. +- Published docs/ops-warden-registry-sync.md and cross-linked it from the handoff and examples docs. + +Remaining blocked work: + +- Operator refreshes scoped VAULT_TOKEN and reruns the OpenBao-backed smoke. +- After workplan file changes, run make fix-consistency REPO=flex-auth from ~/state-hub to mirror these statuses into State Hub. + +## See Also + +- docs/ops-warden-policy-gate-handoff.md +- docs/ops-warden-registry-sync.md +- workplans/FLEX-WP-0006-ops-warden-ssh-signing-policy-gate.md +- ~/ops-warden/wiki/PolicyGatedSigning.md +- ~/ops-warden/workplans/WARDEN-WP-0009-flex-auth-policy-gate-production.md +- ~/ops-warden/history/2026-06-23-flex-auth-production-pickup-suggestion.md + + +2026-06-24 operator tunnel update: + +- Built /tmp/flex-auth and started the production registry runtime on local 127.0.0.1:18090. +- Added local ops-bridge tunnel flex-auth-coulombcore, forwarding CoulombCore 127.0.0.1:18090 to the local runtime. +- Verified remote health from CoulombCore: GET /healthz returned HTTP 200. +- Verified remote POST /v1/check from CoulombCore allowed agt-state-hub-bridge with decision:873c6c682a52bebc. +- VAULT_TOKEN is absent, so OpenBao-backed smoke remains blocked on operator credential refresh.