Files
the-custodian/ops/service-inventory.yml
codex cf4be716e1 CUST-WP-0054 T01-T03: fleet architecture, de-hub runbook, drain plan
Documents the three-machine role model, fleet mesh topology, coulombcore
freeze policy, and ordered drain sequence. Adds railiance01 systemd tunnel
install assets and refreshes ops service inventory to reflect 2026-07-03
production placement (cluster State Hub, fleet mesh, draining coulombcore).
2026-07-04 00:29:55 +02:00

460 lines
14 KiB
YAML

version: 1
last_reviewed: "2026-07-03"
policy:
non_secret_inventory: true
secrets_rule: "Do not store credentials, tokens, private addresses that are not already operationally documented, or command output containing secrets."
sources:
- path: "/home/worsch/helix-forge/wiki/OpsHubInventory.md"
summary: "Initial ops-hub inventory draft with environments, hosts, services, endpoints, gaps, and first widget ids."
- path: "/home/worsch/the-custodian/workplans/CUST-WP-0025-fos-hub-bootstrap.md"
summary: "Long-term ops-hub scaffold, models, health probes, access paths, and now-view work."
- path: "/home/worsch/the-custodian/workplans/CUST-WP-0046-hourly-recently-on-scope-activity-core.md"
summary: "Evidence that activity-core runs on Railiance01 and can reach State Hub through the in-cluster bridge."
- path: "/home/worsch/the-custodian/infra/build-machines/README.md"
summary: "Local workstation and build VM tunnel pattern."
environments:
- id: local
name: "Local Workstation"
role: "Workstation development and local operations"
lifecycle_state: observed
- id: coulombcore
name: "CoulombCore"
role: "Legacy production host — frozen for new workloads; draining per CUST-WP-0054-T03"
lifecycle_state: draining
- id: railiance01
name: "Railiance01"
role: "Production home — activity-core, fleet mesh, target for drain waves"
lifecycle_state: observed
- id: threephoenix-prod
name: "ThreePhoenix Production"
role: "Target governed production topology"
lifecycle_state: planned
hosts:
- id: local-workstation
environment: local
address: "local/private"
role: "State Hub and operator workstation runtime"
evidence:
- type: document
source: "/home/worsch/the-custodian/infra/build-machines/README.md"
- id: coulombcore
environment: coulombcore
address: "92.205.130.254"
role: "Current live production-like server"
evidence:
- type: document
source: "/home/worsch/helix-forge/wiki/OpsHubInventory.md"
- id: railiance01
environment: railiance01
address: "92.205.62.239"
role: "First ThreePhoenix foundation node"
evidence:
- type: document
source: "/home/worsch/helix-forge/wiki/OpsHubInventory.md"
clusters:
- id: coulombcore-k3s
environment: coulombcore
host: coulombcore
kind: k3s
lifecycle_state: observed
notes: "Current operational Kubernetes runtime for Gitea and related services."
- id: railiance01-k3s
environment: railiance01
host: railiance01
kind: k3s
lifecycle_state: observed
notes: "Runtime substrate for activity-core production service evidence."
- id: threephoenix-k3s
environment: threephoenix-prod
kind: k3s
lifecycle_state: planned
notes: "Target governed production cluster shape."
services:
- id: gitea
name: "Gitea"
kind: application
lifecycle_state: draining
health_status: unknown
environment: coulombcore
owner_repos:
- railiance-apps
desired_state_sources:
- "/home/worsch/railiance-forge/docs/gitea-package-registry.md"
- "/home/worsch/the-custodian/ops/runbooks/gitea-coulombcore.md"
runtime:
type: k3s
cluster: coulombcore-k3s
namespace: default
workload_refs:
- "helm:gitea"
- "nodePort:32166"
endpoints:
- id: gitea-oci-registry
type: https
url: "https://gitea.coulomb.social/v2/"
expected_status: 401
expected_signal: "OCI registry auth challenge"
widget_ref: "ops:endpoint:gitea-registry"
backing_stores:
- "database:gitea-db"
- "pvc:default/gitea-shared-storage"
access_paths:
- type: k8s
target: "coulombcore-k3s/default"
status: unknown
evidence:
- type: document
observed_at: "2026-05-16"
source: "/home/worsch/helix-forge/wiki/OpsHubInventory.md"
summary: "Inventory draft records Helm release gitea, namespace default, app version 1.25.4, NodePort 32166, and registry auth challenge."
gaps:
- "Package token and push/pull verification need current evidence."
- "Backup and restore evidence for database and shared storage not recorded in ops inventory."
- id: gitea-database
name: "Gitea Database"
kind: datastore
lifecycle_state: observed
health_status: unknown
environment: coulombcore
owner_repos:
- railiance-platform
runtime:
type: k3s
cluster: coulombcore-k3s
namespace: databases
workload_refs:
- "database:gitea-db"
endpoints: []
backing_stores: []
access_paths:
- type: k8s
target: "coulombcore-k3s/databases"
status: unknown
evidence:
- type: document
observed_at: "2026-05-16"
source: "/home/worsch/helix-forge/wiki/OpsHubInventory.md"
gaps:
- "Backup and restore evidence not recorded in ops inventory."
- id: gitea-shared-storage
name: "Gitea Shared Storage"
kind: storage
lifecycle_state: observed
health_status: unknown
environment: coulombcore
owner_repos:
- railiance-platform
- railiance-apps
runtime:
type: k3s
cluster: coulombcore-k3s
namespace: default
workload_refs:
- "pvc:default/gitea-shared-storage"
endpoints: []
backing_stores: []
access_paths:
- type: k8s
target: "coulombcore-k3s/default/pvc/gitea-shared-storage"
status: unknown
evidence:
- type: document
observed_at: "2026-05-16"
source: "/home/worsch/helix-forge/wiki/OpsHubInventory.md"
gaps:
- "Package blob backup and restore evidence not confirmed."
- id: state-hub
name: "State Hub"
kind: coordination-service
lifecycle_state: draining
health_status: observed_ok
environment: coulombcore
owner_repos:
- state-hub
- the-custodian
desired_state_sources:
- "/home/worsch/state-hub"
- "/home/worsch/the-custodian/state-hub/README.md"
runtime:
type: k3s
cluster: coulombcore-k3s
namespace: state-hub
workload_refs:
- "cnpg:state-hub-db"
- "svc:10.43.170.94:8000"
endpoints:
- id: state-hub-cluster-api
type: http
url: "http://127.0.0.1:8000/state/health"
expected_status: 200
expected_signal: "health response"
- id: state-hub-railiance01-fleet
type: tunnel
url: "http://127.0.0.1:18000/state/health"
expected_status: 200
expected_signal: "reachable from railiance01 fleet mesh"
backing_stores:
- "postgresql:state-hub-db"
access_paths:
- type: http
target: "workstation tunnel state-hub-primary → cluster"
status: observed_ok
- type: tunnel
target: "railiance01 systemd fleet-state-hub-coulombcore → cluster"
status: observed_ok
evidence:
- type: session-probe
observed_at: "2026-07-03"
source: "CUST-WP-0054-T02 fleet mesh + cluster primary"
summary: "Cluster hub healthy; railiance01 reaches via fleet forward tunnel."
gaps:
- "Primary home must move to railiance01 per CUST-WP-0054-T05."
- "Consistency sweep writebacks still target workstation paths."
- id: issue-core
name: "issue-core"
kind: application
lifecycle_state: draining
health_status: observed_ok
environment: coulombcore
owner_repos:
- issue-core
runtime:
type: k3s
cluster: coulombcore-k3s
namespace: issue-core
workload_refs:
- "svc:10.43.103.154:8765"
endpoints:
- id: issue-core-api
type: http
url: "http://127.0.0.1:8765/healthz"
expected_status: 200
expected_signal: "version response"
backing_stores:
- "postgresql:issue-core"
access_paths:
- type: tunnel
target: "railiance01 fleet-issue-core-coulombcore → cluster"
status: observed_ok
evidence:
- type: workplan-note
observed_at: "2026-07-02"
source: "ISSUE-WP-0003 completion — Gitea issue 176 emission"
summary: "REST emission live via cross-machine fleet path."
gaps:
- "Target railiance01 overlay per CUST-WP-0054 drain Wave 4."
- id: core-hub
name: "Core Hub"
kind: governance-service
lifecycle_state: draining
health_status: observed_ok
environment: coulombcore
owner_repos:
- core-hub
runtime:
type: k3s
cluster: coulombcore-k3s
namespace: core-hub-staging
endpoints:
- id: core-hub-public
type: https
url: "https://hub.coulomb.social/api/v2/hubs"
expected_status: 200
expected_signal: "hub list when authenticated"
backing_stores:
- "postgresql:core-hub"
access_paths:
- type: k8s
target: "coulombcore-k3s/core-hub-staging"
status: observed_ok
evidence:
- type: workplan-note
observed_at: "2026-07-02"
source: "CUST-WP-0051 metaplan closeout"
summary: "Staging deployed; production cutover gated on CORE-WP-0005-T04."
gaps:
- "Production cutover to railiance01 pending operator approval."
- id: fleet-mesh-railiance01
name: "Fleet Mesh (railiance01)"
kind: connectivity-service
lifecycle_state: observed
health_status: observed_ok
environment: railiance01
owner_repos:
- the-custodian
- ops-bridge
desired_state_sources:
- "/home/worsch/the-custodian/infra/fleet-mesh/"
runtime:
type: systemd
host: railiance01
workload_refs:
- "fleet-state-hub-coulombcore.service"
- "fleet-issue-core-coulombcore.service"
endpoints:
- id: fleet-state-hub-local
type: http
url: "http://127.0.0.1:18000/state/health"
expected_status: 200
- id: fleet-issue-core-local
type: http
url: "http://127.0.0.1:18765/healthz"
expected_status: 200
backing_stores: []
access_paths:
- type: ssh-tunnel
target: "railiance01 → coulombcore ClusterIPs"
status: observed_ok
evidence:
- type: session-probe
observed_at: "2026-07-03"
source: "CUST-WP-0054-T02 cutover"
summary: "Workstation reverse tunnels stopped; systemd forwards healthy."
gaps:
- "Migrate to atm-fleet-mesh cert_command when VAULT_TOKEN available."
- "Retire when State Hub and issue-core move to railiance01."
- id: inter-hub
name: "Inter-Hub"
kind: governance-service
lifecycle_state: observed
health_status: unknown
environment: threephoenix-prod
owner_repos:
- inter-hub
runtime:
type: external
public_endpoint: "https://hub.coulomb.social"
endpoints:
- id: inter-hub-openapi
type: https
url: "https://hub.coulomb.social/api/v2/openapi.json"
expected_status: 200
expected_signal: "OpenAPI document"
- id: inter-hub-ui
type: https
url: "https://hub.coulomb.social/Hubs"
expected_status: 302
expected_signal: "login redirect when unauthenticated"
backing_stores: []
access_paths:
- type: https
target: "https://hub.coulomb.social"
status: unknown
evidence:
- type: document
observed_at: "2026-05-16"
source: "/home/worsch/helix-forge/wiki/OpsHubInventory.md"
gaps:
- "ops-hub bootstrap requires authenticated UI flow or deployment-side migration."
- id: activity-core
name: "activity-core"
kind: automation-service
lifecycle_state: observed
health_status: observed_ok
environment: railiance01
owner_repos:
- activity-core
- the-custodian
desired_state_sources:
- "/home/worsch/activity-core/k8s/railiance"
- "/home/worsch/the-custodian/activity-definitions"
runtime:
type: k3s
cluster: railiance01-k3s
namespace: activity-core
workload_refs:
- "deployment:activity-core-api"
- "deployment:activity-core-worker"
- "temporal:schedules"
endpoints:
- id: activity-core-api
type: cluster-http
url: "activity-core API health endpoint"
expected_status: 200
expected_signal: "healthy DB and Temporal status"
backing_stores:
- "postgresql:activity-core"
- "temporal:activity-core"
- "nats:railiance01"
access_paths:
- type: k8s
target: "railiance01-k3s/activity-core"
status: observed_ok
evidence:
- type: workplan-note
observed_at: "2026-05-23"
source: "/home/worsch/the-custodian/workplans/CUST-WP-0046-hourly-recently-on-scope-activity-core.md"
summary: "API health, worker rollout, Temporal CLI schedule listing, and State Hub bridge were verified."
gaps:
- "Add explicit ops inventory probes and evidence events."
- id: ops-bridge
name: "Ops Bridge"
kind: connectivity-service
lifecycle_state: observed
health_status: observed_ok
environment: local
owner_repos:
- ops-bridge
runtime:
type: bridge
host: local-workstation
endpoints: []
backing_stores: []
access_paths:
- type: ssh-tunnel
target: "interactive dev tunnels only (k3s-api, state-hub-primary)"
status: observed_ok
evidence:
- type: session-probe
observed_at: "2026-07-03"
source: "CUST-WP-0054-T02 — production reverse tunnels retired"
summary: "state-hub-railiance01 and issue-core-railiance01 stopped; not production-critical."
gaps:
- "Install ops-bridge on railiance01 or keep systemd fleet-mesh units."
- id: haskell-build-agent
name: "Haskell Build Agent"
kind: build-service
lifecycle_state: observed
health_status: unknown
environment: local
owner_repos:
- the-custodian
desired_state_sources:
- "/home/worsch/the-custodian/infra/build-machines/haskell"
runtime:
type: systemd
host: haskell-build-vm
tunnel:
reverse_ssh: "12222:localhost:22"
forward_state_hub: "18000:localhost:8000"
endpoints:
- id: haskell-build-agent-state-hub-forward
type: tunnel
url: "http://127.0.0.1:18000"
expected_signal: "VM can reach State Hub through SSH forward"
backing_stores: []
access_paths:
- type: ssh
target: "local workstation reverse tunnel port 12222"
status: unknown
evidence:
- type: document
source: "/home/worsch/the-custodian/infra/build-machines/README.md"
summary: "Build agent is a systemd service and registers with State Hub on boot."
gaps:
- "Current tunnel and capability registration need live evidence in ops-hub."