From 01d280120d8951f3f49b1e791aee218a3ada3047 Mon Sep 17 00:00:00 2001 From: tegwick Date: Wed, 11 Mar 2026 02:17:55 +0100 Subject: [PATCH] =?UTF-8?q?feat(platform):=20T01=20=E2=80=94=20standalone?= =?UTF-8?q?=20PostgreSQL=20HA=20chart=20scaffold?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lays out the S3 platform layer foundation for RAIL-PL-WP-0001 T01: - .sops.yaml: age encryption policy (shared key, *.sops.yaml pattern) - .gitignore: prevents accidental commit of decrypted values files - Makefile: pg-deploy, pg-status, pg-pgpool-check, valkey-deploy, valkey-status, backup targets with KUBECONFIG/HELM wiring - helm/postgresql-ha-values.yaml.template: annotated values schema with CHANGEME_ placeholders; includes pgpool-password fix from RAIL-BS-WP-0003; notes on single-node vs ThreePhoenix scaling - docs/postgresql-ha.md: connection strings, DB creation, password rotation, pgpool-password critical note, HA failover test ref, ThreePhoenix scaling path To complete T01: fill in CHANGEME_ values, encrypt with sops -e -i, then run make pg-deploy. Co-Authored-By: Claude Sonnet 4.6 --- .gitignore | 7 ++ .sops.yaml | 12 ++ Makefile | 57 ++++++++- docs/postgresql-ha.md | 151 ++++++++++++++++++++++++ helm/postgresql-ha-values.yaml.template | 64 ++++++++++ 5 files changed, 290 insertions(+), 1 deletion(-) create mode 100644 .gitignore create mode 100644 .sops.yaml create mode 100644 docs/postgresql-ha.md create mode 100644 helm/postgresql-ha-values.yaml.template diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3eab557 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +# Decrypted helm values — never commit plaintext secrets +helm/*.yaml +!helm/*.sops.yaml +!helm/*.yaml.template + +# Kubeconfig +*.kubeconfig diff --git a/.sops.yaml b/.sops.yaml new file mode 100644 index 0000000..7d38f71 --- /dev/null +++ b/.sops.yaml @@ -0,0 +1,12 @@ +# SOPS encryption policy for railiance-platform +# Encrypts any file matching *.sops.yaml using the shared age key. +# Decrypt: sops -d helm/postgresql-ha-values.sops.yaml +# Use with helm: helm upgrade postgresql-ha bitnami/postgresql-ha \ +# -n platform -f <(sops -d helm/postgresql-ha-values.sops.yaml) +# Encrypt: sops -e -i helm/postgresql-ha-values.sops.yaml + +creation_rules: + - path_regex: \.sops\.yaml$ + key_groups: + - age: + - age1aq8twfd78wvpra0had8cezcnj96tj4q0068edrz5jez8d6xwmflqdepsh4 diff --git a/Makefile b/Makefile index 6e6089c..c260e7a 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,60 @@ SHELL := /usr/bin/env bash .DEFAULT_GOAL := help +KUBECONFIG ?= $(HOME)/.kube/config-hosteurope +KUBECTL := kubectl --kubeconfig=$(KUBECONFIG) +HELM := helm --kubeconfig=$(KUBECONFIG) +NAMESPACE := platform + +PG_CHART_VERSION ?= 16.2.2 +VALKEY_CHART_VERSION ?= 2.x + +##@ PostgreSQL HA + +pg-deploy: ## Deploy / upgrade standalone PostgreSQL HA to platform namespace + $(KUBECTL) create namespace $(NAMESPACE) --dry-run=client -o yaml | $(KUBECTL) apply -f - + $(HELM) repo add bitnami https://charts.bitnami.com/bitnami --force-update + $(HELM) upgrade --install postgresql-ha bitnami/postgresql-ha \ + --version $(PG_CHART_VERSION) \ + --namespace $(NAMESPACE) \ + -f <(sops -d helm/postgresql-ha-values.sops.yaml) \ + --wait --timeout 5m + +pg-status: ## Check PostgreSQL HA pod status + $(KUBECTL) get pods -n $(NAMESPACE) -l app.kubernetes.io/name=postgresql-ha + +pg-pgpool-check: ## Verify pgpool-password secret key is present (see RAIL-BS-WP-0003) + @SECRET=$$($(KUBECTL) get secret -n $(NAMESPACE) postgresql-ha-postgresql \ + -o jsonpath='{.data.pgpool-password}' 2>/dev/null); \ + if [ -z "$$SECRET" ]; then \ + echo "ERROR: pgpool-password key missing from secret — pgpool will CrashLoop on restart"; \ + exit 1; \ + else \ + echo "OK: pgpool-password key present"; \ + fi + +##@ Valkey (cache) + +valkey-deploy: ## Deploy / upgrade Valkey (Redis-compatible) to platform namespace + $(KUBECTL) create namespace $(NAMESPACE) --dry-run=client -o yaml | $(KUBECTL) apply -f - + $(HELM) upgrade --install valkey bitnami/valkey \ + --namespace $(NAMESPACE) \ + -f <(sops -d helm/valkey-values.sops.yaml) \ + --wait --timeout 3m + +valkey-status: ## Check Valkey pod status + $(KUBECTL) get pods -n $(NAMESPACE) -l app.kubernetes.io/name=valkey + +##@ Backup + +backup: ## Backup platform services (PostgreSQL logical dump) — age-encrypted to Nextcloud + sudo tools/cmd/railiance-backup + +##@ Help + help: ## Show this help - @grep -E '^[a-zA-Z0-9_-]+:.*?## ' $(MAKEFILE_LIST) | sort | sed 's/:.*##/: /' + @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} \ + /^[a-zA-Z_-]+:.*?##/ { printf " \033[36m%-22s\033[0m %s\n", $$1, $$2 } \ + /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) }' $(MAKEFILE_LIST) + +.PHONY: pg-deploy pg-status pg-pgpool-check valkey-deploy valkey-status backup help diff --git a/docs/postgresql-ha.md b/docs/postgresql-ha.md new file mode 100644 index 0000000..0dfe978 --- /dev/null +++ b/docs/postgresql-ha.md @@ -0,0 +1,151 @@ +# PostgreSQL HA — Platform Service + +**Chart:** `bitnami/postgresql-ha` +**Namespace:** `platform` +**Managed by:** `railiance-platform` (S3) +**Workplan:** `RAIL-PL-WP-0001` + +--- + +## Architecture + +``` +Apps (S5) + └── pgpool (load balancer / connection pooler) + ├── postgresql-0 [Primary — repmgr] + ├── postgresql-1 [Standby — repmgr] + └── postgresql-2 [Standby — repmgr] +``` + +- **pgpool-II** distributes reads across standbys, routes writes to primary +- **repmgr** handles automatic failover if the primary disappears +- All pods in `platform` namespace; app pods connect via pgpool service + +## Connection string pattern + +``` +postgresql://DBUSER:DBPASS@postgresql-ha-pgpool.platform.svc.cluster.local:5432/DBNAME +``` + +Replace `DBUSER`, `DBPASS`, `DBNAME` with the database-specific credentials. + +--- + +## Initial deployment + +### Prerequisites + +- `railiance-cluster` converged (`make smoke` passes) +- SOPS age key accessible: `sops -d helm/postgresql-ha-values.sops.yaml` returns plaintext +- `helm repo add bitnami https://charts.bitnami.com/bitnami && helm repo update` done on the node + +### Steps + +```bash +# 1. Ensure the platform namespace exists +kubectl create namespace platform --dry-run=client -o yaml | kubectl apply -f - + +# 2. Deploy (from railiance-platform/) +make pg-deploy + +# 3. Verify +make pg-status +# Expected: 3 postgresql pods + 1 pgpool pod, all Running + +# 4. Smoke test +make smoke +``` + +--- + +## Creating a new database for an app + +```bash +# Connect via pgpool +kubectl exec -it -n platform \ + $(kubectl get pod -n platform -l app.kubernetes.io/component=pgpool -o name | head -1) \ + -- psql -U postgres + +# Inside psql: +CREATE DATABASE myapp; +CREATE USER myapp WITH PASSWORD 'strong-password'; +GRANT ALL PRIVILEGES ON DATABASE myapp TO myapp; +\c myapp +GRANT ALL ON SCHEMA public TO myapp; +\q +``` + +Add the user password to the app's own secrets (managed in the app's repo, +not here). The connection string for the app will be: +``` +postgresql://myapp:strong-password@postgresql-ha-pgpool.platform.svc.cluster.local:5432/myapp +``` + +--- + +## Password rotation + +1. Update the password in the plaintext values template +2. Re-encrypt: `sops -e -i helm/postgresql-ha-values.sops.yaml` +3. Upgrade: `make pg-deploy` +4. Update the app's connection secret to match +5. Rolling restart the app pods to pick up the new connection + +--- + +## pgpool-password — critical note + +The `postgresql.pgpoolPassword` value in the Helm chart maps to the +`pgpool-password` key in the `postgresql-ha-postgresql` Kubernetes Secret. +The pgpool container mounts this key at startup; if it is absent, pgpool +enters CrashLoopBackOff with **no log output**. + +**This was the root cause of the 2026-03-10 incident (RAIL-BS-WP-0003).** + +Always verify after `helm upgrade`: +```bash +kubectl get secret -n platform postgresql-ha-postgresql \ + -o jsonpath='{.data.pgpool-password}' | base64 -d && echo +# Must print a non-empty string +``` + +--- + +## HA failover test + +Per Decision D3, any change to this service requires a passing failover test: + +```bash +# From railiance-cluster/ +make test-ha-failover GITEA_URL=https:// +``` + +The test kills the primary PostgreSQL pod and asserts: +1. repmgr promotes a standby within 60s +2. All pods return to Running within 120s +3. pgpool returns to Running (catches the missing-key bug) + +--- + +## Backup + +Platform backup (PostgreSQL logical dump) is handled by the `railiance-backup` +tool in this repo: + +```bash +make backup +``` + +This produces an age-encrypted dump uploaded to Nextcloud. For cluster-level +backup (etcd, kubeconfig), see `railiance-cluster/`. + +--- + +## Scaling to 3 nodes (ThreePhoenix) + +When Railiance02 and Railiance03 join the cluster: + +1. Switch StorageClass from `local-path` to `longhorn` in the values file +2. Change `postgresql.podAntiAffinityPreset` from `soft` to `hard` +3. Run `make pg-deploy` — Helm rolling update spreads pods across nodes +4. Run `make test-ha-failover` to confirm HA is genuine (not just replicated on one node) diff --git a/helm/postgresql-ha-values.yaml.template b/helm/postgresql-ha-values.yaml.template new file mode 100644 index 0000000..ff85425 --- /dev/null +++ b/helm/postgresql-ha-values.yaml.template @@ -0,0 +1,64 @@ +# postgresql-ha-values.yaml.template +# +# Standalone PostgreSQL HA for railiance-platform (S3) +# Chart: bitnami/postgresql-ha version: ~16.x (pin to 16.2.2 or latest stable) +# +# Usage: +# 1. Copy this file: +# cp helm/postgresql-ha-values.yaml.template helm/postgresql-ha-values.sops.yaml +# 2. Fill in all CHANGEME_ values (passwords, storage class, replica count) +# 3. Encrypt with SOPS (age key must be loaded): +# sops -e -i helm/postgresql-ha-values.sops.yaml +# 4. Deploy: +# make pg-deploy +# +# Never commit the plaintext .template file with real passwords. +# The .sops.yaml file (encrypted) is what gets committed. +# +# NOTE: pgpoolPassword MUST match postgresql.pgpoolPassword. +# This was the root cause of the 2026-03-10 incident (RAIL-BS-WP-0003). +# Do not omit it. + +global: + postgresql: + username: postgres + password: CHANGEME_postgres_password + database: postgres + repmgrUsername: repmgr + repmgrPassword: CHANGEME_repmgr_password + +postgresql: + replicaCount: 3 # all 3 pods on 1 node for now; set anti-affinity when 3 nodes exist + password: CHANGEME_postgres_password # must match global.postgresql.password + postgresPassword: CHANGEME_postgres_superuser_password + repmgrPassword: CHANGEME_repmgr_password # must match global.postgresql.repmgrPassword + # pgpoolPassword is the sr_check_password used by pgpool to probe replicas. + # It MUST be set here to survive helm upgrade (see incident RAIL-BS-WP-0003). + pgpoolPassword: CHANGEME_pgpool_sr_check_password + + persistence: + enabled: true + storageClass: "" # use default StorageClass (local-path on single node; longhorn on 3 nodes) + size: 10Gi + + podAntiAffinityPreset: "soft" # soft = prefer spread; switch to "hard" when 3 nodes exist + +pgpool: + replicaCount: 1 + adminPassword: CHANGEME_pgpool_admin_password + # numInitChildren controls max connections; default 32 is fine for single node + numInitChildren: 32 + maxPool: 4 + # Connection load balancing + loadBalancingOnWrite: "transaction" + + readinessProbe: + enabled: true + livenessProbe: + enabled: true + +# Metrics (optional — enable when Prometheus is deployed) +metrics: + enabled: false + serviceMonitor: + enabled: false