feat(platform): T01 — standalone PostgreSQL HA chart scaffold
Lays out the S3 platform layer foundation for RAIL-PL-WP-0001 T01: - .sops.yaml: age encryption policy (shared key, *.sops.yaml pattern) - .gitignore: prevents accidental commit of decrypted values files - Makefile: pg-deploy, pg-status, pg-pgpool-check, valkey-deploy, valkey-status, backup targets with KUBECONFIG/HELM wiring - helm/postgresql-ha-values.yaml.template: annotated values schema with CHANGEME_ placeholders; includes pgpool-password fix from RAIL-BS-WP-0003; notes on single-node vs ThreePhoenix scaling - docs/postgresql-ha.md: connection strings, DB creation, password rotation, pgpool-password critical note, HA failover test ref, ThreePhoenix scaling path To complete T01: fill in CHANGEME_ values, encrypt with sops -e -i, then run make pg-deploy. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
7
.gitignore
vendored
Normal file
7
.gitignore
vendored
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
# Decrypted helm values — never commit plaintext secrets
|
||||||
|
helm/*.yaml
|
||||||
|
!helm/*.sops.yaml
|
||||||
|
!helm/*.yaml.template
|
||||||
|
|
||||||
|
# Kubeconfig
|
||||||
|
*.kubeconfig
|
||||||
12
.sops.yaml
Normal file
12
.sops.yaml
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
# SOPS encryption policy for railiance-platform
|
||||||
|
# Encrypts any file matching *.sops.yaml using the shared age key.
|
||||||
|
# Decrypt: sops -d helm/postgresql-ha-values.sops.yaml
|
||||||
|
# Use with helm: helm upgrade postgresql-ha bitnami/postgresql-ha \
|
||||||
|
# -n platform -f <(sops -d helm/postgresql-ha-values.sops.yaml)
|
||||||
|
# Encrypt: sops -e -i helm/postgresql-ha-values.sops.yaml
|
||||||
|
|
||||||
|
creation_rules:
|
||||||
|
- path_regex: \.sops\.yaml$
|
||||||
|
key_groups:
|
||||||
|
- age:
|
||||||
|
- age1aq8twfd78wvpra0had8cezcnj96tj4q0068edrz5jez8d6xwmflqdepsh4
|
||||||
57
Makefile
57
Makefile
@@ -1,5 +1,60 @@
|
|||||||
SHELL := /usr/bin/env bash
|
SHELL := /usr/bin/env bash
|
||||||
.DEFAULT_GOAL := help
|
.DEFAULT_GOAL := help
|
||||||
|
|
||||||
|
KUBECONFIG ?= $(HOME)/.kube/config-hosteurope
|
||||||
|
KUBECTL := kubectl --kubeconfig=$(KUBECONFIG)
|
||||||
|
HELM := helm --kubeconfig=$(KUBECONFIG)
|
||||||
|
NAMESPACE := platform
|
||||||
|
|
||||||
|
PG_CHART_VERSION ?= 16.2.2
|
||||||
|
VALKEY_CHART_VERSION ?= 2.x
|
||||||
|
|
||||||
|
##@ PostgreSQL HA
|
||||||
|
|
||||||
|
pg-deploy: ## Deploy / upgrade standalone PostgreSQL HA to platform namespace
|
||||||
|
$(KUBECTL) create namespace $(NAMESPACE) --dry-run=client -o yaml | $(KUBECTL) apply -f -
|
||||||
|
$(HELM) repo add bitnami https://charts.bitnami.com/bitnami --force-update
|
||||||
|
$(HELM) upgrade --install postgresql-ha bitnami/postgresql-ha \
|
||||||
|
--version $(PG_CHART_VERSION) \
|
||||||
|
--namespace $(NAMESPACE) \
|
||||||
|
-f <(sops -d helm/postgresql-ha-values.sops.yaml) \
|
||||||
|
--wait --timeout 5m
|
||||||
|
|
||||||
|
pg-status: ## Check PostgreSQL HA pod status
|
||||||
|
$(KUBECTL) get pods -n $(NAMESPACE) -l app.kubernetes.io/name=postgresql-ha
|
||||||
|
|
||||||
|
pg-pgpool-check: ## Verify pgpool-password secret key is present (see RAIL-BS-WP-0003)
|
||||||
|
@SECRET=$$($(KUBECTL) get secret -n $(NAMESPACE) postgresql-ha-postgresql \
|
||||||
|
-o jsonpath='{.data.pgpool-password}' 2>/dev/null); \
|
||||||
|
if [ -z "$$SECRET" ]; then \
|
||||||
|
echo "ERROR: pgpool-password key missing from secret — pgpool will CrashLoop on restart"; \
|
||||||
|
exit 1; \
|
||||||
|
else \
|
||||||
|
echo "OK: pgpool-password key present"; \
|
||||||
|
fi
|
||||||
|
|
||||||
|
##@ Valkey (cache)
|
||||||
|
|
||||||
|
valkey-deploy: ## Deploy / upgrade Valkey (Redis-compatible) to platform namespace
|
||||||
|
$(KUBECTL) create namespace $(NAMESPACE) --dry-run=client -o yaml | $(KUBECTL) apply -f -
|
||||||
|
$(HELM) upgrade --install valkey bitnami/valkey \
|
||||||
|
--namespace $(NAMESPACE) \
|
||||||
|
-f <(sops -d helm/valkey-values.sops.yaml) \
|
||||||
|
--wait --timeout 3m
|
||||||
|
|
||||||
|
valkey-status: ## Check Valkey pod status
|
||||||
|
$(KUBECTL) get pods -n $(NAMESPACE) -l app.kubernetes.io/name=valkey
|
||||||
|
|
||||||
|
##@ Backup
|
||||||
|
|
||||||
|
backup: ## Backup platform services (PostgreSQL logical dump) — age-encrypted to Nextcloud
|
||||||
|
sudo tools/cmd/railiance-backup
|
||||||
|
|
||||||
|
##@ Help
|
||||||
|
|
||||||
help: ## Show this help
|
help: ## Show this help
|
||||||
@grep -E '^[a-zA-Z0-9_-]+:.*?## ' $(MAKEFILE_LIST) | sort | sed 's/:.*##/: /'
|
@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m<target>\033[0m\n"} \
|
||||||
|
/^[a-zA-Z_-]+:.*?##/ { printf " \033[36m%-22s\033[0m %s\n", $$1, $$2 } \
|
||||||
|
/^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) }' $(MAKEFILE_LIST)
|
||||||
|
|
||||||
|
.PHONY: pg-deploy pg-status pg-pgpool-check valkey-deploy valkey-status backup help
|
||||||
|
|||||||
151
docs/postgresql-ha.md
Normal file
151
docs/postgresql-ha.md
Normal file
@@ -0,0 +1,151 @@
|
|||||||
|
# PostgreSQL HA — Platform Service
|
||||||
|
|
||||||
|
**Chart:** `bitnami/postgresql-ha`
|
||||||
|
**Namespace:** `platform`
|
||||||
|
**Managed by:** `railiance-platform` (S3)
|
||||||
|
**Workplan:** `RAIL-PL-WP-0001`
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
Apps (S5)
|
||||||
|
└── pgpool (load balancer / connection pooler)
|
||||||
|
├── postgresql-0 [Primary — repmgr]
|
||||||
|
├── postgresql-1 [Standby — repmgr]
|
||||||
|
└── postgresql-2 [Standby — repmgr]
|
||||||
|
```
|
||||||
|
|
||||||
|
- **pgpool-II** distributes reads across standbys, routes writes to primary
|
||||||
|
- **repmgr** handles automatic failover if the primary disappears
|
||||||
|
- All pods in `platform` namespace; app pods connect via pgpool service
|
||||||
|
|
||||||
|
## Connection string pattern
|
||||||
|
|
||||||
|
```
|
||||||
|
postgresql://DBUSER:DBPASS@postgresql-ha-pgpool.platform.svc.cluster.local:5432/DBNAME
|
||||||
|
```
|
||||||
|
|
||||||
|
Replace `DBUSER`, `DBPASS`, `DBNAME` with the database-specific credentials.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Initial deployment
|
||||||
|
|
||||||
|
### Prerequisites
|
||||||
|
|
||||||
|
- `railiance-cluster` converged (`make smoke` passes)
|
||||||
|
- SOPS age key accessible: `sops -d helm/postgresql-ha-values.sops.yaml` returns plaintext
|
||||||
|
- `helm repo add bitnami https://charts.bitnami.com/bitnami && helm repo update` done on the node
|
||||||
|
|
||||||
|
### Steps
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# 1. Ensure the platform namespace exists
|
||||||
|
kubectl create namespace platform --dry-run=client -o yaml | kubectl apply -f -
|
||||||
|
|
||||||
|
# 2. Deploy (from railiance-platform/)
|
||||||
|
make pg-deploy
|
||||||
|
|
||||||
|
# 3. Verify
|
||||||
|
make pg-status
|
||||||
|
# Expected: 3 postgresql pods + 1 pgpool pod, all Running
|
||||||
|
|
||||||
|
# 4. Smoke test
|
||||||
|
make smoke
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Creating a new database for an app
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Connect via pgpool
|
||||||
|
kubectl exec -it -n platform \
|
||||||
|
$(kubectl get pod -n platform -l app.kubernetes.io/component=pgpool -o name | head -1) \
|
||||||
|
-- psql -U postgres
|
||||||
|
|
||||||
|
# Inside psql:
|
||||||
|
CREATE DATABASE myapp;
|
||||||
|
CREATE USER myapp WITH PASSWORD 'strong-password';
|
||||||
|
GRANT ALL PRIVILEGES ON DATABASE myapp TO myapp;
|
||||||
|
\c myapp
|
||||||
|
GRANT ALL ON SCHEMA public TO myapp;
|
||||||
|
\q
|
||||||
|
```
|
||||||
|
|
||||||
|
Add the user password to the app's own secrets (managed in the app's repo,
|
||||||
|
not here). The connection string for the app will be:
|
||||||
|
```
|
||||||
|
postgresql://myapp:strong-password@postgresql-ha-pgpool.platform.svc.cluster.local:5432/myapp
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Password rotation
|
||||||
|
|
||||||
|
1. Update the password in the plaintext values template
|
||||||
|
2. Re-encrypt: `sops -e -i helm/postgresql-ha-values.sops.yaml`
|
||||||
|
3. Upgrade: `make pg-deploy`
|
||||||
|
4. Update the app's connection secret to match
|
||||||
|
5. Rolling restart the app pods to pick up the new connection
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## pgpool-password — critical note
|
||||||
|
|
||||||
|
The `postgresql.pgpoolPassword` value in the Helm chart maps to the
|
||||||
|
`pgpool-password` key in the `postgresql-ha-postgresql` Kubernetes Secret.
|
||||||
|
The pgpool container mounts this key at startup; if it is absent, pgpool
|
||||||
|
enters CrashLoopBackOff with **no log output**.
|
||||||
|
|
||||||
|
**This was the root cause of the 2026-03-10 incident (RAIL-BS-WP-0003).**
|
||||||
|
|
||||||
|
Always verify after `helm upgrade`:
|
||||||
|
```bash
|
||||||
|
kubectl get secret -n platform postgresql-ha-postgresql \
|
||||||
|
-o jsonpath='{.data.pgpool-password}' | base64 -d && echo
|
||||||
|
# Must print a non-empty string
|
||||||
|
```
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## HA failover test
|
||||||
|
|
||||||
|
Per Decision D3, any change to this service requires a passing failover test:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# From railiance-cluster/
|
||||||
|
make test-ha-failover GITEA_URL=https://<gitea-hostname>
|
||||||
|
```
|
||||||
|
|
||||||
|
The test kills the primary PostgreSQL pod and asserts:
|
||||||
|
1. repmgr promotes a standby within 60s
|
||||||
|
2. All pods return to Running within 120s
|
||||||
|
3. pgpool returns to Running (catches the missing-key bug)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Backup
|
||||||
|
|
||||||
|
Platform backup (PostgreSQL logical dump) is handled by the `railiance-backup`
|
||||||
|
tool in this repo:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make backup
|
||||||
|
```
|
||||||
|
|
||||||
|
This produces an age-encrypted dump uploaded to Nextcloud. For cluster-level
|
||||||
|
backup (etcd, kubeconfig), see `railiance-cluster/`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Scaling to 3 nodes (ThreePhoenix)
|
||||||
|
|
||||||
|
When Railiance02 and Railiance03 join the cluster:
|
||||||
|
|
||||||
|
1. Switch StorageClass from `local-path` to `longhorn` in the values file
|
||||||
|
2. Change `postgresql.podAntiAffinityPreset` from `soft` to `hard`
|
||||||
|
3. Run `make pg-deploy` — Helm rolling update spreads pods across nodes
|
||||||
|
4. Run `make test-ha-failover` to confirm HA is genuine (not just replicated on one node)
|
||||||
64
helm/postgresql-ha-values.yaml.template
Normal file
64
helm/postgresql-ha-values.yaml.template
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
# postgresql-ha-values.yaml.template
|
||||||
|
#
|
||||||
|
# Standalone PostgreSQL HA for railiance-platform (S3)
|
||||||
|
# Chart: bitnami/postgresql-ha version: ~16.x (pin to 16.2.2 or latest stable)
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
# 1. Copy this file:
|
||||||
|
# cp helm/postgresql-ha-values.yaml.template helm/postgresql-ha-values.sops.yaml
|
||||||
|
# 2. Fill in all CHANGEME_ values (passwords, storage class, replica count)
|
||||||
|
# 3. Encrypt with SOPS (age key must be loaded):
|
||||||
|
# sops -e -i helm/postgresql-ha-values.sops.yaml
|
||||||
|
# 4. Deploy:
|
||||||
|
# make pg-deploy
|
||||||
|
#
|
||||||
|
# Never commit the plaintext .template file with real passwords.
|
||||||
|
# The .sops.yaml file (encrypted) is what gets committed.
|
||||||
|
#
|
||||||
|
# NOTE: pgpoolPassword MUST match postgresql.pgpoolPassword.
|
||||||
|
# This was the root cause of the 2026-03-10 incident (RAIL-BS-WP-0003).
|
||||||
|
# Do not omit it.
|
||||||
|
|
||||||
|
global:
|
||||||
|
postgresql:
|
||||||
|
username: postgres
|
||||||
|
password: CHANGEME_postgres_password
|
||||||
|
database: postgres
|
||||||
|
repmgrUsername: repmgr
|
||||||
|
repmgrPassword: CHANGEME_repmgr_password
|
||||||
|
|
||||||
|
postgresql:
|
||||||
|
replicaCount: 3 # all 3 pods on 1 node for now; set anti-affinity when 3 nodes exist
|
||||||
|
password: CHANGEME_postgres_password # must match global.postgresql.password
|
||||||
|
postgresPassword: CHANGEME_postgres_superuser_password
|
||||||
|
repmgrPassword: CHANGEME_repmgr_password # must match global.postgresql.repmgrPassword
|
||||||
|
# pgpoolPassword is the sr_check_password used by pgpool to probe replicas.
|
||||||
|
# It MUST be set here to survive helm upgrade (see incident RAIL-BS-WP-0003).
|
||||||
|
pgpoolPassword: CHANGEME_pgpool_sr_check_password
|
||||||
|
|
||||||
|
persistence:
|
||||||
|
enabled: true
|
||||||
|
storageClass: "" # use default StorageClass (local-path on single node; longhorn on 3 nodes)
|
||||||
|
size: 10Gi
|
||||||
|
|
||||||
|
podAntiAffinityPreset: "soft" # soft = prefer spread; switch to "hard" when 3 nodes exist
|
||||||
|
|
||||||
|
pgpool:
|
||||||
|
replicaCount: 1
|
||||||
|
adminPassword: CHANGEME_pgpool_admin_password
|
||||||
|
# numInitChildren controls max connections; default 32 is fine for single node
|
||||||
|
numInitChildren: 32
|
||||||
|
maxPool: 4
|
||||||
|
# Connection load balancing
|
||||||
|
loadBalancingOnWrite: "transaction"
|
||||||
|
|
||||||
|
readinessProbe:
|
||||||
|
enabled: true
|
||||||
|
livenessProbe:
|
||||||
|
enabled: true
|
||||||
|
|
||||||
|
# Metrics (optional — enable when Prometheus is deployed)
|
||||||
|
metrics:
|
||||||
|
enabled: false
|
||||||
|
serviceMonitor:
|
||||||
|
enabled: false
|
||||||
Reference in New Issue
Block a user