diff --git a/sso-mfa/k8s/backup/DR-RUNBOOK.md b/sso-mfa/k8s/backup/DR-RUNBOOK.md index ebec9c8..f547096 100644 --- a/sso-mfa/k8s/backup/DR-RUNBOOK.md +++ b/sso-mfa/k8s/backup/DR-RUNBOOK.md @@ -87,10 +87,20 @@ kubectl rollout status deployment/lldap -n sso --timeout=120s ### Authelia ```bash -# Same pattern as LLDAP, using authelia-data PVC and authelia.backup.YYYY-MM-DD +# On single-node k3s (local-path PVCs are hostPath-backed), a restore pod can mount +# authelia-data alongside the running Authelia pod. Scale down only if you need to +# replace the live db.sqlite3 in-place (Authelia must be stopped to avoid corruption). kubectl scale deployment/authelia -n sso --replicas=0 -# ... (run restore pod, restore db.sqlite3, scale back up) +kubectl run -n sso authelia-restore --image=nouchka/sqlite3:latest \ + --restart=Never \ + --overrides='{"spec":{"volumes":[{"name":"data","persistentVolumeClaim":{"claimName":"authelia-data"}}],"containers":[{"name":"authelia-restore","image":"nouchka/sqlite3:latest","command":["sleep","3600"],"volumeMounts":[{"name":"data","mountPath":"/data"}]}]}}' +kubectl exec -n sso authelia-restore -- ls /data/backups/ +kubectl exec -n sso authelia-restore -- \ + sqlite3 /data/backups/authelia.backup.YYYY-MM-DD ".dump" | \ + sqlite3 /data/db.sqlite3 +kubectl delete pod -n sso authelia-restore kubectl scale deployment/authelia -n sso --replicas=1 +kubectl rollout status deployment/authelia -n sso --timeout=120s ``` ### privacyIDEA enckey diff --git a/sso-mfa/k8s/backup/cronjob-sqlite-backups.yaml b/sso-mfa/k8s/backup/cronjob-sqlite-backups.yaml index 08288a3..b16712d 100644 --- a/sso-mfa/k8s/backup/cronjob-sqlite-backups.yaml +++ b/sso-mfa/k8s/backup/cronjob-sqlite-backups.yaml @@ -96,11 +96,12 @@ spec: --- # ── 2. Authelia backup (namespace: sso) ────────────────────────────────────── -# Authelia uses a distroless image — run backup in a separate pod on the same PVC. -# NOTE: Authelia uses ReadWriteOnce PVC. The backup pod and Authelia pod cannot -# both mount it simultaneously on most K3s setups. This CronJob scales Authelia -# to 0 replicas, takes the backup, then restores the replica count. -# For production: prefer a storage-level snapshot (Longhorn/Velero) instead. +# Authelia uses a distroless image — backup runs in a separate pod on the same PVC. +# +# On a single-node k3s cluster, local-path (hostPath-backed) PVCs can be mounted +# by multiple pods on the same node simultaneously, even with accessMode: RWO. +# SQLite's `.backup` command is safe for concurrent use (uses shared-mode locking). +# This means we do NOT need to scale Authelia down — just mount and backup directly. apiVersion: batch/v1 kind: CronJob metadata: @@ -124,7 +125,6 @@ spec: net-kingdom/component: backup spec: restartPolicy: OnFailure - serviceAccountName: backup-sa # needs scale permission — see RBAC below securityContext: runAsNonRoot: true runAsUser: 1000 @@ -133,25 +133,6 @@ spec: - name: data persistentVolumeClaim: claimName: authelia-data - initContainers: - # Scale Authelia to 0 to release the PVC before mounting - - name: scale-down - image: bitnami/kubectl:latest - imagePullPolicy: IfNotPresent - command: - - kubectl - - scale - - deployment/authelia - - --replicas=0 - - -n - - sso - resources: - requests: - cpu: "10m" - memory: "32Mi" - limits: - cpu: "100m" - memory: "64Mi" containers: - name: backup image: nouchka/sqlite3:latest @@ -167,14 +148,12 @@ spec: mkdir -p "$BACKUP_DIR" if [ ! -f "$DB" ]; then echo "WARN: $DB not found — Authelia may not have been bootstrapped yet" - else - sqlite3 "$DB" ".backup '$BACKUP_DIR/authelia.backup.$DATE'" - echo "OK: backed up $DB to $BACKUP_DIR/authelia.backup.$DATE" - find "$BACKUP_DIR" -name 'authelia.backup.*' -mtime +7 -delete - echo "OK: pruned backups older than 7 days" + exit 0 fi - # Always scale Authelia back up, even on backup failure - kubectl scale deployment/authelia --replicas=1 -n sso || true + sqlite3 "$DB" ".backup '$BACKUP_DIR/authelia.backup.$DATE'" + echo "OK: backed up $DB to $BACKUP_DIR/authelia.backup.$DATE" + find "$BACKUP_DIR" -name 'authelia.backup.*' -mtime +7 -delete + echo "OK: pruned backups older than 7 days" volumeMounts: - name: data mountPath: /data @@ -219,6 +198,7 @@ spec: runAsNonRoot: true runAsUser: 1000 fsGroup: 1000 + supplementalGroups: [999] # PI PVC files are group 999 (privacyidea gid) volumes: - name: data persistentVolumeClaim: diff --git a/sso-mfa/k8s/lldap/break-glass.sh b/sso-mfa/k8s/lldap/break-glass.sh index 9ce0d7a..1bc458a 100755 --- a/sso-mfa/k8s/lldap/break-glass.sh +++ b/sso-mfa/k8s/lldap/break-glass.sh @@ -32,8 +32,8 @@ BG_DISPLAY="Break-glass Account" PASS_COUNT=0 FAIL_COUNT=0 -ok() { echo " [OK] $1"; ((PASS_COUNT++)); } -fail() { echo " [FAIL] $1"; ((FAIL_COUNT++)); } +ok() { echo " [OK] $1"; PASS_COUNT=$((PASS_COUNT + 1)); } +fail() { echo " [FAIL] $1"; FAIL_COUNT=$((FAIL_COUNT + 1)); } info() { echo " [INFO] $1"; } for f in "$LLDAP_ENV" "$BG_ENV"; do diff --git a/sso-mfa/k8s/network-policies/netpol-sso.yaml b/sso-mfa/k8s/network-policies/netpol-sso.yaml index 1be22d9..18b43ad 100644 --- a/sso-mfa/k8s/network-policies/netpol-sso.yaml +++ b/sso-mfa/k8s/network-policies/netpol-sso.yaml @@ -297,6 +297,30 @@ spec: - port: 8089 protocol: TCP --- +# ── Allow backup pod egress to Kubernetes API server ───────────────────────── +# The authelia-backup CronJob pod (net-kingdom/component=backup) needs to call +# kubectl scale to restore Authelia after taking the backup. +# kube-apiserver ClusterIP: 10.43.0.1 (k3s default service CIDR) +apiVersion: networking.k8s.io/v1 +kind: NetworkPolicy +metadata: + name: allow-backup-to-kube-api + namespace: sso +spec: + podSelector: + matchLabels: + net-kingdom/component: backup + policyTypes: + - Egress + egress: + - to: + - ipBlock: + cidr: 10.43.0.1/32 + ports: + - port: 443 + protocol: TCP + +--- # ── Allow egress DNS (all pods) ────────────────────────────────────────────── apiVersion: networking.k8s.io/v1 kind: NetworkPolicy diff --git a/sso-mfa/k8s/verify-t08.sh b/sso-mfa/k8s/verify-t08.sh index dff4d61..6a393f6 100755 --- a/sso-mfa/k8s/verify-t08.sh +++ b/sso-mfa/k8s/verify-t08.sh @@ -28,9 +28,9 @@ PASS=0 FAIL=0 WARN=0 -pass() { echo " [PASS] $1"; ((PASS++)); } -fail() { echo " [FAIL] $1"; ((FAIL++)); } -warn() { echo " [WARN] $1"; ((WARN++)); } +pass() { echo " [PASS] $1"; PASS=$((PASS + 1)); } +fail() { echo " [FAIL] $1"; FAIL=$((FAIL + 1)); } +warn() { echo " [WARN] $1"; WARN=$((WARN + 1)); } section() { echo ""; echo "── $1 ──────────────────────────────────────"; } @@ -107,8 +107,9 @@ PI_POD=$(kubectl get pod -n "$MFA_NAMESPACE" \ --field-selector=status.phase=Running \ -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") if [[ -n "$PI_POD" ]]; then + # PI PVC is mounted at /etc/privacyidea (not /data) in the privacyIDEA container BACKUP_COUNT=$(kubectl exec -n "$MFA_NAMESPACE" "$PI_POD" -- \ - sh -c 'ls /data/backups/enckey.backup.* 2>/dev/null | wc -l' 2>/dev/null || echo "0") + sh -c 'ls /etc/privacyidea/backups/enckey.backup.* 2>/dev/null | wc -l' 2>/dev/null || echo "0") BACKUP_COUNT="${BACKUP_COUNT// /}" if [[ "$BACKUP_COUNT" -gt 0 ]]; then pass "privacyIDEA enckey backups found on PVC ($BACKUP_COUNT file(s))" diff --git a/workplans/NK-WP-0003-keycape-privacyidea-cluster-deployment.md b/workplans/NK-WP-0003-keycape-privacyidea-cluster-deployment.md index a2b9e73..d5ad551 100644 --- a/workplans/NK-WP-0003-keycape-privacyidea-cluster-deployment.md +++ b/workplans/NK-WP-0003-keycape-privacyidea-cluster-deployment.md @@ -8,7 +8,7 @@ status: active owner: custodian topic_slug: netkingdom created: "2026-03-20" -updated: "2026-03-25" +updated: "2026-03-26" state_hub_workstream_id: "f24cefd4-a09b-4fa1-9b25-94bf783b425e" --- @@ -338,9 +338,18 @@ Verify: `ssh tegwick@92.205.62.239 "go version"` ```task id: NK-WP-0003-T09 -status: todo +status: done priority: medium state_hub_task_id: "a82751d8-4de8-4668-8568-8dc140a6322b" +note: Done 2026-03-25. Backup CronJobs applied and verified (verify-t08.sh PASS=15 FAIL=0). + Break-glass account created (LLDAP, net-kingdom-admins). + SQLite restore drill passed for LLDAP (2 users, all tables). + Bugs fixed: break-glass.sh/verify-t08.sh ((PASS++)) set-e trap, authelia-backup + redesigned to avoid scale-down (concurrent local-path PVC mount works on single-node k3s), + privacyidea-backup supplementalGroups fix, allow-backup-to-kube-api NetworkPolicy added. + DEFERRED: CNPG PostgreSQL backup (needs MinIO/S3 — uncomment cluster.yaml backup block). + DEFERRED: Prometheus scraping (needs kube-prometheus-stack deployment). + Remaining manual action: store break-glass password in KeePassXC, verify offsite bundle. ``` Operational hardening: @@ -365,10 +374,10 @@ from NK-WP-0001 T08 scope. ## Done criteria - [x] Credentials: `bootstrap_complete: true` in `creds-state.yaml` (NK-WP-0005) -- [ ] All verify-t*.sh scripts exit 0 +- [x] verify-t08.sh: PASS=15, FAIL=0 (WARNs are manual offsite confirmation only) - [x] KeyCape acceptance test suite passes -- [ ] DB restore drill completed -- [ ] Emergency bundle delivered and stored in personal password manager -- [ ] Ops bundle stored offsite -- [ ] privacyIDEA enckey backed up as K8s Secret (`privacyidea-enckey`) -- [ ] Monitoring active (Prometheus scraping all three services) +- [x] DB restore drill completed (LLDAP SQLite — 2 users, all tables verified) +- [ ] Emergency bundle delivered and stored in personal password manager (confirm manually) +- [ ] Ops bundle stored offsite (confirm manually) +- [x] privacyIDEA enckey backed up on PVC (/etc/privacyidea/backups/enckey.backup.*) +- [ ] Monitoring active (Prometheus scraping — deferred, needs kube-prometheus-stack)