feat(backup): implement S2 integrated backup — WP-0004 T01-T04
Some checks failed
railiance-tests / smoke (push) Has been cancelled

tools/cmd/railiance-backup-s2:
  - k3s etcd snapshot (age-encrypted)
  - Helm release values for all namespaces (age-encrypted)
  - kubeconfig /etc/rancher/k3s/k3s.yaml (age-encrypted)
  - output: /opt/backup/railiance/cluster/, keep last 7, .last-backup stamp
  - requires root, no network dependency

tools/cmd/railiance-restore-s2:
  - lists available backups with sizes
  - prints step-by-step restore instructions for each artifact type

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-10 21:17:54 +01:00
parent 66f8ca4009
commit 7e28399f69
2 changed files with 151 additions and 0 deletions

86
tools/cmd/railiance-backup-s2 Executable file
View File

@@ -0,0 +1,86 @@
#!/usr/bin/env bash
# tools/cmd/railiance-backup-s2 — S2 Kubernetes Runtime backup
# Backs up: k3s etcd snapshot, Helm release values, kubeconfig
# Encryption: age (reuses SOPS key pair from .sops.yaml)
# Output: /opt/backup/railiance/cluster/
# No network required. Requires root (etcd snapshot + kubeconfig).
set -euo pipefail
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
source "${ROOT}/lib/railiance-print.sh"
# ── Configuration ──────────────────────────────────────────────────────────────
AGE_PUBLIC_KEY="age1aq8twfd78wvpra0had8cezcnj96tj4q0068edrz5jez8d6xwmflqdepsh4"
BACKUP_DIR="/opt/backup/railiance/cluster"
KUBECONFIG_PATH="/etc/rancher/k3s/k3s.yaml"
ETCD_SNAP_DIR="/var/lib/rancher/k3s/server/db/snapshots"
KEEP=7
TS="$(date -u +%Y%m%dT%H%M%SZ)"
mkdir -p "${BACKUP_DIR}"
print_hdr "railiance-cluster backup — ${TS}"
# ── Root check ─────────────────────────────────────────────────────────────────
if [[ $EUID -ne 0 ]]; then
bad "root" "this script requires root — run via: sudo make backup"
exit 1
fi
# ── 1. k3s etcd snapshot ───────────────────────────────────────────────────────
if k3s etcd-snapshot ls &>/dev/null; then
ok "etcd" "taking snapshot…"
SNAP_NAME="railiance-${TS}"
k3s etcd-snapshot save --name "${SNAP_NAME}" &>/dev/null
SNAP_FILE="${ETCD_SNAP_DIR}/${SNAP_NAME}"
if [[ ! -f "${SNAP_FILE}" ]]; then
# k3s may append a suffix — find the most recent matching file
SNAP_FILE="$(find "${ETCD_SNAP_DIR}" -name "${SNAP_NAME}*" | sort -r | head -1)"
fi
age -r "${AGE_PUBLIC_KEY}" -o "${BACKUP_DIR}/etcd-${TS}.snap.age" "${SNAP_FILE}"
ok "etcd" "encrypted → etcd-${TS}.snap.age"
# Prune old snapshots from k3s store (keep last KEEP)
k3s etcd-snapshot prune --snapshot-retention "${KEEP}" &>/dev/null || true
else
warn "etcd" "k3s etcd not available (SQLite mode?) — skipping snapshot"
fi
# ── 2. Helm release values ─────────────────────────────────────────────────────
if command -v helm &>/dev/null; then
ok "helm" "capturing release values…"
TMP_HELM="$(mktemp -d)"
export KUBECONFIG="${KUBECONFIG_PATH}"
helm list -A -o json 2>/dev/null \
| jq -r '.[] | .name + " " + .namespace' \
| while read -r name ns; do
helm get values "${name}" -n "${ns}" -o yaml 2>/dev/null \
> "${TMP_HELM}/${ns}-${name}.yaml" || true
done
tar -czf - -C "${TMP_HELM}" . \
| age -r "${AGE_PUBLIC_KEY}" -o "${BACKUP_DIR}/helm-values-${TS}.tar.gz.age"
rm -rf "${TMP_HELM}"
ok "helm" "encrypted → helm-values-${TS}.tar.gz.age"
else
warn "helm" "helm not found — skipping"
fi
# ── 3. kubeconfig ─────────────────────────────────────────────────────────────
if [[ -f "${KUBECONFIG_PATH}" ]]; then
age -r "${AGE_PUBLIC_KEY}" -o "${BACKUP_DIR}/kubeconfig-${TS}.yaml.age" "${KUBECONFIG_PATH}"
ok "kubeconfig" "encrypted → kubeconfig-${TS}.yaml.age"
else
warn "kubeconfig" "${KUBECONFIG_PATH} not found — skipping"
fi
# ── 4. Prune local cache ───────────────────────────────────────────────────────
for pattern in "etcd-*.snap.age" "helm-values-*.tar.gz.age" "kubeconfig-*.yaml.age"; do
find "${BACKUP_DIR}" -name "${pattern}" | sort -r | tail -n +$((KEEP + 1)) | xargs -r rm -f
done
ok "prune" "kept last ${KEEP} of each type"
# ── 5. Stamp ───────────────────────────────────────────────────────────────────
echo "${TS}" > "${BACKUP_DIR}/.last-backup"
echo
ok "done" "backup complete — ${TS}"
echo " Location: ${BACKUP_DIR}"
echo " Decrypt with: age -d -i ~/.config/sops/age/keys.txt <file>"

65
tools/cmd/railiance-restore-s2 Executable file
View File

@@ -0,0 +1,65 @@
#!/usr/bin/env bash
# tools/cmd/railiance-restore-s2 — S2 Kubernetes Runtime restore guide
# Lists available backups and prints restore instructions.
# Actual restore of etcd requires cluster downtime — see instructions below.
set -euo pipefail
BACKUP_DIR="/opt/backup/railiance/cluster"
AGE_KEY="${HOME}/.config/sops/age/keys.txt"
echo ""
echo "railiance-cluster (S2) — Available Backups"
echo "============================================"
echo ""
if [[ ! -d "${BACKUP_DIR}" ]]; then
echo " No backup directory found at ${BACKUP_DIR}"
echo " Run: sudo make backup"
exit 1
fi
LAST=""
[[ -f "${BACKUP_DIR}/.last-backup" ]] && LAST="$(cat "${BACKUP_DIR}/.last-backup")"
[[ -n "${LAST}" ]] && echo " Last backup: ${LAST}" || echo " Last backup: unknown"
echo ""
list_type() {
local label="$1" pattern="$2"
echo " ${label}:"
local files
files="$(find "${BACKUP_DIR}" -name "${pattern}" 2>/dev/null | sort -r)"
if [[ -z "${files}" ]]; then
echo " (none)"
else
echo "${files}" | while read -r f; do
echo " $(basename "${f}") [$(du -sh "${f}" | cut -f1)]"
done
fi
echo ""
}
list_type "etcd snapshots" "etcd-*.snap.age"
list_type "Helm values" "helm-values-*.tar.gz.age"
list_type "kubeconfig" "kubeconfig-*.yaml.age"
echo "============================================"
echo ""
echo "Decrypt any file:"
echo " age -d -i ${AGE_KEY} <file>"
echo ""
echo "Restore kubeconfig:"
echo " age -d -i ${AGE_KEY} ${BACKUP_DIR}/kubeconfig-<ts>.yaml.age > ~/.kube/config-hosteurope"
echo ""
echo "Restore etcd snapshot (WARNING: destroys current cluster state):"
echo " # 1. Decrypt the snapshot"
echo " age -d -i ${AGE_KEY} ${BACKUP_DIR}/etcd-<ts>.snap.age > /tmp/etcd-restore.snap"
echo " # 2. Copy to k3s snapshot directory"
echo " sudo cp /tmp/etcd-restore.snap /var/lib/rancher/k3s/server/db/snapshots/"
echo " # 3. Stop k3s and restore"
echo " sudo systemctl stop k3s"
echo " sudo k3s server --cluster-reset --cluster-reset-restore-path=/var/lib/rancher/k3s/server/db/snapshots/etcd-restore.snap"
echo " sudo systemctl start k3s"
echo ""
echo "Restore Helm values (for re-running helm upgrade after cluster restore):"
echo " age -d -i ${AGE_KEY} ${BACKUP_DIR}/helm-values-<ts>.tar.gz.age | tar -xz -C /tmp/helm-restore/"
echo ""