feat(backup): implement S2 integrated backup — WP-0004 T01-T04
Some checks failed
railiance-tests / smoke (push) Has been cancelled
Some checks failed
railiance-tests / smoke (push) Has been cancelled
tools/cmd/railiance-backup-s2: - k3s etcd snapshot (age-encrypted) - Helm release values for all namespaces (age-encrypted) - kubeconfig /etc/rancher/k3s/k3s.yaml (age-encrypted) - output: /opt/backup/railiance/cluster/, keep last 7, .last-backup stamp - requires root, no network dependency tools/cmd/railiance-restore-s2: - lists available backups with sizes - prints step-by-step restore instructions for each artifact type Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
86
tools/cmd/railiance-backup-s2
Executable file
86
tools/cmd/railiance-backup-s2
Executable file
@@ -0,0 +1,86 @@
|
||||
#!/usr/bin/env bash
|
||||
# tools/cmd/railiance-backup-s2 — S2 Kubernetes Runtime backup
|
||||
# Backs up: k3s etcd snapshot, Helm release values, kubeconfig
|
||||
# Encryption: age (reuses SOPS key pair from .sops.yaml)
|
||||
# Output: /opt/backup/railiance/cluster/
|
||||
# No network required. Requires root (etcd snapshot + kubeconfig).
|
||||
set -euo pipefail
|
||||
|
||||
ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
|
||||
source "${ROOT}/lib/railiance-print.sh"
|
||||
|
||||
# ── Configuration ──────────────────────────────────────────────────────────────
|
||||
AGE_PUBLIC_KEY="age1aq8twfd78wvpra0had8cezcnj96tj4q0068edrz5jez8d6xwmflqdepsh4"
|
||||
BACKUP_DIR="/opt/backup/railiance/cluster"
|
||||
KUBECONFIG_PATH="/etc/rancher/k3s/k3s.yaml"
|
||||
ETCD_SNAP_DIR="/var/lib/rancher/k3s/server/db/snapshots"
|
||||
KEEP=7
|
||||
TS="$(date -u +%Y%m%dT%H%M%SZ)"
|
||||
|
||||
mkdir -p "${BACKUP_DIR}"
|
||||
print_hdr "railiance-cluster backup — ${TS}"
|
||||
|
||||
# ── Root check ─────────────────────────────────────────────────────────────────
|
||||
if [[ $EUID -ne 0 ]]; then
|
||||
bad "root" "this script requires root — run via: sudo make backup"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# ── 1. k3s etcd snapshot ───────────────────────────────────────────────────────
|
||||
if k3s etcd-snapshot ls &>/dev/null; then
|
||||
ok "etcd" "taking snapshot…"
|
||||
SNAP_NAME="railiance-${TS}"
|
||||
k3s etcd-snapshot save --name "${SNAP_NAME}" &>/dev/null
|
||||
SNAP_FILE="${ETCD_SNAP_DIR}/${SNAP_NAME}"
|
||||
if [[ ! -f "${SNAP_FILE}" ]]; then
|
||||
# k3s may append a suffix — find the most recent matching file
|
||||
SNAP_FILE="$(find "${ETCD_SNAP_DIR}" -name "${SNAP_NAME}*" | sort -r | head -1)"
|
||||
fi
|
||||
age -r "${AGE_PUBLIC_KEY}" -o "${BACKUP_DIR}/etcd-${TS}.snap.age" "${SNAP_FILE}"
|
||||
ok "etcd" "encrypted → etcd-${TS}.snap.age"
|
||||
# Prune old snapshots from k3s store (keep last KEEP)
|
||||
k3s etcd-snapshot prune --snapshot-retention "${KEEP}" &>/dev/null || true
|
||||
else
|
||||
warn "etcd" "k3s etcd not available (SQLite mode?) — skipping snapshot"
|
||||
fi
|
||||
|
||||
# ── 2. Helm release values ─────────────────────────────────────────────────────
|
||||
if command -v helm &>/dev/null; then
|
||||
ok "helm" "capturing release values…"
|
||||
TMP_HELM="$(mktemp -d)"
|
||||
export KUBECONFIG="${KUBECONFIG_PATH}"
|
||||
helm list -A -o json 2>/dev/null \
|
||||
| jq -r '.[] | .name + " " + .namespace' \
|
||||
| while read -r name ns; do
|
||||
helm get values "${name}" -n "${ns}" -o yaml 2>/dev/null \
|
||||
> "${TMP_HELM}/${ns}-${name}.yaml" || true
|
||||
done
|
||||
tar -czf - -C "${TMP_HELM}" . \
|
||||
| age -r "${AGE_PUBLIC_KEY}" -o "${BACKUP_DIR}/helm-values-${TS}.tar.gz.age"
|
||||
rm -rf "${TMP_HELM}"
|
||||
ok "helm" "encrypted → helm-values-${TS}.tar.gz.age"
|
||||
else
|
||||
warn "helm" "helm not found — skipping"
|
||||
fi
|
||||
|
||||
# ── 3. kubeconfig ─────────────────────────────────────────────────────────────
|
||||
if [[ -f "${KUBECONFIG_PATH}" ]]; then
|
||||
age -r "${AGE_PUBLIC_KEY}" -o "${BACKUP_DIR}/kubeconfig-${TS}.yaml.age" "${KUBECONFIG_PATH}"
|
||||
ok "kubeconfig" "encrypted → kubeconfig-${TS}.yaml.age"
|
||||
else
|
||||
warn "kubeconfig" "${KUBECONFIG_PATH} not found — skipping"
|
||||
fi
|
||||
|
||||
# ── 4. Prune local cache ───────────────────────────────────────────────────────
|
||||
for pattern in "etcd-*.snap.age" "helm-values-*.tar.gz.age" "kubeconfig-*.yaml.age"; do
|
||||
find "${BACKUP_DIR}" -name "${pattern}" | sort -r | tail -n +$((KEEP + 1)) | xargs -r rm -f
|
||||
done
|
||||
ok "prune" "kept last ${KEEP} of each type"
|
||||
|
||||
# ── 5. Stamp ───────────────────────────────────────────────────────────────────
|
||||
echo "${TS}" > "${BACKUP_DIR}/.last-backup"
|
||||
|
||||
echo
|
||||
ok "done" "backup complete — ${TS}"
|
||||
echo " Location: ${BACKUP_DIR}"
|
||||
echo " Decrypt with: age -d -i ~/.config/sops/age/keys.txt <file>"
|
||||
65
tools/cmd/railiance-restore-s2
Executable file
65
tools/cmd/railiance-restore-s2
Executable file
@@ -0,0 +1,65 @@
|
||||
#!/usr/bin/env bash
|
||||
# tools/cmd/railiance-restore-s2 — S2 Kubernetes Runtime restore guide
|
||||
# Lists available backups and prints restore instructions.
|
||||
# Actual restore of etcd requires cluster downtime — see instructions below.
|
||||
set -euo pipefail
|
||||
|
||||
BACKUP_DIR="/opt/backup/railiance/cluster"
|
||||
AGE_KEY="${HOME}/.config/sops/age/keys.txt"
|
||||
|
||||
echo ""
|
||||
echo "railiance-cluster (S2) — Available Backups"
|
||||
echo "============================================"
|
||||
echo ""
|
||||
|
||||
if [[ ! -d "${BACKUP_DIR}" ]]; then
|
||||
echo " No backup directory found at ${BACKUP_DIR}"
|
||||
echo " Run: sudo make backup"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
LAST=""
|
||||
[[ -f "${BACKUP_DIR}/.last-backup" ]] && LAST="$(cat "${BACKUP_DIR}/.last-backup")"
|
||||
[[ -n "${LAST}" ]] && echo " Last backup: ${LAST}" || echo " Last backup: unknown"
|
||||
echo ""
|
||||
|
||||
list_type() {
|
||||
local label="$1" pattern="$2"
|
||||
echo " ${label}:"
|
||||
local files
|
||||
files="$(find "${BACKUP_DIR}" -name "${pattern}" 2>/dev/null | sort -r)"
|
||||
if [[ -z "${files}" ]]; then
|
||||
echo " (none)"
|
||||
else
|
||||
echo "${files}" | while read -r f; do
|
||||
echo " $(basename "${f}") [$(du -sh "${f}" | cut -f1)]"
|
||||
done
|
||||
fi
|
||||
echo ""
|
||||
}
|
||||
|
||||
list_type "etcd snapshots" "etcd-*.snap.age"
|
||||
list_type "Helm values" "helm-values-*.tar.gz.age"
|
||||
list_type "kubeconfig" "kubeconfig-*.yaml.age"
|
||||
|
||||
echo "============================================"
|
||||
echo ""
|
||||
echo "Decrypt any file:"
|
||||
echo " age -d -i ${AGE_KEY} <file>"
|
||||
echo ""
|
||||
echo "Restore kubeconfig:"
|
||||
echo " age -d -i ${AGE_KEY} ${BACKUP_DIR}/kubeconfig-<ts>.yaml.age > ~/.kube/config-hosteurope"
|
||||
echo ""
|
||||
echo "Restore etcd snapshot (WARNING: destroys current cluster state):"
|
||||
echo " # 1. Decrypt the snapshot"
|
||||
echo " age -d -i ${AGE_KEY} ${BACKUP_DIR}/etcd-<ts>.snap.age > /tmp/etcd-restore.snap"
|
||||
echo " # 2. Copy to k3s snapshot directory"
|
||||
echo " sudo cp /tmp/etcd-restore.snap /var/lib/rancher/k3s/server/db/snapshots/"
|
||||
echo " # 3. Stop k3s and restore"
|
||||
echo " sudo systemctl stop k3s"
|
||||
echo " sudo k3s server --cluster-reset --cluster-reset-restore-path=/var/lib/rancher/k3s/server/db/snapshots/etcd-restore.snap"
|
||||
echo " sudo systemctl start k3s"
|
||||
echo ""
|
||||
echo "Restore Helm values (for re-running helm upgrade after cluster restore):"
|
||||
echo " age -d -i ${AGE_KEY} ${BACKUP_DIR}/helm-values-<ts>.tar.gz.age | tar -xz -C /tmp/helm-restore/"
|
||||
echo ""
|
||||
Reference in New Issue
Block a user