From 2420915d3079dc44da927e9db17223c23fab7a66 Mon Sep 17 00:00:00 2001 From: Bernd Worsch Date: Thu, 26 Mar 2026 21:56:19 +0000 Subject: [PATCH] fix(backup): SQLite hot backup instead of etcd snapshot k3s runs in SQLite mode (no --cluster-init). Replace etcd-snapshot with sqlite3 .backup for a WAL-aware hot copy of state.db. Update restore guide to match. Cron installed under root crontab. Co-Authored-By: Claude Sonnet 4.6 --- tools/cmd/railiance-backup-s2 | 30 +++++++++++++----------------- tools/cmd/railiance-restore-s2 | 22 ++++++++++++---------- 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/tools/cmd/railiance-backup-s2 b/tools/cmd/railiance-backup-s2 index f05a80c..79fe69e 100755 --- a/tools/cmd/railiance-backup-s2 +++ b/tools/cmd/railiance-backup-s2 @@ -13,7 +13,7 @@ source "${ROOT}/lib/railiance-print.sh" AGE_PUBLIC_KEY="age1aq8twfd78wvpra0had8cezcnj96tj4q0068edrz5jez8d6xwmflqdepsh4" BACKUP_DIR="/opt/backup/railiance/cluster" KUBECONFIG_PATH="/etc/rancher/k3s/k3s.yaml" -ETCD_SNAP_DIR="/var/lib/rancher/k3s/server/db/snapshots" +K3S_STATE_DB="/var/lib/rancher/k3s/server/db/state.db" KEEP=7 TS="$(date -u +%Y%m%dT%H%M%SZ)" @@ -27,22 +27,18 @@ fi mkdir -p "${BACKUP_DIR}" -# ── 1. k3s etcd snapshot ─────────────────────────────────────────────────────── -if k3s etcd-snapshot ls &>/dev/null; then - ok "etcd" "taking snapshot…" - SNAP_NAME="railiance-${TS}" - k3s etcd-snapshot save --name "${SNAP_NAME}" &>/dev/null - SNAP_FILE="${ETCD_SNAP_DIR}/${SNAP_NAME}" - if [[ ! -f "${SNAP_FILE}" ]]; then - # k3s may append a suffix — find the most recent matching file - SNAP_FILE="$(find "${ETCD_SNAP_DIR}" -name "${SNAP_NAME}*" | sort -r | head -1)" - fi - age -r "${AGE_PUBLIC_KEY}" -o "${BACKUP_DIR}/etcd-${TS}.snap.age" "${SNAP_FILE}" - ok "etcd" "encrypted → etcd-${TS}.snap.age" - # Prune old snapshots from k3s store (keep last KEEP) - k3s etcd-snapshot prune --snapshot-retention "${KEEP}" &>/dev/null || true +# ── 1. k3s state (SQLite hot backup) ────────────────────────────────────────── +# This cluster runs k3s in SQLite mode (no --cluster-init). +# sqlite3 .backup performs a WAL-aware hot copy — no k3s stop required. +if [[ -f "${K3S_STATE_DB}" ]]; then + ok "state-db" "taking hot backup…" + TMP_STATE="$(mktemp /tmp/k3s-state-XXXXXX.db)" + sqlite3 "${K3S_STATE_DB}" ".backup ${TMP_STATE}" + age -r "${AGE_PUBLIC_KEY}" -o "${BACKUP_DIR}/k3s-state-${TS}.db.age" "${TMP_STATE}" + rm -f "${TMP_STATE}" + ok "state-db" "encrypted → k3s-state-${TS}.db.age" else - warn "etcd" "k3s etcd not available (SQLite mode?) — skipping snapshot" + warn "state-db" "${K3S_STATE_DB} not found — skipping" fi # ── 2. Helm release values ───────────────────────────────────────────────────── @@ -73,7 +69,7 @@ else fi # ── 4. Prune local cache ─────────────────────────────────────────────────────── -for pattern in "etcd-*.snap.age" "helm-values-*.tar.gz.age" "kubeconfig-*.yaml.age"; do +for pattern in "k3s-state-*.db.age" "helm-values-*.tar.gz.age" "kubeconfig-*.yaml.age"; do find "${BACKUP_DIR}" -name "${pattern}" | sort -r | tail -n +$((KEEP + 1)) | xargs -r rm -f done ok "prune" "kept last ${KEEP} of each type" diff --git a/tools/cmd/railiance-restore-s2 b/tools/cmd/railiance-restore-s2 index d613054..0aa3eb4 100755 --- a/tools/cmd/railiance-restore-s2 +++ b/tools/cmd/railiance-restore-s2 @@ -38,9 +38,9 @@ list_type() { echo "" } -list_type "etcd snapshots" "etcd-*.snap.age" -list_type "Helm values" "helm-values-*.tar.gz.age" -list_type "kubeconfig" "kubeconfig-*.yaml.age" +list_type "k3s state (SQLite)" "k3s-state-*.db.age" +list_type "Helm values" "helm-values-*.tar.gz.age" +list_type "kubeconfig" "kubeconfig-*.yaml.age" echo "============================================" echo "" @@ -50,14 +50,16 @@ echo "" echo "Restore kubeconfig:" echo " age -d -i ${AGE_KEY} ${BACKUP_DIR}/kubeconfig-.yaml.age > ~/.kube/config-hosteurope" echo "" -echo "Restore etcd snapshot (WARNING: destroys current cluster state):" -echo " # 1. Decrypt the snapshot" -echo " age -d -i ${AGE_KEY} ${BACKUP_DIR}/etcd-.snap.age > /tmp/etcd-restore.snap" -echo " # 2. Copy to k3s snapshot directory" -echo " sudo cp /tmp/etcd-restore.snap /var/lib/rancher/k3s/server/db/snapshots/" -echo " # 3. Stop k3s and restore" +echo "Restore k3s state (SQLite) — WARNING: destroys current cluster state:" +echo " # 1. Decrypt the state db" +echo " age -d -i ${AGE_KEY} ${BACKUP_DIR}/k3s-state-.db.age > /tmp/k3s-restore.db" +echo " # 2. Stop k3s" echo " sudo systemctl stop k3s" -echo " sudo k3s server --cluster-reset --cluster-reset-restore-path=/var/lib/rancher/k3s/server/db/snapshots/etcd-restore.snap" +echo " # 3. Replace the state db" +echo " sudo cp /var/lib/rancher/k3s/server/db/state.db /var/lib/rancher/k3s/server/db/state.db.bak" +echo " sudo cp /tmp/k3s-restore.db /var/lib/rancher/k3s/server/db/state.db" +echo " sudo rm -f /var/lib/rancher/k3s/server/db/state.db-shm /var/lib/rancher/k3s/server/db/state.db-wal" +echo " # 4. Start k3s" echo " sudo systemctl start k3s" echo "" echo "Restore Helm values (for re-running helm upgrade after cluster restore):"