fix(backup): SQLite hot backup instead of etcd snapshot
Some checks failed
railiance-tests / smoke (push) Has been cancelled

k3s runs in SQLite mode (no --cluster-init). Replace etcd-snapshot
with sqlite3 .backup for a WAL-aware hot copy of state.db.
Update restore guide to match. Cron installed under root crontab.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-26 21:56:19 +00:00
parent ee6d7b149e
commit 2420915d30
2 changed files with 25 additions and 27 deletions

View File

@@ -13,7 +13,7 @@ source "${ROOT}/lib/railiance-print.sh"
AGE_PUBLIC_KEY="age1aq8twfd78wvpra0had8cezcnj96tj4q0068edrz5jez8d6xwmflqdepsh4"
BACKUP_DIR="/opt/backup/railiance/cluster"
KUBECONFIG_PATH="/etc/rancher/k3s/k3s.yaml"
ETCD_SNAP_DIR="/var/lib/rancher/k3s/server/db/snapshots"
K3S_STATE_DB="/var/lib/rancher/k3s/server/db/state.db"
KEEP=7
TS="$(date -u +%Y%m%dT%H%M%SZ)"
@@ -27,22 +27,18 @@ fi
mkdir -p "${BACKUP_DIR}"
# ── 1. k3s etcd snapshot ───────────────────────────────────────────────────────
if k3s etcd-snapshot ls &>/dev/null; then
ok "etcd" "taking snapshot…"
SNAP_NAME="railiance-${TS}"
k3s etcd-snapshot save --name "${SNAP_NAME}" &>/dev/null
SNAP_FILE="${ETCD_SNAP_DIR}/${SNAP_NAME}"
if [[ ! -f "${SNAP_FILE}" ]]; then
# k3s may append a suffix — find the most recent matching file
SNAP_FILE="$(find "${ETCD_SNAP_DIR}" -name "${SNAP_NAME}*" | sort -r | head -1)"
fi
age -r "${AGE_PUBLIC_KEY}" -o "${BACKUP_DIR}/etcd-${TS}.snap.age" "${SNAP_FILE}"
ok "etcd" "encrypted → etcd-${TS}.snap.age"
# Prune old snapshots from k3s store (keep last KEEP)
k3s etcd-snapshot prune --snapshot-retention "${KEEP}" &>/dev/null || true
# ── 1. k3s state (SQLite hot backup) ──────────────────────────────────────────
# This cluster runs k3s in SQLite mode (no --cluster-init).
# sqlite3 .backup performs a WAL-aware hot copy — no k3s stop required.
if [[ -f "${K3S_STATE_DB}" ]]; then
ok "state-db" "taking hot backup…"
TMP_STATE="$(mktemp /tmp/k3s-state-XXXXXX.db)"
sqlite3 "${K3S_STATE_DB}" ".backup ${TMP_STATE}"
age -r "${AGE_PUBLIC_KEY}" -o "${BACKUP_DIR}/k3s-state-${TS}.db.age" "${TMP_STATE}"
rm -f "${TMP_STATE}"
ok "state-db" "encrypted → k3s-state-${TS}.db.age"
else
warn "etcd" "k3s etcd not available (SQLite mode?) — skipping snapshot"
warn "state-db" "${K3S_STATE_DB} not found — skipping"
fi
# ── 2. Helm release values ─────────────────────────────────────────────────────
@@ -73,7 +69,7 @@ else
fi
# ── 4. Prune local cache ───────────────────────────────────────────────────────
for pattern in "etcd-*.snap.age" "helm-values-*.tar.gz.age" "kubeconfig-*.yaml.age"; do
for pattern in "k3s-state-*.db.age" "helm-values-*.tar.gz.age" "kubeconfig-*.yaml.age"; do
find "${BACKUP_DIR}" -name "${pattern}" | sort -r | tail -n +$((KEEP + 1)) | xargs -r rm -f
done
ok "prune" "kept last ${KEEP} of each type"

View File

@@ -38,9 +38,9 @@ list_type() {
echo ""
}
list_type "etcd snapshots" "etcd-*.snap.age"
list_type "Helm values" "helm-values-*.tar.gz.age"
list_type "kubeconfig" "kubeconfig-*.yaml.age"
list_type "k3s state (SQLite)" "k3s-state-*.db.age"
list_type "Helm values" "helm-values-*.tar.gz.age"
list_type "kubeconfig" "kubeconfig-*.yaml.age"
echo "============================================"
echo ""
@@ -50,14 +50,16 @@ echo ""
echo "Restore kubeconfig:"
echo " age -d -i ${AGE_KEY} ${BACKUP_DIR}/kubeconfig-<ts>.yaml.age > ~/.kube/config-hosteurope"
echo ""
echo "Restore etcd snapshot (WARNING: destroys current cluster state):"
echo " # 1. Decrypt the snapshot"
echo " age -d -i ${AGE_KEY} ${BACKUP_DIR}/etcd-<ts>.snap.age > /tmp/etcd-restore.snap"
echo " # 2. Copy to k3s snapshot directory"
echo " sudo cp /tmp/etcd-restore.snap /var/lib/rancher/k3s/server/db/snapshots/"
echo " # 3. Stop k3s and restore"
echo "Restore k3s state (SQLite) — WARNING: destroys current cluster state:"
echo " # 1. Decrypt the state db"
echo " age -d -i ${AGE_KEY} ${BACKUP_DIR}/k3s-state-<ts>.db.age > /tmp/k3s-restore.db"
echo " # 2. Stop k3s"
echo " sudo systemctl stop k3s"
echo " sudo k3s server --cluster-reset --cluster-reset-restore-path=/var/lib/rancher/k3s/server/db/snapshots/etcd-restore.snap"
echo " # 3. Replace the state db"
echo " sudo cp /var/lib/rancher/k3s/server/db/state.db /var/lib/rancher/k3s/server/db/state.db.bak"
echo " sudo cp /tmp/k3s-restore.db /var/lib/rancher/k3s/server/db/state.db"
echo " sudo rm -f /var/lib/rancher/k3s/server/db/state.db-shm /var/lib/rancher/k3s/server/db/state.db-wal"
echo " # 4. Start k3s"
echo " sudo systemctl start k3s"
echo ""
echo "Restore Helm values (for re-running helm upgrade after cluster restore):"