fix(backup): SQLite hot backup instead of etcd snapshot
Some checks failed
railiance-tests / smoke (push) Has been cancelled
Some checks failed
railiance-tests / smoke (push) Has been cancelled
k3s runs in SQLite mode (no --cluster-init). Replace etcd-snapshot with sqlite3 .backup for a WAL-aware hot copy of state.db. Update restore guide to match. Cron installed under root crontab. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -13,7 +13,7 @@ source "${ROOT}/lib/railiance-print.sh"
|
||||
AGE_PUBLIC_KEY="age1aq8twfd78wvpra0had8cezcnj96tj4q0068edrz5jez8d6xwmflqdepsh4"
|
||||
BACKUP_DIR="/opt/backup/railiance/cluster"
|
||||
KUBECONFIG_PATH="/etc/rancher/k3s/k3s.yaml"
|
||||
ETCD_SNAP_DIR="/var/lib/rancher/k3s/server/db/snapshots"
|
||||
K3S_STATE_DB="/var/lib/rancher/k3s/server/db/state.db"
|
||||
KEEP=7
|
||||
TS="$(date -u +%Y%m%dT%H%M%SZ)"
|
||||
|
||||
@@ -27,22 +27,18 @@ fi
|
||||
|
||||
mkdir -p "${BACKUP_DIR}"
|
||||
|
||||
# ── 1. k3s etcd snapshot ───────────────────────────────────────────────────────
|
||||
if k3s etcd-snapshot ls &>/dev/null; then
|
||||
ok "etcd" "taking snapshot…"
|
||||
SNAP_NAME="railiance-${TS}"
|
||||
k3s etcd-snapshot save --name "${SNAP_NAME}" &>/dev/null
|
||||
SNAP_FILE="${ETCD_SNAP_DIR}/${SNAP_NAME}"
|
||||
if [[ ! -f "${SNAP_FILE}" ]]; then
|
||||
# k3s may append a suffix — find the most recent matching file
|
||||
SNAP_FILE="$(find "${ETCD_SNAP_DIR}" -name "${SNAP_NAME}*" | sort -r | head -1)"
|
||||
fi
|
||||
age -r "${AGE_PUBLIC_KEY}" -o "${BACKUP_DIR}/etcd-${TS}.snap.age" "${SNAP_FILE}"
|
||||
ok "etcd" "encrypted → etcd-${TS}.snap.age"
|
||||
# Prune old snapshots from k3s store (keep last KEEP)
|
||||
k3s etcd-snapshot prune --snapshot-retention "${KEEP}" &>/dev/null || true
|
||||
# ── 1. k3s state (SQLite hot backup) ──────────────────────────────────────────
|
||||
# This cluster runs k3s in SQLite mode (no --cluster-init).
|
||||
# sqlite3 .backup performs a WAL-aware hot copy — no k3s stop required.
|
||||
if [[ -f "${K3S_STATE_DB}" ]]; then
|
||||
ok "state-db" "taking hot backup…"
|
||||
TMP_STATE="$(mktemp /tmp/k3s-state-XXXXXX.db)"
|
||||
sqlite3 "${K3S_STATE_DB}" ".backup ${TMP_STATE}"
|
||||
age -r "${AGE_PUBLIC_KEY}" -o "${BACKUP_DIR}/k3s-state-${TS}.db.age" "${TMP_STATE}"
|
||||
rm -f "${TMP_STATE}"
|
||||
ok "state-db" "encrypted → k3s-state-${TS}.db.age"
|
||||
else
|
||||
warn "etcd" "k3s etcd not available (SQLite mode?) — skipping snapshot"
|
||||
warn "state-db" "${K3S_STATE_DB} not found — skipping"
|
||||
fi
|
||||
|
||||
# ── 2. Helm release values ─────────────────────────────────────────────────────
|
||||
@@ -73,7 +69,7 @@ else
|
||||
fi
|
||||
|
||||
# ── 4. Prune local cache ───────────────────────────────────────────────────────
|
||||
for pattern in "etcd-*.snap.age" "helm-values-*.tar.gz.age" "kubeconfig-*.yaml.age"; do
|
||||
for pattern in "k3s-state-*.db.age" "helm-values-*.tar.gz.age" "kubeconfig-*.yaml.age"; do
|
||||
find "${BACKUP_DIR}" -name "${pattern}" | sort -r | tail -n +$((KEEP + 1)) | xargs -r rm -f
|
||||
done
|
||||
ok "prune" "kept last ${KEEP} of each type"
|
||||
|
||||
@@ -38,9 +38,9 @@ list_type() {
|
||||
echo ""
|
||||
}
|
||||
|
||||
list_type "etcd snapshots" "etcd-*.snap.age"
|
||||
list_type "Helm values" "helm-values-*.tar.gz.age"
|
||||
list_type "kubeconfig" "kubeconfig-*.yaml.age"
|
||||
list_type "k3s state (SQLite)" "k3s-state-*.db.age"
|
||||
list_type "Helm values" "helm-values-*.tar.gz.age"
|
||||
list_type "kubeconfig" "kubeconfig-*.yaml.age"
|
||||
|
||||
echo "============================================"
|
||||
echo ""
|
||||
@@ -50,14 +50,16 @@ echo ""
|
||||
echo "Restore kubeconfig:"
|
||||
echo " age -d -i ${AGE_KEY} ${BACKUP_DIR}/kubeconfig-<ts>.yaml.age > ~/.kube/config-hosteurope"
|
||||
echo ""
|
||||
echo "Restore etcd snapshot (WARNING: destroys current cluster state):"
|
||||
echo " # 1. Decrypt the snapshot"
|
||||
echo " age -d -i ${AGE_KEY} ${BACKUP_DIR}/etcd-<ts>.snap.age > /tmp/etcd-restore.snap"
|
||||
echo " # 2. Copy to k3s snapshot directory"
|
||||
echo " sudo cp /tmp/etcd-restore.snap /var/lib/rancher/k3s/server/db/snapshots/"
|
||||
echo " # 3. Stop k3s and restore"
|
||||
echo "Restore k3s state (SQLite) — WARNING: destroys current cluster state:"
|
||||
echo " # 1. Decrypt the state db"
|
||||
echo " age -d -i ${AGE_KEY} ${BACKUP_DIR}/k3s-state-<ts>.db.age > /tmp/k3s-restore.db"
|
||||
echo " # 2. Stop k3s"
|
||||
echo " sudo systemctl stop k3s"
|
||||
echo " sudo k3s server --cluster-reset --cluster-reset-restore-path=/var/lib/rancher/k3s/server/db/snapshots/etcd-restore.snap"
|
||||
echo " # 3. Replace the state db"
|
||||
echo " sudo cp /var/lib/rancher/k3s/server/db/state.db /var/lib/rancher/k3s/server/db/state.db.bak"
|
||||
echo " sudo cp /tmp/k3s-restore.db /var/lib/rancher/k3s/server/db/state.db"
|
||||
echo " sudo rm -f /var/lib/rancher/k3s/server/db/state.db-shm /var/lib/rancher/k3s/server/db/state.db-wal"
|
||||
echo " # 4. Start k3s"
|
||||
echo " sudo systemctl start k3s"
|
||||
echo ""
|
||||
echo "Restore Helm values (for re-running helm upgrade after cluster restore):"
|
||||
|
||||
Reference in New Issue
Block a user