From 310c8199c40c9e3e5c924eea0a6fd27e7ad2616c Mon Sep 17 00:00:00 2001 From: Ngo Vu Minh Dat Date: Sat, 20 Jun 2026 17:11:59 +0700 Subject: [PATCH 1/4] targetRevision for postgresql recovery --- disaster-recovery/dr-drill/config.sh | 98 +++++++++++++ disaster-recovery/dr-drill/lib/log.sh | 21 +++ disaster-recovery/dr-drill/lib/preflight.sh | 39 +++++ disaster-recovery/dr-drill/readme.md | 93 ++++++++++++ disaster-recovery/dr-drill/run.sh | 91 ++++++++++++ .../scripts/00-create-doks-cluster.sh | 32 +++++ .../dr-drill/scripts/01-install-argocd.sh | 35 +++++ .../dr-drill/scripts/02-install-app.sh | 122 ++++++++++++++++ .../dr-drill/scripts/03-recover-pg.sh | 63 ++++++++ .../dr-drill/scripts/04-validate.sh | 56 ++++++++ .../scripts/05-validate-vaultwarden.sh | 134 ++++++++++++++++++ .../dr-drill/scripts/99-destroy.sh | 99 +++++++++++++ .../postgresql/raw-manifests/instance.yaml | 134 ++++++++++-------- 13 files changed, 955 insertions(+), 62 deletions(-) create mode 100755 disaster-recovery/dr-drill/config.sh create mode 100644 disaster-recovery/dr-drill/lib/log.sh create mode 100644 disaster-recovery/dr-drill/lib/preflight.sh create mode 100644 disaster-recovery/dr-drill/readme.md create mode 100644 disaster-recovery/dr-drill/run.sh create mode 100755 disaster-recovery/dr-drill/scripts/00-create-doks-cluster.sh create mode 100755 disaster-recovery/dr-drill/scripts/01-install-argocd.sh create mode 100755 disaster-recovery/dr-drill/scripts/02-install-app.sh create mode 100755 disaster-recovery/dr-drill/scripts/03-recover-pg.sh create mode 100644 disaster-recovery/dr-drill/scripts/04-validate.sh create mode 100644 disaster-recovery/dr-drill/scripts/05-validate-vaultwarden.sh create mode 100755 disaster-recovery/dr-drill/scripts/99-destroy.sh diff --git a/disaster-recovery/dr-drill/config.sh b/disaster-recovery/dr-drill/config.sh new file mode 100755 index 0000000..31addf7 --- /dev/null +++ b/disaster-recovery/dr-drill/config.sh @@ -0,0 +1,98 @@ +#!/usr/bin/env bash +# config.sh — central configuration for the PostgreSQL DR drill. +# Sourced by run.sh and every script under scripts/. +# All values can be overridden by exporting them before invoking run.sh. + +# --------------------------------------------------------------------------- +# Run identity +# --------------------------------------------------------------------------- +# RUN_ID makes the cluster name unique per run. In GHA pass github.run_id; +# locally it falls back to a UTC timestamp. +: "${RUN_ID:=$(date -u +%Y%m%d%H%M%S)}" + +: "${CLUSTER_NAME:=pg-backup-test-${RUN_ID}}" + +# --------------------------------------------------------------------------- +# DigitalOcean / DOKS +# --------------------------------------------------------------------------- +: "${REGION:=nyc3}" +: "${NODE_SIZE:=s-4vcpu-8gb}" +: "${NODE_COUNT:=2}" + +# DIGITALOCEAN_TOKEN must be exported by the caller (GHA secret or local env). +# doctl auth is assumed to be already initialised by the CI dependency step, +# but we re-init defensively if a token is present (see lib/preflight.sh). + +# --------------------------------------------------------------------------- +# Kubernetes / app config +# --------------------------------------------------------------------------- +: "${NAMESPACE:=prod-postgresql}" +: "${RECOVERY_TAG:=postgresql-first-recovery-test}" +: "${ARGOCD_CHART_VERSION:=9.5.22}" + +# Repo-relative path to the app-of-app helm chart. +# Resolved against REPO_ROOT (computed in run.sh). +: "${APP_OF_APP_CHART:=kubernetes/argocd/app-of-app}" + +# --------------------------------------------------------------------------- +# Timeouts (seconds unless noted) +# --------------------------------------------------------------------------- +: "${ARGOCD_ROLLOUT_TIMEOUT:=120s}" +: "${HELM_INSTALL_TIMEOUT:=3m}" +: "${CLUSTER_HEALTHY_ATTEMPTS:=90}" # x10s sleep => 15 min +: "${CLUSTER_HEALTHY_INTERVAL:=10}" + +# --------------------------------------------------------------------------- +# JuiceFS (read-only DR mount) +# --------------------------------------------------------------------------- +: "${JUICEFS_ENABLED:=true}" +: "${JUICEFS_READONLY:=true}" # injects the `ro` mount option +: "${JUICEFS_NAMESPACE:=juicefs}" +: "${JUICEFS_SECRET_NAME:=cloudflare-r2}" +: "${JUICEFS_VOLUME_NAME:=cloudflare-r2-prod}" +: "${JUICEFS_BUCKET:=https://4c8ad4e9fa8213af3fd284bb97b68b5e.r2.cloudflarestorage.com/juicefs-prod}" +# Assigned with a plain conditional: the JSON braces collide with ${VAR:=...}. +if [ -z "${JUICEFS_ENVS:-}" ]; then + JUICEFS_ENVS='{"JFS_MOUNT_TIMEOUT": 300}' +fi + +# Metadata engine (the restored CNPG cluster holding the juicefs_prod DB). +# The rw service for a CNPG cluster named "postgresql" is "postgresql-rw". +: "${JUICEFS_META_USER:=juicefs}" +: "${JUICEFS_META_HOST:=postgresql-rw.${NAMESPACE}.svc}" +: "${JUICEFS_META_DB:=juicefs_prod}" +# JUICEFS_META_PASSWORD must be exported (the juicefs DB role password from the +# restored cluster). If you'd rather supply the whole URL, set JUICEFS_METAURL. +: "${JUICEFS_METAURL:=postgres://${JUICEFS_META_USER}:${JUICEFS_META_PASSWORD:-}@${JUICEFS_META_HOST}:5432/${JUICEFS_META_DB}?sslmode=disable}" + +# Label selector for CSI node/controller readiness. +: "${JUICEFS_CSI_SELECTOR:=app.kubernetes.io/name=juicefs-csi-driver}" + +# --------------------------------------------------------------------------- +# Vaultwarden (validated against the read-only JuiceFS mount) +# --------------------------------------------------------------------------- +: "${VAULTWARDEN_NAMESPACE:=vaultwarden}" +: "${VAULTWARDEN_DEPLOYMENT:=vaultwarden}" +: "${VAULTWARDEN_SERVICE:=vaultwarden}" +: "${VAULTWARDEN_DATA_PATH:=/data}" # JuiceFS-backed data dir in the pod +: "${VAULTWARDEN_LOCAL_PORT:=8080}" +: "${VAULTWARDEN_ROLLOUT_TIMEOUT:=300s}" +# Optional: set VW_ADMIN_TOKEN to additionally assert restored user count. + +# --------------------------------------------------------------------------- +# Validation expectations +# --------------------------------------------------------------------------- +# Space-separated list of databases that must exist after recovery. +: "${EXPECTED_DBS:=sonarqube}" + +# --------------------------------------------------------------------------- +# Secrets (required for the secrets step). Exported by caller. +# --------------------------------------------------------------------------- +# R2_ACCESS_KEY +# R2_SECRET_KEY + +# --------------------------------------------------------------------------- +# Behaviour flags +# --------------------------------------------------------------------------- +# Set SKIP_DESTROY=1 to leave the cluster running for inspection after a run. +: "${SKIP_DESTROY:=0}" \ No newline at end of file diff --git a/disaster-recovery/dr-drill/lib/log.sh b/disaster-recovery/dr-drill/lib/log.sh new file mode 100644 index 0000000..a31af38 --- /dev/null +++ b/disaster-recovery/dr-drill/lib/log.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash +# lib/log.sh — minimal structured logging helpers. + +# Colours only when stdout is a TTY (keeps CI logs clean). +if [ -t 1 ]; then + _C_RESET=$'\033[0m'; _C_BLUE=$'\033[34m'; _C_GREEN=$'\033[32m' + _C_YELLOW=$'\033[33m'; _C_RED=$'\033[31m'; _C_DIM=$'\033[2m' +else + _C_RESET=''; _C_BLUE=''; _C_GREEN=''; _C_YELLOW=''; _C_RED=''; _C_DIM='' +fi + +_ts() { date -u +'%H:%M:%S'; } + +log_info() { printf '%s%s%s %s\n' "$_C_DIM" "$(_ts)" "$_C_RESET" "$*"; } +log_step() { printf '\n%s==> %s%s\n' "$_C_BLUE" "$*" "$_C_RESET"; } +log_ok() { printf '%s[OK]%s %s\n' "$_C_GREEN" "$_C_RESET" "$*"; } +log_warn() { printf '%s[WARN]%s %s\n' "$_C_YELLOW" "$_C_RESET" "$*" >&2; } +log_error() { printf '%s[ERROR]%s %s\n' "$_C_RED" "$_C_RESET" "$*" >&2; } + +# die — log and exit non-zero. +die() { log_error "$*"; exit 1; } \ No newline at end of file diff --git a/disaster-recovery/dr-drill/lib/preflight.sh b/disaster-recovery/dr-drill/lib/preflight.sh new file mode 100644 index 0000000..d1d7579 --- /dev/null +++ b/disaster-recovery/dr-drill/lib/preflight.sh @@ -0,0 +1,39 @@ +#!/usr/bin/env bash +# lib/preflight.sh — verify required tooling and environment before doing work. + +# require_cmd ... — fail if any command is missing from PATH. +require_cmd() { + local missing=0 c + for c in "$@"; do + if ! command -v "$c" >/dev/null 2>&1; then + log_error "required command not found: $c" + missing=1 + fi + done + [ "$missing" -eq 0 ] || die "install missing CLIs (these are provided by the CI dependency step)" +} + +# require_env ... — fail if any env var is empty. +require_env() { + local missing=0 v + for v in "$@"; do + if [ -z "${!v:-}" ]; then + log_error "required environment variable not set: $v" + missing=1 + fi + done + [ "$missing" -eq 0 ] || die "export the missing variables (pass GHA secrets through env)" +} + +# ensure_doctl_auth — re-init doctl auth if a token is present and not yet valid. +ensure_doctl_auth() { + if doctl account get >/dev/null 2>&1; then + return 0 + fi + if [ -n "${DIGITALOCEAN_TOKEN:-}" ]; then + log_info "initialising doctl auth from DIGITALOCEAN_TOKEN" + doctl auth init --access-token "$DIGITALOCEAN_TOKEN" >/dev/null + else + die "doctl is not authenticated and DIGITALOCEAN_TOKEN is unset" + fi +} \ No newline at end of file diff --git a/disaster-recovery/dr-drill/readme.md b/disaster-recovery/dr-drill/readme.md new file mode 100644 index 0000000..dfbfd85 --- /dev/null +++ b/disaster-recovery/dr-drill/readme.md @@ -0,0 +1,93 @@ +# PostgreSQL DR Drill + +Automated disaster-recovery test for the homelab PostgreSQL (CNPG) cluster. +Provisions a throwaway DOKS cluster, restores PostgreSQL from R2 WAL archives +via ArgoCD + CNPG bootstrap, validates the restored data, then tears everything +down. + +All logic lives in bash so it runs **identically locally and in CI**. GitHub +Actions only installs the CLIs and calls `run.sh`. + +## Layout + +``` +dr-drill/ +├── config.sh # all env vars + defaults (override by exporting) +├── run.sh # orchestrator; guaranteed cleanup via EXIT trap +├── lib/ +│ ├── log.sh # logging helpers +│ └── preflight.sh # CLI/env checks, doctl auth +├── scripts/ +│ ├── 00-create-cluster.sh # provision DOKS + kubeconfig +│ ├── 01-install-argocd.sh # helm install ArgoCD, wait ready +│ ├── 02-install-apps.sh # juicefs secret (pre-install), app-of-app, +│ │ # cert-manager, secrets, juicefs sync +│ ├── 03-recover-postgres.sh # set recovery tag, sync, wait healthy +│ ├── 04-validate.sh # verify restored databases/tables +│ ├── 05-validate-vaultwarden.sh # deploy + validate VW on read-only JuiceFS +│ └── 99-destroy.sh # cluster + orphaned DO volumes/LBs +├── juicefs-application.yaml # patched ArgoCD Application (conditional `ro`) +└── .github/workflows/dr-drill.yml +``` + +Each `scripts/*.sh` is independently runnable (`./scripts/05-validate-vaultwarden.sh`) +**and** sourceable by `run.sh`. The functions don't execute on source — only +when the file is run directly (the `BASH_SOURCE` guard at the bottom). + +## Drill flow (run.sh) + +``` +create_cluster → install_argocd +create_juicefs_secret # BEFORE install: StorageClass references it +install_app_of_app # juicefs.enabled=true, juicefs.readOnly=true +sync_cert_manager → create_secrets +recover_postgres → validate_data +sync_juicefs # AFTER postgres: mount pod needs the metaurl DB +sync_vaultwarden → validate_vaultwarden +(EXIT trap) → destroy +``` + +## Usage + +```bash +# full drill +export DIGITALOCEAN_TOKEN=... R2_ACCESS_KEY=... R2_SECRET_KEY=... +export JUICEFS_META_PASSWORD=... # juicefs DB role pw in the restored cluster +export VW_ADMIN_TOKEN=... # optional: enables user-count assertion +./run.sh + +# keep the cluster up for inspection (skips teardown) +./run.sh --skip-destroy + +# clean up a leaked run later +CLUSTER_NAME=pg-backup-test-123 ./run.sh destroy +``` + +## JuiceFS read-only protection + +The drill mounts JuiceFS **read-only** (`ro` injected into StorageClass +`mountOptions`). This does two things at once: + +1. Writes to the prod R2 prefix are rejected — no split-brain corruption. +2. Background GC / trash cleanup is disabled — the DR mount can never delete + production objects. + +`05-validate-vaultwarden.sh` proves this by exec-ing into the Vaultwarden pod +and asserting a write into the data dir fails with `Read-only file system`. + +**Apply the patched Application:** replace your existing JuiceFS Application +template with `juicefs-application.yaml` (it adds the conditional `ro` block +gated on `.Values.juicefs.readOnly`). For normal homelab/prod sync, leave +`juicefs.readOnly` unset/false so the mount stays writable. + +## Notes + +- `run.sh` registers an `EXIT INT TERM` trap, so the cluster is destroyed even + on failure or Ctrl-C — the equivalent of GHA's `if: always()`. +- `99-destroy.sh` deliberately omits `set -e`: cleanup must continue past + individual failures. +- The PG app revision override targets `--source-position 1` (the Git source). + Position 2 is the Helm chart source and requires a SemVer constraint, not a + Git tag. +- Required CLIs: `doctl`, `kubectl`, `helm`, `argocd`, `jq`, `curl`. +``` \ No newline at end of file diff --git a/disaster-recovery/dr-drill/run.sh b/disaster-recovery/dr-drill/run.sh new file mode 100644 index 0000000..882f57e --- /dev/null +++ b/disaster-recovery/dr-drill/run.sh @@ -0,0 +1,91 @@ +#!/usr/bin/env bash +# run.sh — orchestrate the full PostgreSQL DR drill. +# +# Usage: +# ./run.sh # full drill: create -> recover -> validate -> destroy +# ./run.sh --skip-destroy # leave the cluster up for inspection +# ./run.sh destroy # destroy only (e.g. to clean up a leaked run) +# +# Required env (typically GHA secrets passed through): +# DIGITALOCEAN_TOKEN, R2_ACCESS_KEY, R2_SECRET_KEY +# +# Optional overrides: see config.sh (REGION, NODE_SIZE, RUN_ID, etc.) +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +export REPO_ROOT + +# --- load config + libs ---------------------------------------------------- +source "$REPO_ROOT/config.sh" +source "$REPO_ROOT/lib/log.sh" +source "$REPO_ROOT/lib/preflight.sh" + +# --- load step functions --------------------------------------------------- +source "$REPO_ROOT/scripts/00-create-cluster.sh" +source "$REPO_ROOT/scripts/01-install-argocd.sh" +source "$REPO_ROOT/scripts/02-install-apps.sh" +source "$REPO_ROOT/scripts/03-recover-postgres.sh" +source "$REPO_ROOT/scripts/04-validate.sh" +source "$REPO_ROOT/scripts/05-validate-vaultwarden.sh" +source "$REPO_ROOT/scripts/99-destroy.sh" + +# --- argument parsing ------------------------------------------------------ +DESTROY_ONLY=0 +for arg in "$@"; do + case "$arg" in + --skip-destroy) SKIP_DESTROY=1 ;; + destroy) DESTROY_ONLY=1 ;; + *) die "unknown argument: $arg" ;; + esac +done + +# --- cleanup trap: the `if: always()` equivalent --------------------------- +# Runs destroy on ANY exit (success, failure, or interrupt) unless skipped. +cleanup() { + local rc=$? + if [ "$SKIP_DESTROY" = "1" ]; then + log_warn "SKIP_DESTROY set — leaving cluster '$CLUSTER_NAME' running" + log_warn "clean up later with: ./run.sh destroy (CLUSTER_NAME=$CLUSTER_NAME)" + else + log_step "Cleanup (exit code: $rc)" + destroy_cluster || log_warn "destroy encountered errors (best-effort)" + fi + exit "$rc" +} + +main() { + require_cmd doctl kubectl helm argocd jq curl + require_env DIGITALOCEAN_TOKEN + ensure_doctl_auth + + if [ "$DESTROY_ONLY" = "1" ]; then + destroy_cluster + return 0 + fi + + # Register cleanup only for the full drill path. + trap cleanup EXIT INT TERM + + create_cluster + install_argocd + wait_for_argocd + + # JuiceFS secret must exist before the StorageClass (existingSecret) is created. + create_juicefs_secret + install_app_of_app + sync_cert_manager + create_secrets + + # PostgreSQL first — JuiceFS metadata + Vaultwarden DB both live here. + recover_postgres + validate_data + + # Data layer: JuiceFS (read-only) then Vaultwarden on top of it. + sync_juicefs + sync_vaultwarden + validate_vaultwarden + + log_ok "DR drill succeeded for cluster $CLUSTER_NAME" +} + +main \ No newline at end of file diff --git a/disaster-recovery/dr-drill/scripts/00-create-doks-cluster.sh b/disaster-recovery/dr-drill/scripts/00-create-doks-cluster.sh new file mode 100755 index 0000000..d3f5af4 --- /dev/null +++ b/disaster-recovery/dr-drill/scripts/00-create-doks-cluster.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# scripts/00-create-cluster.sh — provision the DOKS cluster and load kubeconfig. +set -euo pipefail + +create_cluster() { + log_step "Creating DOKS cluster: $CLUSTER_NAME ($REGION, ${NODE_COUNT}x $NODE_SIZE)" + doctl kubernetes cluster create "$CLUSTER_NAME" \ + --region "$REGION" \ + --size "$NODE_SIZE" \ + --count "$NODE_COUNT" \ + --wait + log_ok "Cluster created" + + log_step "Saving kubeconfig" + doctl kubernetes cluster kubeconfig save "$CLUSTER_NAME" + kubectl config set-context --current --namespace=argocd + log_ok "kubeconfig loaded, default namespace set to argocd" +} + +# Allow running standalone: ./scripts/00-create-cluster.sh +if [ "${BASH_SOURCE[0]}" = "${0}" ]; then + HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + # shellcheck source=../config.sh + source "$HERE/config.sh" + # shellcheck source=../lib/log.sh + source "$HERE/lib/log.sh" + # shellcheck source=../lib/preflight.sh + source "$HERE/lib/preflight.sh" + require_cmd doctl kubectl + ensure_doctl_auth + create_cluster +fi \ No newline at end of file diff --git a/disaster-recovery/dr-drill/scripts/01-install-argocd.sh b/disaster-recovery/dr-drill/scripts/01-install-argocd.sh new file mode 100755 index 0000000..42150f5 --- /dev/null +++ b/disaster-recovery/dr-drill/scripts/01-install-argocd.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# scripts/01-install-argocd.sh — install ArgoCD via Helm and wait for it to be ready. +set -euo pipefail + +install_argocd() { + log_step "Installing ArgoCD (chart $ARGOCD_CHART_VERSION)" + helm repo add argo https://argoproj.github.io/argo-helm >/dev/null 2>&1 || true + helm repo update argo >/dev/null + + helm upgrade --install argocd argo/argo-cd \ + --version "$ARGOCD_CHART_VERSION" \ + --namespace argocd \ + --create-namespace \ + --debug \ + --set 'configs.params.server\.insecure=true' + log_ok "ArgoCD installed" +} + +wait_for_argocd() { + log_step "Waiting for ArgoCD core deployments" + local d + for d in argocd-server argocd-repo-server argocd-applicationset-controller; do + log_info "rollout: $d" + kubectl rollout status "deployment/$d" -n argocd --timeout="$ARGOCD_ROLLOUT_TIMEOUT" + done + log_ok "ArgoCD is ready" +} + +if [ "${BASH_SOURCE[0]}" = "${0}" ]; then + HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + source "$HERE/config.sh"; source "$HERE/lib/log.sh"; source "$HERE/lib/preflight.sh" + require_cmd helm kubectl + install_argocd + wait_for_argocd +fi \ No newline at end of file diff --git a/disaster-recovery/dr-drill/scripts/02-install-app.sh b/disaster-recovery/dr-drill/scripts/02-install-app.sh new file mode 100755 index 0000000..89d51fc --- /dev/null +++ b/disaster-recovery/dr-drill/scripts/02-install-app.sh @@ -0,0 +1,122 @@ +#!/usr/bin/env bash +# scripts/02-install-apps.sh — JuiceFS secret, app-of-app, cert-manager, secrets. +set -euo pipefail + +# create_juicefs_secret — MUST run before install_app_of_app, because the +# JuiceFS StorageClass references existingSecret: . +# Field names match the wener/juicefs-csi-driver existingSecret schema. +create_juicefs_secret() { + [ "$JUICEFS_ENABLED" = "true" ] || { log_info "JuiceFS disabled — skipping secret"; return 0; } + + log_step "Creating JuiceFS namespace and credentials secret" + require_env R2_ACCESS_KEY R2_SECRET_KEY + + if [ -z "${JUICEFS_META_PASSWORD:-}" ] && [[ "$JUICEFS_METAURL" == *"//${JUICEFS_META_USER}:@"* ]]; then + log_warn "JUICEFS_META_PASSWORD is empty — metaurl will have no password" + fi + + kubectl create namespace "$JUICEFS_NAMESPACE" --dry-run=client -o yaml | kubectl apply -f - + + kubectl create secret generic "$JUICEFS_SECRET_NAME" \ + -n "$JUICEFS_NAMESPACE" \ + --from-literal=name="$JUICEFS_VOLUME_NAME" \ + --from-literal=metaurl="$JUICEFS_METAURL" \ + --from-literal=storage="s3" \ + --from-literal=bucket="$JUICEFS_BUCKET" \ + --from-literal=accessKey="$R2_ACCESS_KEY" \ + --from-literal=secretKey="$R2_SECRET_KEY" \ + --from-literal=envs="$JUICEFS_ENVS" \ + --dry-run=client -o yaml | kubectl apply -f - + + log_ok "JuiceFS secret '$JUICEFS_SECRET_NAME' applied in '$JUICEFS_NAMESPACE'" +} + +install_app_of_app() { + log_step "Installing app-of-app chart (juicefs.enabled=$JUICEFS_ENABLED, readOnly=$JUICEFS_READONLY)" + helm upgrade --install app-of-app "$REPO_ROOT/$APP_OF_APP_CHART" \ + --namespace argocd \ + --set metallb.enabled=false \ + --set traefik.enabled=true \ + --set openebs.enabled=false \ + --set postgresql.enabled=true \ + --set certManager.enabled=true \ + --set kubePrometheusStack.enabled=true \ + --set customManifest.enabled=false \ + --set loki.enabled=false \ + --set alloy.enabled=false \ + --set pgadmin4.enabled=false \ + --set sonarqube.enabled=false \ + --set harbor.enabled=false \ + --set velero.enabled=false \ + --set mongoOperator.enabled=false \ + --set kafkaOperator.enabled=false \ + --set juicefs.enabled="$JUICEFS_ENABLED" \ + --set juicefs.readOnly="$JUICEFS_READONLY" \ + --set vaultwarden.enabled=true + log_ok "app-of-app installed" +} + +sync_cert_manager() { + log_step "Syncing cert-manager" + argocd app sync cert-manager --core \ + --retry-limit 5 \ + --retry-backoff-duration 10s \ + --retry-backoff-max-duration 3m \ + --retry-backoff-factor 2 + log_ok "cert-manager synced" +} + +create_secrets() { + log_step "Creating namespace and secrets in $NAMESPACE" + require_env R2_ACCESS_KEY R2_SECRET_KEY + + kubectl create namespace "$NAMESPACE" --dry-run=client -o yaml | kubectl apply -f - + + kubectl create secret generic cloudflare-r2 \ + -n "$NAMESPACE" \ + --from-literal=ACCESS_KEY="$R2_ACCESS_KEY" \ + --from-literal=SECRET_KEY="$R2_SECRET_KEY" \ + --dry-run=client -o yaml | kubectl apply -f - + + kubectl create secret generic postgres-admin \ + -n "$NAMESPACE" \ + --from-literal=username=postgres \ + --from-literal=password=backup-test-dummy \ + --dry-run=client -o yaml | kubectl apply -f - + + log_ok "Secrets applied" +} + +# sync_juicefs — call AFTER PostgreSQL is healthy (the mount pod needs the +# metaurl DB reachable). Syncs the Application and waits for CSI readiness. +sync_juicefs() { + [ "$JUICEFS_ENABLED" = "true" ] || { log_info "JuiceFS disabled — skipping sync"; return 0; } + + log_step "Syncing juicefs Application" + argocd app sync juicefs --core \ + --retry-limit 5 \ + --retry-backoff-duration 10s \ + --retry-backoff-max-duration 3m \ + --retry-backoff-factor 2 + + log_step "Waiting for JuiceFS CSI components" + # Controller is a StatefulSet, node service a DaemonSet. + kubectl rollout status statefulset \ + -n "$JUICEFS_NAMESPACE" -l "$JUICEFS_CSI_SELECTOR" --timeout=180s 2>/dev/null \ + || log_warn "could not confirm CSI controller rollout (continuing)" + kubectl rollout status daemonset \ + -n "$JUICEFS_NAMESPACE" -l "$JUICEFS_CSI_SELECTOR" --timeout=180s 2>/dev/null \ + || log_warn "could not confirm CSI node rollout (continuing)" + log_ok "JuiceFS synced" +} + +if [ "${BASH_SOURCE[0]}" = "${0}" ]; then + HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + REPO_ROOT="${REPO_ROOT:-$HERE}" + source "$HERE/config.sh"; source "$HERE/lib/log.sh"; source "$HERE/lib/preflight.sh" + require_cmd helm kubectl argocd + create_juicefs_secret + install_app_of_app + sync_cert_manager + create_secrets +fi \ No newline at end of file diff --git a/disaster-recovery/dr-drill/scripts/03-recover-pg.sh b/disaster-recovery/dr-drill/scripts/03-recover-pg.sh new file mode 100755 index 0000000..249b197 --- /dev/null +++ b/disaster-recovery/dr-drill/scripts/03-recover-pg.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +# scripts/03-recover-postgres.sh — point PG app at the recovery tag, sync, wait healthy. +set -euo pipefail + +override_revision() { + log_step "Overriding prod-postgresql revision -> $RECOVERY_TAG" + argocd app set prod-postgresql \ + --core \ + --source-position 2 \ + --revision "$RECOVERY_TAG" + log_ok "Revision set" +} + +sync_postgres_app() { + log_step "Syncing prod-postgresql (app)" + # Non-fatal: the Cluster resource sync below is the authoritative gate. + argocd app sync prod-postgresql --core \ + --retry-limit 3 \ + --retry-backoff-duration 5s \ + --retry-backoff-max-duration 1m \ + --retry-backoff-factor 2 || log_warn "app sync returned non-zero (continuing)" + + log_step "Syncing prod-postgresql (Cluster resource)" + argocd app sync prod-postgresql --core \ + --resource postgresql.cnpg.io:Cluster:postgresql \ + --retry-limit 5 \ + --retry-backoff-duration 5s \ + --retry-backoff-max-duration 1m \ + --retry-backoff-factor 2 || log_warn "cluster-resource sync returned non-zero (continuing)" +} + +wait_for_healthy() { + log_step "Waiting for CNPG cluster to reach healthy state" + local i phase + for i in $(seq 1 "$CLUSTER_HEALTHY_ATTEMPTS"); do + phase=$(kubectl get cluster -n "$NAMESPACE" postgresql \ + -o jsonpath='{.status.phase}' 2>/dev/null || echo "unknown") + log_info "attempt $i/$CLUSTER_HEALTHY_ATTEMPTS: phase=$phase" + if [ "$phase" = "Cluster in healthy state" ]; then + log_ok "Cluster is healthy" + return 0 + fi + sleep "$CLUSTER_HEALTHY_INTERVAL" + done + + log_error "Cluster did not reach healthy state in time — dumping diagnostics" + kubectl get cluster -n "$NAMESPACE" postgresql -o yaml || true + kubectl get pods -n "$NAMESPACE" -l cnpg.io/cluster=postgresql || true + return 1 +} + +recover_postgres() { + override_revision + sync_postgres_app + wait_for_healthy +} + +if [ "${BASH_SOURCE[0]}" = "${0}" ]; then + HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + source "$HERE/config.sh"; source "$HERE/lib/log.sh"; source "$HERE/lib/preflight.sh" + require_cmd kubectl argocd + recover_postgres +fi \ No newline at end of file diff --git a/disaster-recovery/dr-drill/scripts/04-validate.sh b/disaster-recovery/dr-drill/scripts/04-validate.sh new file mode 100644 index 0000000..eda411a --- /dev/null +++ b/disaster-recovery/dr-drill/scripts/04-validate.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash +# scripts/04-validate.sh — verify the restored PostgreSQL data. +set -euo pipefail + +_primary_pod() { + kubectl get pods -n "$NAMESPACE" \ + -l cnpg.io/cluster=postgresql,role=primary \ + -o jsonpath='{.items[0].metadata.name}' +} + +validate_data() { + log_step "Validating restored data" + local pod + pod="$(_primary_pod)" + [ -n "$pod" ] || die "could not find primary PostgreSQL pod" + log_info "primary pod: $pod" + + log_info "--- connectivity check ---" + kubectl exec -n "$NAMESPACE" "$pod" -- \ + psql -U postgres -c "SELECT 1 AS connectivity_check;" + + log_info "--- database listing ---" + kubectl exec -n "$NAMESPACE" "$pod" -- \ + psql -U postgres -c "\l" + + log_info "--- verifying expected databases ---" + local db count + for db in $EXPECTED_DBS; do + count=$(kubectl exec -n "$NAMESPACE" "$pod" -- \ + psql -U postgres -tAc \ + "SELECT count(*) FROM pg_database WHERE datname = '$db';") + if [ "$count" -eq 0 ]; then + die "database '$db' not found" + fi + log_ok "database '$db' exists" + done + + log_info "--- counting user tables ---" + local table_count + for db in $EXPECTED_DBS; do + table_count=$(kubectl exec -n "$NAMESPACE" "$pod" -- \ + psql -U postgres -d "$db" -tAc \ + "SELECT count(*) FROM pg_catalog.pg_tables WHERE schemaname NOT IN ('pg_catalog','information_schema');" \ + 2>/dev/null || echo "0") + log_info "database '$db': $table_count user table(s)" + done + + log_ok "All validation checks passed" +} + +if [ "${BASH_SOURCE[0]}" = "${0}" ]; then + HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + source "$HERE/config.sh"; source "$HERE/lib/log.sh"; source "$HERE/lib/preflight.sh" + require_cmd kubectl + validate_data +fi \ No newline at end of file diff --git a/disaster-recovery/dr-drill/scripts/05-validate-vaultwarden.sh b/disaster-recovery/dr-drill/scripts/05-validate-vaultwarden.sh new file mode 100644 index 0000000..4adc6b1 --- /dev/null +++ b/disaster-recovery/dr-drill/scripts/05-validate-vaultwarden.sh @@ -0,0 +1,134 @@ +#!/usr/bin/env bash +# scripts/05-validate-vaultwarden.sh — deploy + validate Vaultwarden on the +# read-only JuiceFS mount. Replaces the un-automatable "log in via UI" step. +set -euo pipefail + +sync_vaultwarden() { + log_step "Syncing vaultwarden Application" + argocd app sync vaultwarden --core \ + --retry-limit 5 \ + --retry-backoff-duration 5s \ + --retry-backoff-max-duration 1m \ + --retry-backoff-factor 2 || log_warn "vaultwarden sync returned non-zero (continuing)" + + log_step "Waiting for Vaultwarden rollout" + kubectl rollout status "deployment/$VAULTWARDEN_DEPLOYMENT" \ + -n "$VAULTWARDEN_NAMESPACE" \ + --timeout="$VAULTWARDEN_ROLLOUT_TIMEOUT" + log_ok "Vaultwarden is running" +} + +_vw_pod() { + kubectl get pods -n "$VAULTWARDEN_NAMESPACE" \ + -l "app=$VAULTWARDEN_DEPLOYMENT" \ + -o jsonpath='{.items[0].metadata.name}' 2>/dev/null +} + +# 1) Process health via HTTP — proves the server started and serves requests. +check_health_endpoint() { + log_step "Checking Vaultwarden HTTP health" + kubectl port-forward "svc/$VAULTWARDEN_SERVICE" \ + "${VAULTWARDEN_LOCAL_PORT}:80" -n "$VAULTWARDEN_NAMESPACE" >/dev/null 2>&1 & + local pf_pid=$! + # Ensure the port-forward is torn down no matter how we exit this function. + trap 'kill "$pf_pid" 2>/dev/null || true' RETURN + sleep 5 + + curl --fail --silent --show-error --retry 5 --retry-delay 3 \ + "http://localhost:${VAULTWARDEN_LOCAL_PORT}/alive" >/dev/null + curl --fail --silent --show-error --retry 5 --retry-delay 3 \ + "http://localhost:${VAULTWARDEN_LOCAL_PORT}/api/config" >/dev/null + log_ok "Vaultwarden is alive and serving /api/config" +} + +# 2) Read-only enforcement — a write into the JuiceFS-backed data dir MUST fail. +# This is the split-brain guarantee: DR cannot mutate the prod R2 prefix. +check_mount_is_readonly() { + log_step "Verifying JuiceFS mount is read-only" + local pod + pod="$(_vw_pod)" + [ -n "$pod" ] || die "could not find Vaultwarden pod" + + local out + out=$(kubectl exec -n "$VAULTWARDEN_NAMESPACE" "$pod" -- \ + sh -c "touch ${VAULTWARDEN_DATA_PATH}/.dr-write-test 2>&1" || true) + + if printf '%s' "$out" | grep -qiE 'read-only file system|permission denied'; then + log_ok "write correctly rejected: ${out}" + else + # Clean up if the write unexpectedly succeeded, then fail hard. + kubectl exec -n "$VAULTWARDEN_NAMESPACE" "$pod" -- \ + rm -f "${VAULTWARDEN_DATA_PATH}/.dr-write-test" 2>/dev/null || true + die "mount is NOT read-only — DR could corrupt prod R2 (output: '${out:-}')" + fi +} + +# 3) Data is actually readable — proves restored metadata maps to real objects. +check_data_readable() { + log_step "Verifying restored data is readable via JuiceFS" + local pod count + pod="$(_vw_pod)" + count=$(kubectl exec -n "$VAULTWARDEN_NAMESPACE" "$pod" -- \ + sh -c "ls -A ${VAULTWARDEN_DATA_PATH} 2>/dev/null | wc -l" || echo 0) + + if [ "${count:-0}" -gt 0 ]; then + log_ok "data dir has $count entries — files restored" + else + die "data dir is empty — JuiceFS restore may have failed" + fi +} + +# 4) Log scan — catch DB connection failures / startup panics. +check_logs_clean() { + log_step "Scanning Vaultwarden logs" + local logs + logs=$(kubectl logs "deployment/$VAULTWARDEN_DEPLOYMENT" \ + -n "$VAULTWARDEN_NAMESPACE" --tail=100 2>/dev/null || echo "") + printf '%s\n' "$logs" + + if printf '%s' "$logs" | grep -qiE 'panic|fatal|database.*(fail|error)|unable to connect'; then + die "critical errors found in Vaultwarden logs" + fi + log_ok "no critical errors in logs" +} + +# 5) Optional deep check — assert restored user count via the admin API. +check_user_data() { + [ -n "${VW_ADMIN_TOKEN:-}" ] || { log_info "VW_ADMIN_TOKEN unset — skipping user-count check"; return 0; } + + log_step "Validating restored user data via admin API" + kubectl port-forward "svc/$VAULTWARDEN_SERVICE" \ + "${VAULTWARDEN_LOCAL_PORT}:80" -n "$VAULTWARDEN_NAMESPACE" >/dev/null 2>&1 & + local pf_pid=$! + trap 'kill "$pf_pid" 2>/dev/null || true' RETURN + sleep 5 + + local users + users=$(curl --silent \ + -H "Authorization: Bearer ${VW_ADMIN_TOKEN}" \ + "http://localhost:${VAULTWARDEN_LOCAL_PORT}/admin/users/overview" \ + | jq 'length' 2>/dev/null || echo 0) + + if [ "${users:-0}" -gt 0 ]; then + log_ok "$users user(s) found — user data restored" + else + die "no users found — DB restore may be incomplete" + fi +} + +validate_vaultwarden() { + check_health_endpoint + check_mount_is_readonly + check_data_readable + check_logs_clean + check_user_data + log_ok "Vaultwarden DR validation passed" +} + +if [ "${BASH_SOURCE[0]}" = "${0}" ]; then + HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + source "$HERE/config.sh"; source "$HERE/lib/log.sh"; source "$HERE/lib/preflight.sh" + require_cmd kubectl argocd curl jq + sync_vaultwarden + validate_vaultwarden +fi \ No newline at end of file diff --git a/disaster-recovery/dr-drill/scripts/99-destroy.sh b/disaster-recovery/dr-drill/scripts/99-destroy.sh new file mode 100755 index 0000000..c33e469 --- /dev/null +++ b/disaster-recovery/dr-drill/scripts/99-destroy.sh @@ -0,0 +1,99 @@ +#!/usr/bin/env bash +# scripts/99-destroy.sh — tear down the DOKS cluster and orphaned DO resources. +# Safe to run repeatedly; every step is best-effort and idempotent. +set -uo pipefail # NOTE: no -e here — cleanup must continue past failures. + +destroy_cluster() { + log_step "Destroy: looking up cluster $CLUSTER_NAME" + local cluster_id + cluster_id=$(doctl kubernetes cluster get "$CLUSTER_NAME" \ + --format ID --no-header 2>/dev/null || echo "") + + if [ -z "$cluster_id" ]; then + log_warn "cluster '$CLUSTER_NAME' not found — nothing to clean up" + return 0 + fi + log_info "cluster ID: $cluster_id" + + # Collect LoadBalancer IPs while kubectl still works — needed after deletion. + local lb_ips + lb_ips=$(kubectl get svc -A \ + -o jsonpath='{.items[?(@.spec.type=="LoadBalancer")].status.loadBalancer.ingress[0].ip}' \ + 2>/dev/null || echo "") + + _scale_down_node_pools + _delete_cluster + _delete_volumes "$cluster_id" + _delete_load_balancers "$cluster_id" "$lb_ips" + + log_ok "Destroy complete" +} + +_scale_down_node_pools() { + log_step "Scaling node pools to 0" + local pool_ids pool_id + pool_ids=$(doctl kubernetes cluster node-pool list "$CLUSTER_NAME" \ + --format ID --no-header 2>/dev/null || echo "") + for pool_id in $pool_ids; do + log_info "scaling pool $pool_id -> 0" + doctl kubernetes cluster node-pool update "$CLUSTER_NAME" "$pool_id" --count 0 || true + done + [ -n "$pool_ids" ] && sleep 30 || true +} + +_delete_cluster() { + log_step "Deleting DOKS cluster" + doctl kubernetes cluster delete "$CLUSTER_NAME" --force --dangerous || true +} + +_delete_volumes() { + # DOKS CSI tags every provisioned volume with k8s:. + local cluster_id="$1" + log_step "Cleaning up Block Storage volumes" + local vol_ids vol_id + vol_ids=$(doctl compute volume list -o json \ + | jq -r --arg tag "k8s:$cluster_id" \ + '.[] | select(.tags? and (.tags[] == $tag)) | .id' 2>/dev/null || echo "") + if [ -z "$vol_ids" ]; then + log_info "no volumes found for cluster $cluster_id" + return 0 + fi + for vol_id in $vol_ids; do + log_info "deleting volume $vol_id" + doctl compute volume delete "$vol_id" --force || true + done +} + +_delete_load_balancers() { + local cluster_id="$1" lb_ips="$2" + log_step "Cleaning up Load Balancers" + + # Primary: match by IPs collected before the cluster was deleted. + local lb_ip lb_id + for lb_ip in $lb_ips; do + lb_id=$(doctl compute load-balancer list -o json \ + | jq -r --arg ip "$lb_ip" '.[] | select(.ip == $ip) | .id') + if [ -n "$lb_id" ]; then + log_info "deleting LB $lb_id (IP: $lb_ip)" + doctl compute load-balancer delete "$lb_id" --force || true + fi + done + + # Fallback: catch any LB still tagged with the cluster ID. + local tagged_ids + tagged_ids=$(doctl compute load-balancer list -o json \ + | jq -r --arg tag "k8s:$cluster_id" \ + '.[] | select(.tags? and (.tags[] == $tag)) | .id' 2>/dev/null || echo "") + for lb_id in $tagged_ids; do + log_info "deleting tagged LB $lb_id" + doctl compute load-balancer delete "$lb_id" --force 2>/dev/null || true + done +} + +if [ "${BASH_SOURCE[0]}" = "${0}" ]; then + HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + source "$HERE/config.sh"; source "$HERE/lib/log.sh"; source "$HERE/lib/preflight.sh" + require_cmd doctl kubectl jq + ensure_doctl_auth + destroy_cluster +fi \ No newline at end of file diff --git a/kubernetes/argocd/argocd-app/stateful/postgresql/raw-manifests/instance.yaml b/kubernetes/argocd/argocd-app/stateful/postgresql/raw-manifests/instance.yaml index 7a69ae4..9ce6c6f 100644 --- a/kubernetes/argocd/argocd-app/stateful/postgresql/raw-manifests/instance.yaml +++ b/kubernetes/argocd/argocd-app/stateful/postgresql/raw-manifests/instance.yaml @@ -75,12 +75,22 @@ spec: superuserSecret: name: postgres-admin monitoring: - enablePodMonitor: true - plugins: - - name: barman-cloud.cloudnative-pg.io - isWALArchiver: true - parameters: - barmanObjectName: cloudflare-r2 + enablePodMonitor: false + bootstrap: + recovery: + source: origin + externalClusters: + - name: origin + plugin: + name: barman-cloud.cloudnative-pg.io + parameters: + barmanObjectName: cloudflare-r2 + serverName: postgresql + # plugins: + # - name: barman-cloud.cloudnative-pg.io + # isWALArchiver: true + # parameters: + # barmanObjectName: cloudflare-r2 managed: services: additional: @@ -90,62 +100,62 @@ spec: name: postgresql-external-rw spec: type: LoadBalancer - roles: - - name: sonarqube - ensure: present - comment: Role for Sonarqube - connectionLimit: 40 - login: true - superuser: false - passwordSecret: { name: sonarqube } - - name: vaultwarden - ensure: present - comment: Role for Vaultwarden - connectionLimit: 20 - login: true - superuser: false - passwordSecret: { name: vaultwarden } - - name: gitlab - ensure: present - comment: Role for Gitlab - connectionLimit: 80 - login: true - superuser: false - passwordSecret: { name: gitlab } - - name: nextcloud - ensure: present - comment: Role for nextcloud - connectionLimit: 30 - login: true - superuser: false - passwordSecret: { name: nextcloud } - - name: juicefs - ensure: present - comment: Role for juicefs - connectionLimit: 30 - login: true - superuser: false - passwordSecret: { name: juicefs } - - name: argus - ensure: present - comment: Role for Argus - connectionLimit: 20 - login: true - superuser: false - passwordSecret: { name: argus } - - name: crowsec - ensure: present - comment: Role for crowsec - connectionLimit: 10 - login: true - superuser: false - passwordSecret: { name: crowsec } - - name: pgadmin_monitor - ensure: present - login: true - superuser: false - passwordSecret: { name: pgadmin-monitor-credentials } - inRoles: [ pg_monitor ] + # roles: + # - name: sonarqube + # ensure: present + # comment: Role for Sonarqube + # connectionLimit: 40 + # login: true + # superuser: false + # passwordSecret: { name: sonarqube } + # - name: vaultwarden + # ensure: present + # comment: Role for Vaultwarden + # connectionLimit: 20 + # login: true + # superuser: false + # passwordSecret: { name: vaultwarden } + # - name: gitlab + # ensure: present + # comment: Role for Gitlab + # connectionLimit: 80 + # login: true + # superuser: false + # passwordSecret: { name: gitlab } + # - name: nextcloud + # ensure: present + # comment: Role for nextcloud + # connectionLimit: 30 + # login: true + # superuser: false + # passwordSecret: { name: nextcloud } + # - name: juicefs + # ensure: present + # comment: Role for juicefs + # connectionLimit: 30 + # login: true + # superuser: false + # passwordSecret: { name: juicefs } + # - name: argus + # ensure: present + # comment: Role for Argus + # connectionLimit: 20 + # login: true + # superuser: false + # passwordSecret: { name: argus } + # - name: crowsec + # ensure: present + # comment: Role for crowsec + # connectionLimit: 10 + # login: true + # superuser: false + # passwordSecret: { name: crowsec } + # - name: pgadmin_monitor + # ensure: present + # login: true + # superuser: false + # passwordSecret: { name: pgadmin-monitor-credentials } + # inRoles: [ pg_monitor ] instances: 2 primaryUpdateMethod: switchover storage: From d7999a00a3494840a7c4971c0ce85ab692328227 Mon Sep 17 00:00:00 2001 From: Ngo Vu Minh Dat Date: Sat, 20 Jun 2026 17:19:55 +0700 Subject: [PATCH 2/4] update tag --- disaster-recovery/dr-drill/config.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/disaster-recovery/dr-drill/config.sh b/disaster-recovery/dr-drill/config.sh index 31addf7..4c2fc2a 100755 --- a/disaster-recovery/dr-drill/config.sh +++ b/disaster-recovery/dr-drill/config.sh @@ -27,7 +27,7 @@ # Kubernetes / app config # --------------------------------------------------------------------------- : "${NAMESPACE:=prod-postgresql}" -: "${RECOVERY_TAG:=postgresql-first-recovery-test}" +: "${RECOVERY_TAG:=postgreql-recovery-sync}" : "${ARGOCD_CHART_VERSION:=9.5.22}" # Repo-relative path to the app-of-app helm chart. From 2db5900ed33e1fe64b53387712c1fd040af2ca90 Mon Sep 17 00:00:00 2001 From: Ngo Vu Minh Dat Date: Sat, 20 Jun 2026 18:00:57 +0700 Subject: [PATCH 3/4] wip --- .github/workflows/postgresql-backup-test.yml | 239 +----------------- .../dr-drill/scripts/02-install-app.sh | 2 +- .../argocd/app-of-app/templates/juicefs.yaml | 10 +- kubernetes/argocd/app-of-app/values.yaml | 1 + 4 files changed, 22 insertions(+), 230 deletions(-) diff --git a/.github/workflows/postgresql-backup-test.yml b/.github/workflows/postgresql-backup-test.yml index cbd4b3d..0846371 100644 --- a/.github/workflows/postgresql-backup-test.yml +++ b/.github/workflows/postgresql-backup-test.yml @@ -18,21 +18,13 @@ on: default: "2" required: false -env: - CLUSTER_NAME: pg-backup-test-${{ github.run_id }} - REGION: ${{ inputs.region || 'nyc3' }} - NODE_SIZE: ${{ inputs.node_size || 's-4vcpu-8gb' }} - NODE_COUNT: ${{ inputs.node_count || '2' }} - NAMESPACE: prod-postgresql - RECOVERY_TAG: postgresql-first-recovery-test - ARGOCD_CHART_VERSION: "9.4.15" - jobs: backup-test: runs-on: ubuntu-latest environment: test-backup timeout-minutes: 45 steps: + # --- dependency setup only; no business logic lives here --- - name: Checkout repository uses: actions/checkout@v4 @@ -43,228 +35,23 @@ jobs: - name: Install ArgoCD CLI run: | - curl -sSL -o argocd https://github.com/argoproj/argo-cd/releases/download/v3.3.4/argocd-linux-amd64 + curl -sSL -o argocd \ + https://github.com/argoproj/argo-cd/releases/download/v3.3.4/argocd-linux-amd64 chmod +x argocd sudo mv argocd /usr/local/bin/ - - name: Create DOKS cluster - run: | - doctl kubernetes cluster create "$CLUSTER_NAME" \ - --region "$REGION" \ - --size "$NODE_SIZE" \ - --count "$NODE_COUNT" \ - --wait - - name: Save kubeconfig - run: | - doctl kubernetes cluster kubeconfig save "$CLUSTER_NAME" - kubectl config set-context --current --namespace=argocd - - name: Install ArgoCD via Helm - run: | - helm repo add argo https://argoproj.github.io/argo-helm - helm repo update - helm install argocd argo/argo-cd \ - --version "$ARGOCD_CHART_VERSION" \ - --namespace argocd \ - --create-namespace \ - --wait \ - --timeout 5m \ - --set 'configs.params.server\.insecure=true' - - - name: Wait for ArgoCD to be ready - run: | - kubectl rollout status deployment/argocd-server -n argocd --timeout=120s - kubectl rollout status deployment/argocd-repo-server -n argocd --timeout=120s - kubectl rollout status deployment/argocd-applicationset-controller -n argocd --timeout=120s - - - name: Install app-of-app chart - run: | - helm install app-of-app ./kubernetes/argocd/app-of-app \ - --namespace argocd \ - --set metallb.enabled=false \ - --set traefik.enabled=false \ - --set openebs.enabled=false \ - --set postgresql.enabled=true \ - --set certManager.enabled=true \ - --set kubePrometheusStack.enabled=true \ - --set customManifest.enabled=false \ - --set loki.enabled=true \ - --set alloy.enabled=true \ - --set pgadmin4.enabled=true \ - --set sonarqube.enabled=false \ - --set harbor.enabled=false \ - --set velero.enabled=false \ - --set mongoOperator.enabled=false \ - --set kafkaOperator.enabled=false \ - --set juicefs.enabled=false \ - --set vaultwarden.enabled=true + # helm, kubectl, and jq are preinstalled on ubuntu-latest runners. - - name: Sync cert-manager - run: | - argocd app sync cert-manager --core \ - --retry-limit 5 \ - --retry-backoff-duration 10s \ - --retry-backoff-max-duration 3m \ - --retry-backoff-factor 2 - - name: Create namespace and secrets + # --- delegate all logic to the bash scripts --- + - name: Run DR drill env: + DIGITALOCEAN_TOKEN: ${{ secrets.DIGITALOCEAN_TOKEN }} R2_ACCESS_KEY: ${{ secrets.R2_ACCESS_KEY }} R2_SECRET_KEY: ${{ secrets.R2_SECRET_KEY }} + RUN_ID: ${{ github.run_id }} + REGION: ${{ inputs.region || 'nyc3' }} + NODE_SIZE: ${{ inputs.node_size || 's-4vcpu-8gb' }} + NODE_COUNT: ${{ inputs.node_count || '2' }} run: | - kubectl create namespace "$NAMESPACE" --dry-run=client -o yaml | kubectl apply -f - - kubectl create secret generic cloudflare-r2 \ - -n "$NAMESPACE" \ - --from-literal=ACCESS_KEY="$R2_ACCESS_KEY" \ - --from-literal=SECRET_KEY="$R2_SECRET_KEY" \ - --dry-run=client -o yaml | kubectl apply -f - - kubectl create secret generic postgres-admin \ - -n "$NAMESPACE" \ - --from-literal=username=postgres \ - --from-literal=password=backup-test-dummy \ - --dry-run=client -o yaml | kubectl apply -f - - - - name: Override PostgreSQL app revision to recovery tag - run: | - argocd app set prod-postgresql \ - --core \ - --source-position 2 \ - --revision "$RECOVERY_TAG" - - name: Sync PostgreSQL (App) - continue-on-error: true - run: | - argocd app sync prod-postgresql --core \ - --retry-limit 3 \ - --retry-backoff-duration 5s \ - --retry-backoff-max-duration 1m \ - --retry-backoff-factor 2 - - name: Sync PostgreSQL (Cluster Resource) - continue-on-error: true - run: | - argocd app sync prod-postgresql --core \ - --resource postgresql.cnpg.io:Cluster:postgresql \ - --retry-limit 5 \ - --retry-backoff-duration 5s \ - --retry-backoff-max-duration 1m \ - --retry-backoff-factor 2 - - - name: Wait for cluster healthy state - run: | - echo "Waiting for CloudNativePG cluster to reach healthy state..." - for i in $(seq 1 90); do - phase=$(kubectl get cluster -n "$NAMESPACE" postgresql \ - -o jsonpath='{.status.phase}' 2>/dev/null || echo "unknown") - echo " Attempt $i/90: phase=$phase" - if [ "$phase" = "Cluster in healthy state" ]; then - echo "Cluster is healthy." - exit 0 - fi - sleep 10 - done - echo "ERROR: Cluster did not reach healthy state within 15 minutes." - kubectl get cluster -n "$NAMESPACE" postgresql -o yaml || true - kubectl get pods -n "$NAMESPACE" -l cnpg.io/cluster=postgresql || true - exit 1 - - - name: Validate restored data - run: | - POD=$(kubectl get pods -n "$NAMESPACE" \ - -l cnpg.io/cluster=postgresql,role=primary \ - -o jsonpath='{.items[0].metadata.name}') - echo "Primary pod: $POD" - - echo "--- Connectivity check ---" - kubectl exec -n "$NAMESPACE" "$POD" -- \ - psql -U postgres -c "SELECT 1 AS connectivity_check;" - - echo "--- Database listing ---" - kubectl exec -n "$NAMESPACE" "$POD" -- \ - psql -U postgres -c "\l" - - echo "--- Verify expected databases exist ---" - EXPECTED_DBS="sonarqube" - for db in $EXPECTED_DBS; do - count=$(kubectl exec -n "$NAMESPACE" "$POD" -- \ - psql -U postgres -tAc "SELECT count(*) FROM pg_database WHERE datname = '$db';") - if [ "$count" -eq 0 ]; then - echo "FAIL: Database '$db' not found." - exit 1 - fi - echo "OK: Database '$db' exists." - done - - echo "--- Count user tables across databases ---" - for db in $EXPECTED_DBS; do - table_count=$(kubectl exec -n "$NAMESPACE" "$POD" -- \ - psql -U postgres -d "$db" -tAc \ - "SELECT count(*) FROM pg_catalog.pg_tables WHERE schemaname NOT IN ('pg_catalog','information_schema');" \ - 2>/dev/null || echo "0") - echo "Database '$db': $table_count user table(s)" - done - echo "All validation checks passed." - - name: Destroy DOKS cluster and associated infra - if: always() - run: | - CLUSTER_ID=$(doctl kubernetes cluster get "$CLUSTER_NAME" \ - --format ID --no-header 2>/dev/null || echo "") - - if [ -z "$CLUSTER_ID" ]; then - echo "Cluster '$CLUSTER_NAME' not found, nothing to clean up." - exit 0 - fi - - echo "Cluster ID: $CLUSTER_ID" - - # Collect LB IPs now while kubectl still works — needed after cluster is gone - LB_IPS=$(kubectl get svc -A \ - -o jsonpath='{.items[?(@.spec.type=="LoadBalancer")].status.loadBalancer.ingress[0].ip}' \ - 2>/dev/null || echo "") - - # --- Scale down all node pools then delete the cluster --- - echo "=== Scaling down node pools ===" - POOL_IDS=$(doctl kubernetes cluster node-pool list "$CLUSTER_NAME" \ - --format ID --no-header 2>/dev/null || echo "") - for pool_id in $POOL_IDS; do - echo "Scaling node pool $pool_id to 0..." - doctl kubernetes cluster node-pool update "$CLUSTER_NAME" "$pool_id" \ - --count 0 || true - done - [ -n "$POOL_IDS" ] && sleep 30 || true - - echo "=== Deleting DOKS cluster ===" - doctl kubernetes cluster delete "$CLUSTER_NAME" --force --dangerous || true - - # --- Delete DigitalOcean Block Storage Volumes --- - # The DOKS CSI driver tags every provisioned volume with k8s: - echo "=== Cleaning up DigitalOcean Block Storage Volumes ===" - VOL_IDS=$(doctl compute volume list -o json \ - | jq -r --arg tag "k8s:$CLUSTER_ID" \ - '.[] | select(.tags? and (.tags[] == $tag)) | .id' 2>/dev/null || echo "") - if [ -n "$VOL_IDS" ]; then - for vol_id in $VOL_IDS; do - echo "Deleting volume: $vol_id" - doctl compute volume delete "$vol_id" --force || true - done - else - echo "No block storage volumes found for cluster $CLUSTER_ID." - fi - - # --- Delete DigitalOcean Load Balancers --- - # Primary: match by IPs collected before cluster was deleted - echo "=== Cleaning up DigitalOcean Load Balancers ===" - for lb_ip in $LB_IPS; do - LB_ID=$(doctl compute load-balancer list -o json \ - | jq -r --arg ip "$lb_ip" '.[] | select(.ip == $ip) | .id') - if [ -n "$LB_ID" ]; then - echo "Deleting load balancer $LB_ID (IP: $lb_ip)" - doctl compute load-balancer delete "$LB_ID" --force || true - fi - done - # Fallback: catch any LBs still tagged with the cluster ID - TAGGED_LB_IDS=$(doctl compute load-balancer list -o json \ - | jq -r --arg tag "k8s:$CLUSTER_ID" \ - '.[] | select(.tags? and (.tags[] == $tag)) | .id' 2>/dev/null || echo "") - for lb_id in $TAGGED_LB_IDS; do - echo "Deleting tagged load balancer: $lb_id" - doctl compute load-balancer delete "$lb_id" --force 2>/dev/null || true - done - - echo "Destroy complete." + chmod +x dr-drill/run.sh dr-drill/scripts/*.sh + ./dr-drill/run.sh \ No newline at end of file diff --git a/disaster-recovery/dr-drill/scripts/02-install-app.sh b/disaster-recovery/dr-drill/scripts/02-install-app.sh index 89d51fc..f000472 100755 --- a/disaster-recovery/dr-drill/scripts/02-install-app.sh +++ b/disaster-recovery/dr-drill/scripts/02-install-app.sh @@ -40,7 +40,7 @@ install_app_of_app() { --set openebs.enabled=false \ --set postgresql.enabled=true \ --set certManager.enabled=true \ - --set kubePrometheusStack.enabled=true \ + --set kubePrometheusStack.enabled=false \ --set customManifest.enabled=false \ --set loki.enabled=false \ --set alloy.enabled=false \ diff --git a/kubernetes/argocd/app-of-app/templates/juicefs.yaml b/kubernetes/argocd/app-of-app/templates/juicefs.yaml index 76d3f91..1932753 100644 --- a/kubernetes/argocd/app-of-app/templates/juicefs.yaml +++ b/kubernetes/argocd/app-of-app/templates/juicefs.yaml @@ -14,8 +14,6 @@ spec: syncPolicy: automated: enabled: false - selfHeal: false - prune: true retry: limit: 1 backoff: @@ -52,7 +50,7 @@ spec: memory: 128Mi limits: cpu: 1 - memory: 256Mi + memory: 256Mi mountMode: mountpod metrics: enabled: true @@ -79,6 +77,12 @@ spec: reclaimPolicy: Retain allowVolumeExpansion: true mountOptions: + {{- if .Values.juicefs.readOnly }} + # DR read-only: blocks writes to the prod R2 prefix AND + # disables background GC/trash cleanup, so a DR drill can + # never delete or mutate production objects. + - ro + {{- end }} - cache-size=10240 - free-space-ratio=0.1 - prefetch=3 diff --git a/kubernetes/argocd/app-of-app/values.yaml b/kubernetes/argocd/app-of-app/values.yaml index 06847bf..38f314a 100644 --- a/kubernetes/argocd/app-of-app/values.yaml +++ b/kubernetes/argocd/app-of-app/values.yaml @@ -28,6 +28,7 @@ sonarqube: enabled: true juicefs: enabled: true + readOnly: false vaultwarden: enabled: true certManager: From 7edef0641d61c2f3b8dbfef03979a7dc7b790676 Mon Sep 17 00:00:00 2001 From: Ngo Vu Minh Dat Date: Sun, 21 Jun 2026 11:59:21 +0700 Subject: [PATCH 4/4] Commit to save work, stil error due to read-only mode in juicefs, Note: Seperate dev, and prodcution in juicefs bucket to avoid data conflicting --- disaster-recovery/dr-drill/config.sh | 3 +- disaster-recovery/dr-drill/lib/log.sh | 0 disaster-recovery/dr-drill/lib/preflight.sh | 0 disaster-recovery/dr-drill/run.sh | 7 ++-- .../dr-drill/scripts/02-install-app.sh | 7 ++++ .../dr-drill/scripts/04-sync-juicefs.sh | 37 +++++++++++++++++++ ...04-validate.sh => 05-validate-database.sh} | 0 ...ltwarden.sh => 06-validate-vaultwarden.sh} | 0 .../argocd/app-of-app/templates/juicefs.yaml | 11 +++++- kubernetes/argocd/app-of-app/values.yaml | 1 + 10 files changed, 60 insertions(+), 6 deletions(-) mode change 100644 => 100755 disaster-recovery/dr-drill/lib/log.sh mode change 100644 => 100755 disaster-recovery/dr-drill/lib/preflight.sh create mode 100755 disaster-recovery/dr-drill/scripts/04-sync-juicefs.sh rename disaster-recovery/dr-drill/scripts/{04-validate.sh => 05-validate-database.sh} (100%) mode change 100644 => 100755 rename disaster-recovery/dr-drill/scripts/{05-validate-vaultwarden.sh => 06-validate-vaultwarden.sh} (100%) mode change 100644 => 100755 diff --git a/disaster-recovery/dr-drill/config.sh b/disaster-recovery/dr-drill/config.sh index 4c2fc2a..9d8095b 100755 --- a/disaster-recovery/dr-drill/config.sh +++ b/disaster-recovery/dr-drill/config.sh @@ -47,6 +47,7 @@ # --------------------------------------------------------------------------- : "${JUICEFS_ENABLED:=true}" : "${JUICEFS_READONLY:=true}" # injects the `ro` mount option +: "${JUICEFS_MONITORING:=false}" : "${JUICEFS_NAMESPACE:=juicefs}" : "${JUICEFS_SECRET_NAME:=cloudflare-r2}" : "${JUICEFS_VOLUME_NAME:=cloudflare-r2-prod}" @@ -60,7 +61,7 @@ fi # The rw service for a CNPG cluster named "postgresql" is "postgresql-rw". : "${JUICEFS_META_USER:=juicefs}" : "${JUICEFS_META_HOST:=postgresql-rw.${NAMESPACE}.svc}" -: "${JUICEFS_META_DB:=juicefs_prod}" +: "${JUICEFS_META_DB:=juicefs}" # JUICEFS_META_PASSWORD must be exported (the juicefs DB role password from the # restored cluster). If you'd rather supply the whole URL, set JUICEFS_METAURL. : "${JUICEFS_METAURL:=postgres://${JUICEFS_META_USER}:${JUICEFS_META_PASSWORD:-}@${JUICEFS_META_HOST}:5432/${JUICEFS_META_DB}?sslmode=disable}" diff --git a/disaster-recovery/dr-drill/lib/log.sh b/disaster-recovery/dr-drill/lib/log.sh old mode 100644 new mode 100755 diff --git a/disaster-recovery/dr-drill/lib/preflight.sh b/disaster-recovery/dr-drill/lib/preflight.sh old mode 100644 new mode 100755 diff --git a/disaster-recovery/dr-drill/run.sh b/disaster-recovery/dr-drill/run.sh index 882f57e..ed33dff 100644 --- a/disaster-recovery/dr-drill/run.sh +++ b/disaster-recovery/dr-drill/run.sh @@ -25,8 +25,9 @@ source "$REPO_ROOT/scripts/00-create-cluster.sh" source "$REPO_ROOT/scripts/01-install-argocd.sh" source "$REPO_ROOT/scripts/02-install-apps.sh" source "$REPO_ROOT/scripts/03-recover-postgres.sh" -source "$REPO_ROOT/scripts/04-validate.sh" -source "$REPO_ROOT/scripts/05-validate-vaultwarden.sh" +source "$REPO_ROOT/scripts/04-sync-juicefs.sh" +source "$REPO_ROOT/scripts/05-validate.sh" +source "$REPO_ROOT/scripts/06-validate-vaultwarden.sh" source "$REPO_ROOT/scripts/99-destroy.sh" # --- argument parsing ------------------------------------------------------ @@ -80,7 +81,7 @@ main() { recover_postgres validate_data - # Data layer: JuiceFS (read-only) then Vaultwarden on top of it. + # Data layer: JuiceFS sync (needs the metaurl DB up) → Vaultwarden on top. sync_juicefs sync_vaultwarden validate_vaultwarden diff --git a/disaster-recovery/dr-drill/scripts/02-install-app.sh b/disaster-recovery/dr-drill/scripts/02-install-app.sh index f000472..e01f540 100755 --- a/disaster-recovery/dr-drill/scripts/02-install-app.sh +++ b/disaster-recovery/dr-drill/scripts/02-install-app.sh @@ -36,6 +36,12 @@ install_app_of_app() { helm upgrade --install app-of-app "$REPO_ROOT/$APP_OF_APP_CHART" \ --namespace argocd \ --set metallb.enabled=false \ + --set argus.enabled=false \ + --set chaosMesh.enabled=false \ + --set nextcloud.enabled=false \ + --set nfsCsiDriver.enabled=false \ + --set jellyfin.enabled=false \ + --set qbittorrent.enabled=false \ --set traefik.enabled=true \ --set openebs.enabled=false \ --set postgresql.enabled=true \ @@ -52,6 +58,7 @@ install_app_of_app() { --set kafkaOperator.enabled=false \ --set juicefs.enabled="$JUICEFS_ENABLED" \ --set juicefs.readOnly="$JUICEFS_READONLY" \ + --set juicefs.monitoring="$JUICEFS_MONITORING" \ --set vaultwarden.enabled=true log_ok "app-of-app installed" } diff --git a/disaster-recovery/dr-drill/scripts/04-sync-juicefs.sh b/disaster-recovery/dr-drill/scripts/04-sync-juicefs.sh new file mode 100755 index 0000000..7257ffc --- /dev/null +++ b/disaster-recovery/dr-drill/scripts/04-sync-juicefs.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +# scripts/04-sync-juicefs.sh — sync the JuiceFS Application and wait for the CSI +# driver to be ready. +# +# ORDERING: must run AFTER PostgreSQL recovery (03). The JuiceFS mount pod +# connects to the metaurl (the restored CNPG juicefs_prod DB) when a volume is +# mounted, so the metadata engine has to be up first. The credentials secret +# itself is created earlier in 02 (before the StorageClass references it). +set -euo pipefail + +sync_juicefs() { + [ "$JUICEFS_ENABLED" = "true" ] || { log_info "JuiceFS disabled — skipping sync"; return 0; } + + log_step "Syncing juicefs Application" + argocd app sync juicefs --core \ + --retry-limit 5 \ + --retry-backoff-duration 10s \ + --retry-backoff-max-duration 3m \ + --retry-backoff-factor 2 + + log_step "Waiting for JuiceFS CSI components" + # Controller is a StatefulSet, node service a DaemonSet. + kubectl rollout status statefulset \ + -n "$JUICEFS_NAMESPACE" -l "$JUICEFS_CSI_SELECTOR" --timeout=180s 2>/dev/null \ + || log_warn "could not confirm CSI controller rollout (continuing)" + kubectl rollout status daemonset \ + -n "$JUICEFS_NAMESPACE" -l "$JUICEFS_CSI_SELECTOR" --timeout=180s 2>/dev/null \ + || log_warn "could not confirm CSI node rollout (continuing)" + log_ok "JuiceFS synced" +} + +if [ "${BASH_SOURCE[0]}" = "${0}" ]; then + HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + source "$HERE/config.sh"; source "$HERE/lib/log.sh"; source "$HERE/lib/preflight.sh" + require_cmd kubectl argocd + sync_juicefs +fi \ No newline at end of file diff --git a/disaster-recovery/dr-drill/scripts/04-validate.sh b/disaster-recovery/dr-drill/scripts/05-validate-database.sh old mode 100644 new mode 100755 similarity index 100% rename from disaster-recovery/dr-drill/scripts/04-validate.sh rename to disaster-recovery/dr-drill/scripts/05-validate-database.sh diff --git a/disaster-recovery/dr-drill/scripts/05-validate-vaultwarden.sh b/disaster-recovery/dr-drill/scripts/06-validate-vaultwarden.sh old mode 100644 new mode 100755 similarity index 100% rename from disaster-recovery/dr-drill/scripts/05-validate-vaultwarden.sh rename to disaster-recovery/dr-drill/scripts/06-validate-vaultwarden.sh diff --git a/kubernetes/argocd/app-of-app/templates/juicefs.yaml b/kubernetes/argocd/app-of-app/templates/juicefs.yaml index 1932753..cbcd0a2 100644 --- a/kubernetes/argocd/app-of-app/templates/juicefs.yaml +++ b/kubernetes/argocd/app-of-app/templates/juicefs.yaml @@ -53,7 +53,10 @@ spec: memory: 256Mi mountMode: mountpod metrics: - enabled: true + # metrics.enabled=true makes the chart emit a ServiceMonitor, which + # needs the monitoring.coreos.com CRDs. Tie it to juicefs.monitoring + # so DR drills (no kube-prometheus-stack) don't fail on the missing CRD. + enabled: {{ .Values.juicefs.monitoring }} port: 9567 service: servicePort: 9567 @@ -111,8 +114,12 @@ spec: paths: - path: / pathType: ImplementationSpecific - # Raw manifests (ServiceMonitor, etc.) tracked in Git + {{- if .Values.juicefs.monitoring }} + # Raw manifests (ServiceMonitor, etc.) tracked in Git. + # CRDs (installed by kube-prometheus-stack). Disabled for DR drills where + # that stack isn't deployed, otherwise ArgoCD fails on the missing CRD. - repoURL: https://github.com/ngodat0103/dev-oops.git targetRevision: master path: kubernetes/argocd/argocd-app/daemon/juicefs + {{- end }} {{- end -}} \ No newline at end of file diff --git a/kubernetes/argocd/app-of-app/values.yaml b/kubernetes/argocd/app-of-app/values.yaml index 38f314a..8458093 100644 --- a/kubernetes/argocd/app-of-app/values.yaml +++ b/kubernetes/argocd/app-of-app/values.yaml @@ -29,6 +29,7 @@ sonarqube: juicefs: enabled: true readOnly: false + monitoring: true vaultwarden: enabled: true certManager: