From 310c8199c40c9e3e5c924eea0a6fd27e7ad2616c Mon Sep 17 00:00:00 2001
From: Ngo Vu Minh Dat <ngovuminhdat@gmail.com>
Date: Sat, 20 Jun 2026 17:11:59 +0700
Subject: [PATCH 1/4] targetRevision for postgresql recovery

---
 disaster-recovery/dr-drill/config.sh          |  98 +++++++++++++
 disaster-recovery/dr-drill/lib/log.sh         |  21 +++
 disaster-recovery/dr-drill/lib/preflight.sh   |  39 +++++
 disaster-recovery/dr-drill/readme.md          |  93 ++++++++++++
 disaster-recovery/dr-drill/run.sh             |  91 ++++++++++++
 .../scripts/00-create-doks-cluster.sh         |  32 +++++
 .../dr-drill/scripts/01-install-argocd.sh     |  35 +++++
 .../dr-drill/scripts/02-install-app.sh        | 122 ++++++++++++++++
 .../dr-drill/scripts/03-recover-pg.sh         |  63 ++++++++
 .../dr-drill/scripts/04-validate.sh           |  56 ++++++++
 .../scripts/05-validate-vaultwarden.sh        | 134 ++++++++++++++++++
 .../dr-drill/scripts/99-destroy.sh            |  99 +++++++++++++
 .../postgresql/raw-manifests/instance.yaml    | 134 ++++++++++--------
 13 files changed, 955 insertions(+), 62 deletions(-)
 create mode 100755 disaster-recovery/dr-drill/config.sh
 create mode 100644 disaster-recovery/dr-drill/lib/log.sh
 create mode 100644 disaster-recovery/dr-drill/lib/preflight.sh
 create mode 100644 disaster-recovery/dr-drill/readme.md
 create mode 100644 disaster-recovery/dr-drill/run.sh
 create mode 100755 disaster-recovery/dr-drill/scripts/00-create-doks-cluster.sh
 create mode 100755 disaster-recovery/dr-drill/scripts/01-install-argocd.sh
 create mode 100755 disaster-recovery/dr-drill/scripts/02-install-app.sh
 create mode 100755 disaster-recovery/dr-drill/scripts/03-recover-pg.sh
 create mode 100644 disaster-recovery/dr-drill/scripts/04-validate.sh
 create mode 100644 disaster-recovery/dr-drill/scripts/05-validate-vaultwarden.sh
 create mode 100755 disaster-recovery/dr-drill/scripts/99-destroy.sh

diff --git a/disaster-recovery/dr-drill/config.sh b/disaster-recovery/dr-drill/config.sh
new file mode 100755
index 0000000..31addf7
--- /dev/null
+++ b/disaster-recovery/dr-drill/config.sh
@@ -0,0 +1,98 @@
+#!/usr/bin/env bash
+# config.sh — central configuration for the PostgreSQL DR drill.
+# Sourced by run.sh and every script under scripts/.
+# All values can be overridden by exporting them before invoking run.sh.
+
+# ---------------------------------------------------------------------------
+# Run identity
+# ---------------------------------------------------------------------------
+# RUN_ID makes the cluster name unique per run. In GHA pass github.run_id;
+# locally it falls back to a UTC timestamp.
+: "${RUN_ID:=$(date -u +%Y%m%d%H%M%S)}"
+
+: "${CLUSTER_NAME:=pg-backup-test-${RUN_ID}}"
+
+# ---------------------------------------------------------------------------
+# DigitalOcean / DOKS
+# ---------------------------------------------------------------------------
+: "${REGION:=nyc3}"
+: "${NODE_SIZE:=s-4vcpu-8gb}"
+: "${NODE_COUNT:=2}"
+
+# DIGITALOCEAN_TOKEN must be exported by the caller (GHA secret or local env).
+# doctl auth is assumed to be already initialised by the CI dependency step,
+# but we re-init defensively if a token is present (see lib/preflight.sh).
+
+# ---------------------------------------------------------------------------
+# Kubernetes / app config
+# ---------------------------------------------------------------------------
+: "${NAMESPACE:=prod-postgresql}"
+: "${RECOVERY_TAG:=postgresql-first-recovery-test}"
+: "${ARGOCD_CHART_VERSION:=9.5.22}"
+
+# Repo-relative path to the app-of-app helm chart.
+# Resolved against REPO_ROOT (computed in run.sh).
+: "${APP_OF_APP_CHART:=kubernetes/argocd/app-of-app}"
+
+# ---------------------------------------------------------------------------
+# Timeouts (seconds unless noted)
+# ---------------------------------------------------------------------------
+: "${ARGOCD_ROLLOUT_TIMEOUT:=120s}"
+: "${HELM_INSTALL_TIMEOUT:=3m}"
+: "${CLUSTER_HEALTHY_ATTEMPTS:=90}"   # x10s sleep => 15 min
+: "${CLUSTER_HEALTHY_INTERVAL:=10}"
+
+# ---------------------------------------------------------------------------
+# JuiceFS (read-only DR mount)
+# ---------------------------------------------------------------------------
+: "${JUICEFS_ENABLED:=true}"
+: "${JUICEFS_READONLY:=true}"          # injects the `ro` mount option
+: "${JUICEFS_NAMESPACE:=juicefs}"
+: "${JUICEFS_SECRET_NAME:=cloudflare-r2}"
+: "${JUICEFS_VOLUME_NAME:=cloudflare-r2-prod}"
+: "${JUICEFS_BUCKET:=https://4c8ad4e9fa8213af3fd284bb97b68b5e.r2.cloudflarestorage.com/juicefs-prod}"
+# Assigned with a plain conditional: the JSON braces collide with ${VAR:=...}.
+if [ -z "${JUICEFS_ENVS:-}" ]; then
+  JUICEFS_ENVS='{"JFS_MOUNT_TIMEOUT": 300}'
+fi
+
+# Metadata engine (the restored CNPG cluster holding the juicefs_prod DB).
+# The rw service for a CNPG cluster named "postgresql" is "postgresql-rw".
+: "${JUICEFS_META_USER:=juicefs}"
+: "${JUICEFS_META_HOST:=postgresql-rw.${NAMESPACE}.svc}"
+: "${JUICEFS_META_DB:=juicefs_prod}"
+# JUICEFS_META_PASSWORD must be exported (the juicefs DB role password from the
+# restored cluster). If you'd rather supply the whole URL, set JUICEFS_METAURL.
+: "${JUICEFS_METAURL:=postgres://${JUICEFS_META_USER}:${JUICEFS_META_PASSWORD:-}@${JUICEFS_META_HOST}:5432/${JUICEFS_META_DB}?sslmode=disable}"
+
+# Label selector for CSI node/controller readiness.
+: "${JUICEFS_CSI_SELECTOR:=app.kubernetes.io/name=juicefs-csi-driver}"
+
+# ---------------------------------------------------------------------------
+# Vaultwarden (validated against the read-only JuiceFS mount)
+# ---------------------------------------------------------------------------
+: "${VAULTWARDEN_NAMESPACE:=vaultwarden}"
+: "${VAULTWARDEN_DEPLOYMENT:=vaultwarden}"
+: "${VAULTWARDEN_SERVICE:=vaultwarden}"
+: "${VAULTWARDEN_DATA_PATH:=/data}"        # JuiceFS-backed data dir in the pod
+: "${VAULTWARDEN_LOCAL_PORT:=8080}"
+: "${VAULTWARDEN_ROLLOUT_TIMEOUT:=300s}"
+# Optional: set VW_ADMIN_TOKEN to additionally assert restored user count.
+
+# ---------------------------------------------------------------------------
+# Validation expectations
+# ---------------------------------------------------------------------------
+# Space-separated list of databases that must exist after recovery.
+: "${EXPECTED_DBS:=sonarqube}"
+
+# ---------------------------------------------------------------------------
+# Secrets (required for the secrets step). Exported by caller.
+# ---------------------------------------------------------------------------
+#   R2_ACCESS_KEY
+#   R2_SECRET_KEY
+
+# ---------------------------------------------------------------------------
+# Behaviour flags
+# ---------------------------------------------------------------------------
+# Set SKIP_DESTROY=1 to leave the cluster running for inspection after a run.
+: "${SKIP_DESTROY:=0}"
\ No newline at end of file
diff --git a/disaster-recovery/dr-drill/lib/log.sh b/disaster-recovery/dr-drill/lib/log.sh
new file mode 100644
index 0000000..a31af38
--- /dev/null
+++ b/disaster-recovery/dr-drill/lib/log.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+# lib/log.sh — minimal structured logging helpers.
+
+# Colours only when stdout is a TTY (keeps CI logs clean).
+if [ -t 1 ]; then
+  _C_RESET=$'\033[0m'; _C_BLUE=$'\033[34m'; _C_GREEN=$'\033[32m'
+  _C_YELLOW=$'\033[33m'; _C_RED=$'\033[31m'; _C_DIM=$'\033[2m'
+else
+  _C_RESET=''; _C_BLUE=''; _C_GREEN=''; _C_YELLOW=''; _C_RED=''; _C_DIM=''
+fi
+
+_ts() { date -u +'%H:%M:%S'; }
+
+log_info()  { printf '%s%s%s %s\n'  "$_C_DIM" "$(_ts)" "$_C_RESET" "$*"; }
+log_step()  { printf '\n%s==> %s%s\n' "$_C_BLUE" "$*" "$_C_RESET"; }
+log_ok()    { printf '%s[OK]%s %s\n'   "$_C_GREEN" "$_C_RESET" "$*"; }
+log_warn()  { printf '%s[WARN]%s %s\n' "$_C_YELLOW" "$_C_RESET" "$*" >&2; }
+log_error() { printf '%s[ERROR]%s %s\n' "$_C_RED" "$_C_RESET" "$*" >&2; }
+
+# die <message> — log and exit non-zero.
+die() { log_error "$*"; exit 1; }
\ No newline at end of file
diff --git a/disaster-recovery/dr-drill/lib/preflight.sh b/disaster-recovery/dr-drill/lib/preflight.sh
new file mode 100644
index 0000000..d1d7579
--- /dev/null
+++ b/disaster-recovery/dr-drill/lib/preflight.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+# lib/preflight.sh — verify required tooling and environment before doing work.
+
+# require_cmd <name>... — fail if any command is missing from PATH.
+require_cmd() {
+  local missing=0 c
+  for c in "$@"; do
+    if ! command -v "$c" >/dev/null 2>&1; then
+      log_error "required command not found: $c"
+      missing=1
+    fi
+  done
+  [ "$missing" -eq 0 ] || die "install missing CLIs (these are provided by the CI dependency step)"
+}
+
+# require_env <name>... — fail if any env var is empty.
+require_env() {
+  local missing=0 v
+  for v in "$@"; do
+    if [ -z "${!v:-}" ]; then
+      log_error "required environment variable not set: $v"
+      missing=1
+    fi
+  done
+  [ "$missing" -eq 0 ] || die "export the missing variables (pass GHA secrets through env)"
+}
+
+# ensure_doctl_auth — re-init doctl auth if a token is present and not yet valid.
+ensure_doctl_auth() {
+  if doctl account get >/dev/null 2>&1; then
+    return 0
+  fi
+  if [ -n "${DIGITALOCEAN_TOKEN:-}" ]; then
+    log_info "initialising doctl auth from DIGITALOCEAN_TOKEN"
+    doctl auth init --access-token "$DIGITALOCEAN_TOKEN" >/dev/null
+  else
+    die "doctl is not authenticated and DIGITALOCEAN_TOKEN is unset"
+  fi
+}
\ No newline at end of file
diff --git a/disaster-recovery/dr-drill/readme.md b/disaster-recovery/dr-drill/readme.md
new file mode 100644
index 0000000..dfbfd85
--- /dev/null
+++ b/disaster-recovery/dr-drill/readme.md
@@ -0,0 +1,93 @@
+# PostgreSQL DR Drill
+
+Automated disaster-recovery test for the homelab PostgreSQL (CNPG) cluster.
+Provisions a throwaway DOKS cluster, restores PostgreSQL from R2 WAL archives
+via ArgoCD + CNPG bootstrap, validates the restored data, then tears everything
+down.
+
+All logic lives in bash so it runs **identically locally and in CI**. GitHub
+Actions only installs the CLIs and calls `run.sh`.
+
+## Layout
+
+```
+dr-drill/
+├── config.sh                 # all env vars + defaults (override by exporting)
+├── run.sh                    # orchestrator; guaranteed cleanup via EXIT trap
+├── lib/
+│   ├── log.sh                # logging helpers
+│   └── preflight.sh          # CLI/env checks, doctl auth
+├── scripts/
+│   ├── 00-create-cluster.sh   # provision DOKS + kubeconfig
+│   ├── 01-install-argocd.sh   # helm install ArgoCD, wait ready
+│   ├── 02-install-apps.sh     # juicefs secret (pre-install), app-of-app,
+│   │                          #   cert-manager, secrets, juicefs sync
+│   ├── 03-recover-postgres.sh # set recovery tag, sync, wait healthy
+│   ├── 04-validate.sh         # verify restored databases/tables
+│   ├── 05-validate-vaultwarden.sh # deploy + validate VW on read-only JuiceFS
+│   └── 99-destroy.sh          # cluster + orphaned DO volumes/LBs
+├── juicefs-application.yaml   # patched ArgoCD Application (conditional `ro`)
+└── .github/workflows/dr-drill.yml
+```
+
+Each `scripts/*.sh` is independently runnable (`./scripts/05-validate-vaultwarden.sh`)
+**and** sourceable by `run.sh`. The functions don't execute on source — only
+when the file is run directly (the `BASH_SOURCE` guard at the bottom).
+
+## Drill flow (run.sh)
+
+```
+create_cluster → install_argocd
+create_juicefs_secret      # BEFORE install: StorageClass references it
+install_app_of_app         # juicefs.enabled=true, juicefs.readOnly=true
+sync_cert_manager → create_secrets
+recover_postgres → validate_data
+sync_juicefs               # AFTER postgres: mount pod needs the metaurl DB
+sync_vaultwarden → validate_vaultwarden
+(EXIT trap) → destroy
+```
+
+## Usage
+
+```bash
+# full drill
+export DIGITALOCEAN_TOKEN=... R2_ACCESS_KEY=... R2_SECRET_KEY=...
+export JUICEFS_META_PASSWORD=...          # juicefs DB role pw in the restored cluster
+export VW_ADMIN_TOKEN=...                  # optional: enables user-count assertion
+./run.sh
+
+# keep the cluster up for inspection (skips teardown)
+./run.sh --skip-destroy
+
+# clean up a leaked run later
+CLUSTER_NAME=pg-backup-test-123 ./run.sh destroy
+```
+
+## JuiceFS read-only protection
+
+The drill mounts JuiceFS **read-only** (`ro` injected into StorageClass
+`mountOptions`). This does two things at once:
+
+1. Writes to the prod R2 prefix are rejected — no split-brain corruption.
+2. Background GC / trash cleanup is disabled — the DR mount can never delete
+   production objects.
+
+`05-validate-vaultwarden.sh` proves this by exec-ing into the Vaultwarden pod
+and asserting a write into the data dir fails with `Read-only file system`.
+
+**Apply the patched Application:** replace your existing JuiceFS Application
+template with `juicefs-application.yaml` (it adds the conditional `ro` block
+gated on `.Values.juicefs.readOnly`). For normal homelab/prod sync, leave
+`juicefs.readOnly` unset/false so the mount stays writable.
+
+## Notes
+
+- `run.sh` registers an `EXIT INT TERM` trap, so the cluster is destroyed even
+  on failure or Ctrl-C — the equivalent of GHA's `if: always()`.
+- `99-destroy.sh` deliberately omits `set -e`: cleanup must continue past
+  individual failures.
+- The PG app revision override targets `--source-position 1` (the Git source).
+  Position 2 is the Helm chart source and requires a SemVer constraint, not a
+  Git tag.
+- Required CLIs: `doctl`, `kubectl`, `helm`, `argocd`, `jq`, `curl`.
+```
\ No newline at end of file
diff --git a/disaster-recovery/dr-drill/run.sh b/disaster-recovery/dr-drill/run.sh
new file mode 100644
index 0000000..882f57e
--- /dev/null
+++ b/disaster-recovery/dr-drill/run.sh
@@ -0,0 +1,91 @@
+#!/usr/bin/env bash
+# run.sh — orchestrate the full PostgreSQL DR drill.
+#
+# Usage:
+#   ./run.sh                  # full drill: create -> recover -> validate -> destroy
+#   ./run.sh --skip-destroy   # leave the cluster up for inspection
+#   ./run.sh destroy          # destroy only (e.g. to clean up a leaked run)
+#
+# Required env (typically GHA secrets passed through):
+#   DIGITALOCEAN_TOKEN, R2_ACCESS_KEY, R2_SECRET_KEY
+#
+# Optional overrides: see config.sh (REGION, NODE_SIZE, RUN_ID, etc.)
+set -euo pipefail
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+export REPO_ROOT
+
+# --- load config + libs ----------------------------------------------------
+source "$REPO_ROOT/config.sh"
+source "$REPO_ROOT/lib/log.sh"
+source "$REPO_ROOT/lib/preflight.sh"
+
+# --- load step functions ---------------------------------------------------
+source "$REPO_ROOT/scripts/00-create-cluster.sh"
+source "$REPO_ROOT/scripts/01-install-argocd.sh"
+source "$REPO_ROOT/scripts/02-install-apps.sh"
+source "$REPO_ROOT/scripts/03-recover-postgres.sh"
+source "$REPO_ROOT/scripts/04-validate.sh"
+source "$REPO_ROOT/scripts/05-validate-vaultwarden.sh"
+source "$REPO_ROOT/scripts/99-destroy.sh"
+
+# --- argument parsing ------------------------------------------------------
+DESTROY_ONLY=0
+for arg in "$@"; do
+  case "$arg" in
+    --skip-destroy) SKIP_DESTROY=1 ;;
+    destroy)        DESTROY_ONLY=1 ;;
+    *) die "unknown argument: $arg" ;;
+  esac
+done
+
+# --- cleanup trap: the `if: always()` equivalent ---------------------------
+# Runs destroy on ANY exit (success, failure, or interrupt) unless skipped.
+cleanup() {
+  local rc=$?
+  if [ "$SKIP_DESTROY" = "1" ]; then
+    log_warn "SKIP_DESTROY set — leaving cluster '$CLUSTER_NAME' running"
+    log_warn "clean up later with: ./run.sh destroy   (CLUSTER_NAME=$CLUSTER_NAME)"
+  else
+    log_step "Cleanup (exit code: $rc)"
+    destroy_cluster || log_warn "destroy encountered errors (best-effort)"
+  fi
+  exit "$rc"
+}
+
+main() {
+  require_cmd doctl kubectl helm argocd jq curl
+  require_env DIGITALOCEAN_TOKEN
+  ensure_doctl_auth
+
+  if [ "$DESTROY_ONLY" = "1" ]; then
+    destroy_cluster
+    return 0
+  fi
+
+  # Register cleanup only for the full drill path.
+  trap cleanup EXIT INT TERM
+
+  create_cluster
+  install_argocd
+  wait_for_argocd
+
+  # JuiceFS secret must exist before the StorageClass (existingSecret) is created.
+  create_juicefs_secret
+  install_app_of_app
+  sync_cert_manager
+  create_secrets
+
+  # PostgreSQL first — JuiceFS metadata + Vaultwarden DB both live here.
+  recover_postgres
+  validate_data
+
+  # Data layer: JuiceFS (read-only) then Vaultwarden on top of it.
+  sync_juicefs
+  sync_vaultwarden
+  validate_vaultwarden
+
+  log_ok "DR drill succeeded for cluster $CLUSTER_NAME"
+}
+
+main
\ No newline at end of file
diff --git a/disaster-recovery/dr-drill/scripts/00-create-doks-cluster.sh b/disaster-recovery/dr-drill/scripts/00-create-doks-cluster.sh
new file mode 100755
index 0000000..d3f5af4
--- /dev/null
+++ b/disaster-recovery/dr-drill/scripts/00-create-doks-cluster.sh
@@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+# scripts/00-create-cluster.sh — provision the DOKS cluster and load kubeconfig.
+set -euo pipefail
+
+create_cluster() {
+  log_step "Creating DOKS cluster: $CLUSTER_NAME ($REGION, ${NODE_COUNT}x $NODE_SIZE)"
+  doctl kubernetes cluster create "$CLUSTER_NAME" \
+    --region "$REGION" \
+    --size "$NODE_SIZE" \
+    --count "$NODE_COUNT" \
+    --wait
+  log_ok "Cluster created"
+
+  log_step "Saving kubeconfig"
+  doctl kubernetes cluster kubeconfig save "$CLUSTER_NAME"
+  kubectl config set-context --current --namespace=argocd
+  log_ok "kubeconfig loaded, default namespace set to argocd"
+}
+
+# Allow running standalone: ./scripts/00-create-cluster.sh
+if [ "${BASH_SOURCE[0]}" = "${0}" ]; then
+  HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+  # shellcheck source=../config.sh
+  source "$HERE/config.sh"
+  # shellcheck source=../lib/log.sh
+  source "$HERE/lib/log.sh"
+  # shellcheck source=../lib/preflight.sh
+  source "$HERE/lib/preflight.sh"
+  require_cmd doctl kubectl
+  ensure_doctl_auth
+  create_cluster
+fi
\ No newline at end of file
diff --git a/disaster-recovery/dr-drill/scripts/01-install-argocd.sh b/disaster-recovery/dr-drill/scripts/01-install-argocd.sh
new file mode 100755
index 0000000..42150f5
--- /dev/null
+++ b/disaster-recovery/dr-drill/scripts/01-install-argocd.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# scripts/01-install-argocd.sh — install ArgoCD via Helm and wait for it to be ready.
+set -euo pipefail
+
+install_argocd() {
+  log_step "Installing ArgoCD (chart $ARGOCD_CHART_VERSION)"
+  helm repo add argo https://argoproj.github.io/argo-helm >/dev/null 2>&1 || true
+  helm repo update argo >/dev/null
+
+  helm upgrade --install argocd argo/argo-cd \
+    --version "$ARGOCD_CHART_VERSION" \
+    --namespace argocd \
+    --create-namespace \
+    --debug \
+    --set 'configs.params.server\.insecure=true'
+  log_ok "ArgoCD installed"
+}
+
+wait_for_argocd() {
+  log_step "Waiting for ArgoCD core deployments"
+  local d
+  for d in argocd-server argocd-repo-server argocd-applicationset-controller; do
+    log_info "rollout: $d"
+    kubectl rollout status "deployment/$d" -n argocd --timeout="$ARGOCD_ROLLOUT_TIMEOUT"
+  done
+  log_ok "ArgoCD is ready"
+}
+
+if [ "${BASH_SOURCE[0]}" = "${0}" ]; then
+  HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+  source "$HERE/config.sh"; source "$HERE/lib/log.sh"; source "$HERE/lib/preflight.sh"
+  require_cmd helm kubectl
+  install_argocd
+  wait_for_argocd
+fi
\ No newline at end of file
diff --git a/disaster-recovery/dr-drill/scripts/02-install-app.sh b/disaster-recovery/dr-drill/scripts/02-install-app.sh
new file mode 100755
index 0000000..89d51fc
--- /dev/null
+++ b/disaster-recovery/dr-drill/scripts/02-install-app.sh
@@ -0,0 +1,122 @@
+#!/usr/bin/env bash
+# scripts/02-install-apps.sh — JuiceFS secret, app-of-app, cert-manager, secrets.
+set -euo pipefail
+
+# create_juicefs_secret — MUST run before install_app_of_app, because the
+# JuiceFS StorageClass references existingSecret: <JUICEFS_SECRET_NAME>.
+# Field names match the wener/juicefs-csi-driver existingSecret schema.
+create_juicefs_secret() {
+  [ "$JUICEFS_ENABLED" = "true" ] || { log_info "JuiceFS disabled — skipping secret"; return 0; }
+
+  log_step "Creating JuiceFS namespace and credentials secret"
+  require_env R2_ACCESS_KEY R2_SECRET_KEY
+
+  if [ -z "${JUICEFS_META_PASSWORD:-}" ] && [[ "$JUICEFS_METAURL" == *"//${JUICEFS_META_USER}:@"* ]]; then
+    log_warn "JUICEFS_META_PASSWORD is empty — metaurl will have no password"
+  fi
+
+  kubectl create namespace "$JUICEFS_NAMESPACE" --dry-run=client -o yaml | kubectl apply -f -
+
+  kubectl create secret generic "$JUICEFS_SECRET_NAME" \
+    -n "$JUICEFS_NAMESPACE" \
+    --from-literal=name="$JUICEFS_VOLUME_NAME" \
+    --from-literal=metaurl="$JUICEFS_METAURL" \
+    --from-literal=storage="s3" \
+    --from-literal=bucket="$JUICEFS_BUCKET" \
+    --from-literal=accessKey="$R2_ACCESS_KEY" \
+    --from-literal=secretKey="$R2_SECRET_KEY" \
+    --from-literal=envs="$JUICEFS_ENVS" \
+    --dry-run=client -o yaml | kubectl apply -f -
+
+  log_ok "JuiceFS secret '$JUICEFS_SECRET_NAME' applied in '$JUICEFS_NAMESPACE'"
+}
+
+install_app_of_app() {
+  log_step "Installing app-of-app chart (juicefs.enabled=$JUICEFS_ENABLED, readOnly=$JUICEFS_READONLY)"
+  helm upgrade --install app-of-app "$REPO_ROOT/$APP_OF_APP_CHART" \
+    --namespace argocd \
+    --set metallb.enabled=false \
+    --set traefik.enabled=true \
+    --set openebs.enabled=false \
+    --set postgresql.enabled=true \
+    --set certManager.enabled=true \
+    --set kubePrometheusStack.enabled=true \
+    --set customManifest.enabled=false \
+    --set loki.enabled=false \
+    --set alloy.enabled=false \
+    --set pgadmin4.enabled=false \
+    --set sonarqube.enabled=false \
+    --set harbor.enabled=false \
+    --set velero.enabled=false \
+    --set mongoOperator.enabled=false \
+    --set kafkaOperator.enabled=false \
+    --set juicefs.enabled="$JUICEFS_ENABLED" \
+    --set juicefs.readOnly="$JUICEFS_READONLY" \
+    --set vaultwarden.enabled=true
+  log_ok "app-of-app installed"
+}
+
+sync_cert_manager() {
+  log_step "Syncing cert-manager"
+  argocd app sync cert-manager --core \
+    --retry-limit 5 \
+    --retry-backoff-duration 10s \
+    --retry-backoff-max-duration 3m \
+    --retry-backoff-factor 2
+  log_ok "cert-manager synced"
+}
+
+create_secrets() {
+  log_step "Creating namespace and secrets in $NAMESPACE"
+  require_env R2_ACCESS_KEY R2_SECRET_KEY
+
+  kubectl create namespace "$NAMESPACE" --dry-run=client -o yaml | kubectl apply -f -
+
+  kubectl create secret generic cloudflare-r2 \
+    -n "$NAMESPACE" \
+    --from-literal=ACCESS_KEY="$R2_ACCESS_KEY" \
+    --from-literal=SECRET_KEY="$R2_SECRET_KEY" \
+    --dry-run=client -o yaml | kubectl apply -f -
+
+  kubectl create secret generic postgres-admin \
+    -n "$NAMESPACE" \
+    --from-literal=username=postgres \
+    --from-literal=password=backup-test-dummy \
+    --dry-run=client -o yaml | kubectl apply -f -
+
+  log_ok "Secrets applied"
+}
+
+# sync_juicefs — call AFTER PostgreSQL is healthy (the mount pod needs the
+# metaurl DB reachable). Syncs the Application and waits for CSI readiness.
+sync_juicefs() {
+  [ "$JUICEFS_ENABLED" = "true" ] || { log_info "JuiceFS disabled — skipping sync"; return 0; }
+
+  log_step "Syncing juicefs Application"
+  argocd app sync juicefs --core \
+    --retry-limit 5 \
+    --retry-backoff-duration 10s \
+    --retry-backoff-max-duration 3m \
+    --retry-backoff-factor 2
+
+  log_step "Waiting for JuiceFS CSI components"
+  # Controller is a StatefulSet, node service a DaemonSet.
+  kubectl rollout status statefulset \
+    -n "$JUICEFS_NAMESPACE" -l "$JUICEFS_CSI_SELECTOR" --timeout=180s 2>/dev/null \
+    || log_warn "could not confirm CSI controller rollout (continuing)"
+  kubectl rollout status daemonset \
+    -n "$JUICEFS_NAMESPACE" -l "$JUICEFS_CSI_SELECTOR" --timeout=180s 2>/dev/null \
+    || log_warn "could not confirm CSI node rollout (continuing)"
+  log_ok "JuiceFS synced"
+}
+
+if [ "${BASH_SOURCE[0]}" = "${0}" ]; then
+  HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+  REPO_ROOT="${REPO_ROOT:-$HERE}"
+  source "$HERE/config.sh"; source "$HERE/lib/log.sh"; source "$HERE/lib/preflight.sh"
+  require_cmd helm kubectl argocd
+  create_juicefs_secret
+  install_app_of_app
+  sync_cert_manager
+  create_secrets
+fi
\ No newline at end of file
diff --git a/disaster-recovery/dr-drill/scripts/03-recover-pg.sh b/disaster-recovery/dr-drill/scripts/03-recover-pg.sh
new file mode 100755
index 0000000..249b197
--- /dev/null
+++ b/disaster-recovery/dr-drill/scripts/03-recover-pg.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+# scripts/03-recover-postgres.sh — point PG app at the recovery tag, sync, wait healthy.
+set -euo pipefail
+
+override_revision() {
+  log_step "Overriding prod-postgresql revision -> $RECOVERY_TAG"
+  argocd app set prod-postgresql \
+    --core \
+    --source-position 2 \
+    --revision "$RECOVERY_TAG"
+  log_ok "Revision set"
+}
+
+sync_postgres_app() {
+  log_step "Syncing prod-postgresql (app)"
+  # Non-fatal: the Cluster resource sync below is the authoritative gate.
+  argocd app sync prod-postgresql --core \
+    --retry-limit 3 \
+    --retry-backoff-duration 5s \
+    --retry-backoff-max-duration 1m \
+    --retry-backoff-factor 2 || log_warn "app sync returned non-zero (continuing)"
+
+  log_step "Syncing prod-postgresql (Cluster resource)"
+  argocd app sync prod-postgresql --core \
+    --resource postgresql.cnpg.io:Cluster:postgresql \
+    --retry-limit 5 \
+    --retry-backoff-duration 5s \
+    --retry-backoff-max-duration 1m \
+    --retry-backoff-factor 2 || log_warn "cluster-resource sync returned non-zero (continuing)"
+}
+
+wait_for_healthy() {
+  log_step "Waiting for CNPG cluster to reach healthy state"
+  local i phase
+  for i in $(seq 1 "$CLUSTER_HEALTHY_ATTEMPTS"); do
+    phase=$(kubectl get cluster -n "$NAMESPACE" postgresql \
+      -o jsonpath='{.status.phase}' 2>/dev/null || echo "unknown")
+    log_info "attempt $i/$CLUSTER_HEALTHY_ATTEMPTS: phase=$phase"
+    if [ "$phase" = "Cluster in healthy state" ]; then
+      log_ok "Cluster is healthy"
+      return 0
+    fi
+    sleep "$CLUSTER_HEALTHY_INTERVAL"
+  done
+
+  log_error "Cluster did not reach healthy state in time — dumping diagnostics"
+  kubectl get cluster -n "$NAMESPACE" postgresql -o yaml || true
+  kubectl get pods -n "$NAMESPACE" -l cnpg.io/cluster=postgresql || true
+  return 1
+}
+
+recover_postgres() {
+  override_revision
+  sync_postgres_app
+  wait_for_healthy
+}
+
+if [ "${BASH_SOURCE[0]}" = "${0}" ]; then
+  HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+  source "$HERE/config.sh"; source "$HERE/lib/log.sh"; source "$HERE/lib/preflight.sh"
+  require_cmd kubectl argocd
+  recover_postgres
+fi
\ No newline at end of file
diff --git a/disaster-recovery/dr-drill/scripts/04-validate.sh b/disaster-recovery/dr-drill/scripts/04-validate.sh
new file mode 100644
index 0000000..eda411a
--- /dev/null
+++ b/disaster-recovery/dr-drill/scripts/04-validate.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+# scripts/04-validate.sh — verify the restored PostgreSQL data.
+set -euo pipefail
+
+_primary_pod() {
+  kubectl get pods -n "$NAMESPACE" \
+    -l cnpg.io/cluster=postgresql,role=primary \
+    -o jsonpath='{.items[0].metadata.name}'
+}
+
+validate_data() {
+  log_step "Validating restored data"
+  local pod
+  pod="$(_primary_pod)"
+  [ -n "$pod" ] || die "could not find primary PostgreSQL pod"
+  log_info "primary pod: $pod"
+
+  log_info "--- connectivity check ---"
+  kubectl exec -n "$NAMESPACE" "$pod" -- \
+    psql -U postgres -c "SELECT 1 AS connectivity_check;"
+
+  log_info "--- database listing ---"
+  kubectl exec -n "$NAMESPACE" "$pod" -- \
+    psql -U postgres -c "\l"
+
+  log_info "--- verifying expected databases ---"
+  local db count
+  for db in $EXPECTED_DBS; do
+    count=$(kubectl exec -n "$NAMESPACE" "$pod" -- \
+      psql -U postgres -tAc \
+      "SELECT count(*) FROM pg_database WHERE datname = '$db';")
+    if [ "$count" -eq 0 ]; then
+      die "database '$db' not found"
+    fi
+    log_ok "database '$db' exists"
+  done
+
+  log_info "--- counting user tables ---"
+  local table_count
+  for db in $EXPECTED_DBS; do
+    table_count=$(kubectl exec -n "$NAMESPACE" "$pod" -- \
+      psql -U postgres -d "$db" -tAc \
+      "SELECT count(*) FROM pg_catalog.pg_tables WHERE schemaname NOT IN ('pg_catalog','information_schema');" \
+      2>/dev/null || echo "0")
+    log_info "database '$db': $table_count user table(s)"
+  done
+
+  log_ok "All validation checks passed"
+}
+
+if [ "${BASH_SOURCE[0]}" = "${0}" ]; then
+  HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+  source "$HERE/config.sh"; source "$HERE/lib/log.sh"; source "$HERE/lib/preflight.sh"
+  require_cmd kubectl
+  validate_data
+fi
\ No newline at end of file
diff --git a/disaster-recovery/dr-drill/scripts/05-validate-vaultwarden.sh b/disaster-recovery/dr-drill/scripts/05-validate-vaultwarden.sh
new file mode 100644
index 0000000..4adc6b1
--- /dev/null
+++ b/disaster-recovery/dr-drill/scripts/05-validate-vaultwarden.sh
@@ -0,0 +1,134 @@
+#!/usr/bin/env bash
+# scripts/05-validate-vaultwarden.sh — deploy + validate Vaultwarden on the
+# read-only JuiceFS mount. Replaces the un-automatable "log in via UI" step.
+set -euo pipefail
+
+sync_vaultwarden() {
+  log_step "Syncing vaultwarden Application"
+  argocd app sync vaultwarden --core \
+    --retry-limit 5 \
+    --retry-backoff-duration 5s \
+    --retry-backoff-max-duration 1m \
+    --retry-backoff-factor 2 || log_warn "vaultwarden sync returned non-zero (continuing)"
+
+  log_step "Waiting for Vaultwarden rollout"
+  kubectl rollout status "deployment/$VAULTWARDEN_DEPLOYMENT" \
+    -n "$VAULTWARDEN_NAMESPACE" \
+    --timeout="$VAULTWARDEN_ROLLOUT_TIMEOUT"
+  log_ok "Vaultwarden is running"
+}
+
+_vw_pod() {
+  kubectl get pods -n "$VAULTWARDEN_NAMESPACE" \
+    -l "app=$VAULTWARDEN_DEPLOYMENT" \
+    -o jsonpath='{.items[0].metadata.name}' 2>/dev/null
+}
+
+# 1) Process health via HTTP — proves the server started and serves requests.
+check_health_endpoint() {
+  log_step "Checking Vaultwarden HTTP health"
+  kubectl port-forward "svc/$VAULTWARDEN_SERVICE" \
+    "${VAULTWARDEN_LOCAL_PORT}:80" -n "$VAULTWARDEN_NAMESPACE" >/dev/null 2>&1 &
+  local pf_pid=$!
+  # Ensure the port-forward is torn down no matter how we exit this function.
+  trap 'kill "$pf_pid" 2>/dev/null || true' RETURN
+  sleep 5
+
+  curl --fail --silent --show-error --retry 5 --retry-delay 3 \
+    "http://localhost:${VAULTWARDEN_LOCAL_PORT}/alive" >/dev/null
+  curl --fail --silent --show-error --retry 5 --retry-delay 3 \
+    "http://localhost:${VAULTWARDEN_LOCAL_PORT}/api/config" >/dev/null
+  log_ok "Vaultwarden is alive and serving /api/config"
+}
+
+# 2) Read-only enforcement — a write into the JuiceFS-backed data dir MUST fail.
+#    This is the split-brain guarantee: DR cannot mutate the prod R2 prefix.
+check_mount_is_readonly() {
+  log_step "Verifying JuiceFS mount is read-only"
+  local pod
+  pod="$(_vw_pod)"
+  [ -n "$pod" ] || die "could not find Vaultwarden pod"
+
+  local out
+  out=$(kubectl exec -n "$VAULTWARDEN_NAMESPACE" "$pod" -- \
+    sh -c "touch ${VAULTWARDEN_DATA_PATH}/.dr-write-test 2>&1" || true)
+
+  if printf '%s' "$out" | grep -qiE 'read-only file system|permission denied'; then
+    log_ok "write correctly rejected: ${out}"
+  else
+    # Clean up if the write unexpectedly succeeded, then fail hard.
+    kubectl exec -n "$VAULTWARDEN_NAMESPACE" "$pod" -- \
+      rm -f "${VAULTWARDEN_DATA_PATH}/.dr-write-test" 2>/dev/null || true
+    die "mount is NOT read-only — DR could corrupt prod R2 (output: '${out:-<empty, write succeeded>}')"
+  fi
+}
+
+# 3) Data is actually readable — proves restored metadata maps to real objects.
+check_data_readable() {
+  log_step "Verifying restored data is readable via JuiceFS"
+  local pod count
+  pod="$(_vw_pod)"
+  count=$(kubectl exec -n "$VAULTWARDEN_NAMESPACE" "$pod" -- \
+    sh -c "ls -A ${VAULTWARDEN_DATA_PATH} 2>/dev/null | wc -l" || echo 0)
+
+  if [ "${count:-0}" -gt 0 ]; then
+    log_ok "data dir has $count entries — files restored"
+  else
+    die "data dir is empty — JuiceFS restore may have failed"
+  fi
+}
+
+# 4) Log scan — catch DB connection failures / startup panics.
+check_logs_clean() {
+  log_step "Scanning Vaultwarden logs"
+  local logs
+  logs=$(kubectl logs "deployment/$VAULTWARDEN_DEPLOYMENT" \
+    -n "$VAULTWARDEN_NAMESPACE" --tail=100 2>/dev/null || echo "")
+  printf '%s\n' "$logs"
+
+  if printf '%s' "$logs" | grep -qiE 'panic|fatal|database.*(fail|error)|unable to connect'; then
+    die "critical errors found in Vaultwarden logs"
+  fi
+  log_ok "no critical errors in logs"
+}
+
+# 5) Optional deep check — assert restored user count via the admin API.
+check_user_data() {
+  [ -n "${VW_ADMIN_TOKEN:-}" ] || { log_info "VW_ADMIN_TOKEN unset — skipping user-count check"; return 0; }
+
+  log_step "Validating restored user data via admin API"
+  kubectl port-forward "svc/$VAULTWARDEN_SERVICE" \
+    "${VAULTWARDEN_LOCAL_PORT}:80" -n "$VAULTWARDEN_NAMESPACE" >/dev/null 2>&1 &
+  local pf_pid=$!
+  trap 'kill "$pf_pid" 2>/dev/null || true' RETURN
+  sleep 5
+
+  local users
+  users=$(curl --silent \
+    -H "Authorization: Bearer ${VW_ADMIN_TOKEN}" \
+    "http://localhost:${VAULTWARDEN_LOCAL_PORT}/admin/users/overview" \
+    | jq 'length' 2>/dev/null || echo 0)
+
+  if [ "${users:-0}" -gt 0 ]; then
+    log_ok "$users user(s) found — user data restored"
+  else
+    die "no users found — DB restore may be incomplete"
+  fi
+}
+
+validate_vaultwarden() {
+  check_health_endpoint
+  check_mount_is_readonly
+  check_data_readable
+  check_logs_clean
+  check_user_data
+  log_ok "Vaultwarden DR validation passed"
+}
+
+if [ "${BASH_SOURCE[0]}" = "${0}" ]; then
+  HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+  source "$HERE/config.sh"; source "$HERE/lib/log.sh"; source "$HERE/lib/preflight.sh"
+  require_cmd kubectl argocd curl jq
+  sync_vaultwarden
+  validate_vaultwarden
+fi
\ No newline at end of file
diff --git a/disaster-recovery/dr-drill/scripts/99-destroy.sh b/disaster-recovery/dr-drill/scripts/99-destroy.sh
new file mode 100755
index 0000000..c33e469
--- /dev/null
+++ b/disaster-recovery/dr-drill/scripts/99-destroy.sh
@@ -0,0 +1,99 @@
+#!/usr/bin/env bash
+# scripts/99-destroy.sh — tear down the DOKS cluster and orphaned DO resources.
+# Safe to run repeatedly; every step is best-effort and idempotent.
+set -uo pipefail   # NOTE: no -e here — cleanup must continue past failures.
+
+destroy_cluster() {
+  log_step "Destroy: looking up cluster $CLUSTER_NAME"
+  local cluster_id
+  cluster_id=$(doctl kubernetes cluster get "$CLUSTER_NAME" \
+    --format ID --no-header 2>/dev/null || echo "")
+
+  if [ -z "$cluster_id" ]; then
+    log_warn "cluster '$CLUSTER_NAME' not found — nothing to clean up"
+    return 0
+  fi
+  log_info "cluster ID: $cluster_id"
+
+  # Collect LoadBalancer IPs while kubectl still works — needed after deletion.
+  local lb_ips
+  lb_ips=$(kubectl get svc -A \
+    -o jsonpath='{.items[?(@.spec.type=="LoadBalancer")].status.loadBalancer.ingress[0].ip}' \
+    2>/dev/null || echo "")
+
+  _scale_down_node_pools
+  _delete_cluster
+  _delete_volumes "$cluster_id"
+  _delete_load_balancers "$cluster_id" "$lb_ips"
+
+  log_ok "Destroy complete"
+}
+
+_scale_down_node_pools() {
+  log_step "Scaling node pools to 0"
+  local pool_ids pool_id
+  pool_ids=$(doctl kubernetes cluster node-pool list "$CLUSTER_NAME" \
+    --format ID --no-header 2>/dev/null || echo "")
+  for pool_id in $pool_ids; do
+    log_info "scaling pool $pool_id -> 0"
+    doctl kubernetes cluster node-pool update "$CLUSTER_NAME" "$pool_id" --count 0 || true
+  done
+  [ -n "$pool_ids" ] && sleep 30 || true
+}
+
+_delete_cluster() {
+  log_step "Deleting DOKS cluster"
+  doctl kubernetes cluster delete "$CLUSTER_NAME" --force --dangerous || true
+}
+
+_delete_volumes() {
+  # DOKS CSI tags every provisioned volume with k8s:<cluster-id>.
+  local cluster_id="$1"
+  log_step "Cleaning up Block Storage volumes"
+  local vol_ids vol_id
+  vol_ids=$(doctl compute volume list -o json \
+    | jq -r --arg tag "k8s:$cluster_id" \
+      '.[] | select(.tags? and (.tags[] == $tag)) | .id' 2>/dev/null || echo "")
+  if [ -z "$vol_ids" ]; then
+    log_info "no volumes found for cluster $cluster_id"
+    return 0
+  fi
+  for vol_id in $vol_ids; do
+    log_info "deleting volume $vol_id"
+    doctl compute volume delete "$vol_id" --force || true
+  done
+}
+
+_delete_load_balancers() {
+  local cluster_id="$1" lb_ips="$2"
+  log_step "Cleaning up Load Balancers"
+
+  # Primary: match by IPs collected before the cluster was deleted.
+  local lb_ip lb_id
+  for lb_ip in $lb_ips; do
+    lb_id=$(doctl compute load-balancer list -o json \
+      | jq -r --arg ip "$lb_ip" '.[] | select(.ip == $ip) | .id')
+    if [ -n "$lb_id" ]; then
+      log_info "deleting LB $lb_id (IP: $lb_ip)"
+      doctl compute load-balancer delete "$lb_id" --force || true
+    fi
+  done
+
+  # Fallback: catch any LB still tagged with the cluster ID.
+  local tagged_ids
+  tagged_ids=$(doctl compute load-balancer list -o json \
+    | jq -r --arg tag "k8s:$cluster_id" \
+      '.[] | select(.tags? and (.tags[] == $tag)) | .id' 2>/dev/null || echo "")
+  for lb_id in $tagged_ids; do
+    log_info "deleting tagged LB $lb_id"
+    doctl compute load-balancer delete "$lb_id" --force 2>/dev/null || true
+  done
+}
+
+if [ "${BASH_SOURCE[0]}" = "${0}" ]; then
+  HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+  source "$HERE/config.sh"; source "$HERE/lib/log.sh"; source "$HERE/lib/preflight.sh"
+  require_cmd doctl kubectl jq
+  ensure_doctl_auth
+  destroy_cluster
+fi
\ No newline at end of file
diff --git a/kubernetes/argocd/argocd-app/stateful/postgresql/raw-manifests/instance.yaml b/kubernetes/argocd/argocd-app/stateful/postgresql/raw-manifests/instance.yaml
index 7a69ae4..9ce6c6f 100644
--- a/kubernetes/argocd/argocd-app/stateful/postgresql/raw-manifests/instance.yaml
+++ b/kubernetes/argocd/argocd-app/stateful/postgresql/raw-manifests/instance.yaml
@@ -75,12 +75,22 @@ spec:
   superuserSecret:
     name: postgres-admin
   monitoring:
-    enablePodMonitor: true
-  plugins:
-  - name: barman-cloud.cloudnative-pg.io
-    isWALArchiver: true
-    parameters:
-      barmanObjectName: cloudflare-r2
+    enablePodMonitor: false
+  bootstrap:
+    recovery:
+      source: origin
+  externalClusters:
+  - name: origin
+    plugin:
+      name: barman-cloud.cloudnative-pg.io
+      parameters:
+        barmanObjectName: cloudflare-r2
+        serverName: postgresql
+  # plugins:
+  # - name: barman-cloud.cloudnative-pg.io
+  #   isWALArchiver: true
+  #   parameters:
+  #     barmanObjectName: cloudflare-r2
   managed:
     services:
       additional:
@@ -90,62 +100,62 @@ spec:
             name: postgresql-external-rw
           spec:
             type: LoadBalancer
-    roles:
-    - name: sonarqube
-      ensure: present
-      comment: Role for Sonarqube
-      connectionLimit: 40
-      login: true
-      superuser: false
-      passwordSecret: { name: sonarqube }
-    - name: vaultwarden
-      ensure: present
-      comment: Role for Vaultwarden
-      connectionLimit: 20
-      login: true
-      superuser: false
-      passwordSecret: { name: vaultwarden }
-    - name: gitlab
-      ensure: present
-      comment: Role for Gitlab
-      connectionLimit: 80
-      login: true
-      superuser: false
-      passwordSecret: { name: gitlab }
-    - name: nextcloud
-      ensure: present
-      comment: Role for nextcloud
-      connectionLimit: 30
-      login: true
-      superuser: false
-      passwordSecret: { name: nextcloud }
-    - name: juicefs
-      ensure: present
-      comment: Role for juicefs
-      connectionLimit: 30
-      login: true
-      superuser: false
-      passwordSecret: { name: juicefs }
-    - name: argus
-      ensure: present
-      comment: Role for Argus
-      connectionLimit: 20
-      login: true
-      superuser: false
-      passwordSecret: { name: argus }
-    - name: crowsec
-      ensure: present
-      comment: Role for crowsec
-      connectionLimit: 10
-      login: true
-      superuser: false
-      passwordSecret: { name: crowsec }
-    - name: pgadmin_monitor
-      ensure: present
-      login: true
-      superuser: false
-      passwordSecret: { name: pgadmin-monitor-credentials }
-      inRoles: [ pg_monitor ]
+    # roles:
+    # - name: sonarqube
+    #   ensure: present
+    #   comment: Role for Sonarqube
+    #   connectionLimit: 40
+    #   login: true
+    #   superuser: false
+    #   passwordSecret: { name: sonarqube }
+    # - name: vaultwarden
+    #   ensure: present
+    #   comment: Role for Vaultwarden
+    #   connectionLimit: 20
+    #   login: true
+    #   superuser: false
+    #   passwordSecret: { name: vaultwarden }
+    # - name: gitlab
+    #   ensure: present
+    #   comment: Role for Gitlab
+    #   connectionLimit: 80
+    #   login: true
+    #   superuser: false
+    #   passwordSecret: { name: gitlab }
+    # - name: nextcloud
+    #   ensure: present
+    #   comment: Role for nextcloud
+    #   connectionLimit: 30
+    #   login: true
+    #   superuser: false
+    #   passwordSecret: { name: nextcloud }
+    # - name: juicefs
+    #   ensure: present
+    #   comment: Role for juicefs
+    #   connectionLimit: 30
+    #   login: true
+    #   superuser: false
+    #   passwordSecret: { name: juicefs }
+    # - name: argus
+    #   ensure: present
+    #   comment: Role for Argus
+    #   connectionLimit: 20
+    #   login: true
+    #   superuser: false
+    #   passwordSecret: { name: argus }
+    # - name: crowsec
+    #   ensure: present
+    #   comment: Role for crowsec
+    #   connectionLimit: 10
+    #   login: true
+    #   superuser: false
+    #   passwordSecret: { name: crowsec }
+    # - name: pgadmin_monitor
+    #   ensure: present
+    #   login: true
+    #   superuser: false
+    #   passwordSecret: { name: pgadmin-monitor-credentials }
+    #   inRoles: [ pg_monitor ]
   instances: 2
   primaryUpdateMethod: switchover
   storage:

From d7999a00a3494840a7c4971c0ce85ab692328227 Mon Sep 17 00:00:00 2001
From: Ngo Vu Minh Dat <ngovuminhdat@gmail.com>
Date: Sat, 20 Jun 2026 17:19:55 +0700
Subject: [PATCH 2/4] update tag

---
 disaster-recovery/dr-drill/config.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/disaster-recovery/dr-drill/config.sh b/disaster-recovery/dr-drill/config.sh
index 31addf7..4c2fc2a 100755
--- a/disaster-recovery/dr-drill/config.sh
+++ b/disaster-recovery/dr-drill/config.sh
@@ -27,7 +27,7 @@
 # Kubernetes / app config
 # ---------------------------------------------------------------------------
 : "${NAMESPACE:=prod-postgresql}"
-: "${RECOVERY_TAG:=postgresql-first-recovery-test}"
+: "${RECOVERY_TAG:=postgreql-recovery-sync}"
 : "${ARGOCD_CHART_VERSION:=9.5.22}"
 
 # Repo-relative path to the app-of-app helm chart.

From 2db5900ed33e1fe64b53387712c1fd040af2ca90 Mon Sep 17 00:00:00 2001
From: Ngo Vu Minh Dat <ngovuminhdat@gmail.com>
Date: Sat, 20 Jun 2026 18:00:57 +0700
Subject: [PATCH 3/4] wip

---
 .github/workflows/postgresql-backup-test.yml  | 239 +-----------------
 .../dr-drill/scripts/02-install-app.sh        |   2 +-
 .../argocd/app-of-app/templates/juicefs.yaml  |  10 +-
 kubernetes/argocd/app-of-app/values.yaml      |   1 +
 4 files changed, 22 insertions(+), 230 deletions(-)

diff --git a/.github/workflows/postgresql-backup-test.yml b/.github/workflows/postgresql-backup-test.yml
index cbd4b3d..0846371 100644
--- a/.github/workflows/postgresql-backup-test.yml
+++ b/.github/workflows/postgresql-backup-test.yml
@@ -18,21 +18,13 @@ on:
         default: "2"
         required: false
 
-env:
-  CLUSTER_NAME: pg-backup-test-${{ github.run_id }}
-  REGION: ${{ inputs.region || 'nyc3' }}
-  NODE_SIZE: ${{ inputs.node_size || 's-4vcpu-8gb' }}
-  NODE_COUNT: ${{ inputs.node_count || '2' }}
-  NAMESPACE: prod-postgresql
-  RECOVERY_TAG: postgresql-first-recovery-test
-  ARGOCD_CHART_VERSION: "9.4.15"
-
 jobs:
   backup-test:
     runs-on: ubuntu-latest
     environment: test-backup
     timeout-minutes: 45
     steps:
+      # --- dependency setup only; no business logic lives here ---
       - name: Checkout repository
         uses: actions/checkout@v4
 
@@ -43,228 +35,23 @@ jobs:
 
       - name: Install ArgoCD CLI
         run: |
-          curl -sSL -o argocd https://github.com/argoproj/argo-cd/releases/download/v3.3.4/argocd-linux-amd64
+          curl -sSL -o argocd \
+            https://github.com/argoproj/argo-cd/releases/download/v3.3.4/argocd-linux-amd64
           chmod +x argocd
           sudo mv argocd /usr/local/bin/
 
-      - name: Create DOKS cluster
-        run: |
-          doctl kubernetes cluster create "$CLUSTER_NAME" \
-            --region "$REGION" \
-            --size "$NODE_SIZE" \
-            --count "$NODE_COUNT" \
-            --wait
-      - name: Save kubeconfig
-        run: |
-          doctl kubernetes cluster kubeconfig save "$CLUSTER_NAME"
-          kubectl config set-context --current --namespace=argocd
-      - name: Install ArgoCD via Helm
-        run: |
-          helm repo add argo https://argoproj.github.io/argo-helm
-          helm repo update
-          helm install argocd argo/argo-cd \
-            --version "$ARGOCD_CHART_VERSION" \
-            --namespace argocd \
-            --create-namespace \
-            --wait \
-            --timeout 5m \
-            --set 'configs.params.server\.insecure=true'
-
-      - name: Wait for ArgoCD to be ready
-        run: |
-          kubectl rollout status deployment/argocd-server -n argocd --timeout=120s
-          kubectl rollout status deployment/argocd-repo-server -n argocd --timeout=120s
-          kubectl rollout status deployment/argocd-applicationset-controller -n argocd --timeout=120s
-
-      - name: Install app-of-app chart
-        run: |
-          helm install app-of-app ./kubernetes/argocd/app-of-app \
-            --namespace argocd \
-            --set metallb.enabled=false \
-            --set traefik.enabled=false \
-            --set openebs.enabled=false \
-            --set postgresql.enabled=true \
-            --set certManager.enabled=true \
-            --set kubePrometheusStack.enabled=true \
-            --set customManifest.enabled=false \
-            --set loki.enabled=true \
-            --set alloy.enabled=true \
-            --set pgadmin4.enabled=true \
-            --set sonarqube.enabled=false \
-            --set harbor.enabled=false \
-            --set velero.enabled=false \
-            --set mongoOperator.enabled=false \
-            --set kafkaOperator.enabled=false \
-            --set juicefs.enabled=false \
-            --set vaultwarden.enabled=true
+      # helm, kubectl, and jq are preinstalled on ubuntu-latest runners.
 
-      - name: Sync cert-manager
-        run: |
-          argocd app sync cert-manager --core \
-          --retry-limit 5 \
-          --retry-backoff-duration 10s \
-          --retry-backoff-max-duration 3m \
-          --retry-backoff-factor 2
-      - name: Create namespace and secrets
+      # --- delegate all logic to the bash scripts ---
+      - name: Run DR drill
         env:
+          DIGITALOCEAN_TOKEN: ${{ secrets.DIGITALOCEAN_TOKEN }}
           R2_ACCESS_KEY: ${{ secrets.R2_ACCESS_KEY }}
           R2_SECRET_KEY: ${{ secrets.R2_SECRET_KEY }}
+          RUN_ID: ${{ github.run_id }}
+          REGION: ${{ inputs.region || 'nyc3' }}
+          NODE_SIZE: ${{ inputs.node_size || 's-4vcpu-8gb' }}
+          NODE_COUNT: ${{ inputs.node_count || '2' }}
         run: |
-          kubectl create namespace "$NAMESPACE" --dry-run=client -o yaml | kubectl apply -f -
-          kubectl create secret generic cloudflare-r2 \
-            -n "$NAMESPACE" \
-            --from-literal=ACCESS_KEY="$R2_ACCESS_KEY" \
-            --from-literal=SECRET_KEY="$R2_SECRET_KEY" \
-            --dry-run=client -o yaml | kubectl apply -f -
-          kubectl create secret generic postgres-admin \
-            -n "$NAMESPACE" \
-            --from-literal=username=postgres \
-            --from-literal=password=backup-test-dummy \
-            --dry-run=client -o yaml | kubectl apply -f -
-
-      - name: Override PostgreSQL app revision to recovery tag
-        run: |
-          argocd app set prod-postgresql \
-            --core \
-            --source-position 2 \
-            --revision "$RECOVERY_TAG"
-      - name: Sync PostgreSQL (App)
-        continue-on-error: true
-        run: |
-          argocd app sync prod-postgresql --core \
-            --retry-limit 3 \
-            --retry-backoff-duration 5s \
-            --retry-backoff-max-duration 1m \
-            --retry-backoff-factor 2
-      - name: Sync PostgreSQL (Cluster Resource)
-        continue-on-error: true
-        run: |
-          argocd app sync prod-postgresql --core \
-            --resource postgresql.cnpg.io:Cluster:postgresql \
-            --retry-limit 5 \
-            --retry-backoff-duration 5s \
-            --retry-backoff-max-duration 1m \
-            --retry-backoff-factor 2
-    
-      - name: Wait for cluster healthy state
-        run: |
-          echo "Waiting for CloudNativePG cluster to reach healthy state..."
-          for i in $(seq 1 90); do
-            phase=$(kubectl get cluster -n "$NAMESPACE" postgresql \
-              -o jsonpath='{.status.phase}' 2>/dev/null || echo "unknown")
-            echo "  Attempt $i/90: phase=$phase"
-            if [ "$phase" = "Cluster in healthy state" ]; then
-              echo "Cluster is healthy."
-              exit 0
-            fi
-            sleep 10
-          done
-          echo "ERROR: Cluster did not reach healthy state within 15 minutes."
-          kubectl get cluster -n "$NAMESPACE" postgresql -o yaml || true
-          kubectl get pods -n "$NAMESPACE" -l cnpg.io/cluster=postgresql || true
-          exit 1
-
-      - name: Validate restored data
-        run: |
-          POD=$(kubectl get pods -n "$NAMESPACE" \
-            -l cnpg.io/cluster=postgresql,role=primary \
-            -o jsonpath='{.items[0].metadata.name}')
-          echo "Primary pod: $POD"
-
-          echo "--- Connectivity check ---"
-          kubectl exec -n "$NAMESPACE" "$POD" -- \
-            psql -U postgres -c "SELECT 1 AS connectivity_check;"
-
-          echo "--- Database listing ---"
-          kubectl exec -n "$NAMESPACE" "$POD" -- \
-            psql -U postgres -c "\l"
-
-          echo "--- Verify expected databases exist ---"
-          EXPECTED_DBS="sonarqube"
-          for db in $EXPECTED_DBS; do
-            count=$(kubectl exec -n "$NAMESPACE" "$POD" -- \
-              psql -U postgres -tAc "SELECT count(*) FROM pg_database WHERE datname = '$db';")
-            if [ "$count" -eq 0 ]; then
-              echo "FAIL: Database '$db' not found."
-              exit 1
-            fi
-            echo "OK: Database '$db' exists."
-          done
-
-          echo "--- Count user tables across databases ---"
-          for db in $EXPECTED_DBS; do
-            table_count=$(kubectl exec -n "$NAMESPACE" "$POD" -- \
-              psql -U postgres -d "$db" -tAc \
-              "SELECT count(*) FROM pg_catalog.pg_tables WHERE schemaname NOT IN ('pg_catalog','information_schema');" \
-              2>/dev/null || echo "0")
-            echo "Database '$db': $table_count user table(s)"
-          done
-          echo "All validation checks passed."
-      - name: Destroy DOKS cluster and associated infra
-        if: always()
-        run: |
-          CLUSTER_ID=$(doctl kubernetes cluster get "$CLUSTER_NAME" \
-            --format ID --no-header 2>/dev/null || echo "")
-
-          if [ -z "$CLUSTER_ID" ]; then
-            echo "Cluster '$CLUSTER_NAME' not found, nothing to clean up."
-            exit 0
-          fi
-
-          echo "Cluster ID: $CLUSTER_ID"
-
-          # Collect LB IPs now while kubectl still works — needed after cluster is gone
-          LB_IPS=$(kubectl get svc -A \
-            -o jsonpath='{.items[?(@.spec.type=="LoadBalancer")].status.loadBalancer.ingress[0].ip}' \
-            2>/dev/null || echo "")
-
-          # --- Scale down all node pools then delete the cluster ---
-          echo "=== Scaling down node pools ==="
-          POOL_IDS=$(doctl kubernetes cluster node-pool list "$CLUSTER_NAME" \
-            --format ID --no-header 2>/dev/null || echo "")
-          for pool_id in $POOL_IDS; do
-            echo "Scaling node pool $pool_id to 0..."
-            doctl kubernetes cluster node-pool update "$CLUSTER_NAME" "$pool_id" \
-              --count 0 || true
-          done
-          [ -n "$POOL_IDS" ] && sleep 30 || true
-
-          echo "=== Deleting DOKS cluster ==="
-          doctl kubernetes cluster delete "$CLUSTER_NAME" --force --dangerous || true
-
-          # --- Delete DigitalOcean Block Storage Volumes ---
-          # The DOKS CSI driver tags every provisioned volume with k8s:<cluster-id>
-          echo "=== Cleaning up DigitalOcean Block Storage Volumes ==="
-          VOL_IDS=$(doctl compute volume list -o json \
-            | jq -r --arg tag "k8s:$CLUSTER_ID" \
-              '.[] | select(.tags? and (.tags[] == $tag)) | .id' 2>/dev/null || echo "")
-          if [ -n "$VOL_IDS" ]; then
-            for vol_id in $VOL_IDS; do
-              echo "Deleting volume: $vol_id"
-              doctl compute volume delete "$vol_id" --force || true
-            done
-          else
-            echo "No block storage volumes found for cluster $CLUSTER_ID."
-          fi
-
-          # --- Delete DigitalOcean Load Balancers ---
-          # Primary: match by IPs collected before cluster was deleted
-          echo "=== Cleaning up DigitalOcean Load Balancers ==="
-          for lb_ip in $LB_IPS; do
-            LB_ID=$(doctl compute load-balancer list -o json \
-              | jq -r --arg ip "$lb_ip" '.[] | select(.ip == $ip) | .id')
-            if [ -n "$LB_ID" ]; then
-              echo "Deleting load balancer $LB_ID (IP: $lb_ip)"
-              doctl compute load-balancer delete "$LB_ID" --force || true
-            fi
-          done
-          # Fallback: catch any LBs still tagged with the cluster ID
-          TAGGED_LB_IDS=$(doctl compute load-balancer list -o json \
-            | jq -r --arg tag "k8s:$CLUSTER_ID" \
-              '.[] | select(.tags? and (.tags[] == $tag)) | .id' 2>/dev/null || echo "")
-          for lb_id in $TAGGED_LB_IDS; do
-            echo "Deleting tagged load balancer: $lb_id"
-            doctl compute load-balancer delete "$lb_id" --force 2>/dev/null || true
-          done
-
-          echo "Destroy complete."
+          chmod +x dr-drill/run.sh dr-drill/scripts/*.sh
+          ./dr-drill/run.sh
\ No newline at end of file
diff --git a/disaster-recovery/dr-drill/scripts/02-install-app.sh b/disaster-recovery/dr-drill/scripts/02-install-app.sh
index 89d51fc..f000472 100755
--- a/disaster-recovery/dr-drill/scripts/02-install-app.sh
+++ b/disaster-recovery/dr-drill/scripts/02-install-app.sh
@@ -40,7 +40,7 @@ install_app_of_app() {
     --set openebs.enabled=false \
     --set postgresql.enabled=true \
     --set certManager.enabled=true \
-    --set kubePrometheusStack.enabled=true \
+    --set kubePrometheusStack.enabled=false \
     --set customManifest.enabled=false \
     --set loki.enabled=false \
     --set alloy.enabled=false \
diff --git a/kubernetes/argocd/app-of-app/templates/juicefs.yaml b/kubernetes/argocd/app-of-app/templates/juicefs.yaml
index 76d3f91..1932753 100644
--- a/kubernetes/argocd/app-of-app/templates/juicefs.yaml
+++ b/kubernetes/argocd/app-of-app/templates/juicefs.yaml
@@ -14,8 +14,6 @@ spec:
   syncPolicy:
     automated:
       enabled: false
-      selfHeal: false
-      prune: true
     retry:
       limit: 1
       backoff:
@@ -52,7 +50,7 @@ spec:
                 memory: 128Mi
               limits:
                 cpu: 1
-                memory: 256Mi            
+                memory: 256Mi
           mountMode: mountpod
           metrics:
             enabled: true
@@ -79,6 +77,12 @@ spec:
               reclaimPolicy: Retain
               allowVolumeExpansion: true
               mountOptions:
+                {{- if .Values.juicefs.readOnly }}
+                # DR read-only: blocks writes to the prod R2 prefix AND
+                # disables background GC/trash cleanup, so a DR drill can
+                # never delete or mutate production objects.
+                - ro
+                {{- end }}
                 - cache-size=10240
                 - free-space-ratio=0.1
                 - prefetch=3
diff --git a/kubernetes/argocd/app-of-app/values.yaml b/kubernetes/argocd/app-of-app/values.yaml
index 06847bf..38f314a 100644
--- a/kubernetes/argocd/app-of-app/values.yaml
+++ b/kubernetes/argocd/app-of-app/values.yaml
@@ -28,6 +28,7 @@ sonarqube:
   enabled: true
 juicefs:
   enabled: true
+  readOnly: false
 vaultwarden:
   enabled: true
 certManager:

From 7edef0641d61c2f3b8dbfef03979a7dc7b790676 Mon Sep 17 00:00:00 2001
From: Ngo Vu Minh Dat <ngovuminhdat@gmail.com>
Date: Sun, 21 Jun 2026 11:59:21 +0700
Subject: [PATCH 4/4] Commit to save work, stil error due to read-only mode in
 juicefs, Note: Seperate dev, and prodcution in juicefs bucket to avoid data
 conflicting

---
 disaster-recovery/dr-drill/config.sh          |  3 +-
 disaster-recovery/dr-drill/lib/log.sh         |  0
 disaster-recovery/dr-drill/lib/preflight.sh   |  0
 disaster-recovery/dr-drill/run.sh             |  7 ++--
 .../dr-drill/scripts/02-install-app.sh        |  7 ++++
 .../dr-drill/scripts/04-sync-juicefs.sh       | 37 +++++++++++++++++++
 ...04-validate.sh => 05-validate-database.sh} |  0
 ...ltwarden.sh => 06-validate-vaultwarden.sh} |  0
 .../argocd/app-of-app/templates/juicefs.yaml  | 11 +++++-
 kubernetes/argocd/app-of-app/values.yaml      |  1 +
 10 files changed, 60 insertions(+), 6 deletions(-)
 mode change 100644 => 100755 disaster-recovery/dr-drill/lib/log.sh
 mode change 100644 => 100755 disaster-recovery/dr-drill/lib/preflight.sh
 create mode 100755 disaster-recovery/dr-drill/scripts/04-sync-juicefs.sh
 rename disaster-recovery/dr-drill/scripts/{04-validate.sh => 05-validate-database.sh} (100%)
 mode change 100644 => 100755
 rename disaster-recovery/dr-drill/scripts/{05-validate-vaultwarden.sh => 06-validate-vaultwarden.sh} (100%)
 mode change 100644 => 100755

diff --git a/disaster-recovery/dr-drill/config.sh b/disaster-recovery/dr-drill/config.sh
index 4c2fc2a..9d8095b 100755
--- a/disaster-recovery/dr-drill/config.sh
+++ b/disaster-recovery/dr-drill/config.sh
@@ -47,6 +47,7 @@
 # ---------------------------------------------------------------------------
 : "${JUICEFS_ENABLED:=true}"
 : "${JUICEFS_READONLY:=true}"          # injects the `ro` mount option
+: "${JUICEFS_MONITORING:=false}"
 : "${JUICEFS_NAMESPACE:=juicefs}"
 : "${JUICEFS_SECRET_NAME:=cloudflare-r2}"
 : "${JUICEFS_VOLUME_NAME:=cloudflare-r2-prod}"
@@ -60,7 +61,7 @@ fi
 # The rw service for a CNPG cluster named "postgresql" is "postgresql-rw".
 : "${JUICEFS_META_USER:=juicefs}"
 : "${JUICEFS_META_HOST:=postgresql-rw.${NAMESPACE}.svc}"
-: "${JUICEFS_META_DB:=juicefs_prod}"
+: "${JUICEFS_META_DB:=juicefs}"
 # JUICEFS_META_PASSWORD must be exported (the juicefs DB role password from the
 # restored cluster). If you'd rather supply the whole URL, set JUICEFS_METAURL.
 : "${JUICEFS_METAURL:=postgres://${JUICEFS_META_USER}:${JUICEFS_META_PASSWORD:-}@${JUICEFS_META_HOST}:5432/${JUICEFS_META_DB}?sslmode=disable}"
diff --git a/disaster-recovery/dr-drill/lib/log.sh b/disaster-recovery/dr-drill/lib/log.sh
old mode 100644
new mode 100755
diff --git a/disaster-recovery/dr-drill/lib/preflight.sh b/disaster-recovery/dr-drill/lib/preflight.sh
old mode 100644
new mode 100755
diff --git a/disaster-recovery/dr-drill/run.sh b/disaster-recovery/dr-drill/run.sh
index 882f57e..ed33dff 100644
--- a/disaster-recovery/dr-drill/run.sh
+++ b/disaster-recovery/dr-drill/run.sh
@@ -25,8 +25,9 @@ source "$REPO_ROOT/scripts/00-create-cluster.sh"
 source "$REPO_ROOT/scripts/01-install-argocd.sh"
 source "$REPO_ROOT/scripts/02-install-apps.sh"
 source "$REPO_ROOT/scripts/03-recover-postgres.sh"
-source "$REPO_ROOT/scripts/04-validate.sh"
-source "$REPO_ROOT/scripts/05-validate-vaultwarden.sh"
+source "$REPO_ROOT/scripts/04-sync-juicefs.sh"
+source "$REPO_ROOT/scripts/05-validate.sh"
+source "$REPO_ROOT/scripts/06-validate-vaultwarden.sh"
 source "$REPO_ROOT/scripts/99-destroy.sh"
 
 # --- argument parsing ------------------------------------------------------
@@ -80,7 +81,7 @@ main() {
   recover_postgres
   validate_data
 
-  # Data layer: JuiceFS (read-only) then Vaultwarden on top of it.
+  # Data layer: JuiceFS sync (needs the metaurl DB up) → Vaultwarden on top.
   sync_juicefs
   sync_vaultwarden
   validate_vaultwarden
diff --git a/disaster-recovery/dr-drill/scripts/02-install-app.sh b/disaster-recovery/dr-drill/scripts/02-install-app.sh
index f000472..e01f540 100755
--- a/disaster-recovery/dr-drill/scripts/02-install-app.sh
+++ b/disaster-recovery/dr-drill/scripts/02-install-app.sh
@@ -36,6 +36,12 @@ install_app_of_app() {
   helm upgrade --install app-of-app "$REPO_ROOT/$APP_OF_APP_CHART" \
     --namespace argocd \
     --set metallb.enabled=false \
+    --set argus.enabled=false \
+    --set chaosMesh.enabled=false \
+    --set nextcloud.enabled=false \
+    --set nfsCsiDriver.enabled=false \
+    --set jellyfin.enabled=false \
+    --set qbittorrent.enabled=false \
     --set traefik.enabled=true \
     --set openebs.enabled=false \
     --set postgresql.enabled=true \
@@ -52,6 +58,7 @@ install_app_of_app() {
     --set kafkaOperator.enabled=false \
     --set juicefs.enabled="$JUICEFS_ENABLED" \
     --set juicefs.readOnly="$JUICEFS_READONLY" \
+    --set juicefs.monitoring="$JUICEFS_MONITORING" \
     --set vaultwarden.enabled=true
   log_ok "app-of-app installed"
 }
diff --git a/disaster-recovery/dr-drill/scripts/04-sync-juicefs.sh b/disaster-recovery/dr-drill/scripts/04-sync-juicefs.sh
new file mode 100755
index 0000000..7257ffc
--- /dev/null
+++ b/disaster-recovery/dr-drill/scripts/04-sync-juicefs.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+# scripts/04-sync-juicefs.sh — sync the JuiceFS Application and wait for the CSI
+# driver to be ready.
+#
+# ORDERING: must run AFTER PostgreSQL recovery (03). The JuiceFS mount pod
+# connects to the metaurl (the restored CNPG juicefs_prod DB) when a volume is
+# mounted, so the metadata engine has to be up first. The credentials secret
+# itself is created earlier in 02 (before the StorageClass references it).
+set -euo pipefail
+
+sync_juicefs() {
+  [ "$JUICEFS_ENABLED" = "true" ] || { log_info "JuiceFS disabled — skipping sync"; return 0; }
+
+  log_step "Syncing juicefs Application"
+  argocd app sync juicefs --core \
+    --retry-limit 5 \
+    --retry-backoff-duration 10s \
+    --retry-backoff-max-duration 3m \
+    --retry-backoff-factor 2
+
+  log_step "Waiting for JuiceFS CSI components"
+  # Controller is a StatefulSet, node service a DaemonSet.
+  kubectl rollout status statefulset \
+    -n "$JUICEFS_NAMESPACE" -l "$JUICEFS_CSI_SELECTOR" --timeout=180s 2>/dev/null \
+    || log_warn "could not confirm CSI controller rollout (continuing)"
+  kubectl rollout status daemonset \
+    -n "$JUICEFS_NAMESPACE" -l "$JUICEFS_CSI_SELECTOR" --timeout=180s 2>/dev/null \
+    || log_warn "could not confirm CSI node rollout (continuing)"
+  log_ok "JuiceFS synced"
+}
+
+if [ "${BASH_SOURCE[0]}" = "${0}" ]; then
+  HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+  source "$HERE/config.sh"; source "$HERE/lib/log.sh"; source "$HERE/lib/preflight.sh"
+  require_cmd kubectl argocd
+  sync_juicefs
+fi
\ No newline at end of file
diff --git a/disaster-recovery/dr-drill/scripts/04-validate.sh b/disaster-recovery/dr-drill/scripts/05-validate-database.sh
old mode 100644
new mode 100755
similarity index 100%
rename from disaster-recovery/dr-drill/scripts/04-validate.sh
rename to disaster-recovery/dr-drill/scripts/05-validate-database.sh
diff --git a/disaster-recovery/dr-drill/scripts/05-validate-vaultwarden.sh b/disaster-recovery/dr-drill/scripts/06-validate-vaultwarden.sh
old mode 100644
new mode 100755
similarity index 100%
rename from disaster-recovery/dr-drill/scripts/05-validate-vaultwarden.sh
rename to disaster-recovery/dr-drill/scripts/06-validate-vaultwarden.sh
diff --git a/kubernetes/argocd/app-of-app/templates/juicefs.yaml b/kubernetes/argocd/app-of-app/templates/juicefs.yaml
index 1932753..cbcd0a2 100644
--- a/kubernetes/argocd/app-of-app/templates/juicefs.yaml
+++ b/kubernetes/argocd/app-of-app/templates/juicefs.yaml
@@ -53,7 +53,10 @@ spec:
                 memory: 256Mi
           mountMode: mountpod
           metrics:
-            enabled: true
+            # metrics.enabled=true makes the chart emit a ServiceMonitor, which
+            # needs the monitoring.coreos.com CRDs. Tie it to juicefs.monitoring
+            # so DR drills (no kube-prometheus-stack) don't fail on the missing CRD.
+            enabled: {{ .Values.juicefs.monitoring }}
             port: 9567
             service:
               servicePort: 9567
@@ -111,8 +114,12 @@ spec:
                   paths:
                     - path: /
                       pathType: ImplementationSpecific
-    # Raw manifests (ServiceMonitor, etc.) tracked in Git
+    {{- if .Values.juicefs.monitoring }}
+    # Raw manifests (ServiceMonitor, etc.) tracked in Git.
+    # CRDs (installed by kube-prometheus-stack). Disabled for DR drills where
+    # that stack isn't deployed, otherwise ArgoCD fails on the missing CRD.
     - repoURL: https://github.com/ngodat0103/dev-oops.git
       targetRevision: master
       path: kubernetes/argocd/argocd-app/daemon/juicefs
+    {{- end }}
 {{- end -}}
\ No newline at end of file
diff --git a/kubernetes/argocd/app-of-app/values.yaml b/kubernetes/argocd/app-of-app/values.yaml
index 38f314a..8458093 100644
--- a/kubernetes/argocd/app-of-app/values.yaml
+++ b/kubernetes/argocd/app-of-app/values.yaml
@@ -29,6 +29,7 @@ sonarqube:
 juicefs:
   enabled: true
   readOnly: false
+  monitoring: true
 vaultwarden:
   enabled: true
 certManager: