Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
239 changes: 13 additions & 226 deletions .github/workflows/postgresql-backup-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,13 @@ on:
default: "2"
required: false

env:
CLUSTER_NAME: pg-backup-test-${{ github.run_id }}
REGION: ${{ inputs.region || 'nyc3' }}
NODE_SIZE: ${{ inputs.node_size || 's-4vcpu-8gb' }}
NODE_COUNT: ${{ inputs.node_count || '2' }}
NAMESPACE: prod-postgresql
RECOVERY_TAG: postgresql-first-recovery-test
ARGOCD_CHART_VERSION: "9.4.15"

jobs:
backup-test:
runs-on: ubuntu-latest
environment: test-backup
timeout-minutes: 45
steps:
# --- dependency setup only; no business logic lives here ---
- name: Checkout repository
uses: actions/checkout@v4

Expand All @@ -43,228 +35,23 @@ jobs:

- name: Install ArgoCD CLI
run: |
curl -sSL -o argocd https://github.com/argoproj/argo-cd/releases/download/v3.3.4/argocd-linux-amd64
curl -sSL -o argocd \
https://github.com/argoproj/argo-cd/releases/download/v3.3.4/argocd-linux-amd64
chmod +x argocd
sudo mv argocd /usr/local/bin/

- name: Create DOKS cluster
run: |
doctl kubernetes cluster create "$CLUSTER_NAME" \
--region "$REGION" \
--size "$NODE_SIZE" \
--count "$NODE_COUNT" \
--wait
- name: Save kubeconfig
run: |
doctl kubernetes cluster kubeconfig save "$CLUSTER_NAME"
kubectl config set-context --current --namespace=argocd
- name: Install ArgoCD via Helm
run: |
helm repo add argo https://argoproj.github.io/argo-helm
helm repo update
helm install argocd argo/argo-cd \
--version "$ARGOCD_CHART_VERSION" \
--namespace argocd \
--create-namespace \
--wait \
--timeout 5m \
--set 'configs.params.server\.insecure=true'

- name: Wait for ArgoCD to be ready
run: |
kubectl rollout status deployment/argocd-server -n argocd --timeout=120s
kubectl rollout status deployment/argocd-repo-server -n argocd --timeout=120s
kubectl rollout status deployment/argocd-applicationset-controller -n argocd --timeout=120s

- name: Install app-of-app chart
run: |
helm install app-of-app ./kubernetes/argocd/app-of-app \
--namespace argocd \
--set metallb.enabled=false \
--set traefik.enabled=false \
--set openebs.enabled=false \
--set postgresql.enabled=true \
--set certManager.enabled=true \
--set kubePrometheusStack.enabled=true \
--set customManifest.enabled=false \
--set loki.enabled=true \
--set alloy.enabled=true \
--set pgadmin4.enabled=true \
--set sonarqube.enabled=false \
--set harbor.enabled=false \
--set velero.enabled=false \
--set mongoOperator.enabled=false \
--set kafkaOperator.enabled=false \
--set juicefs.enabled=false \
--set vaultwarden.enabled=true
# helm, kubectl, and jq are preinstalled on ubuntu-latest runners.

- name: Sync cert-manager
run: |
argocd app sync cert-manager --core \
--retry-limit 5 \
--retry-backoff-duration 10s \
--retry-backoff-max-duration 3m \
--retry-backoff-factor 2
- name: Create namespace and secrets
# --- delegate all logic to the bash scripts ---
- name: Run DR drill
env:
DIGITALOCEAN_TOKEN: ${{ secrets.DIGITALOCEAN_TOKEN }}
R2_ACCESS_KEY: ${{ secrets.R2_ACCESS_KEY }}
R2_SECRET_KEY: ${{ secrets.R2_SECRET_KEY }}
RUN_ID: ${{ github.run_id }}
REGION: ${{ inputs.region || 'nyc3' }}
NODE_SIZE: ${{ inputs.node_size || 's-4vcpu-8gb' }}
NODE_COUNT: ${{ inputs.node_count || '2' }}
run: |
kubectl create namespace "$NAMESPACE" --dry-run=client -o yaml | kubectl apply -f -
kubectl create secret generic cloudflare-r2 \
-n "$NAMESPACE" \
--from-literal=ACCESS_KEY="$R2_ACCESS_KEY" \
--from-literal=SECRET_KEY="$R2_SECRET_KEY" \
--dry-run=client -o yaml | kubectl apply -f -
kubectl create secret generic postgres-admin \
-n "$NAMESPACE" \
--from-literal=username=postgres \
--from-literal=password=backup-test-dummy \
--dry-run=client -o yaml | kubectl apply -f -

- name: Override PostgreSQL app revision to recovery tag
run: |
argocd app set prod-postgresql \
--core \
--source-position 2 \
--revision "$RECOVERY_TAG"
- name: Sync PostgreSQL (App)
continue-on-error: true
run: |
argocd app sync prod-postgresql --core \
--retry-limit 3 \
--retry-backoff-duration 5s \
--retry-backoff-max-duration 1m \
--retry-backoff-factor 2
- name: Sync PostgreSQL (Cluster Resource)
continue-on-error: true
run: |
argocd app sync prod-postgresql --core \
--resource postgresql.cnpg.io:Cluster:postgresql \
--retry-limit 5 \
--retry-backoff-duration 5s \
--retry-backoff-max-duration 1m \
--retry-backoff-factor 2

- name: Wait for cluster healthy state
run: |
echo "Waiting for CloudNativePG cluster to reach healthy state..."
for i in $(seq 1 90); do
phase=$(kubectl get cluster -n "$NAMESPACE" postgresql \
-o jsonpath='{.status.phase}' 2>/dev/null || echo "unknown")
echo " Attempt $i/90: phase=$phase"
if [ "$phase" = "Cluster in healthy state" ]; then
echo "Cluster is healthy."
exit 0
fi
sleep 10
done
echo "ERROR: Cluster did not reach healthy state within 15 minutes."
kubectl get cluster -n "$NAMESPACE" postgresql -o yaml || true
kubectl get pods -n "$NAMESPACE" -l cnpg.io/cluster=postgresql || true
exit 1

- name: Validate restored data
run: |
POD=$(kubectl get pods -n "$NAMESPACE" \
-l cnpg.io/cluster=postgresql,role=primary \
-o jsonpath='{.items[0].metadata.name}')
echo "Primary pod: $POD"

echo "--- Connectivity check ---"
kubectl exec -n "$NAMESPACE" "$POD" -- \
psql -U postgres -c "SELECT 1 AS connectivity_check;"

echo "--- Database listing ---"
kubectl exec -n "$NAMESPACE" "$POD" -- \
psql -U postgres -c "\l"

echo "--- Verify expected databases exist ---"
EXPECTED_DBS="sonarqube"
for db in $EXPECTED_DBS; do
count=$(kubectl exec -n "$NAMESPACE" "$POD" -- \
psql -U postgres -tAc "SELECT count(*) FROM pg_database WHERE datname = '$db';")
if [ "$count" -eq 0 ]; then
echo "FAIL: Database '$db' not found."
exit 1
fi
echo "OK: Database '$db' exists."
done

echo "--- Count user tables across databases ---"
for db in $EXPECTED_DBS; do
table_count=$(kubectl exec -n "$NAMESPACE" "$POD" -- \
psql -U postgres -d "$db" -tAc \
"SELECT count(*) FROM pg_catalog.pg_tables WHERE schemaname NOT IN ('pg_catalog','information_schema');" \
2>/dev/null || echo "0")
echo "Database '$db': $table_count user table(s)"
done
echo "All validation checks passed."
- name: Destroy DOKS cluster and associated infra
if: always()
run: |
CLUSTER_ID=$(doctl kubernetes cluster get "$CLUSTER_NAME" \
--format ID --no-header 2>/dev/null || echo "")

if [ -z "$CLUSTER_ID" ]; then
echo "Cluster '$CLUSTER_NAME' not found, nothing to clean up."
exit 0
fi

echo "Cluster ID: $CLUSTER_ID"

# Collect LB IPs now while kubectl still works — needed after cluster is gone
LB_IPS=$(kubectl get svc -A \
-o jsonpath='{.items[?(@.spec.type=="LoadBalancer")].status.loadBalancer.ingress[0].ip}' \
2>/dev/null || echo "")

# --- Scale down all node pools then delete the cluster ---
echo "=== Scaling down node pools ==="
POOL_IDS=$(doctl kubernetes cluster node-pool list "$CLUSTER_NAME" \
--format ID --no-header 2>/dev/null || echo "")
for pool_id in $POOL_IDS; do
echo "Scaling node pool $pool_id to 0..."
doctl kubernetes cluster node-pool update "$CLUSTER_NAME" "$pool_id" \
--count 0 || true
done
[ -n "$POOL_IDS" ] && sleep 30 || true

echo "=== Deleting DOKS cluster ==="
doctl kubernetes cluster delete "$CLUSTER_NAME" --force --dangerous || true

# --- Delete DigitalOcean Block Storage Volumes ---
# The DOKS CSI driver tags every provisioned volume with k8s:<cluster-id>
echo "=== Cleaning up DigitalOcean Block Storage Volumes ==="
VOL_IDS=$(doctl compute volume list -o json \
| jq -r --arg tag "k8s:$CLUSTER_ID" \
'.[] | select(.tags? and (.tags[] == $tag)) | .id' 2>/dev/null || echo "")
if [ -n "$VOL_IDS" ]; then
for vol_id in $VOL_IDS; do
echo "Deleting volume: $vol_id"
doctl compute volume delete "$vol_id" --force || true
done
else
echo "No block storage volumes found for cluster $CLUSTER_ID."
fi

# --- Delete DigitalOcean Load Balancers ---
# Primary: match by IPs collected before cluster was deleted
echo "=== Cleaning up DigitalOcean Load Balancers ==="
for lb_ip in $LB_IPS; do
LB_ID=$(doctl compute load-balancer list -o json \
| jq -r --arg ip "$lb_ip" '.[] | select(.ip == $ip) | .id')
if [ -n "$LB_ID" ]; then
echo "Deleting load balancer $LB_ID (IP: $lb_ip)"
doctl compute load-balancer delete "$LB_ID" --force || true
fi
done
# Fallback: catch any LBs still tagged with the cluster ID
TAGGED_LB_IDS=$(doctl compute load-balancer list -o json \
| jq -r --arg tag "k8s:$CLUSTER_ID" \
'.[] | select(.tags? and (.tags[] == $tag)) | .id' 2>/dev/null || echo "")
for lb_id in $TAGGED_LB_IDS; do
echo "Deleting tagged load balancer: $lb_id"
doctl compute load-balancer delete "$lb_id" --force 2>/dev/null || true
done

echo "Destroy complete."
chmod +x dr-drill/run.sh dr-drill/scripts/*.sh
./dr-drill/run.sh
99 changes: 99 additions & 0 deletions disaster-recovery/dr-drill/config.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#!/usr/bin/env bash
# config.sh — central configuration for the PostgreSQL DR drill.
# Sourced by run.sh and every script under scripts/.
# All values can be overridden by exporting them before invoking run.sh.

# ---------------------------------------------------------------------------
# Run identity
# ---------------------------------------------------------------------------
# RUN_ID makes the cluster name unique per run. In GHA pass github.run_id;
# locally it falls back to a UTC timestamp.
: "${RUN_ID:=$(date -u +%Y%m%d%H%M%S)}"

: "${CLUSTER_NAME:=pg-backup-test-${RUN_ID}}"

# ---------------------------------------------------------------------------
# DigitalOcean / DOKS
# ---------------------------------------------------------------------------
: "${REGION:=nyc3}"
: "${NODE_SIZE:=s-4vcpu-8gb}"
: "${NODE_COUNT:=2}"

# DIGITALOCEAN_TOKEN must be exported by the caller (GHA secret or local env).
# doctl auth is assumed to be already initialised by the CI dependency step,
# but we re-init defensively if a token is present (see lib/preflight.sh).

# ---------------------------------------------------------------------------
# Kubernetes / app config
# ---------------------------------------------------------------------------
: "${NAMESPACE:=prod-postgresql}"
: "${RECOVERY_TAG:=postgreql-recovery-sync}"
: "${ARGOCD_CHART_VERSION:=9.5.22}"

# Repo-relative path to the app-of-app helm chart.
# Resolved against REPO_ROOT (computed in run.sh).
: "${APP_OF_APP_CHART:=kubernetes/argocd/app-of-app}"

# ---------------------------------------------------------------------------
# Timeouts (seconds unless noted)
# ---------------------------------------------------------------------------
: "${ARGOCD_ROLLOUT_TIMEOUT:=120s}"
: "${HELM_INSTALL_TIMEOUT:=3m}"
: "${CLUSTER_HEALTHY_ATTEMPTS:=90}" # x10s sleep => 15 min
: "${CLUSTER_HEALTHY_INTERVAL:=10}"

# ---------------------------------------------------------------------------
# JuiceFS (read-only DR mount)
# ---------------------------------------------------------------------------
: "${JUICEFS_ENABLED:=true}"
: "${JUICEFS_READONLY:=true}" # injects the `ro` mount option
: "${JUICEFS_MONITORING:=false}"
: "${JUICEFS_NAMESPACE:=juicefs}"
: "${JUICEFS_SECRET_NAME:=cloudflare-r2}"
: "${JUICEFS_VOLUME_NAME:=cloudflare-r2-prod}"
: "${JUICEFS_BUCKET:=https://4c8ad4e9fa8213af3fd284bb97b68b5e.r2.cloudflarestorage.com/juicefs-prod}"
# Assigned with a plain conditional: the JSON braces collide with ${VAR:=...}.
if [ -z "${JUICEFS_ENVS:-}" ]; then
JUICEFS_ENVS='{"JFS_MOUNT_TIMEOUT": 300}'
fi

# Metadata engine (the restored CNPG cluster holding the juicefs_prod DB).
# The rw service for a CNPG cluster named "postgresql" is "postgresql-rw".
: "${JUICEFS_META_USER:=juicefs}"
: "${JUICEFS_META_HOST:=postgresql-rw.${NAMESPACE}.svc}"
: "${JUICEFS_META_DB:=juicefs}"
# JUICEFS_META_PASSWORD must be exported (the juicefs DB role password from the
# restored cluster). If you'd rather supply the whole URL, set JUICEFS_METAURL.
: "${JUICEFS_METAURL:=postgres://${JUICEFS_META_USER}:${JUICEFS_META_PASSWORD:-}@${JUICEFS_META_HOST}:5432/${JUICEFS_META_DB}?sslmode=disable}"

# Label selector for CSI node/controller readiness.
: "${JUICEFS_CSI_SELECTOR:=app.kubernetes.io/name=juicefs-csi-driver}"

# ---------------------------------------------------------------------------
# Vaultwarden (validated against the read-only JuiceFS mount)
# ---------------------------------------------------------------------------
: "${VAULTWARDEN_NAMESPACE:=vaultwarden}"
: "${VAULTWARDEN_DEPLOYMENT:=vaultwarden}"
: "${VAULTWARDEN_SERVICE:=vaultwarden}"
: "${VAULTWARDEN_DATA_PATH:=/data}" # JuiceFS-backed data dir in the pod
: "${VAULTWARDEN_LOCAL_PORT:=8080}"
: "${VAULTWARDEN_ROLLOUT_TIMEOUT:=300s}"
# Optional: set VW_ADMIN_TOKEN to additionally assert restored user count.

# ---------------------------------------------------------------------------
# Validation expectations
# ---------------------------------------------------------------------------
# Space-separated list of databases that must exist after recovery.
: "${EXPECTED_DBS:=sonarqube}"

# ---------------------------------------------------------------------------
# Secrets (required for the secrets step). Exported by caller.
# ---------------------------------------------------------------------------
# R2_ACCESS_KEY
# R2_SECRET_KEY

# ---------------------------------------------------------------------------
# Behaviour flags
# ---------------------------------------------------------------------------
# Set SKIP_DESTROY=1 to leave the cluster running for inspection after a run.
: "${SKIP_DESTROY:=0}"
21 changes: 21 additions & 0 deletions disaster-recovery/dr-drill/lib/log.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/usr/bin/env bash
# lib/log.sh — minimal structured logging helpers.

# Colours only when stdout is a TTY (keeps CI logs clean).
if [ -t 1 ]; then
_C_RESET=$'\033[0m'; _C_BLUE=$'\033[34m'; _C_GREEN=$'\033[32m'
_C_YELLOW=$'\033[33m'; _C_RED=$'\033[31m'; _C_DIM=$'\033[2m'
else
_C_RESET=''; _C_BLUE=''; _C_GREEN=''; _C_YELLOW=''; _C_RED=''; _C_DIM=''
fi

_ts() { date -u +'%H:%M:%S'; }

log_info() { printf '%s%s%s %s\n' "$_C_DIM" "$(_ts)" "$_C_RESET" "$*"; }
log_step() { printf '\n%s==> %s%s\n' "$_C_BLUE" "$*" "$_C_RESET"; }
log_ok() { printf '%s[OK]%s %s\n' "$_C_GREEN" "$_C_RESET" "$*"; }
log_warn() { printf '%s[WARN]%s %s\n' "$_C_YELLOW" "$_C_RESET" "$*" >&2; }
log_error() { printf '%s[ERROR]%s %s\n' "$_C_RED" "$_C_RESET" "$*" >&2; }

# die <message> — log and exit non-zero.
die() { log_error "$*"; exit 1; }
Loading