From 2462ee95a7b80d8a0fe0fd5ce2f61c83ae0137f1 Mon Sep 17 00:00:00 2001
From: ali <aliebrahimi2079@gmail.com>
Date: Thu, 11 Jun 2026 19:22:41 +0330
Subject: [PATCH 1/2] fix: make API resilient to DB/pooler connection loss

Production and staging went down during a DigitalOcean managed Postgres /
PgBouncer connectivity blip ("server login has been failing ...
(server_login_retry)"). Several issues combined to turn a transient DB
problem into a hard outage that needed a manual redeploy:

- No process-level error handlers, so a rejected DB query during a blip
  crashed the whole Node process (unhandledRejection, Node >= 15 exits).
- orm.ts used idleTimeoutMillis: 500, recycling idle connections twice a
  second and hammering the pooler with reconnect/login churn.
- pool.connect() had no connectionTimeoutMillis, so during a pooler stall
  requests hung indefinitely instead of failing fast.
- bootstrap()'s catch only logged; if the DB was unreachable at startup the
  process stayed up with no HTTP listener (a zombie that `restart: always`
  never recovers, since the policy only fires on process exit).
- Sentry's built-in OnUncaughtException/OnUnhandledRejection integrations
  double-handled with any new handlers (double capture + exit race).

Changes:
- Add src/utils/globalErrorHandlers.ts: keep the process alive on
  unhandledRejection (log + Sentry), exit cleanly on uncaughtException so
  Docker (restart: always) recreates a fresh process. Registered first in
  index.ts.
- orm.ts: idleTimeoutMillis 500 -> 30000 and add connectionTimeoutMillis:
  10000 on both AppDataSource and CronDataSource; drop the no-op
  maxWaitingClients/evictionRunIntervalMillis keys (node-postgres ignores
  them).
- bootstrap(): exit on startup failure (skipped under tests) so
  restart: always self-heals once the DB is reachable again.
- sentryLogger.ts: disable Sentry's global handlers so ours are the single
  source of truth.
- example.env: document pool-size sizing to prevent recurrence.

NOTE: the over-sized production pool (TYPEORM_DATABASE_POOL_SIZE=97 per
process x 5 processes) lives in the gitignored config/production.env and
must be reduced on the server separately.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 config/example.env               |  8 ++++
 src/index.ts                     |  5 +++
 src/orm.ts                       | 30 +++++++++++---
 src/sentryLogger.ts              | 11 ++++++
 src/server/bootstrap.ts          | 12 ++++++
 src/utils/globalErrorHandlers.ts | 67 ++++++++++++++++++++++++++++++++
 6 files changed, 127 insertions(+), 6 deletions(-)
 create mode 100644 src/utils/globalErrorHandlers.ts

diff --git a/config/example.env b/config/example.env
index 888402581..d0af76289 100644
--- a/config/example.env
+++ b/config/example.env
@@ -7,6 +7,14 @@ TYPEORM_DATABASE_PASSWORD=
 TYPEORM_DATABASE_HOST=
 TYPEORM_DATABASE_PORT=
 TYPEORM_LOGGING=
+# Per-process node-postgres pool size (default 10 if empty). The effective DB
+# connection count is this value x number of app processes (each graphql
+# instance + the jobs process). Keep the total well under the database/pooler
+# connection limit. Behind DigitalOcean managed Postgres / PgBouncer,
+# oversizing this (e.g. 97 per process across 5 processes) exhausts the pooler
+# and triggers "server login has been failing ... (server_login_retry)" errors.
+# (The jobs process also opens a small separate CronDataSource pool of up to
+# ~10 connections on top of this value.)
 TYPEORM_DATABASE_POOL_SIZE=
 DROP_DATABASE=
 APOLLO_KEY=
diff --git a/src/index.ts b/src/index.ts
index 8eb1124f8..a7fb4975c 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -1,4 +1,9 @@
 import 'reflect-metadata';
+import { registerGlobalErrorHandlers } from './utils/globalErrorHandlers';
 import { bootstrap } from './server/bootstrap';
 
+// Register process-level error handlers before anything async runs so a
+// transient DB/pooler error rejecting a promise cannot crash the whole API.
+registerGlobalErrorHandlers();
+
 bootstrap();
diff --git a/src/orm.ts b/src/orm.ts
index 46ae56c7c..235c1d7f8 100644
--- a/src/orm.ts
+++ b/src/orm.ts
@@ -53,9 +53,18 @@ export class AppDataSource {
         },
         poolSize,
         extra: {
-          maxWaitingClients: 10,
-          evictionRunIntervalMillis: 500,
-          idleTimeoutMillis: 500,
+          // The service runs behind a Postgres connection pooler (DigitalOcean
+          // managed Postgres / PgBouncer). Recycling idle connections every
+          // 500ms (the previous idleTimeoutMillis) caused constant reconnect +
+          // login churn against the pooler, surfacing in production as
+          // "server login has been failing ... (server_login_retry)" errors.
+          idleTimeoutMillis: 30000,
+          // Fail fast instead of hanging forever when a connection cannot be
+          // acquired during a pooler stall, so requests error out quickly and
+          // the pool can recover.
+          connectionTimeoutMillis: 10000,
+          // (maxWaitingClients / evictionRunIntervalMillis were generic-pool
+          // options that node-postgres ignores, so they were removed.)
         },
       });
       await AppDataSource.datasource.initialize();
@@ -82,9 +91,18 @@ export class CronDataSource {
         synchronize: false,
         dropSchema: false,
         extra: {
-          maxWaitingClients: 10,
-          evictionRunIntervalMillis: 500,
-          idleTimeoutMillis: 500,
+          // The service runs behind a Postgres connection pooler (DigitalOcean
+          // managed Postgres / PgBouncer). Recycling idle connections every
+          // 500ms (the previous idleTimeoutMillis) caused constant reconnect +
+          // login churn against the pooler, surfacing in production as
+          // "server login has been failing ... (server_login_retry)" errors.
+          idleTimeoutMillis: 30000,
+          // Fail fast instead of hanging forever when a connection cannot be
+          // acquired during a pooler stall, so requests error out quickly and
+          // the pool can recover.
+          connectionTimeoutMillis: 10000,
+          // (maxWaitingClients / evictionRunIntervalMillis were generic-pool
+          // options that node-postgres ignores, so they were removed.)
         },
       });
       await CronDataSource.datasource.initialize();
diff --git a/src/sentryLogger.ts b/src/sentryLogger.ts
index 14f1ebb38..e3aa3bc6f 100644
--- a/src/sentryLogger.ts
+++ b/src/sentryLogger.ts
@@ -10,6 +10,17 @@ Sentry.init({
   // We recommend adjusting this value in production, or using tracesSampler
   // for finer control
   tracesSampleRate: 1.0,
+
+  // Crash/rejection handling lives in src/utils/globalErrorHandlers.ts, which
+  // registers our own process-level handlers (keep-alive on unhandledRejection,
+  // clean exit on uncaughtException). Disable Sentry's built-in global handlers
+  // so errors aren't captured twice and the handlers don't race to exit.
+  integrations: defaults =>
+    defaults.filter(
+      integration =>
+        integration.name !== 'OnUncaughtException' &&
+        integration.name !== 'OnUnhandledRejection',
+    ),
 });
 
 export default Sentry;
diff --git a/src/server/bootstrap.ts b/src/server/bootstrap.ts
index 5e41ef02e..ff66de905 100644
--- a/src/server/bootstrap.ts
+++ b/src/server/bootstrap.ts
@@ -35,6 +35,7 @@ import {
   translationErrorMessagesKeys,
 } from '../utils/errorMessages';
 import { logger } from '../utils/logger';
+import { flushSentryAndExit } from '../utils/globalErrorHandlers';
 import { isTrustedVercelRequest } from '../utils/ipWhitelist';
 import { adminJsRootPath, getAdminJsRouter } from './adminJs/adminJs';
 // import { apiGivRouter } from '../routers/apiGivRoutes';
@@ -426,6 +427,17 @@ export async function bootstrap() {
     });
   } catch (err) {
     logger.fatal('bootstrap() error', err);
+    SentryLogger.captureException(err as Error);
+    // A failure during startup (e.g. the database/pooler being unreachable when
+    // AppDataSource.initialize() runs) leaves the process with no HTTP listener
+    // on port 4000 — a zombie that `restart: always` never recovers, because the
+    // restart policy only fires on process exit. Exit so the container is
+    // recreated cleanly and self-heals once the dependency is reachable again.
+    // Skipped under tests so a failed bootstrap is reported by the test runner
+    // instead of abruptly terminating it.
+    if (!isTestEnv) {
+      flushSentryAndExit();
+    }
   }
 
   async function continueDbSetup() {
diff --git a/src/utils/globalErrorHandlers.ts b/src/utils/globalErrorHandlers.ts
new file mode 100644
index 000000000..876926f07
--- /dev/null
+++ b/src/utils/globalErrorHandlers.ts
@@ -0,0 +1,67 @@
+import { logger } from './logger';
+import SentryLogger from '../sentryLogger';
+
+/**
+ * Registers process-level handlers for errors that escape application code.
+ *
+ * Why this exists: the API runs behind a Postgres connection pooler
+ * (DigitalOcean managed Postgres / PgBouncer). When the pooler or database has
+ * a transient problem, in-flight DB queries reject. Without an
+ * `unhandledRejection` listener, Node (>= 15) terminates the whole process on
+ * the first such rejection, taking the entire API down for a momentary DB blip.
+ *
+ * Behaviour:
+ *  - unhandledRejection: log + report to Sentry, then KEEP the process alive.
+ *    A single rejected promise (often a transient DB error) must not tear down
+ *    the server; it should keep serving and recover once the DB is reachable.
+ *  - uncaughtException: log + report to Sentry, then EXIT. The process state is
+ *    undefined after an uncaught exception, so we let the container restart
+ *    policy (`restart: always`) recreate a clean process.
+ */
+let handlersRegistered = false;
+
+export function registerGlobalErrorHandlers(): void {
+  // Idempotent: never attach the listeners more than once.
+  if (handlersRegistered) {
+    return;
+  }
+  handlersRegistered = true;
+
+  process.on('unhandledRejection', (reason: unknown) => {
+    logger.error('unhandledRejection - process kept alive', reason);
+    captureToSentry(reason);
+  });
+
+  process.on('uncaughtException', (error: Error) => {
+    logger.fatal('uncaughtException - exiting for a clean restart', error);
+    captureToSentry(error);
+    flushSentryAndExit();
+  });
+}
+
+function captureToSentry(error: unknown): void {
+  try {
+    SentryLogger.captureException(
+      error instanceof Error ? error : new Error(String(error)),
+    );
+  } catch {
+    // Never let error reporting throw from inside an error handler.
+  }
+}
+
+/**
+ * Flushes pending Sentry events (best-effort, max 2s) and then exits with a
+ * non-zero code so the container restart policy (`restart: always`) recreates a
+ * clean process. Used by the uncaughtException handler above and by the
+ * bootstrap() startup-failure path.
+ */
+export function flushSentryAndExit(): void {
+  // A hard fallback guarantees the process exits even if flushing stalls.
+  const forceExit = setTimeout(() => process.exit(1), 3000);
+  void SentryLogger.close(2000)
+    .catch(() => undefined)
+    .then(() => {
+      clearTimeout(forceExit);
+      process.exit(1);
+    });
+}

From e65c563b3545f6c987a17a74bcbc62150284d9d4 Mon Sep 17 00:00:00 2001
From: ali <aliebrahimi2079@gmail.com>
Date: Thu, 11 Jun 2026 22:35:52 +0330
Subject: [PATCH 2/2] refactor: address PR review (extract shared pool config,
 clarify cron pool docs)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- orm.ts: extract the duplicated `extra` pool config (idleTimeoutMillis,
  connectionTimeoutMillis) into a shared `poolerExtraConfig` constant used by
  both AppDataSource and CronDataSource (CodeRabbit nitpick).
- example.env: clarify that the jobs process's CronDataSource pool does NOT
  honor TYPEORM_DATABASE_POOL_SIZE — it uses node-postgres' default of ~10.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 config/example.env |  5 +++--
 src/orm.ts         | 44 ++++++++++++++++----------------------------
 2 files changed, 19 insertions(+), 30 deletions(-)

diff --git a/config/example.env b/config/example.env
index d0af76289..4993cc3d3 100644
--- a/config/example.env
+++ b/config/example.env
@@ -13,8 +13,9 @@ TYPEORM_LOGGING=
 # connection limit. Behind DigitalOcean managed Postgres / PgBouncer,
 # oversizing this (e.g. 97 per process across 5 processes) exhausts the pooler
 # and triggers "server login has been failing ... (server_login_retry)" errors.
-# (The jobs process also opens a small separate CronDataSource pool of up to
-# ~10 connections on top of this value.)
+# (The jobs process also opens a separate CronDataSource pool that does NOT
+# honor this setting — it uses node-postgres' default of ~10 connections — so
+# count ~10 extra for that process on top of this value.)
 TYPEORM_DATABASE_POOL_SIZE=
 DROP_DATABASE=
 APOLLO_KEY=
diff --git a/src/orm.ts b/src/orm.ts
index 235c1d7f8..6054779de 100644
--- a/src/orm.ts
+++ b/src/orm.ts
@@ -5,6 +5,20 @@ import { CronJob } from './entities/CronJob';
 import { getEntities } from './entities/entities';
 import { redisConfig } from './redis';
 
+// Shared connection-pool tuning for DataSources that run behind a Postgres
+// connection pooler (DigitalOcean managed Postgres / PgBouncer).
+const poolerExtraConfig = {
+  // Recycling idle connections every 500ms (the previous idleTimeoutMillis)
+  // caused constant reconnect + login churn against the pooler, surfacing in
+  // production as "server login has been failing ... (server_login_retry)" errors.
+  idleTimeoutMillis: 30000,
+  // Fail fast instead of hanging forever when a connection cannot be acquired
+  // during a pooler stall, so requests error out quickly and the pool can recover.
+  connectionTimeoutMillis: 10000,
+  // (maxWaitingClients / evictionRunIntervalMillis were generic-pool options
+  // that node-postgres ignores, so they were removed.)
+};
+
 export class AppDataSource {
   private static datasource: DataSource;
 
@@ -52,20 +66,7 @@ export class AppDataSource {
           },
         },
         poolSize,
-        extra: {
-          // The service runs behind a Postgres connection pooler (DigitalOcean
-          // managed Postgres / PgBouncer). Recycling idle connections every
-          // 500ms (the previous idleTimeoutMillis) caused constant reconnect +
-          // login churn against the pooler, surfacing in production as
-          // "server login has been failing ... (server_login_retry)" errors.
-          idleTimeoutMillis: 30000,
-          // Fail fast instead of hanging forever when a connection cannot be
-          // acquired during a pooler stall, so requests error out quickly and
-          // the pool can recover.
-          connectionTimeoutMillis: 10000,
-          // (maxWaitingClients / evictionRunIntervalMillis were generic-pool
-          // options that node-postgres ignores, so they were removed.)
-        },
+        extra: poolerExtraConfig,
       });
       await AppDataSource.datasource.initialize();
     }
@@ -90,20 +91,7 @@ export class CronDataSource {
         entities: [CronJob],
         synchronize: false,
         dropSchema: false,
-        extra: {
-          // The service runs behind a Postgres connection pooler (DigitalOcean
-          // managed Postgres / PgBouncer). Recycling idle connections every
-          // 500ms (the previous idleTimeoutMillis) caused constant reconnect +
-          // login churn against the pooler, surfacing in production as
-          // "server login has been failing ... (server_login_retry)" errors.
-          idleTimeoutMillis: 30000,
-          // Fail fast instead of hanging forever when a connection cannot be
-          // acquired during a pooler stall, so requests error out quickly and
-          // the pool can recover.
-          connectionTimeoutMillis: 10000,
-          // (maxWaitingClients / evictionRunIntervalMillis were generic-pool
-          // options that node-postgres ignores, so they were removed.)
-        },
+        extra: poolerExtraConfig,
       });
       await CronDataSource.datasource.initialize();
     }