From 2462ee95a7b80d8a0fe0fd5ce2f61c83ae0137f1 Mon Sep 17 00:00:00 2001 From: ali Date: Thu, 11 Jun 2026 19:22:41 +0330 Subject: [PATCH 1/2] fix: make API resilient to DB/pooler connection loss Production and staging went down during a DigitalOcean managed Postgres / PgBouncer connectivity blip ("server login has been failing ... (server_login_retry)"). Several issues combined to turn a transient DB problem into a hard outage that needed a manual redeploy: - No process-level error handlers, so a rejected DB query during a blip crashed the whole Node process (unhandledRejection, Node >= 15 exits). - orm.ts used idleTimeoutMillis: 500, recycling idle connections twice a second and hammering the pooler with reconnect/login churn. - pool.connect() had no connectionTimeoutMillis, so during a pooler stall requests hung indefinitely instead of failing fast. - bootstrap()'s catch only logged; if the DB was unreachable at startup the process stayed up with no HTTP listener (a zombie that `restart: always` never recovers, since the policy only fires on process exit). - Sentry's built-in OnUncaughtException/OnUnhandledRejection integrations double-handled with any new handlers (double capture + exit race). Changes: - Add src/utils/globalErrorHandlers.ts: keep the process alive on unhandledRejection (log + Sentry), exit cleanly on uncaughtException so Docker (restart: always) recreates a fresh process. Registered first in index.ts. - orm.ts: idleTimeoutMillis 500 -> 30000 and add connectionTimeoutMillis: 10000 on both AppDataSource and CronDataSource; drop the no-op maxWaitingClients/evictionRunIntervalMillis keys (node-postgres ignores them). - bootstrap(): exit on startup failure (skipped under tests) so restart: always self-heals once the DB is reachable again. - sentryLogger.ts: disable Sentry's global handlers so ours are the single source of truth. - example.env: document pool-size sizing to prevent recurrence. NOTE: the over-sized production pool (TYPEORM_DATABASE_POOL_SIZE=97 per process x 5 processes) lives in the gitignored config/production.env and must be reduced on the server separately. Co-Authored-By: Claude Opus 4.8 (1M context) --- config/example.env | 8 ++++ src/index.ts | 5 +++ src/orm.ts | 30 +++++++++++--- src/sentryLogger.ts | 11 ++++++ src/server/bootstrap.ts | 12 ++++++ src/utils/globalErrorHandlers.ts | 67 ++++++++++++++++++++++++++++++++ 6 files changed, 127 insertions(+), 6 deletions(-) create mode 100644 src/utils/globalErrorHandlers.ts diff --git a/config/example.env b/config/example.env index 888402581..d0af76289 100644 --- a/config/example.env +++ b/config/example.env @@ -7,6 +7,14 @@ TYPEORM_DATABASE_PASSWORD= TYPEORM_DATABASE_HOST= TYPEORM_DATABASE_PORT= TYPEORM_LOGGING= +# Per-process node-postgres pool size (default 10 if empty). The effective DB +# connection count is this value x number of app processes (each graphql +# instance + the jobs process). Keep the total well under the database/pooler +# connection limit. Behind DigitalOcean managed Postgres / PgBouncer, +# oversizing this (e.g. 97 per process across 5 processes) exhausts the pooler +# and triggers "server login has been failing ... (server_login_retry)" errors. +# (The jobs process also opens a small separate CronDataSource pool of up to +# ~10 connections on top of this value.) TYPEORM_DATABASE_POOL_SIZE= DROP_DATABASE= APOLLO_KEY= diff --git a/src/index.ts b/src/index.ts index 8eb1124f8..a7fb4975c 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,4 +1,9 @@ import 'reflect-metadata'; +import { registerGlobalErrorHandlers } from './utils/globalErrorHandlers'; import { bootstrap } from './server/bootstrap'; +// Register process-level error handlers before anything async runs so a +// transient DB/pooler error rejecting a promise cannot crash the whole API. +registerGlobalErrorHandlers(); + bootstrap(); diff --git a/src/orm.ts b/src/orm.ts index 46ae56c7c..235c1d7f8 100644 --- a/src/orm.ts +++ b/src/orm.ts @@ -53,9 +53,18 @@ export class AppDataSource { }, poolSize, extra: { - maxWaitingClients: 10, - evictionRunIntervalMillis: 500, - idleTimeoutMillis: 500, + // The service runs behind a Postgres connection pooler (DigitalOcean + // managed Postgres / PgBouncer). Recycling idle connections every + // 500ms (the previous idleTimeoutMillis) caused constant reconnect + + // login churn against the pooler, surfacing in production as + // "server login has been failing ... (server_login_retry)" errors. + idleTimeoutMillis: 30000, + // Fail fast instead of hanging forever when a connection cannot be + // acquired during a pooler stall, so requests error out quickly and + // the pool can recover. + connectionTimeoutMillis: 10000, + // (maxWaitingClients / evictionRunIntervalMillis were generic-pool + // options that node-postgres ignores, so they were removed.) }, }); await AppDataSource.datasource.initialize(); @@ -82,9 +91,18 @@ export class CronDataSource { synchronize: false, dropSchema: false, extra: { - maxWaitingClients: 10, - evictionRunIntervalMillis: 500, - idleTimeoutMillis: 500, + // The service runs behind a Postgres connection pooler (DigitalOcean + // managed Postgres / PgBouncer). Recycling idle connections every + // 500ms (the previous idleTimeoutMillis) caused constant reconnect + + // login churn against the pooler, surfacing in production as + // "server login has been failing ... (server_login_retry)" errors. + idleTimeoutMillis: 30000, + // Fail fast instead of hanging forever when a connection cannot be + // acquired during a pooler stall, so requests error out quickly and + // the pool can recover. + connectionTimeoutMillis: 10000, + // (maxWaitingClients / evictionRunIntervalMillis were generic-pool + // options that node-postgres ignores, so they were removed.) }, }); await CronDataSource.datasource.initialize(); diff --git a/src/sentryLogger.ts b/src/sentryLogger.ts index 14f1ebb38..e3aa3bc6f 100644 --- a/src/sentryLogger.ts +++ b/src/sentryLogger.ts @@ -10,6 +10,17 @@ Sentry.init({ // We recommend adjusting this value in production, or using tracesSampler // for finer control tracesSampleRate: 1.0, + + // Crash/rejection handling lives in src/utils/globalErrorHandlers.ts, which + // registers our own process-level handlers (keep-alive on unhandledRejection, + // clean exit on uncaughtException). Disable Sentry's built-in global handlers + // so errors aren't captured twice and the handlers don't race to exit. + integrations: defaults => + defaults.filter( + integration => + integration.name !== 'OnUncaughtException' && + integration.name !== 'OnUnhandledRejection', + ), }); export default Sentry; diff --git a/src/server/bootstrap.ts b/src/server/bootstrap.ts index 5e41ef02e..ff66de905 100644 --- a/src/server/bootstrap.ts +++ b/src/server/bootstrap.ts @@ -35,6 +35,7 @@ import { translationErrorMessagesKeys, } from '../utils/errorMessages'; import { logger } from '../utils/logger'; +import { flushSentryAndExit } from '../utils/globalErrorHandlers'; import { isTrustedVercelRequest } from '../utils/ipWhitelist'; import { adminJsRootPath, getAdminJsRouter } from './adminJs/adminJs'; // import { apiGivRouter } from '../routers/apiGivRoutes'; @@ -426,6 +427,17 @@ export async function bootstrap() { }); } catch (err) { logger.fatal('bootstrap() error', err); + SentryLogger.captureException(err as Error); + // A failure during startup (e.g. the database/pooler being unreachable when + // AppDataSource.initialize() runs) leaves the process with no HTTP listener + // on port 4000 — a zombie that `restart: always` never recovers, because the + // restart policy only fires on process exit. Exit so the container is + // recreated cleanly and self-heals once the dependency is reachable again. + // Skipped under tests so a failed bootstrap is reported by the test runner + // instead of abruptly terminating it. + if (!isTestEnv) { + flushSentryAndExit(); + } } async function continueDbSetup() { diff --git a/src/utils/globalErrorHandlers.ts b/src/utils/globalErrorHandlers.ts new file mode 100644 index 000000000..876926f07 --- /dev/null +++ b/src/utils/globalErrorHandlers.ts @@ -0,0 +1,67 @@ +import { logger } from './logger'; +import SentryLogger from '../sentryLogger'; + +/** + * Registers process-level handlers for errors that escape application code. + * + * Why this exists: the API runs behind a Postgres connection pooler + * (DigitalOcean managed Postgres / PgBouncer). When the pooler or database has + * a transient problem, in-flight DB queries reject. Without an + * `unhandledRejection` listener, Node (>= 15) terminates the whole process on + * the first such rejection, taking the entire API down for a momentary DB blip. + * + * Behaviour: + * - unhandledRejection: log + report to Sentry, then KEEP the process alive. + * A single rejected promise (often a transient DB error) must not tear down + * the server; it should keep serving and recover once the DB is reachable. + * - uncaughtException: log + report to Sentry, then EXIT. The process state is + * undefined after an uncaught exception, so we let the container restart + * policy (`restart: always`) recreate a clean process. + */ +let handlersRegistered = false; + +export function registerGlobalErrorHandlers(): void { + // Idempotent: never attach the listeners more than once. + if (handlersRegistered) { + return; + } + handlersRegistered = true; + + process.on('unhandledRejection', (reason: unknown) => { + logger.error('unhandledRejection - process kept alive', reason); + captureToSentry(reason); + }); + + process.on('uncaughtException', (error: Error) => { + logger.fatal('uncaughtException - exiting for a clean restart', error); + captureToSentry(error); + flushSentryAndExit(); + }); +} + +function captureToSentry(error: unknown): void { + try { + SentryLogger.captureException( + error instanceof Error ? error : new Error(String(error)), + ); + } catch { + // Never let error reporting throw from inside an error handler. + } +} + +/** + * Flushes pending Sentry events (best-effort, max 2s) and then exits with a + * non-zero code so the container restart policy (`restart: always`) recreates a + * clean process. Used by the uncaughtException handler above and by the + * bootstrap() startup-failure path. + */ +export function flushSentryAndExit(): void { + // A hard fallback guarantees the process exits even if flushing stalls. + const forceExit = setTimeout(() => process.exit(1), 3000); + void SentryLogger.close(2000) + .catch(() => undefined) + .then(() => { + clearTimeout(forceExit); + process.exit(1); + }); +} From e65c563b3545f6c987a17a74bcbc62150284d9d4 Mon Sep 17 00:00:00 2001 From: ali Date: Thu, 11 Jun 2026 22:35:52 +0330 Subject: [PATCH 2/2] refactor: address PR review (extract shared pool config, clarify cron pool docs) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - orm.ts: extract the duplicated `extra` pool config (idleTimeoutMillis, connectionTimeoutMillis) into a shared `poolerExtraConfig` constant used by both AppDataSource and CronDataSource (CodeRabbit nitpick). - example.env: clarify that the jobs process's CronDataSource pool does NOT honor TYPEORM_DATABASE_POOL_SIZE — it uses node-postgres' default of ~10. Co-Authored-By: Claude Opus 4.8 (1M context) --- config/example.env | 5 +++-- src/orm.ts | 44 ++++++++++++++++---------------------------- 2 files changed, 19 insertions(+), 30 deletions(-) diff --git a/config/example.env b/config/example.env index d0af76289..4993cc3d3 100644 --- a/config/example.env +++ b/config/example.env @@ -13,8 +13,9 @@ TYPEORM_LOGGING= # connection limit. Behind DigitalOcean managed Postgres / PgBouncer, # oversizing this (e.g. 97 per process across 5 processes) exhausts the pooler # and triggers "server login has been failing ... (server_login_retry)" errors. -# (The jobs process also opens a small separate CronDataSource pool of up to -# ~10 connections on top of this value.) +# (The jobs process also opens a separate CronDataSource pool that does NOT +# honor this setting — it uses node-postgres' default of ~10 connections — so +# count ~10 extra for that process on top of this value.) TYPEORM_DATABASE_POOL_SIZE= DROP_DATABASE= APOLLO_KEY= diff --git a/src/orm.ts b/src/orm.ts index 235c1d7f8..6054779de 100644 --- a/src/orm.ts +++ b/src/orm.ts @@ -5,6 +5,20 @@ import { CronJob } from './entities/CronJob'; import { getEntities } from './entities/entities'; import { redisConfig } from './redis'; +// Shared connection-pool tuning for DataSources that run behind a Postgres +// connection pooler (DigitalOcean managed Postgres / PgBouncer). +const poolerExtraConfig = { + // Recycling idle connections every 500ms (the previous idleTimeoutMillis) + // caused constant reconnect + login churn against the pooler, surfacing in + // production as "server login has been failing ... (server_login_retry)" errors. + idleTimeoutMillis: 30000, + // Fail fast instead of hanging forever when a connection cannot be acquired + // during a pooler stall, so requests error out quickly and the pool can recover. + connectionTimeoutMillis: 10000, + // (maxWaitingClients / evictionRunIntervalMillis were generic-pool options + // that node-postgres ignores, so they were removed.) +}; + export class AppDataSource { private static datasource: DataSource; @@ -52,20 +66,7 @@ export class AppDataSource { }, }, poolSize, - extra: { - // The service runs behind a Postgres connection pooler (DigitalOcean - // managed Postgres / PgBouncer). Recycling idle connections every - // 500ms (the previous idleTimeoutMillis) caused constant reconnect + - // login churn against the pooler, surfacing in production as - // "server login has been failing ... (server_login_retry)" errors. - idleTimeoutMillis: 30000, - // Fail fast instead of hanging forever when a connection cannot be - // acquired during a pooler stall, so requests error out quickly and - // the pool can recover. - connectionTimeoutMillis: 10000, - // (maxWaitingClients / evictionRunIntervalMillis were generic-pool - // options that node-postgres ignores, so they were removed.) - }, + extra: poolerExtraConfig, }); await AppDataSource.datasource.initialize(); } @@ -90,20 +91,7 @@ export class CronDataSource { entities: [CronJob], synchronize: false, dropSchema: false, - extra: { - // The service runs behind a Postgres connection pooler (DigitalOcean - // managed Postgres / PgBouncer). Recycling idle connections every - // 500ms (the previous idleTimeoutMillis) caused constant reconnect + - // login churn against the pooler, surfacing in production as - // "server login has been failing ... (server_login_retry)" errors. - idleTimeoutMillis: 30000, - // Fail fast instead of hanging forever when a connection cannot be - // acquired during a pooler stall, so requests error out quickly and - // the pool can recover. - connectionTimeoutMillis: 10000, - // (maxWaitingClients / evictionRunIntervalMillis were generic-pool - // options that node-postgres ignores, so they were removed.) - }, + extra: poolerExtraConfig, }); await CronDataSource.datasource.initialize(); }