diff --git a/config/example.env b/config/example.env index 888402581..4993cc3d3 100644 --- a/config/example.env +++ b/config/example.env @@ -7,6 +7,15 @@ TYPEORM_DATABASE_PASSWORD= TYPEORM_DATABASE_HOST= TYPEORM_DATABASE_PORT= TYPEORM_LOGGING= +# Per-process node-postgres pool size (default 10 if empty). The effective DB +# connection count is this value x number of app processes (each graphql +# instance + the jobs process). Keep the total well under the database/pooler +# connection limit. Behind DigitalOcean managed Postgres / PgBouncer, +# oversizing this (e.g. 97 per process across 5 processes) exhausts the pooler +# and triggers "server login has been failing ... (server_login_retry)" errors. +# (The jobs process also opens a separate CronDataSource pool that does NOT +# honor this setting — it uses node-postgres' default of ~10 connections — so +# count ~10 extra for that process on top of this value.) TYPEORM_DATABASE_POOL_SIZE= DROP_DATABASE= APOLLO_KEY= diff --git a/src/index.ts b/src/index.ts index 8eb1124f8..a7fb4975c 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,4 +1,9 @@ import 'reflect-metadata'; +import { registerGlobalErrorHandlers } from './utils/globalErrorHandlers'; import { bootstrap } from './server/bootstrap'; +// Register process-level error handlers before anything async runs so a +// transient DB/pooler error rejecting a promise cannot crash the whole API. +registerGlobalErrorHandlers(); + bootstrap(); diff --git a/src/orm.ts b/src/orm.ts index 46ae56c7c..6054779de 100644 --- a/src/orm.ts +++ b/src/orm.ts @@ -5,6 +5,20 @@ import { CronJob } from './entities/CronJob'; import { getEntities } from './entities/entities'; import { redisConfig } from './redis'; +// Shared connection-pool tuning for DataSources that run behind a Postgres +// connection pooler (DigitalOcean managed Postgres / PgBouncer). +const poolerExtraConfig = { + // Recycling idle connections every 500ms (the previous idleTimeoutMillis) + // caused constant reconnect + login churn against the pooler, surfacing in + // production as "server login has been failing ... (server_login_retry)" errors. + idleTimeoutMillis: 30000, + // Fail fast instead of hanging forever when a connection cannot be acquired + // during a pooler stall, so requests error out quickly and the pool can recover. + connectionTimeoutMillis: 10000, + // (maxWaitingClients / evictionRunIntervalMillis were generic-pool options + // that node-postgres ignores, so they were removed.) +}; + export class AppDataSource { private static datasource: DataSource; @@ -52,11 +66,7 @@ export class AppDataSource { }, }, poolSize, - extra: { - maxWaitingClients: 10, - evictionRunIntervalMillis: 500, - idleTimeoutMillis: 500, - }, + extra: poolerExtraConfig, }); await AppDataSource.datasource.initialize(); } @@ -81,11 +91,7 @@ export class CronDataSource { entities: [CronJob], synchronize: false, dropSchema: false, - extra: { - maxWaitingClients: 10, - evictionRunIntervalMillis: 500, - idleTimeoutMillis: 500, - }, + extra: poolerExtraConfig, }); await CronDataSource.datasource.initialize(); } diff --git a/src/sentryLogger.ts b/src/sentryLogger.ts index 14f1ebb38..e3aa3bc6f 100644 --- a/src/sentryLogger.ts +++ b/src/sentryLogger.ts @@ -10,6 +10,17 @@ Sentry.init({ // We recommend adjusting this value in production, or using tracesSampler // for finer control tracesSampleRate: 1.0, + + // Crash/rejection handling lives in src/utils/globalErrorHandlers.ts, which + // registers our own process-level handlers (keep-alive on unhandledRejection, + // clean exit on uncaughtException). Disable Sentry's built-in global handlers + // so errors aren't captured twice and the handlers don't race to exit. + integrations: defaults => + defaults.filter( + integration => + integration.name !== 'OnUncaughtException' && + integration.name !== 'OnUnhandledRejection', + ), }); export default Sentry; diff --git a/src/server/bootstrap.ts b/src/server/bootstrap.ts index 5e41ef02e..ff66de905 100644 --- a/src/server/bootstrap.ts +++ b/src/server/bootstrap.ts @@ -35,6 +35,7 @@ import { translationErrorMessagesKeys, } from '../utils/errorMessages'; import { logger } from '../utils/logger'; +import { flushSentryAndExit } from '../utils/globalErrorHandlers'; import { isTrustedVercelRequest } from '../utils/ipWhitelist'; import { adminJsRootPath, getAdminJsRouter } from './adminJs/adminJs'; // import { apiGivRouter } from '../routers/apiGivRoutes'; @@ -426,6 +427,17 @@ export async function bootstrap() { }); } catch (err) { logger.fatal('bootstrap() error', err); + SentryLogger.captureException(err as Error); + // A failure during startup (e.g. the database/pooler being unreachable when + // AppDataSource.initialize() runs) leaves the process with no HTTP listener + // on port 4000 — a zombie that `restart: always` never recovers, because the + // restart policy only fires on process exit. Exit so the container is + // recreated cleanly and self-heals once the dependency is reachable again. + // Skipped under tests so a failed bootstrap is reported by the test runner + // instead of abruptly terminating it. + if (!isTestEnv) { + flushSentryAndExit(); + } } async function continueDbSetup() { diff --git a/src/utils/globalErrorHandlers.ts b/src/utils/globalErrorHandlers.ts new file mode 100644 index 000000000..876926f07 --- /dev/null +++ b/src/utils/globalErrorHandlers.ts @@ -0,0 +1,67 @@ +import { logger } from './logger'; +import SentryLogger from '../sentryLogger'; + +/** + * Registers process-level handlers for errors that escape application code. + * + * Why this exists: the API runs behind a Postgres connection pooler + * (DigitalOcean managed Postgres / PgBouncer). When the pooler or database has + * a transient problem, in-flight DB queries reject. Without an + * `unhandledRejection` listener, Node (>= 15) terminates the whole process on + * the first such rejection, taking the entire API down for a momentary DB blip. + * + * Behaviour: + * - unhandledRejection: log + report to Sentry, then KEEP the process alive. + * A single rejected promise (often a transient DB error) must not tear down + * the server; it should keep serving and recover once the DB is reachable. + * - uncaughtException: log + report to Sentry, then EXIT. The process state is + * undefined after an uncaught exception, so we let the container restart + * policy (`restart: always`) recreate a clean process. + */ +let handlersRegistered = false; + +export function registerGlobalErrorHandlers(): void { + // Idempotent: never attach the listeners more than once. + if (handlersRegistered) { + return; + } + handlersRegistered = true; + + process.on('unhandledRejection', (reason: unknown) => { + logger.error('unhandledRejection - process kept alive', reason); + captureToSentry(reason); + }); + + process.on('uncaughtException', (error: Error) => { + logger.fatal('uncaughtException - exiting for a clean restart', error); + captureToSentry(error); + flushSentryAndExit(); + }); +} + +function captureToSentry(error: unknown): void { + try { + SentryLogger.captureException( + error instanceof Error ? error : new Error(String(error)), + ); + } catch { + // Never let error reporting throw from inside an error handler. + } +} + +/** + * Flushes pending Sentry events (best-effort, max 2s) and then exits with a + * non-zero code so the container restart policy (`restart: always`) recreates a + * clean process. Used by the uncaughtException handler above and by the + * bootstrap() startup-failure path. + */ +export function flushSentryAndExit(): void { + // A hard fallback guarantees the process exits even if flushing stalls. + const forceExit = setTimeout(() => process.exit(1), 3000); + void SentryLogger.close(2000) + .catch(() => undefined) + .then(() => { + clearTimeout(forceExit); + process.exit(1); + }); +}