Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions config/example.env
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,14 @@ TYPEORM_DATABASE_PASSWORD=
TYPEORM_DATABASE_HOST=
TYPEORM_DATABASE_PORT=
TYPEORM_LOGGING=
# Per-process node-postgres pool size (default 10 if empty). The effective DB
# connection count is this value x number of app processes (each graphql
# instance + the jobs process). Keep the total well under the database/pooler
# connection limit. Behind DigitalOcean managed Postgres / PgBouncer,
# oversizing this (e.g. 97 per process across 5 processes) exhausts the pooler
# and triggers "server login has been failing ... (server_login_retry)" errors.
# (The jobs process also opens a small separate CronDataSource pool of up to
# ~10 connections on top of this value.)
TYPEORM_DATABASE_POOL_SIZE=
Comment thread
ae2079 marked this conversation as resolved.
DROP_DATABASE=
APOLLO_KEY=
Expand Down
5 changes: 5 additions & 0 deletions src/index.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
import 'reflect-metadata';
import { registerGlobalErrorHandlers } from './utils/globalErrorHandlers';
import { bootstrap } from './server/bootstrap';

// Register process-level error handlers before anything async runs so a
// transient DB/pooler error rejecting a promise cannot crash the whole API.
registerGlobalErrorHandlers();

bootstrap();
30 changes: 24 additions & 6 deletions src/orm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,18 @@ export class AppDataSource {
},
poolSize,
extra: {
maxWaitingClients: 10,
evictionRunIntervalMillis: 500,
idleTimeoutMillis: 500,
// The service runs behind a Postgres connection pooler (DigitalOcean
// managed Postgres / PgBouncer). Recycling idle connections every
// 500ms (the previous idleTimeoutMillis) caused constant reconnect +
// login churn against the pooler, surfacing in production as
// "server login has been failing ... (server_login_retry)" errors.
idleTimeoutMillis: 30000,
// Fail fast instead of hanging forever when a connection cannot be
// acquired during a pooler stall, so requests error out quickly and
// the pool can recover.
connectionTimeoutMillis: 10000,
// (maxWaitingClients / evictionRunIntervalMillis were generic-pool
// options that node-postgres ignores, so they were removed.)
},
});
await AppDataSource.datasource.initialize();
Expand All @@ -82,9 +91,18 @@ export class CronDataSource {
synchronize: false,
dropSchema: false,
extra: {
maxWaitingClients: 10,
evictionRunIntervalMillis: 500,
idleTimeoutMillis: 500,
// The service runs behind a Postgres connection pooler (DigitalOcean
// managed Postgres / PgBouncer). Recycling idle connections every
// 500ms (the previous idleTimeoutMillis) caused constant reconnect +
// login churn against the pooler, surfacing in production as
// "server login has been failing ... (server_login_retry)" errors.
idleTimeoutMillis: 30000,
// Fail fast instead of hanging forever when a connection cannot be
// acquired during a pooler stall, so requests error out quickly and
// the pool can recover.
connectionTimeoutMillis: 10000,
// (maxWaitingClients / evictionRunIntervalMillis were generic-pool
// options that node-postgres ignores, so they were removed.)
},
});
await CronDataSource.datasource.initialize();
Expand Down
11 changes: 11 additions & 0 deletions src/sentryLogger.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,17 @@ Sentry.init({
// We recommend adjusting this value in production, or using tracesSampler
// for finer control
tracesSampleRate: 1.0,

// Crash/rejection handling lives in src/utils/globalErrorHandlers.ts, which
// registers our own process-level handlers (keep-alive on unhandledRejection,
// clean exit on uncaughtException). Disable Sentry's built-in global handlers
// so errors aren't captured twice and the handlers don't race to exit.
integrations: defaults =>
defaults.filter(
integration =>
integration.name !== 'OnUncaughtException' &&
integration.name !== 'OnUnhandledRejection',
),
});

export default Sentry;
12 changes: 12 additions & 0 deletions src/server/bootstrap.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ import {
translationErrorMessagesKeys,
} from '../utils/errorMessages';
import { logger } from '../utils/logger';
import { flushSentryAndExit } from '../utils/globalErrorHandlers';
import { isTrustedVercelRequest } from '../utils/ipWhitelist';
import { adminJsRootPath, getAdminJsRouter } from './adminJs/adminJs';
// import { apiGivRouter } from '../routers/apiGivRoutes';
Expand Down Expand Up @@ -426,6 +427,17 @@ export async function bootstrap() {
});
} catch (err) {
logger.fatal('bootstrap() error', err);
SentryLogger.captureException(err as Error);
// A failure during startup (e.g. the database/pooler being unreachable when
// AppDataSource.initialize() runs) leaves the process with no HTTP listener
// on port 4000 — a zombie that `restart: always` never recovers, because the
// restart policy only fires on process exit. Exit so the container is
// recreated cleanly and self-heals once the dependency is reachable again.
// Skipped under tests so a failed bootstrap is reported by the test runner
// instead of abruptly terminating it.
if (!isTestEnv) {
flushSentryAndExit();
}
Comment thread
ae2079 marked this conversation as resolved.
}

async function continueDbSetup() {
Expand Down
67 changes: 67 additions & 0 deletions src/utils/globalErrorHandlers.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import { logger } from './logger';
import SentryLogger from '../sentryLogger';

/**
* Registers process-level handlers for errors that escape application code.
*
* Why this exists: the API runs behind a Postgres connection pooler
* (DigitalOcean managed Postgres / PgBouncer). When the pooler or database has
* a transient problem, in-flight DB queries reject. Without an
* `unhandledRejection` listener, Node (>= 15) terminates the whole process on
* the first such rejection, taking the entire API down for a momentary DB blip.
*
* Behaviour:
* - unhandledRejection: log + report to Sentry, then KEEP the process alive.
* A single rejected promise (often a transient DB error) must not tear down
* the server; it should keep serving and recover once the DB is reachable.
* - uncaughtException: log + report to Sentry, then EXIT. The process state is
* undefined after an uncaught exception, so we let the container restart
* policy (`restart: always`) recreate a clean process.
*/
let handlersRegistered = false;

export function registerGlobalErrorHandlers(): void {
// Idempotent: never attach the listeners more than once.
if (handlersRegistered) {
return;
}
handlersRegistered = true;

process.on('unhandledRejection', (reason: unknown) => {
logger.error('unhandledRejection - process kept alive', reason);
captureToSentry(reason);
});

process.on('uncaughtException', (error: Error) => {
logger.fatal('uncaughtException - exiting for a clean restart', error);
captureToSentry(error);
flushSentryAndExit();
});
}

function captureToSentry(error: unknown): void {
try {
SentryLogger.captureException(
error instanceof Error ? error : new Error(String(error)),
);
} catch {
// Never let error reporting throw from inside an error handler.
}
}

/**
* Flushes pending Sentry events (best-effort, max 2s) and then exits with a
* non-zero code so the container restart policy (`restart: always`) recreates a
* clean process. Used by the uncaughtException handler above and by the
* bootstrap() startup-failure path.
*/
export function flushSentryAndExit(): void {
// A hard fallback guarantees the process exits even if flushing stalls.
const forceExit = setTimeout(() => process.exit(1), 3000);
void SentryLogger.close(2000)
.catch(() => undefined)
.then(() => {
clearTimeout(forceExit);
process.exit(1);
});
}
Loading