Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions config/example.env
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,15 @@ TYPEORM_DATABASE_PASSWORD=
TYPEORM_DATABASE_HOST=
TYPEORM_DATABASE_PORT=
TYPEORM_LOGGING=
# Per-process node-postgres pool size (default 10 if empty). The effective DB
# connection count is this value x number of app processes (each graphql
# instance + the jobs process). Keep the total well under the database/pooler
# connection limit. Behind DigitalOcean managed Postgres / PgBouncer,
# oversizing this (e.g. 97 per process across 5 processes) exhausts the pooler
# and triggers "server login has been failing ... (server_login_retry)" errors.
# (The jobs process also opens a separate CronDataSource pool that does NOT
# honor this setting — it uses node-postgres' default of ~10 connections — so
# count ~10 extra for that process on top of this value.)
TYPEORM_DATABASE_POOL_SIZE=
Comment thread
ae2079 marked this conversation as resolved.
DROP_DATABASE=
APOLLO_KEY=
Expand Down
5 changes: 5 additions & 0 deletions src/index.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
import 'reflect-metadata';
import { registerGlobalErrorHandlers } from './utils/globalErrorHandlers';
import { bootstrap } from './server/bootstrap';

// Register process-level error handlers before anything async runs so a
// transient DB/pooler error rejecting a promise cannot crash the whole API.
registerGlobalErrorHandlers();

bootstrap();
26 changes: 16 additions & 10 deletions src/orm.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,20 @@ import { CronJob } from './entities/CronJob';
import { getEntities } from './entities/entities';
import { redisConfig } from './redis';

// Shared connection-pool tuning for DataSources that run behind a Postgres
// connection pooler (DigitalOcean managed Postgres / PgBouncer).
const poolerExtraConfig = {
// Recycling idle connections every 500ms (the previous idleTimeoutMillis)
// caused constant reconnect + login churn against the pooler, surfacing in
// production as "server login has been failing ... (server_login_retry)" errors.
idleTimeoutMillis: 30000,
// Fail fast instead of hanging forever when a connection cannot be acquired
// during a pooler stall, so requests error out quickly and the pool can recover.
connectionTimeoutMillis: 10000,
// (maxWaitingClients / evictionRunIntervalMillis were generic-pool options
// that node-postgres ignores, so they were removed.)
};

export class AppDataSource {
private static datasource: DataSource;

Expand Down Expand Up @@ -52,11 +66,7 @@ export class AppDataSource {
},
},
poolSize,
extra: {
maxWaitingClients: 10,
evictionRunIntervalMillis: 500,
idleTimeoutMillis: 500,
},
extra: poolerExtraConfig,
});
await AppDataSource.datasource.initialize();
}
Expand All @@ -81,11 +91,7 @@ export class CronDataSource {
entities: [CronJob],
synchronize: false,
dropSchema: false,
extra: {
maxWaitingClients: 10,
evictionRunIntervalMillis: 500,
idleTimeoutMillis: 500,
},
extra: poolerExtraConfig,
});
await CronDataSource.datasource.initialize();
}
Expand Down
11 changes: 11 additions & 0 deletions src/sentryLogger.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,17 @@ Sentry.init({
// We recommend adjusting this value in production, or using tracesSampler
// for finer control
tracesSampleRate: 1.0,

// Crash/rejection handling lives in src/utils/globalErrorHandlers.ts, which
// registers our own process-level handlers (keep-alive on unhandledRejection,
// clean exit on uncaughtException). Disable Sentry's built-in global handlers
// so errors aren't captured twice and the handlers don't race to exit.
integrations: defaults =>
defaults.filter(
integration =>
integration.name !== 'OnUncaughtException' &&
integration.name !== 'OnUnhandledRejection',
),
});

export default Sentry;
12 changes: 12 additions & 0 deletions src/server/bootstrap.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ import {
translationErrorMessagesKeys,
} from '../utils/errorMessages';
import { logger } from '../utils/logger';
import { flushSentryAndExit } from '../utils/globalErrorHandlers';
import { isTrustedVercelRequest } from '../utils/ipWhitelist';
import { adminJsRootPath, getAdminJsRouter } from './adminJs/adminJs';
// import { apiGivRouter } from '../routers/apiGivRoutes';
Expand Down Expand Up @@ -426,6 +427,17 @@ export async function bootstrap() {
});
} catch (err) {
logger.fatal('bootstrap() error', err);
SentryLogger.captureException(err as Error);
// A failure during startup (e.g. the database/pooler being unreachable when
// AppDataSource.initialize() runs) leaves the process with no HTTP listener
// on port 4000 — a zombie that `restart: always` never recovers, because the
// restart policy only fires on process exit. Exit so the container is
// recreated cleanly and self-heals once the dependency is reachable again.
// Skipped under tests so a failed bootstrap is reported by the test runner
// instead of abruptly terminating it.
if (!isTestEnv) {
flushSentryAndExit();
}
Comment thread
ae2079 marked this conversation as resolved.
}

async function continueDbSetup() {
Expand Down
67 changes: 67 additions & 0 deletions src/utils/globalErrorHandlers.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import { logger } from './logger';
import SentryLogger from '../sentryLogger';

/**
* Registers process-level handlers for errors that escape application code.
*
* Why this exists: the API runs behind a Postgres connection pooler
* (DigitalOcean managed Postgres / PgBouncer). When the pooler or database has
* a transient problem, in-flight DB queries reject. Without an
* `unhandledRejection` listener, Node (>= 15) terminates the whole process on
* the first such rejection, taking the entire API down for a momentary DB blip.
*
* Behaviour:
* - unhandledRejection: log + report to Sentry, then KEEP the process alive.
* A single rejected promise (often a transient DB error) must not tear down
* the server; it should keep serving and recover once the DB is reachable.
* - uncaughtException: log + report to Sentry, then EXIT. The process state is
* undefined after an uncaught exception, so we let the container restart
* policy (`restart: always`) recreate a clean process.
*/
let handlersRegistered = false;

export function registerGlobalErrorHandlers(): void {
// Idempotent: never attach the listeners more than once.
if (handlersRegistered) {
return;
}
handlersRegistered = true;

process.on('unhandledRejection', (reason: unknown) => {
logger.error('unhandledRejection - process kept alive', reason);
captureToSentry(reason);
});

process.on('uncaughtException', (error: Error) => {
logger.fatal('uncaughtException - exiting for a clean restart', error);
captureToSentry(error);
flushSentryAndExit();
});
}

function captureToSentry(error: unknown): void {
try {
SentryLogger.captureException(
error instanceof Error ? error : new Error(String(error)),
);
} catch {
// Never let error reporting throw from inside an error handler.
}
}

/**
* Flushes pending Sentry events (best-effort, max 2s) and then exits with a
* non-zero code so the container restart policy (`restart: always`) recreates a
* clean process. Used by the uncaughtException handler above and by the
* bootstrap() startup-failure path.
*/
export function flushSentryAndExit(): void {
// A hard fallback guarantees the process exits even if flushing stalls.
const forceExit = setTimeout(() => process.exit(1), 3000);
void SentryLogger.close(2000)
.catch(() => undefined)
.then(() => {
clearTimeout(forceExit);
process.exit(1);
});
}
Loading