diff --git a/blocks/loader.ts b/blocks/loader.ts index b44e36bf2..84a088f0f 100644 --- a/blocks/loader.ts +++ b/blocks/loader.ts @@ -10,10 +10,7 @@ import { HttpError } from "../engine/errors.ts"; import type { ResolverMiddlewareContext } from "../engine/middleware.ts"; import type { State } from "../mod.ts"; import { logger } from "../observability/otel/config.ts"; -import { - meter, - OTEL_ENABLE_EXTRA_METRICS, -} from "../observability/otel/metrics.ts"; +import { meter } from "../observability/otel/metrics.ts"; import { caches, ENABLE_LOADER_CACHE } from "../runtime/caches/mod.ts"; import { inFuture } from "../runtime/caches/utils.ts"; import type { DebugProperties } from "../utils/vary.ts"; @@ -184,6 +181,7 @@ const wrapLoader = ( }: LoaderModule, resolveChain: FieldResolver[], release: DecofileProvider, + loaderKey?: string, ) => { const [cacheMaxAge, mode] = typeof cache === "string" ? [MAX_AGE_S, cache] @@ -204,7 +202,9 @@ const wrapLoader = ( req: Request, ctx: FnContext, ): Promise> => { - const loader = ctx.resolverId || "unknown"; + const loader = (ctx.resolverId && ctx.resolverId !== "obj") + ? ctx.resolverId + : (loaderKey ?? ctx.resolverId ?? "unknown"); const start = performance.now(); let status: "bypass" | "miss" | "stale" | "hit" | undefined; @@ -267,8 +267,9 @@ const wrapLoader = ( (await release?.revision() ?? undefined); if (!revisionID) { - logger.warn(`Could not get K_REVISION`); timing?.end(); + status = "bypass"; + stats.cache.add(1, { status, loader }); return await handler(props, req, ctx); } @@ -339,9 +340,7 @@ const wrapLoader = ( return await flights.do(request.url, staleWhileRevalidate); } finally { const dimension = { loader, status }; - if (OTEL_ENABLE_EXTRA_METRICS) { - stats.latency.record(performance.now() - start, dimension); - } + stats.latency.record(performance.now() - start, dimension); ctx.monitoring?.currentSpan?.setDesc(status); } }, @@ -356,7 +355,7 @@ const loaderBlock: Block = { wrapCaughtErrors, (props: TProps, ctx: HttpContext<{ global: any } & RequestState>) => applyProps( - wrapLoader(mod, ctx.resolveChain, ctx.context.state.release), + wrapLoader(mod, ctx.resolveChain, ctx.context.state.release, key), )( props, ctx, diff --git a/deps.ts b/deps.ts index 4ec50a83e..861359f70 100644 --- a/deps.ts +++ b/deps.ts @@ -59,7 +59,7 @@ export type { export * as weakcache from "npm:weak-lru-cache@1.0.0"; export type Handler = Deno.ServeHandler; -export { OTLPTraceExporter } from "npm:@opentelemetry/exporter-trace-otlp-proto@0.52.1"; +export { OTLPTraceExporter } from "npm:@opentelemetry/exporter-trace-otlp-http@0.52.1"; export { Resource } from "npm:@opentelemetry/resources@1.25.1"; export { BatchSpanProcessor, @@ -69,8 +69,11 @@ export { } from "npm:@opentelemetry/sdk-trace-base@1.25.1"; export type { + ReadableSpan, Sampler, SamplingResult, + SpanExporter, + SpanProcessor, } from "npm:@opentelemetry/sdk-trace-base@1.25.1"; export { NodeTracerProvider } from "npm:@opentelemetry/sdk-trace-node@1.25.1"; export { diff --git a/engine/manifest/manifest.ts b/engine/manifest/manifest.ts index 93c722c65..3a0bbe197 100644 --- a/engine/manifest/manifest.ts +++ b/engine/manifest/manifest.ts @@ -1,5 +1,7 @@ // deno-lint-ignore-file no-explicit-any import { parse } from "@std/flags"; +import { ValueType } from "../../deps.ts"; +import { meter } from "../../observability/otel/metrics.ts"; import { gray, green, red } from "@std/fmt/colors"; import { type AppManifest, @@ -284,8 +286,15 @@ export const fulfillContext = async < ctx.site = currentSite!; const provider = release ?? await getProvider(); const runtimePromise = deferred>(); + const startedAt = ctx.instance.startedAt; ctx.runtime = runtimePromise.finally(() => { - ctx.instance.readyAt = new Date(); + const readyAt = new Date(); + ctx.instance.readyAt = readyAt; + meter.createHistogram("instance_startup_duration_ms", { + description: "Time from instance start to first request readiness.", + unit: "ms", + valueType: ValueType.DOUBLE, + }).record(readyAt.getTime() - startedAt.getTime()); }); ctx.release = provider; diff --git a/observability/otel/config.ts b/observability/otel/config.ts index 161921563..0629f1912 100644 --- a/observability/otel/config.ts +++ b/observability/otel/config.ts @@ -1,68 +1,36 @@ import * as log from "@std/log"; import { Logger } from "@std/log/logger"; -import { Context, context } from "../../deco.ts"; -import denoJSON from "../../deno.json" with { type: "json" }; import { BatchSpanProcessor, + diag, + DiagConsoleLogger, + DiagLogLevel, FetchInstrumentation, NodeTracerProvider, opentelemetry, OTLPTraceExporter, ParentBasedSampler, registerInstrumentations, - Resource, - SemanticResourceAttributes, } from "../../deps.ts"; -import { DenoRuntimeInstrumentation } from "./instrumentation/deno-runtime.ts"; + +if (Deno.env.has("OTEL_DIAG")) { + diag.setLogger(new DiagConsoleLogger(), DiagLogLevel.DEBUG); +} +import "./instrumentation/deno-runtime.ts"; import { DebugSampler } from "./samplers/debug.ts"; import { type SamplingOptions, URLBasedSampler } from "./samplers/urlBased.ts"; - -import { ENV_SITE_NAME } from "../../engine/decofile/constants.ts"; -import { safeImportResolve } from "../../engine/importmap/builder.ts"; +import { FilteringSpanProcessor } from "./processors/filtering.ts"; +import { OTEL_IS_ENABLED, resource } from "./resource.ts"; import { OpenTelemetryHandler } from "./logger.ts"; -const tryGetVersionOf = (pkg: string) => { - try { - const [_, ver] = safeImportResolve(pkg).split("@"); - return ver.substring(0, ver.length - 1); - } catch { - return undefined; - } -}; -const apps_ver = tryGetVersionOf("apps/") ?? - tryGetVersionOf("deco-sites/std/") ?? "_"; - -export const resource = Resource.default().merge( - new Resource({ - [SemanticResourceAttributes.SERVICE_NAME]: Deno.env.get(ENV_SITE_NAME) ?? - "deco", - [SemanticResourceAttributes.SERVICE_VERSION]: - Context.active().deploymentId ?? - Deno.hostname(), - [SemanticResourceAttributes.SERVICE_INSTANCE_ID]: crypto.randomUUID(), - [SemanticResourceAttributes.CLOUD_PROVIDER]: context.platform, - "deco.runtime.version": denoJSON.version, - "deco.apps.version": apps_ver, - [SemanticResourceAttributes.CLOUD_REGION]: Deno.env.get("DENO_REGION") ?? - "unknown", - [SemanticResourceAttributes.DEPLOYMENT_ENVIRONMENT]: Deno.env.get( - "DECO_ENV_NAME", - ) - ? `env-${Deno.env.get("DECO_ENV_NAME")}` - : "production", - }), -); - const loggerName = "deco-logger"; -export const OTEL_IS_ENABLED: boolean = Deno.env.has( - "OTEL_EXPORTER_OTLP_ENDPOINT", -); export const logger: Logger = new Logger(loggerName, "INFO", { handlers: [ ...OTEL_IS_ENABLED ? [ new OpenTelemetryHandler("INFO", { resourceAttributes: resource.attributes, + detectResources: false, }), ] : [new log.ConsoleHandler("INFO")], @@ -82,6 +50,7 @@ registerInstrumentations({ // @ts-ignore: no idea why this is failing, but it should work new FetchInstrumentation( { + ignoreUrls: [/127\.0\.0\.1/, /localhost/], applyCustomAttributesOnSpan: ( span, _req, @@ -101,7 +70,6 @@ registerInstrumentations({ }, }, ), - new DenoRuntimeInstrumentation(), ], }); @@ -125,8 +93,9 @@ const parseSamplingOptions = (): SamplingOptions | undefined => { } }; +const samplingOptions = parseSamplingOptions(); const debugSampler = new DebugSampler( - new URLBasedSampler(parseSamplingOptions()), + new URLBasedSampler(samplingOptions), ); const provider = new NodeTracerProvider({ resource: resource, @@ -137,17 +106,36 @@ const provider = new NodeTracerProvider({ ), }); -if (OTEL_IS_ENABLED) { +// Deno 2.2+ has built-in OTel support via OTEL_DENO=true. +// If enabled, it sets its own global TracerProvider and instruments fetch/console, +// which would conflict with our manual setup (double spans, double logs). +// Skip provider.register() when Deno's native OTel is active. +const DENO_OTEL_ACTIVE = Deno.env.get("OTEL_DENO") === "true"; + + +if (OTEL_IS_ENABLED && !DENO_OTEL_ACTIVE) { const traceExporter = new OTLPTraceExporter(); - // @ts-ignore: no idea why this is failing, but it should work - provider.addSpanProcessor(new BatchSpanProcessor(traceExporter)); + provider.addSpanProcessor( + new FilteringSpanProcessor( + // @ts-ignore: no idea why this is failing, but it should work + new BatchSpanProcessor(traceExporter), + samplingOptions, + ), + ); provider.register(); + + addEventListener("unload", () => { + provider.shutdown().catch(() => {}); + }); } -export const tracer = opentelemetry.trace.getTracer( - "deco-tracer", -); +// Use provider.getTracer directly (not via global API) to ensure spans +// always go through our FilteringSpanProcessor, even if another TracerProvider +// overrides the global after our provider.register() call. +export const tracer = OTEL_IS_ENABLED && !DENO_OTEL_ACTIVE + ? provider.getTracer("deco-tracer") + : opentelemetry.trace.getTracer("deco-tracer"); export const tracerIsRecording = () => opentelemetry.trace.getActiveSpan()?.isRecording() ?? false; diff --git a/observability/otel/instrumentation/deno-runtime.ts b/observability/otel/instrumentation/deno-runtime.ts index 3d81e423d..3d720566b 100644 --- a/observability/otel/instrumentation/deno-runtime.ts +++ b/observability/otel/instrumentation/deno-runtime.ts @@ -1,69 +1,54 @@ -/** - * Heavily inspired from unlicensed code: https://github.com/cloudydeno/deno-observability/blob/main/instrumentation/deno-runtime.ts - */ import { type Attributes, - InstrumentationBase, - type InstrumentationConfig, - type ObservableCounter, type ObservableGauge, type ObservableResult, type ObservableUpDownCounter, ValueType, } from "../../../deps.ts"; - -export class DenoRuntimeInstrumentation extends InstrumentationBase { - readonly component: string = "deno-runtime"; - moduleName = this.component; - - constructor(_config?: InstrumentationConfig) { - super("deno-runtime", "0.1.0", { enabled: false }); +import { meter } from "../metrics.ts"; + +const memoryUsage: ObservableGauge = meter + .createObservableGauge("deno.memory_usage", { + unit: "By", + valueType: ValueType.DOUBLE, + description: "Deno process memory usage in bytes.", + }); + +const openResources: ObservableUpDownCounter = meter + .createObservableUpDownCounter("deno.open_resources", { + valueType: ValueType.DOUBLE, + description: "Number of open resources of a particular type.", + }); + +const gatherMemoryUsage = (x: ObservableResult) => { + const usage = Deno.memoryUsage(); + x.observe(usage.rss, { "deno.memory.type": "rss" }); + x.observe(usage.heapTotal, { "deno.memory.type": "heap_total" }); + x.observe(usage.heapUsed, { "deno.memory.type": "heap_used" }); + x.observe(usage.external, { "deno.memory.type": "external" }); +}; + +const gatherOpenResources = (x: ObservableResult) => { + try { + // deno-lint-ignore no-explicit-any + const resources = (Deno as any).resources?.() as + | Record + | undefined; + if (!resources) return; + const counts: Record = {}; + for (const type of Object.values(resources)) { + counts[type] = (counts[type] ?? 0) + 1; + } + for (const [type, count] of Object.entries(counts)) { + x.observe(count, { "deno.resource.type": type }); + } + } catch { + // Deno.resources() may not be available in all environments } +}; - metrics!: { - openResources: ObservableUpDownCounter; - memoryUsage: ObservableGauge; - dispatchedCtr: ObservableCounter; - inflightCtr: ObservableUpDownCounter; - }; - - protected init() {} - - private gatherMemoryUsage = (x: ObservableResult) => { - const usage = Deno.memoryUsage(); - x.observe(usage.rss, { "deno.memory.type": "rss" }); - x.observe(usage.heapTotal, { "deno.memory.type": "heap_total" }); - x.observe(usage.heapUsed, { "deno.memory.type": "heap_used" }); - x.observe(usage.external, { "deno.memory.type": "external" }); - }; +memoryUsage.addCallback(gatherMemoryUsage); +openResources.addCallback(gatherOpenResources); - override enable() { - this.metrics ??= { - openResources: this.meter - .createObservableUpDownCounter("deno.open_resources", { - valueType: ValueType.DOUBLE, - description: "Number of open resources of a particular type.", - }), - memoryUsage: this.meter - .createObservableGauge("deno.memory_usage", { - valueType: ValueType.DOUBLE, - }), - dispatchedCtr: this.meter - .createObservableCounter("deno.ops_dispatched", { - valueType: ValueType.DOUBLE, - description: "Total number of Deno op invocations.", - }), - inflightCtr: this.meter - .createObservableUpDownCounter("deno.ops_inflight", { - valueType: ValueType.DOUBLE, - description: "Number of currently-inflight Deno ops.", - }), - }; - - this.metrics.memoryUsage.addCallback(this.gatherMemoryUsage); - } - - override disable() { - this.metrics.memoryUsage.removeCallback(this.gatherMemoryUsage); - } -} +// Kept for backward compatibility — no longer needed but exported to avoid import errors +export class DenoRuntimeInstrumentation {} diff --git a/observability/otel/metrics.ts b/observability/otel/metrics.ts index b5ea77d7d..2c043d718 100644 --- a/observability/otel/metrics.ts +++ b/observability/otel/metrics.ts @@ -5,13 +5,12 @@ import { PeriodicExportingMetricReader, View, } from "../../deps.ts"; -import { OTEL_IS_ENABLED, resource } from "./config.ts"; +import { OTEL_IS_ENABLED, resource } from "./resource.ts"; export const OTEL_ENABLE_EXTRA_METRICS: boolean = Deno.env.has( "OTEL_ENABLE_EXTRA_METRICS", ); -// 2 minutes. We don't need frequent updates here. export const OTEL_EXPORT_INTERVAL: number = parseInt( Deno.env.get("OTEL_EXPORT_INTERVAL") ?? "60000", 10, @@ -33,7 +32,7 @@ const msBoundaries = [10, 100, 500, 1000, 5000, 10000, 15000]; const sBoundaries = [1, 5, 10, 50]; type IMeter = ReturnType; -const meterProvider: MeterProvider = new MeterProvider({ +export const meterProvider: MeterProvider = new MeterProvider({ resource, views: [ new View({ @@ -60,6 +59,10 @@ if (OTEL_IS_ENABLED) { exportIntervalMillis: OTEL_EXPORT_INTERVAL, }), ); + + addEventListener("unload", () => { + meterProvider.shutdown().catch(() => {}); + }); } export const meter: IMeter = meterProvider.getMeter("deco"); diff --git a/observability/otel/processors/filtering.ts b/observability/otel/processors/filtering.ts new file mode 100644 index 000000000..c498a7e51 --- /dev/null +++ b/observability/otel/processors/filtering.ts @@ -0,0 +1,119 @@ +import { SpanStatusCode } from "../../../deps.ts"; +import type { ReadableSpan, SpanProcessor } from "../../../deps.ts"; +import type { Context } from "../../../deps.ts"; +import type { SamplingOptions } from "../samplers/urlBased.ts"; + +// Span names that are framework internals with no actionable signal. +const ALWAYS_DROP_NAMES = new Set([ + "cache-match", + "website/functions/requestToParam.ts", + "website/handlers/router.ts", + "website/handlers/fresh.ts", + "website/pages/Page.tsx", + "htmx/sections/htmx.tsx", +]); + +// Drop internal loopback fetch spans (FetchInstrumentation for /deco/render calls). +// These are server-to-self requests that create high-cardinality span names. +const isLoopbackFetch = (name: string): boolean => + /^(GET|POST|PUT|DELETE|PATCH)\s+https?:\/\/(127\.0\.0\.1|localhost)/.test( + name, + ); + +const isRoot = (span: ReadableSpan): boolean => !span.parentSpanId; +const hasError = (span: ReadableSpan): boolean => + span.status.code === SpanStatusCode.ERROR; +const durationMs = (span: ReadableSpan): number => + span.duration[0] * 1000 + span.duration[1] / 1_000_000; +const isSlow = (span: ReadableSpan, thresholdMs: number): boolean => + durationMs(span) >= thresholdMs; + +/** + * Per-span filtering processor. + * + * Rules (in order, applied to every span individually): + * 1. Error spans → always keep + * 2. Root span + rate limit token available → keep + * 3. Root span + no token → drop (prevents export explosion under traffic spikes) + * 4. Known framework noise names → drop + * 5. Slow spans (> slowThresholdMs) → keep + * 6. Fast spans (< 5ms) → drop + * 7. Everything else → keep + * + * Zero memory overhead — no buffering, O(1) per span. + */ +export class FilteringSpanProcessor implements SpanProcessor { + private readonly slowThresholdMs: number; + private readonly maxExportPerSecond: number; + private tokens: number; + private lastRefill: number = Date.now(); + + constructor( + private readonly inner: SpanProcessor, + options: Pick< + SamplingOptions, + "slowThresholdMs" | "maxExportPerSecond" + > = {}, + ) { + this.slowThresholdMs = options.slowThresholdMs ?? 2000; + this.maxExportPerSecond = options.maxExportPerSecond ?? Infinity; + this.tokens = this.maxExportPerSecond; + } + + // @ts-ignore: onStart signature varies across SDK versions + onStart(span: ReadableSpan, ctx: Context): void { + // @ts-ignore + this.inner.onStart(span, ctx); + } + + onEnd(span: ReadableSpan): void { + if (this.shouldKeep(span)) { + this.inner.onEnd(span); + } + } + + private shouldKeep(span: ReadableSpan): boolean { + // Always keep error spans — critical signal regardless of anything else + if (hasError(span)) return true; + + // Root spans: drop known noise even if appearing as root (context propagation issues) + if (isRoot(span)) { + if (ALWAYS_DROP_NAMES.has(span.name) || isLoopbackFetch(span.name)) { + return false; + } + if (isFinite(this.maxExportPerSecond) && !this.consumeToken()) { + return false; + } + return true; + } + + // Child spans: filter noise + if (ALWAYS_DROP_NAMES.has(span.name)) return false; + if (isSlow(span, this.slowThresholdMs)) return true; + if (durationMs(span) < 5) return false; + return true; + } + + private consumeToken(): boolean { + const now = Date.now(); + const elapsed = (now - this.lastRefill) / 1000; + this.tokens = Math.min( + this.maxExportPerSecond, + this.tokens + elapsed * this.maxExportPerSecond, + ); + this.lastRefill = now; + if (this.tokens >= 1) { + this.tokens--; + return true; + } + return false; + } + + shutdown(): Promise { + return this.inner.shutdown(); + } + + forceFlush(): Promise { + return this.inner.forceFlush(); + } +} diff --git a/observability/otel/resource.ts b/observability/otel/resource.ts new file mode 100644 index 000000000..e9f9a69d1 --- /dev/null +++ b/observability/otel/resource.ts @@ -0,0 +1,41 @@ +import { Context, context } from "../../deco.ts"; +import denoJSON from "../../deno.json" with { type: "json" }; +import { Resource, SemanticResourceAttributes } from "../../deps.ts"; +import { ENV_SITE_NAME } from "../../engine/decofile/constants.ts"; +import { safeImportResolve } from "../../engine/importmap/builder.ts"; + +const tryGetVersionOf = (pkg: string) => { + try { + const [_, ver] = safeImportResolve(pkg).split("@"); + return ver.substring(0, ver.length - 1); + } catch { + return undefined; + } +}; +const apps_ver = tryGetVersionOf("apps/") ?? + tryGetVersionOf("deco-sites/std/") ?? "_"; + +export const OTEL_IS_ENABLED: boolean = Deno.env.has( + "OTEL_EXPORTER_OTLP_ENDPOINT", +); + +export const resource = Resource.default().merge( + new Resource({ + [SemanticResourceAttributes.SERVICE_NAME]: Deno.env.get(ENV_SITE_NAME) ?? + "deco", + [SemanticResourceAttributes.SERVICE_VERSION]: + Context.active().deploymentId ?? + Deno.hostname(), + [SemanticResourceAttributes.SERVICE_INSTANCE_ID]: crypto.randomUUID(), + [SemanticResourceAttributes.CLOUD_PROVIDER]: context.platform, + "deco.runtime.version": denoJSON.version, + "deco.apps.version": apps_ver, + [SemanticResourceAttributes.CLOUD_REGION]: Deno.env.get("DENO_REGION") ?? + "unknown", + [SemanticResourceAttributes.DEPLOYMENT_ENVIRONMENT]: Deno.env.get( + "DECO_ENV_NAME", + ) + ? `env-${Deno.env.get("DECO_ENV_NAME")}` + : "production", + }), +); diff --git a/observability/otel/samplers/debug.ts b/observability/otel/samplers/debug.ts index 52446a69b..68cf46de4 100644 --- a/observability/otel/samplers/debug.ts +++ b/observability/otel/samplers/debug.ts @@ -23,8 +23,14 @@ export class DebugSampler implements Sampler { const req = context.getValue(REQUEST_CONTEXT_KEY) as Request; const state = context.getValue(STATE_CONTEXT_KEY) as DecoState; + // Only force sampling when explicitly requested: + // 1. via x-trace-debug-id header (programmatic debug) + // 2. via ?__d= query param or debug cookie (debugEnabled = true) + // Normal requests always have a correlationId (UUID) — do NOT use it here, + // otherwise every request would be forced to RECORD_AND_SAMPLED, + // nullifying the defaultRatio in URLBasedSampler. const correlationId = req?.headers?.get?.("x-trace-debug-id") ?? - state?.correlationId; + (state?.debugEnabled ? state?.correlationId : undefined); if (correlationId) { return { decision: SamplingDecision.RECORD_AND_SAMPLED, diff --git a/observability/otel/samplers/urlBased.ts b/observability/otel/samplers/urlBased.ts index 4dc2e2162..1320f7c41 100644 --- a/observability/otel/samplers/urlBased.ts +++ b/observability/otel/samplers/urlBased.ts @@ -14,6 +14,11 @@ export interface URLBasedSampling { export interface SamplingOptions { defaultRatio?: number; byURLPattern?: URLBasedSampling[]; + // Tail sampling options — applied after the trace completes + alwaysExportErrors?: boolean; // default: true + slowThresholdMs?: number; // always export traces slower than this (default: 2000) + exportRatio?: number; // fraction of non-error/non-slow traces to export (default: 1) + maxExportPerSecond?: number; // hard cap on normal traces/s — errors/slow bypass this } interface CompiledURLBasedSampling { @@ -21,6 +26,30 @@ interface CompiledURLBasedSampling { matches(urlSegments: string[]): boolean; } +// Paths that are always excluded regardless of sampling config. +// These are noise: assets, health checks, and framework internals. +const ALWAYS_EXCLUDE = [ + /^\/_frsh\//, + /^\/deco\/_liveness$/, + /^\/favicon\./, + /\.(js|css|map|woff2?|ttf|eot|ico|png|jpg|jpeg|svg|webp|gif)$/, +]; + +const isExcluded = (pathname: string): boolean => + ALWAYS_EXCLUDE.some((re) => re.test(pathname)); + +// Bot detection — zero spans for bot traffic. +// Bots pollute cache with unique keys and inflate trace volume without signal. +const isBot = (req: Request): boolean => { + // Cloudflare verified bot header — most reliable + if (req.headers.get("cf-verified-bot") === "true") return true; + const ua = req.headers.get("user-agent") ?? ""; + if (!ua) return false; + // Known monitoring robots and crawlers + return /bot|crawl|spider|slurp|monitoring|uptimerobot|pingdom|datadog|newrelic|googlebot|bingbot|yandex/i + .test(ua); +}; + export class URLBasedSampler implements Sampler { protected compiledOptions: CompiledURLBasedSampling[]; constructor(protected options?: SamplingOptions) { @@ -44,9 +73,17 @@ export class URLBasedSampler implements Sampler { context: Context, ): SamplingResult { const req = context.getValue(REQUEST_CONTEXT_KEY) as Request; - const pathnameSegments = req?.url - ? new URL(req.url).pathname.split("/") - : undefined; + const url = req?.url ? new URL(req.url) : undefined; + + if (url && isExcluded(url.pathname)) { + return { decision: SamplingDecision.NOT_RECORD }; + } + + if (req && isBot(req)) { + return { decision: SamplingDecision.NOT_RECORD }; + } + + const pathnameSegments = url?.pathname.split("/"); const defaultRatio = this.options?.defaultRatio ?? 0; const ratio = pathnameSegments ? this.compiledOptions.find((opt) => opt.matches(pathnameSegments)) diff --git a/runtime/caches/common.ts b/runtime/caches/common.ts index 915037f8d..d4e46985c 100644 --- a/runtime/caches/common.ts +++ b/runtime/caches/common.ts @@ -1,4 +1,4 @@ -import { type Exception, ValueType } from "../../deps.ts"; +import { context, type Exception, ValueType } from "../../deps.ts"; import { tracer } from "../../observability/otel/config.ts"; import { meter } from "../../observability/otel/metrics.ts"; import { inFuture } from "./utils.ts"; @@ -39,7 +39,7 @@ export const withInstrumentation = ( match: async (req, opts) => { const span = tracer.startSpan("cache-match", { attributes: { engine }, - }); + }, context.active()); try { const isMatch = await cacheImpl.match(req, opts); //there is an edge case where there is no expires header, but technically our loader always sets it diff --git a/runtime/caches/lrucache.ts b/runtime/caches/lrucache.ts index f564cb84c..d7d305cbd 100644 --- a/runtime/caches/lrucache.ts +++ b/runtime/caches/lrucache.ts @@ -1,4 +1,7 @@ import { LRUCache } from "npm:lru-cache@10.2.0"; +import { type ObservableResult, ValueType } from "../../deps.ts"; +import { logger } from "../../observability/otel/config.ts"; +import { meter } from "../../observability/otel/metrics.ts"; import { assertCanBeCached, assertNoOptions, @@ -24,14 +27,82 @@ const STALE_TTL_PERIOD = parseInt( Deno.env.get("STALE_TTL_PERIOD") ?? "30000", ); -const cacheOptions = (cache: Cache) => ( +const lruEvictions = meter.createCounter("lru.evictions_total", { + unit: "1", + valueType: ValueType.DOUBLE, + description: "Number of LRU cache evictions.", +}); + +const lruSizeGauge = meter.createObservableGauge("lru.size_bytes", { + unit: "By", + valueType: ValueType.DOUBLE, + description: "Current LRU cache size in bytes.", +}); + +const lruItemsGauge = meter.createObservableGauge("lru.item_count", { + unit: "1", + valueType: ValueType.DOUBLE, + description: "Current number of items in the LRU cache.", +}); + +const lruFillRatioGauge = meter.createObservableGauge("lru.fill_ratio", { + valueType: ValueType.DOUBLE, + description: + "LRU cache fill ratio (0-1). Above 0.9 eviction pressure is high.", +}); + +const lruHits = meter.createCounter("lru.hits_total", { + unit: "1", + valueType: ValueType.DOUBLE, + description: "Number of LRU cache hits.", +}); + +const lruMisses = meter.createCounter("lru.misses_total", { + unit: "1", + valueType: ValueType.DOUBLE, + description: "Number of LRU cache misses.", +}); + +// Track active LRU instances per cache name for observable gauges +const lruInstances = new Map>(); + +lruSizeGauge.addCallback((result: ObservableResult) => { + for (const [cacheName, cache] of lruInstances) { + result.observe(cache.calculatedSize ?? 0, { cache: cacheName }); + } +}); + +lruItemsGauge.addCallback((result: ObservableResult) => { + for (const [cacheName, cache] of lruInstances) { + result.observe(cache.size, { cache: cacheName }); + } +}); + +lruFillRatioGauge.addCallback((result: ObservableResult) => { + for (const [cacheName, cache] of lruInstances) { + const ratio = CACHE_MAX_SIZE > 0 + ? (cache.calculatedSize ?? 0) / CACHE_MAX_SIZE + : 0; + result.observe(ratio, { cache: cacheName }); + } +}); + +const cacheOptions = (cache: Cache, cacheName: string) => ( { max: CACHE_MAX_ITEMS, maxSize: CACHE_MAX_SIZE, ttlAutopurge: CACHE_TTL_AUTOPURGE, ttlResolution: CACHE_TTL_RESOLUTION, - dispose: async (_value: boolean, key: string) => { - await cache.delete(key); + dispose: (_value: boolean, key: string, reason: string) => { + if (reason === "evict") { + lruEvictions.add(1, { cache: cacheName }); + } + cache.delete(key).catch((err) => { + logger.warn( + `lru dispose failed to delete key from backing cache: ${err}`, + { cache: cacheName }, + ); + }); }, } ); @@ -39,8 +110,9 @@ const cacheOptions = (cache: Cache) => ( function createLruCacheStorage(cacheStorageInner: CacheStorage): CacheStorage { const caches = createBaseCacheStorage( cacheStorageInner, - (_cacheName, cacheInner, requestURLSHA1) => { - const fileCache = new LRUCache(cacheOptions(cacheInner)); + (cacheName, cacheInner, requestURLSHA1) => { + const fileCache = new LRUCache(cacheOptions(cacheInner, cacheName)); + lruInstances.set(cacheName, fileCache); return Promise.resolve({ ...baseCache, delete: async ( @@ -58,9 +130,11 @@ function createLruCacheStorage(cacheStorageInner: CacheStorage): CacheStorage { assertNoOptions(options); const cacheKey = await requestURLSHA1(request); if (fileCache.has(cacheKey)) { + lruHits.add(1, { cache: cacheName }); const result = cacheInner.match(cacheKey); return result; } + lruMisses.add(1, { cache: cacheName }); return undefined; }, put: async ( diff --git a/runtime/caches/redis.ts b/runtime/caches/redis.ts index ffb816e0c..9fefaab93 100644 --- a/runtime/caches/redis.ts +++ b/runtime/caches/redis.ts @@ -1,3 +1,6 @@ +import { logger } from "../../observability/otel/config.ts"; +import { ValueType } from "../../deps.ts"; +import { meter } from "../../observability/otel/metrics.ts"; import { assertCanBeCached, assertNoOptions, @@ -7,6 +10,29 @@ import { } from "./utils.ts"; import { Redis } from "npm:ioredis@^5.10.1"; +const redisErrors = meter.createCounter("redis.errors_total", { + unit: "1", + valueType: ValueType.DOUBLE, + description: "Number of Redis connection or command errors.", +}); + +const redisReconnections = meter.createCounter("redis.reconnections_total", { + unit: "1", + valueType: ValueType.DOUBLE, + description: "Number of Redis reconnection attempts.", +}); + +const redisConnected = meter.createObservableGauge("redis.connected", { + valueType: ValueType.DOUBLE, + description: "Number of Redis clients currently connected.", +}); + +const connectedClientSet = new Set(); +// deno-lint-ignore no-explicit-any +redisConnected.addCallback((result: any) => { + result.observe(connectedClientSet.size); +}); + const CONNECTION_TIMEOUT = 500; const COMMAND_TIMEOUT = 500; const TTL = parseInt(Deno.env.get("LOADER_CACHE_REDIS_TTL_SECONDS") || "180"); // 3 minutes @@ -105,7 +131,11 @@ export function create(redis: RedisConnection | null, namespace: string) { COMMAND_TIMEOUT, ) ) - .catch(() => 0); + .catch((err) => { + logger.warn(`redis cache delete error: ${err}`); + redisErrors.add(1); + return 0; + }); return result > 0; }, @@ -129,7 +159,11 @@ export function create(redis: RedisConnection | null, namespace: string) { return deserialize(result); }) - .catch(() => undefined); + .catch((err) => { + logger.warn(`redis cache match error: ${err}`); + redisErrors.add(1); + return undefined; + }); return result; }, @@ -153,7 +187,10 @@ export function create(redis: RedisConnection | null, namespace: string) { COMMAND_TIMEOUT, ) ) - .catch(() => {}); + .catch((err) => { + logger.warn(`redis cache put error: ${err}`); + redisErrors.add(1); + }); }, }; } @@ -164,7 +201,17 @@ export const caches: CacheStorage = { if (isAvailable) { redis = createRedisClient(); - redis.on("error", () => {}); + redis.on("error", () => { + connectedClientSet.delete(redis!); + redisErrors.add(1); + }); + redis.on("connect", () => { + connectedClientSet.add(redis!); + }); + redis.on("reconnecting", () => { + connectedClientSet.delete(redis!); + redisReconnections.add(1); + }); await wait(CONNECTION_TIMEOUT); }