Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions examples/node22-bookworm-computer-use-image.ts
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,16 @@ const CUA_DRIVER_INSTALL_COMMAND = [
'"$(curl -fsSL https://raw.githubusercontent.com/trycua/cua/main/libs/cua-driver/scripts/install.sh)"',
].join(' ');

/**
* Sandbox memory default (GiB). Daytona's platform default (no `resources` override) is a 1GiB
* cgroup limit -- confirmed by inspecting `/sys/fs/cgroup/memory.max` inside a real sandbox -- which
* reliably gets a full `npm install` of AutoBrin-flue's dependency tree OOM-killed a few seconds in
* (see superagent-ai/benchpress#40). Confirmed empirically: 2/2 sandboxes at the 1GiB default were
* OOM-killed (`bash`'s own "Killed" job-control message, the exact signature from #40) within ~35s;
* 3/3 sandboxes at 4GiB completed the same `npm install` successfully in ~35-37s.
*/
const DEFAULT_SANDBOX_MEMORY_GIB = 4;

export function buildNode22BookwormComputerUseImage(): Image {
return Image.base('node:22-bookworm')
.runCommands(
Expand Down Expand Up @@ -95,6 +105,7 @@ async function main(): Promise<void> {
kind: 'image',
image: buildNode22BookwormComputerUseImage(),
autoStopInterval: AUTO_STOP_SAFETY_NET_MINUTES,
resources: { memory: DEFAULT_SANDBOX_MEMORY_GIB },
});

try {
Expand Down
17 changes: 16 additions & 1 deletion src/contenders/autobrin.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { tmpdir } from 'node:os';
import path from 'node:path';
import { pathToFileURL } from 'node:url';
import { spawn } from 'node:child_process';
import type { Image, Sandbox } from '@daytona/sdk';
import type { Image, Resources, Sandbox } from '@daytona/sdk';
import type { AgentRunner, BenchmarkTask, ContenderClaim, ConfirmedFinding, NormalizedResult, ProposedPatch, RunContext, RunControls, TargetHandle } from './types.js';
import { webappTargetMetadata } from './types.js';
import { ensureAutobrinCheckout } from '../lib/checkout.js';
Expand Down Expand Up @@ -38,6 +38,15 @@ export type AutobrinContenderConfig = {
visionModel?: string;
/** Keep the sandbox running after the engagement instead of deleting it (debugging only). `transport: 'daytona'` only. */
keepSandbox?: boolean;
/**
* Sandbox CPU/memory/disk allocation. `transport: 'daytona'` only, and only takes effect when
* creating from `image` -- Daytona snapshots fix their resources at snapshot-build time
* (`CreateSnapshotParams.resources`), so there is nothing to override at sandbox-creation time
* for `snapshot`. Omitting this uses the Daytona platform default, which is not large enough for
* a full `npm install` of AutoBrin-flue's dependency tree under load and can get the process
* OOM-killed -- see superagent-ai/benchpress#40.
*/
resources?: Pick<Resources, 'cpu' | 'memory' | 'disk'>;
};

export type AutobrinRunOptions = {
Expand Down Expand Up @@ -370,6 +379,11 @@ export function createAutobrinRunner(options: AutobrinRunOptions): AgentRunner {
if (!options.config.image && !options.config.snapshot) {
throw new Error(`autobrin contender "${contenderId}": transport "daytona" requires "image" or "snapshot"`);
}
if (options.config.resources && options.config.snapshot) {
throw new Error(
`autobrin contender "${contenderId}": "resources" is only supported when creating a sandbox from "image" (Daytona snapshots fix resources at build time)`,
);
}
}

return {
Expand Down Expand Up @@ -499,6 +513,7 @@ async function runViaDaytona(input: RunInput): Promise<NormalizedResult> {
ref: config.ref,
image: config.image,
snapshot: config.snapshot,
resources: config.resources,
visionModel: config.visionModel,
payload,
keepSandbox: config.keepSandbox,
Expand Down
10 changes: 9 additions & 1 deletion src/daytona/launcher.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import type { Image, Sandbox } from '@daytona/sdk';
import type { Image, Resources, Sandbox } from '@daytona/sdk';
import { ensureComputerUseAssets, ensureComputerUseStarted } from './assets.js';
import { bootstrapAutobrinFlue, prepareRepoTarget, prepareWebappTarget } from './bootstrap.js';
import {
Expand All @@ -18,6 +18,13 @@ export type DaytonaRunOptions = {
/** String image ref (registry tag) or a declarative `Image` built with `Image.base(...)`. */
image?: string | Image;
snapshot?: string;
/**
* CPU/memory/disk allocation for the sandbox. Only takes effect when creating from `image`:
* `createSandbox()`'s snapshot params (`CreateSandboxFromSnapshotParams`) have no `resources`
* field, since Daytona snapshots fix resources at snapshot-build time. See
* `AutobrinContenderConfig.resources` in `src/contenders/autobrin.ts` (superagent-ai/benchpress#40).
*/
resources?: Resources;
visionModel?: string;
payload: EngagementPayload | unknown;
keepSandbox?: boolean;
Expand Down Expand Up @@ -73,6 +80,7 @@ export async function runDaytonaEngagement(options: DaytonaRunOptions): Promise<
image: options.image!,
envVars: sandboxEnv,
autoStopInterval: AUTO_STOP_SAFETY_NET_MINUTES,
resources: options.resources,
},
);

Expand Down
20 changes: 20 additions & 0 deletions tests/autobrin-contender.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,26 @@ describe('createAutobrinRunner transport validation', () => {
const runner = createAutobrinRunner({ config: { id: 'x', type: 'autobrin', transport: 'daytona', snapshot: 'daytona-large' } });
expect(runner.id).toBe('x');
});

// Regression (superagent-ai/benchpress#40): Daytona sandboxes had no way to request more than
// the platform default resources, so a full `npm install` of AutoBrin-flue's dependency tree
// could get OOM-killed. `resources` only has an effect when creating from `image` -- Daytona
// snapshots fix their resources at snapshot-build time, so `createSandbox()`'s snapshot params
// have no `resources` field to set (see `CreateSandboxFromSnapshotParams` in `@daytona/sdk`).
it('accepts transport "daytona" with "resources" combined with "image"', () => {
const runner = createAutobrinRunner({
config: { id: 'x', type: 'autobrin', transport: 'daytona', image: 'some-image', resources: { memory: 4 } },
});
expect(runner.id).toBe('x');
});

it('rejects "resources" combined with "snapshot" (snapshots fix resources at build time)', () => {
expect(() =>
createAutobrinRunner({
config: { id: 'x', type: 'autobrin', transport: 'daytona', snapshot: 'daytona-large', resources: { memory: 4 } },
}),
).toThrow(/"resources" is only supported when creating a sandbox from "image"/);
});
});

describe('materializeTarget', () => {
Expand Down
62 changes: 62 additions & 0 deletions tests/autobrin-daytona-sequencing.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -162,3 +162,65 @@ describe('autobrin daytona transport: webapp modality (superagent-ai/benchpress#
expect(launcherMocks.runDaytonaEngagement).not.toHaveBeenCalled();
});
});

// Regression (superagent-ai/benchpress#40): AutobrinContenderConfig had no way to request more
// than the Daytona platform default sandbox resources, so a full `npm install` of AutoBrin-flue's
// dependency tree could get OOM-killed. runDaytonaEngagement's own resources handling (only applied
// for the "image" creation branch) is covered in tests/daytona-launcher.test.ts; these tests only
// cover that runViaDaytona actually forwards config.resources into the options it builds.
describe('autobrin daytona transport: resources config threading (superagent-ai/benchpress#40)', () => {
const baseTask = { id: 't1', benchmarkId: 'repo-cve-smoke' };
const baseTarget = {
benchmarkId: 'repo-cve-smoke',
taskId: 't1',
modality: 'repo' as const,
repo: 'owner/repo',
sha: 'abc123',
};
const baseControls = { model: 'kimi-azure/kimi-k2.6' };

const tmpDirs: string[] = [];

afterEach(() => {
vi.restoreAllMocks();
for (const dir of tmpDirs.splice(0)) rmSync(dir, { recursive: true, force: true });
});

function makeContext() {
const root = mkdtempSync(path.join(tmpdir(), 'benchpress-daytona-resources-'));
tmpDirs.push(root);
return { runId: 'run1', resultsDir: path.join(root, 'results'), engagementsDir: path.join(root, 'engagements') };
}

function mockSuccessfulEngagement() {
checkoutMocks.ensureAutobrinCheckout.mockReset().mockResolvedValue({ root: '/cache/x', ref: 'staging', commitSha: 'deadbeef' });
launcherMocks.runDaytonaEngagement.mockReset().mockResolvedValue({
sandboxId: 'sandbox-1',
engagement: { exitCode: 0, streamLogPath: 'x', resultPath: 'y', resultJson: {} },
computerUse: {},
keptSandbox: false,
});
}

it('forwards config.resources into the options passed to runDaytonaEngagement', async () => {
mockSuccessfulEngagement();
const runner = createAutobrinRunner({
config: { id: 'x', type: 'autobrin', transport: 'daytona', image: 'test-image', resources: { cpu: 2, memory: 4, disk: 20 } },
});

await runner.run({ task: baseTask, target: baseTarget, controls: baseControls, context: makeContext() });

const options = launcherMocks.runDaytonaEngagement.mock.calls[0]?.[0] as { resources?: unknown };
expect(options.resources).toEqual({ cpu: 2, memory: 4, disk: 20 });
});

it('omits resources entirely when not configured (unchanged default behavior)', async () => {
mockSuccessfulEngagement();
const runner = createAutobrinRunner({ config: { id: 'x', type: 'autobrin', transport: 'daytona', image: 'test-image' } });

await runner.run({ task: baseTask, target: baseTarget, controls: baseControls, context: makeContext() });

const options = launcherMocks.runDaytonaEngagement.mock.calls[0]?.[0] as { resources?: unknown };
expect(options.resources).toBeUndefined();
});
});
51 changes: 51 additions & 0 deletions tests/daytona-launcher.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -221,3 +221,54 @@ describe('runDaytonaEngagement computer-use start gating (superagent-ai/benchpre
expect(clientMocks.deleteDaytonaSandbox).toHaveBeenCalledWith('sandbox-1', baseOptions.env);
});
});

// Regression (superagent-ai/benchpress#40): sandboxes had no way to request more than the Daytona
// platform default resources, so a full `npm install` of AutoBrin-flue's dependency tree could get
// OOM-killed (confirmed live: the platform default is a 1GiB cgroup memory limit that reliably
// OOM-kills that install; see the PR description for the empirical repro).
describe('runDaytonaEngagement resources threading (superagent-ai/benchpress#40)', () => {
beforeEach(() => {
vi.spyOn(console, 'error').mockImplementation(() => undefined);
clientMocks.createSandbox.mockReset().mockResolvedValue({ id: 'sandbox-1' });
clientMocks.deleteDaytonaSandbox.mockReset().mockResolvedValue(undefined);
assetsMocks.ensureComputerUseStarted.mockReset().mockResolvedValue(true);
engagementMocks.runEngagementViaHttp.mockReset().mockResolvedValue({
exitCode: 0,
streamLogPath: '/logs/stream.jsonl',
resultPath: '/result.json',
resultJson: { status: 'ok' },
});
});

afterEach(() => {
vi.restoreAllMocks();
});

const payload = { modality: 'repo' as const, repo: 'owner/repo' };
const env = { DAYTONA_API_KEY: 'test' };

it('passes resources through to createSandbox when creating from "image"', async () => {
await runDaytonaEngagement({ image: 'test-image', resources: { cpu: 2, memory: 4, disk: 20 }, payload, env });

expect(clientMocks.createSandbox).toHaveBeenCalledTimes(1);
const params = clientMocks.createSandbox.mock.calls[0]?.[1] as Record<string, unknown>;
expect(params.kind).toBe('image');
expect(params.resources).toEqual({ cpu: 2, memory: 4, disk: 20 });
});

it('omits resources when not provided (unchanged default behavior)', async () => {
await runDaytonaEngagement({ image: 'test-image', payload, env });

const params = clientMocks.createSandbox.mock.calls[0]?.[1] as Record<string, unknown>;
expect(params.resources).toBeUndefined();
});

it('never applies resources on the "snapshot" branch (Daytona snapshots fix resources at build time)', async () => {
await runDaytonaEngagement({ snapshot: 'daytona-large', resources: { memory: 4 }, payload, env });

expect(clientMocks.createSandbox).toHaveBeenCalledTimes(1);
const params = clientMocks.createSandbox.mock.calls[0]?.[1] as Record<string, unknown>;
expect(params.kind).toBe('snapshot');
expect(params).not.toHaveProperty('resources');
});
});
28 changes: 26 additions & 2 deletions tests/daytona.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@ import { mkdtempSync, rmSync, writeFileSync } from 'node:fs';
import { connect, createServer } from 'node:net';
import { tmpdir } from 'node:os';
import path from 'node:path';
import { afterEach, describe, expect, it } from 'vitest';
import { getDaytonaClientConfig } from '../src/daytona/client.js';
import type { Daytona } from '@daytona/sdk';
import { afterEach, describe, expect, it, vi } from 'vitest';
import { createSandbox, getDaytonaClientConfig } from '../src/daytona/client.js';
import { buildEngagementRunScript } from '../src/daytona/engagement.js';
import { assertAllowedFlueRef, buildSandboxEnv } from '../src/daytona/env.js';
import {
Expand Down Expand Up @@ -32,6 +33,29 @@ describe('daytona client config', () => {
});
});

// Regression (superagent-ai/benchpress#40): confirms createSandbox() generically forwards a
// "resources" field (cpu/memory/disk) through to daytona.create() -- it never had a special case
// blocking it, but nothing upstream ever set it either. See src/daytona/launcher.ts for where
// "resources" actually gets set on the SandboxCreateInput passed here.
describe('createSandbox (superagent-ai/benchpress#40: resources threading)', () => {
it('forwards "resources" (and other fields) to daytona.create(), stripping only "kind"', async () => {
const create = vi.fn(async (_params: Record<string, unknown>, _options: Record<string, unknown>) => ({ id: 'fake-sandbox' }));
const fakeDaytona = { create } as unknown as Daytona;

await createSandbox(fakeDaytona, {
kind: 'image',
image: 'test-image',
resources: { cpu: 2, memory: 4, disk: 20 },
envVars: { FOO: 'bar' },
});

expect(create).toHaveBeenCalledTimes(1);
const [params, options] = create.mock.calls[0]!;
expect(params).toEqual({ image: 'test-image', resources: { cpu: 2, memory: 4, disk: 20 }, envVars: { FOO: 'bar' } });
expect(options).toEqual({ timeout: 120 });
});
});

describe('daytona env', () => {
it('enforces branch pins for AUTOBRIN_FLUE_REF', () => {
expect(() => assertAllowedFlueRef('feature/foo')).toThrow('branch pin');
Expand Down
Loading