RL Integration Tests #33

Workflow file for this run

.github/workflows/integration_test_4gpu_rl.yaml at 80015e3

	name: RL Integration Tests

	on:
	push:
	branches: [ main ]
	paths:
	- 'torchtitan/experiments/rl/**'
	- '.github/workflows/integration_test_4gpu_rl.yaml'
	pull_request:
	types: [opened, synchronize, reopened, ready_for_review]
	paths:
	- 'torchtitan/experiments/rl/**'
	- '.github/workflows/integration_test_4gpu_rl.yaml'
	schedule:
	# Runs every 12 hours
	- cron: '0 /12 * *'

	concurrency:
	group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number \|\| github.ref }}
	cancel-in-progress: true

	defaults:
	run:
	shell: bash -l -eo pipefail {0}

	permissions:
	id-token: write
	contents: read

	jobs:
	# Step 1: Dynamically compute the matrix based on conditions
	set-matrix:
	uses: ./.github/workflows/set-matrix.yaml

	# Step 2: Use the dynamic matrix in the build-test job
	build-test:
	needs: set-matrix
	uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
	strategy:
	fail-fast: false
	matrix: ${{ fromJSON(needs.set-matrix.outputs.matrix) }}
	with:
	runner: ${{ matrix.runner }}
	gpu-arch-type: ${{ matrix.gpu-arch-type }}
	gpu-arch-version: ${{ matrix.gpu-arch-version }}
	docker-image: ${{ matrix.docker-image }}
	repository: pytorch/torchtitan
	upload-artifact: outputs
	timeout: 45
	script: \|
	set -eux

	# The generic Linux job chooses to use base env, not the one setup by the image
	CONDA_ENV=$(conda env list --json \| jq -r ".envs \| .[-1]")
	conda activate "${CONDA_ENV}"

	# Log CUDA driver version for debugging.
	DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader \| head -n 1 \|\| true)
	echo "CUDA driver version: ${DRIVER_VERSION}"

	pip config --user set global.progress_bar off

	# Install uv for faster dependency resolution
	pip install uv

	# 1. Install Monarch and TorchStore
	uv pip install torchmonarch==0.3.0
	uv pip install --no-deps "git+https://github.com/meta-pytorch/torchstore.git@main"
	uv pip install pygtrie portpicker

	# 2. Install batch-invariant ops
	uv pip install --no-deps "git+https://github.com/thinking-machines-lab/batch_invariant_ops.git@main"

	# 3. Install PyTorch nightly, vllm, and xformers
	uv pip install torch vllm xformers --pre \
	--extra-index-url https://download.pytorch.org/whl/nightly/cu128 \
	--index-strategy unsafe-best-match
	uv pip install torchdata==0.12.0.dev20260327 --extra-index-url https://download.pytorch.org/whl/nightly/cpu

	# 4. Install torchtitan in editable mode because integration test spawns
	# python simple_grpo_sum_digits.py as a subprocess
	uv pip install -e .

	# 5. Download HF model checkpoint for tests
	MODEL_PATH=$(python -c "from huggingface_hub import snapshot_download; print(snapshot_download('Qwen/Qwen3-0.6B'))")

	sudo mkdir -p "$RUNNER_TEMP/artifacts-to-be-uploaded"
	sudo chown -R $(id -u):$(id -g) "$RUNNER_TEMP/artifacts-to-be-uploaded"

	# Run E2E RL integration tests (TP=2 on 4 GPUs)
	python -m torchtitan.experiments.rl.tests.integration_tests \
	$RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 4 \
	--hf_assets_path "$MODEL_PATH"

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

RL Integration Tests #33

Workflow file

RL Integration Tests #33

Uh oh!

Workflow file for this run