Skip to content

RL Integration Tests #33

RL Integration Tests

RL Integration Tests #33

name: RL Integration Tests
on:
push:
branches: [ main ]
paths:
- 'torchtitan/experiments/rl/**'
- '.github/workflows/integration_test_4gpu_rl.yaml'
pull_request:
types: [opened, synchronize, reopened, ready_for_review]
paths:
- 'torchtitan/experiments/rl/**'
- '.github/workflows/integration_test_4gpu_rl.yaml'
schedule:
# Runs every 12 hours
- cron: '0 */12 * * *'
concurrency:
group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
cancel-in-progress: true
defaults:
run:
shell: bash -l -eo pipefail {0}
permissions:
id-token: write
contents: read
jobs:
# Step 1: Dynamically compute the matrix based on conditions
set-matrix:
uses: ./.github/workflows/set-matrix.yaml
# Step 2: Use the dynamic matrix in the build-test job
build-test:
needs: set-matrix
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
strategy:
fail-fast: false
matrix: ${{ fromJSON(needs.set-matrix.outputs.matrix) }}
with:
runner: ${{ matrix.runner }}
gpu-arch-type: ${{ matrix.gpu-arch-type }}
gpu-arch-version: ${{ matrix.gpu-arch-version }}
docker-image: ${{ matrix.docker-image }}
repository: pytorch/torchtitan
upload-artifact: outputs
timeout: 45
script: |
set -eux
# The generic Linux job chooses to use base env, not the one setup by the image
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
conda activate "${CONDA_ENV}"
# Log CUDA driver version for debugging.
DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
echo "CUDA driver version: ${DRIVER_VERSION}"
pip config --user set global.progress_bar off
# Install uv for faster dependency resolution
pip install uv
# 1. Install Monarch and TorchStore
uv pip install torchmonarch==0.3.0
uv pip install --no-deps "git+https://github.com/meta-pytorch/torchstore.git@main"
uv pip install pygtrie portpicker
# 2. Install batch-invariant ops
uv pip install --no-deps "git+https://github.com/thinking-machines-lab/batch_invariant_ops.git@main"
# 3. Install PyTorch nightly, vllm, and xformers
uv pip install torch vllm xformers --pre \
--extra-index-url https://download.pytorch.org/whl/nightly/cu128 \
--index-strategy unsafe-best-match
uv pip install torchdata==0.12.0.dev20260327 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
# 4. Install torchtitan in editable mode because integration test spawns
# python simple_grpo_sum_digits.py as a subprocess
uv pip install -e .
# 5. Download HF model checkpoint for tests
MODEL_PATH=$(python -c "from huggingface_hub import snapshot_download; print(snapshot_download('Qwen/Qwen3-0.6B'))")
sudo mkdir -p "$RUNNER_TEMP/artifacts-to-be-uploaded"
sudo chown -R $(id -u):$(id -g) "$RUNNER_TEMP/artifacts-to-be-uploaded"
# Run E2E RL integration tests (TP=2 on 4 GPUs)
python -m torchtitan.experiments.rl.tests.integration_tests \
$RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 4 \
--hf_assets_path "$MODEL_PATH"