torchtitan/.github/workflows/integration_test_8gpu_graph_trainer_h100.yaml at 80015e36a459ae4196d4203011e4c8195363265b · ACharacterInASimulation/torchtitan · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
name: GraphTrainer 8 GPU H100 Integration Tests

on:
  push:
    branches: [ main ]
    tags:
      - ciflow/8gpu/*
    paths:
      - 'torchtitan/experiments/graph_trainer/**'
      - '.github/workflows/integration_test_8gpu_graph_trainer_h100.yaml'
  pull_request:
    types: [opened, synchronize, reopened, ready_for_review]
    paths:
      - 'torchtitan/experiments/graph_trainer/**'
      - '.github/workflows/integration_test_8gpu_graph_trainer_h100.yaml'
  schedule:
    # Runs every 12 hours
    - cron: '0 */12 * * *'

concurrency:
  group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }}
  cancel-in-progress: true

defaults:
  run:
    shell: bash -l -eo pipefail {0}

permissions:
      id-token: write
      contents: read

jobs:
  build-test:
    if: github.event_name != 'pull_request' || github.event.pull_request.draft == false
    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
    with:
      runner: linux.aws.h100.8
      gpu-arch-type: cuda
      gpu-arch-version: "12.8"
      docker-image: torchtitan-ubuntu-20.04-clang12
      repository: pytorch/torchtitan
      upload-artifact: outputs
      timeout: 45
      script: |
        set -eux

        # The generic Linux job chooses to use base env, not the one setup by the image
        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
        conda activate "${CONDA_ENV}"

        # Log GPU info / driver version for debugging.
        DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader | head -n 1 || true)
        echo "CUDA driver version: ${DRIVER_VERSION}"

        pip config --user set global.progress_bar off

        python -m pip install --force-reinstall --pre \
          torch --index-url https://download.pytorch.org/whl/nightly/cu128
        python -m pip install torchdata==0.12.0.dev20260327 --extra-index-url https://download.pytorch.org/whl/nightly/cpu

        sudo mkdir -p "$RUNNER_TEMP/artifacts-to-be-uploaded"
        sudo chown -R $(id -u):$(id -g) "$RUNNER_TEMP/artifacts-to-be-uploaded"

        # Disable Nvlink Sharp. The CI machine seems to be in unstable state to support
        # NVLS according to several CI runs.
        NCCL_NVLS_ENABLE=0 python -m torchtitan.experiments.graph_trainer.tests.integration_tests --test_suite graph_trainer_h100 --gpu_arch_type cuda $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8

        # Run the MoE numerics tests
        NCCL_NVLS_ENABLE=0 pytest torchtitan/experiments/graph_trainer/tests/test_numerics.py::TestGraphTrainerNumerics -v -k "moe"

        # Run bitwise deterministic guardrail test (includes H100-only hardcoded-hash tests)
        pytest torchtitan/experiments/graph_trainer/tests/test_bitwise_deterministic.py -v

        rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/*/checkpoint