8 GPU Feature Tests #66

Workflow file for this run

.github/workflows/integration_test_8gpu_features.yaml at 80015e3

	name: 8 GPU Feature Tests

	on:
	push:
	branches: [ main ]
	tags:
	- ciflow/8gpu/*
	paths-ignore:
	- 'torchtitan/experiments/**'
	pull_request:
	types: [opened, synchronize, reopened, ready_for_review]
	paths-ignore:
	- 'torchtitan/experiments/**'
	schedule:
	# Runs every 6 hours
	- cron: '0 /6 * *'

	concurrency:
	group: unit-test${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number \|\| github.ref }}
	cancel-in-progress: true

	defaults:
	run:
	shell: bash -l -eo pipefail {0}

	permissions:
	id-token: write
	contents: read

	jobs:
	# Step 1: Dynamically compute the matrix based on conditions
	set-matrix:
	uses: ./.github/workflows/set-matrix.yaml

	# Step 2: Use the dynamic matrix in the build-test job
	build-test:
	needs: set-matrix
	uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
	strategy:
	fail-fast: false
	matrix: ${{ fromJSON(needs.set-matrix.outputs.matrix) }}
	with:
	runner: ${{ matrix.runner }}
	gpu-arch-type: ${{ matrix.gpu-arch-type }}
	gpu-arch-version: ${{ matrix.gpu-arch-version }}
	docker-image: ${{ matrix.docker-image }}
	repository: pytorch/torchtitan
	upload-artifact: outputs
	timeout: 45
	script: \|
	set -eux

	# The generic Linux job chooses to use base env, not the one setup by the image
	CONDA_ENV=$(conda env list --json \| jq -r ".envs \| .[-1]")
	conda activate "${CONDA_ENV}"

	# Log CUDA driver version for debugging.
	DRIVER_VERSION=$(nvidia-smi --query-gpu=driver_version --format=csv,noheader \| head -n 1 \|\| true)
	echo "CUDA driver version: ${DRIVER_VERSION}"

	pip config --user set global.progress_bar off

	start=$(date +%s)
	TORCH_SPEC="torch"
	if [ -n "${{ matrix.torch-version }}" ]; then
	TORCH_SPEC="torch==${{ matrix.torch-version }}"
	fi
	python -m pip install --force-reinstall --pre \
	"${TORCH_SPEC}" --index-url ${{ matrix.index-url }}
	if [[ "${{ matrix.gpu-arch-type }}" == "cuda" ]]; then
	python -m pip install --pre torchcomms --index-url ${{ matrix.index-url }}
	fi
	end=$(date +%s)
	echo "pip install torch took $((end - start)) seconds"

	if [[ "${{ matrix.gpu-arch-type }}" == "rocm" ]]; then
	export HIPBLASLT_TENSILE_LIBPATH="$(python -c 'import os, torch; print(os.path.join(os.path.dirname(torch.__file__), "lib", "hipblaslt", "library"))')"
	echo "HIPBLASLT_TENSILE_LIBPATH=${HIPBLASLT_TENSILE_LIBPATH}"
	fi

	start=$(date +%s)
	USE_CPP=0 python -m pip install --pre torchao --index-url ${{ matrix.index-url }}
	python -m pip install torchdata==0.12.0.dev20260327 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
	end=$(date +%s)
	echo "pip install torchao took $((end - start)) seconds"

	sudo mkdir -p "$RUNNER_TEMP/artifacts-to-be-uploaded"
	sudo chown -R $(id -u):$(id -g) "$RUNNER_TEMP/artifacts-to-be-uploaded"

	sudo mkdir -p "comm_traces"
	sudo chown -R $(id -u):$(id -g) "comm_traces"

	# Verify the accuracy first.
	echo "Checking FSDP8 v.s. HSDP (4, 2) accuracy parity"
	export baseline_options="--parallelism.data_parallel_replicate_degree=1"
	export test_options="--parallelism.data_parallel_replicate_degree=4"

	# Set architecture-specific parameters
	if [[ "${{ matrix.gpu-arch-type }}" == "cuda" ]]; then
	LOSS_FILE="tests/assets/losses/llama3_cuda.txt"
	STEPS=1
	elif [[ "${{ matrix.gpu-arch-type }}" == "rocm" ]]; then
	# The loss results of FSDP and HSDP start to diverge after 5th
	# step when running with ROCm, we also need to adjust this.
	# But this is more an unknown issue that AMD people may want to
	# figure out the root cause or confirm that this is an expected
	# behavior.
	LOSS_FILE="tests/assets/losses/llama3_rocm.txt"
	STEPS=1
	else
	echo "Error: Unknown GPU architecture type: ${{ matrix.gpu-arch-type }}"
	exit 1
	fi

	python3 scripts/loss_compare.py . . --baseline-options="${baseline_options}" --job-dump-folder="${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs" --export-result="${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs/result.txt" --steps=100

	echo "Checking FSDP8 the first tep loss is the same as FSDP2HSDP4"
	python3 scripts/loss_compare.py . . --baseline-options="${baseline_options}" --test-options="${test_options}" --job-dump-folder="${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs" --assert-equal --steps=1
	rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/*

	echo "Checking FSDP8 loss from a new run v.s. FSDP8 loss from text file parity"
	python3 scripts/loss_compare.py . . --baseline-options="${baseline_options}" --job-dump-folder="${RUNNER_TEMP}/artifacts-to-be-uploaded/accuracy_comparison_outputs" --import-result="${LOSS_FILE}" --assert-equal --steps=100
	rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/*

	python -m tests.integration_tests.run_tests --gpu_arch_type ${{ matrix.gpu-arch-type }} --test_suite features $RUNNER_TEMP/artifacts-to-be-uploaded --ngpu 8

	# Cleanup the checkpoints so that we don't waste network bandwidth and time.
	rm -rf $RUNNER_TEMP/artifacts-to-be-uploaded/*/checkpoint
	rm -rf artifacts-to-be-uploaded/*/checkpoint

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

8 GPU Feature Tests #66

Workflow file

8 GPU Feature Tests #66

Uh oh!

Workflow file for this run