Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions .github/workflows/prek.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ permissions:
contents: write

env:
CI_BASE_IMAGE: "pytorch/pytorch:2.9.0-cuda12.8-cudnn9-devel"
CI_BASE_IMAGE: "pytorch/pytorch:2.11.0-cuda13.0-cudnn9-devel"
CI_PYTHON_MM: "3.12"
CI_UV_CACHE_RELEASE_TAG: "prek-uv-cache"
CI_UV_CACHE_ASSET_PREFIX: "prek-uv-cache"
Expand Down Expand Up @@ -85,7 +85,7 @@ jobs:
if: needs.cache-status.outputs.cache-hit != 'true'
runs-on: art-cache-builder
container:
image: pytorch/pytorch:2.9.0-cuda12.8-cudnn9-devel
image: pytorch/pytorch:2.11.0-cuda13.0-cudnn9-devel
steps:
- name: Install CI dependencies
run: |
Expand Down Expand Up @@ -126,7 +126,7 @@ jobs:
if: ${{ !failure() && !cancelled() }}
runs-on: art-large-runner
container:
image: pytorch/pytorch:2.9.0-cuda12.8-cudnn9-devel
image: pytorch/pytorch:2.11.0-cuda13.0-cudnn9-devel
steps:
- name: Install CI dependencies
run: |
Expand Down Expand Up @@ -216,9 +216,12 @@ jobs:
export CUDNN_HOME="${cudnn_path}"
export CUDNN_INCLUDE_PATH="${cudnn_path}/include"
export CUDNN_LIBRARY_PATH="${cudnn_path}/lib"
export CPLUS_INCLUDE_PATH="${CUDNN_INCLUDE_PATH}${CPLUS_INCLUDE_PATH:+:${CPLUS_INCLUDE_PATH}}"
export LIBRARY_PATH="${CUDNN_LIBRARY_PATH}${LIBRARY_PATH:+:${LIBRARY_PATH}}"
export LD_LIBRARY_PATH="${CUDNN_LIBRARY_PATH}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
nccl_path="${GITHUB_WORKSPACE}/.venv/lib/python${CI_PYTHON_MM}/site-packages/nvidia/nccl"
export NCCL_INCLUDE_PATH="${nccl_path}/include"
export NCCL_LIBRARY_PATH="${nccl_path}/lib"
export CPLUS_INCLUDE_PATH="${CUDNN_INCLUDE_PATH}:${NCCL_INCLUDE_PATH}${CPLUS_INCLUDE_PATH:+:${CPLUS_INCLUDE_PATH}}"
export LIBRARY_PATH="${CUDNN_LIBRARY_PATH}:${NCCL_LIBRARY_PATH}${LIBRARY_PATH:+:${LIBRARY_PATH}}"
export LD_LIBRARY_PATH="${CUDNN_LIBRARY_PATH}:${NCCL_LIBRARY_PATH}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
export UV_CONCURRENT_BUILDS="${CI_UV_BUILD_SLOTS}"
export CMAKE_BUILD_PARALLEL_LEVEL="${CI_APEX_PARALLEL_BUILD}"
export MAX_JOBS="${CI_APEX_PARALLEL_BUILD}"
Expand Down
2 changes: 1 addition & 1 deletion dev/run_qwen3_5_localbackend_yes_no_maybe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

load_dotenv()

DEFAULT_IMAGE_ID = "docker:nvidia/cuda:12.8.1-devel-ubuntu22.04"
DEFAULT_IMAGE_ID = "docker:nvidia/cuda:13.0.2-devel-ubuntu22.04"


def _format_env_bool(value: bool) -> str:
Expand Down
2 changes: 1 addition & 1 deletion dev/run_qwen3_5_megatron_yes_no_maybe.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

load_dotenv()

DEFAULT_IMAGE_ID = "docker:nvidia/cuda:12.8.1-devel-ubuntu22.04"
DEFAULT_IMAGE_ID = "docker:nvidia/cuda:13.0.2-devel-ubuntu22.04"


def _format_env_bool(value: bool) -> str:
Expand Down
82 changes: 55 additions & 27 deletions docker/art-gpu.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
ARG BASE_IMAGE=docker.io/pytorch/pytorch:2.9.0-cuda12.8-cudnn9-devel
ARG BASE_IMAGE=docker.io/pytorch/pytorch:2.11.0-cuda13.0-cudnn9-devel
ARG ART_SHA=unknown
ARG UV_VERSION=0.11.7
ARG BUILD_JOBS=2
ARG UV_CONCURRENT_BUILDS=1
ARG APEX_PARALLEL_BUILD=2
ARG APEX_NVCC_THREADS=1
ARG TORCH_CUDA_ARCH_LIST=9.0
ARG CUDNN_PACKAGE_VERSION=9.10.2.21
ARG CUDNN_PACKAGE_VERSION=9.19.0.56
ARG SKYPILOT_VERSION=0.12.0
ARG SKY_REMOTE_RAY_VERSION=2.9.3

Expand All @@ -20,8 +20,8 @@ ARG APEX_NVCC_THREADS
ARG TORCH_CUDA_ARCH_LIST
ARG CUDNN_PACKAGE_VERSION

ENV CUDA_HOME=/usr/local/cuda-12.8 \
PATH=/opt/conda/bin:${PATH} \
ENV CUDA_HOME=/usr/local/cuda-13.0 \
PATH=/usr/local/bin:${PATH} \
UV_CACHE_DIR=/opt/uv-cache \
UV_PYTHON_INSTALL_DIR=/opt/uv-python \
UV_LINK_MODE=copy \
Expand All @@ -41,33 +41,47 @@ RUN if ! getent group messagebus >/dev/null; then groupadd -r messagebus; fi \
&& apt-get update \
&& apt-get install -y --no-install-recommends git libibverbs-dev \
&& rm -rf /var/lib/apt/lists/* \
&& /opt/conda/bin/python -m pip install --no-cache-dir --upgrade "uv==${UV_VERSION}" \
&& /opt/conda/bin/conda clean -afy \
&& /opt/conda/bin/uv --version \
&& python -m pip install --break-system-packages --no-cache-dir --upgrade "uv==${UV_VERSION}" \
&& uv --version \
&& mkdir -p "${UV_CACHE_DIR}" "${UV_PYTHON_INSTALL_DIR}"

WORKDIR /opt/src/art
COPY pyproject.toml uv.lock ./
COPY vllm_runtime/pyproject.toml vllm_runtime/uv.lock ./vllm_runtime/

RUN /opt/conda/bin/python -m pip install --no-cache-dir "nvidia-cudnn-cu12==${CUDNN_PACKAGE_VERSION}" \
&& mkdir -p /usr/local/cuda-12.8/include /usr/local/cuda-12.8/lib64 \
&& : > /tmp/art-cudnn-symlinks.txt \
&& for src in /opt/conda/lib/python3.11/site-packages/nvidia/cudnn/include/*; do \
dst="/usr/local/cuda-12.8/include/$(basename "$src")"; \
if [ ! -e "$dst" ]; then ln -s "$src" "$dst" && printf '%s\n' "$dst" >> /tmp/art-cudnn-symlinks.txt; fi; \
RUN python -m pip install --break-system-packages --no-cache-dir "nvidia-cudnn-cu13==${CUDNN_PACKAGE_VERSION}" \
&& mkdir -p /usr/local/cuda-13.0/include /usr/local/cuda-13.0/lib64 \
&& for cccl_dir in cuda cub thrust; do \
src="/usr/local/cuda-13.0/targets/x86_64-linux/include/cccl/${cccl_dir}"; \
dst="/usr/local/cuda-13.0/include/${cccl_dir}"; \
if [ -e "$src" ] && [ ! -e "$dst" ]; then ln -s "$src" "$dst"; fi; \
done \
&& for src in /opt/conda/lib/python3.11/site-packages/nvidia/cudnn/lib/*; do \
dst="/usr/local/cuda-12.8/lib64/$(basename "$src")"; \
if [ ! -e "$dst" ]; then ln -s "$src" "$dst" && printf '%s\n' "$dst" >> /tmp/art-cudnn-symlinks.txt; fi; \
&& cudnn_path="$(python -c 'from pathlib import Path; import site; paths = [Path(p) / "nvidia" / "cudnn" for p in site.getsitepackages() + [site.getusersitepackages()]]; matches = [p for p in paths if p.exists()]; print(matches[0] if matches else ""); raise SystemExit(0 if matches else 1)')" \
&& : > /tmp/art-cuda-symlinks.txt \
&& for src in "${cudnn_path}"/include/*; do \
dst="/usr/local/cuda-13.0/include/$(basename "$src")"; \
if [ ! -e "$dst" ]; then ln -s "$src" "$dst" && printf '%s\n' "$dst" >> /tmp/art-cuda-symlinks.txt; fi; \
done \
&& for src in "${cudnn_path}"/lib/*; do \
dst="/usr/local/cuda-13.0/lib64/$(basename "$src")"; \
if [ ! -e "$dst" ]; then ln -s "$src" "$dst" && printf '%s\n' "$dst" >> /tmp/art-cuda-symlinks.txt; fi; \
done \
&& nccl_path="$(python -c 'from pathlib import Path; import site; paths = [Path(p) / "nvidia" / "nccl" for p in site.getsitepackages() + [site.getusersitepackages()]]; matches = [p for p in paths if p.exists()]; print(matches[0] if matches else ""); raise SystemExit(0 if matches else 1)')" \
&& for src in "${nccl_path}"/include/*; do \
dst="/usr/local/cuda-13.0/include/$(basename "$src")"; \
if [ ! -e "$dst" ]; then ln -s "$src" "$dst" && printf '%s\n' "$dst" >> /tmp/art-cuda-symlinks.txt; fi; \
done \
&& for src in "${nccl_path}"/lib/*; do \
dst="/usr/local/cuda-13.0/lib64/$(basename "$src")"; \
if [ ! -e "$dst" ]; then ln -s "$src" "$dst" && printf '%s\n' "$dst" >> /tmp/art-cuda-symlinks.txt; fi; \
done \
&& UV_LINK_MODE=hardlink uv sync --frozen --extra backend --extra megatron --extra tinker --no-install-project --python 3.12 \
&& rm -rf .venv \
&& cd vllm_runtime \
&& UV_LINK_MODE=hardlink uv sync --frozen --no-install-project --no-dev --python 3.12 \
&& rm -rf .venv \
&& if [ -f /tmp/art-cudnn-symlinks.txt ]; then while IFS= read -r link; do [ -L "$link" ] && rm "$link"; done < /tmp/art-cudnn-symlinks.txt; fi \
&& rm -f /tmp/art-cudnn-symlinks.txt
&& if [ -f /tmp/art-cuda-symlinks.txt ]; then while IFS= read -r link; do [ -L "$link" ] && rm "$link"; done < /tmp/art-cuda-symlinks.txt; fi \
&& rm -f /tmp/art-cuda-symlinks.txt

FROM ${BASE_IMAGE}

Expand All @@ -81,8 +95,8 @@ ARG TORCH_CUDA_ARCH_LIST
ARG SKYPILOT_VERSION
ARG SKY_REMOTE_RAY_VERSION

ENV CUDA_HOME=/usr/local/cuda-12.8 \
PATH=/home/sky/.local/bin:/opt/conda/bin:${PATH} \
ENV CUDA_HOME=/usr/local/cuda-13.0 \
PATH=/home/sky/.local/bin:/usr/local/bin:${PATH} \
UV_CACHE_DIR=/opt/uv-cache \
UV_PYTHON_INSTALL_DIR=/opt/uv-python \
UV_LINK_MODE=copy \
Expand Down Expand Up @@ -117,7 +131,7 @@ RUN if ! getent group messagebus >/dev/null; then groupadd -r messagebus; fi \
git \
htop \
jq \
libcudnn9-headers-cuda-12 \
libcudnn9-headers-cuda-13 \
libibverbs-dev \
nano \
netcat-openbsd \
Expand All @@ -132,18 +146,32 @@ RUN if ! getent group messagebus >/dev/null; then groupadd -r messagebus; fi \
tmux \
unzip \
wget \
&& for cccl_dir in cuda cub thrust; do \
src="/usr/local/cuda-13.0/targets/x86_64-linux/include/cccl/${cccl_dir}"; \
dst="/usr/local/cuda-13.0/include/${cccl_dir}"; \
if [ -e "$src" ] && [ ! -e "$dst" ]; then ln -s "$src" "$dst"; fi; \
done \
&& nccl_path="$(python -c 'from pathlib import Path; import site; paths = [Path(p) / "nvidia" / "nccl" for p in site.getsitepackages() + [site.getusersitepackages()]]; matches = [p for p in paths if p.exists()]; print(matches[0] if matches else ""); raise SystemExit(0 if matches else 1)')" \
&& for src in "${nccl_path}"/include/*; do \
dst="/usr/local/cuda-13.0/include/$(basename "$src")"; \
if [ ! -e "$dst" ]; then ln -s "$src" "$dst"; fi; \
done \
&& for src in "${nccl_path}"/lib/*; do \
dst="/usr/local/cuda-13.0/lib64/$(basename "$src")"; \
if [ ! -e "$dst" ]; then ln -s "$src" "$dst"; fi; \
done \
&& ldconfig \
&& mkdir -p /var/run/sshd "${UV_CACHE_DIR}" "${UV_PYTHON_INSTALL_DIR}" \
&& sed -i 's/PermitRootLogin prohibit-password/PermitRootLogin yes/' /etc/ssh/sshd_config \
&& sed -i 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' /etc/pam.d/sshd \
&& ssh-keygen -A \
&& useradd -m -s /bin/bash sky \
&& mkdir -p /home/sky/.local/bin /home/sky/.sky/sky_app \
&& /bin/bash -c 'echo "sky ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers' \
&& /bin/bash -c 'echo '\''Defaults secure_path="/opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"'\'' > /etc/sudoers.d/sky' \
&& /opt/conda/bin/python -m pip install --no-cache-dir --upgrade "uv==${UV_VERSION}" \
&& /opt/conda/bin/conda clean -afy \
&& ln -sf /opt/conda/bin/uv /home/sky/.local/bin/uv \
&& /opt/conda/bin/uv --version \
&& /bin/bash -c 'echo '\''Defaults secure_path="/usr/local/bin:/usr/local/sbin:/usr/sbin:/usr/bin:/sbin:/bin"'\'' > /etc/sudoers.d/sky' \
&& python -m pip install --break-system-packages --no-cache-dir --upgrade "uv==${UV_VERSION}" \
&& ln -sf /usr/local/bin/uv /home/sky/.local/bin/uv \
&& uv --version \
&& chown -R sky:sky /home/sky "${UV_CACHE_DIR}" "${UV_PYTHON_INSTALL_DIR}"

COPY --from=builder --chown=sky:sky /opt/uv-cache /opt/uv-cache
Expand All @@ -153,7 +181,7 @@ USER sky
WORKDIR /home/sky

RUN mkdir -p "${HOME}/.local/bin" "${HOME}/.sky/sky_app" "${HOME}/sky_workdir" \
&& ln -sf /opt/conda/bin/uv "${HOME}/.local/bin/uv" \
&& ln -sf /usr/local/bin/uv "${HOME}/.local/bin/uv" \
&& uv venv --seed "${HOME}/skypilot-runtime" --python 3.10 \
&& VIRTUAL_ENV="${HOME}/skypilot-runtime" UV_LINK_MODE=copy UV_SYSTEM_PYTHON=false env -u PYTHONPATH -C "${HOME}" uv pip install \
"setuptools<70" \
Expand Down
19 changes: 11 additions & 8 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ backend = [
"unsloth==2026.3.3",
"unsloth-zoo==2026.3.1",
"torch>=2.11.0",
"torchao==0.16.0",
"torchao==0.17.0",
"accelerate==1.7.0",
"awscli>=1.38.1",
"setuptools>=78.1.0",
Expand All @@ -49,7 +49,7 @@ megatron = [
"quack-kernels==0.3.7",
"apex",
"transformer-engine==2.11.0",
"transformer-engine-cu12==2.11.0",
"transformer-engine-cu13==2.11.0",
"transformer-engine-torch==2.11.0",
"megatron-core==0.17.0",
"pybind11>=2.13.6",
Expand Down Expand Up @@ -159,9 +159,10 @@ override-dependencies = [
"transformer-engine==2.11.0",
"transformers==5.2.0",
"torch==2.11.0",
"torchvision==0.26.0",
]
exclude-dependencies = ["pynvml", "emerging-optimizers", "causal-conv1d", "mamba-ssm"]
no-build-isolation-package = ["apex", "transformer-engine", "transformer-engine-cu12", "transformer-engine-torch", "megatron-bridge", "deep-ep", "nv-grouped-gemm"]
no-build-isolation-package = ["apex", "transformer-engine", "transformer-engine-cu13", "transformer-engine-torch", "megatron-bridge", "deep-ep", "nv-grouped-gemm"]

[tool.uv.extra-build-dependencies]
apex = ["torch>=2.11.0"]
Expand Down Expand Up @@ -194,7 +195,7 @@ requires-dist = [
"packaging",
"pydantic",
"torch",
"transformer-engine-cu12",
"transformer-engine-cu13",
]

[tool.ty.environment]
Expand Down Expand Up @@ -272,15 +273,17 @@ dev = [
]

[tool.uv.sources]
torch = [{ index = "pytorch-cu128", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }]
apex = { git = "https://github.com/NVIDIA/apex.git", rev = "25.09" }
torch = [{ index = "pytorch-cu130", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }]
torchao = [{ index = "pytorch-cu130", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }]
torchvision = [{ index = "pytorch-cu130", marker = "sys_platform == 'linux' or sys_platform == 'win32'" }]
apex = { git = "https://github.com/NVIDIA/apex.git", rev = "26bba57d62553d268319b4a20cc3d8aa990249ec" }
deep-ep = { git = "https://github.com/deepseek-ai/DeepEP.git", rev = "v1.2.1" }
flash-attn-4 = { url = "https://files.pythonhosted.org/packages/24/f7/01ee2576ce41f9884d291ee21861ef194afc0b2b1ce3bd175fc7a6e1b133/flash_attn_4-4.0.0b5-py3-none-any.whl" }
megatron-bridge = { git = "https://github.com/NVIDIA-NeMo/Megatron-Bridge.git", rev = "e049cc00c24d03e2ae45d2608c7a44e2d2364e3d" }
panza = { git = "https://github.com/corbt/panza.git" }
transformer-engine-torch = { git = "https://github.com/NVIDIA/TransformerEngine.git", rev = "v2.11", subdirectory = "transformer_engine/pytorch" }

[[tool.uv.index]]
name = "pytorch-cu128"
url = "https://download.pytorch.org/whl/cu128"
name = "pytorch-cu130"
url = "https://download.pytorch.org/whl/cu130"
explicit = true
11 changes: 7 additions & 4 deletions scripts/ci/build_and_push_uv_cache.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)"

BASE_IMAGE="${BASE_IMAGE:-pytorch/pytorch:2.9.0-cuda12.8-cudnn9-devel}"
BASE_IMAGE="${BASE_IMAGE:-pytorch/pytorch:2.11.0-cuda13.0-cudnn9-devel}"
PYTHON_MM="${PYTHON_MM:-3.12}"
UV_CACHE_RELEASE_TAG="${UV_CACHE_RELEASE_TAG:-prek-uv-cache}"
UV_CACHE_ASSET_PREFIX="${UV_CACHE_ASSET_PREFIX:-prek-uv-cache}"
Expand Down Expand Up @@ -275,13 +275,16 @@ build_cache_archive() {
export TORCH_CUDA_ARCH_LIST

local cudnn_path="${TMP_DIR}/.venv/lib/python${PYTHON_MM}/site-packages/nvidia/cudnn"
local nccl_path="${TMP_DIR}/.venv/lib/python${PYTHON_MM}/site-packages/nvidia/nccl"
export CUDNN_PATH="${cudnn_path}"
export CUDNN_HOME="${cudnn_path}"
export CUDNN_INCLUDE_PATH="${cudnn_path}/include"
export CUDNN_LIBRARY_PATH="${cudnn_path}/lib"
export CPLUS_INCLUDE_PATH="${CUDNN_INCLUDE_PATH}${CPLUS_INCLUDE_PATH:+:${CPLUS_INCLUDE_PATH}}"
export LIBRARY_PATH="${CUDNN_LIBRARY_PATH}${LIBRARY_PATH:+:${LIBRARY_PATH}}"
export LD_LIBRARY_PATH="${CUDNN_LIBRARY_PATH}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
export NCCL_INCLUDE_PATH="${nccl_path}/include"
export NCCL_LIBRARY_PATH="${nccl_path}/lib"
export CPLUS_INCLUDE_PATH="${CUDNN_INCLUDE_PATH}:${NCCL_INCLUDE_PATH}${CPLUS_INCLUDE_PATH:+:${CPLUS_INCLUDE_PATH}}"
export LIBRARY_PATH="${CUDNN_LIBRARY_PATH}:${NCCL_LIBRARY_PATH}${LIBRARY_PATH:+:${LIBRARY_PATH}}"
export LD_LIBRARY_PATH="${CUDNN_LIBRARY_PATH}:${NCCL_LIBRARY_PATH}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"

log "Building full uv cache with compile_jobs=${compile_jobs}, apex_parallel_build=${apex_parallel_build}, nvcc_threads=${CI_APEX_NVCC_THREADS}, cuda_arch_list=${TORCH_CUDA_ARCH_LIST}, and uv_concurrent_builds=${UV_BUILD_SLOTS}."
uv sync --frozen --all-extras --group dev --no-install-project --python "${PYTHON_MM}"
Expand Down
2 changes: 1 addition & 1 deletion scripts/ci/compute_uv_fingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def _build_parser() -> argparse.ArgumentParser:
)
parser.add_argument(
"--base-image",
default="pytorch/pytorch:2.9.0-cuda12.8-cudnn9-devel",
default="pytorch/pytorch:2.11.0-cuda13.0-cudnn9-devel",
help="Base image reference used for CI runtime/build cache compatibility",
)
parser.add_argument(
Expand Down
4 changes: 2 additions & 2 deletions src/art/megatron/setup.sh
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
#!/usr/bin/env bash
set -euo pipefail

export CUDA_HOME="${CUDA_HOME:-/usr/local/cuda-12.8}"
export CUDA_HOME="${CUDA_HOME:-/usr/local/cuda-13.0}"
export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0}"
# Install missing cudnn headers, DeepEP RDMA headers, and ninja build tools.
missing_packages=()
for package in libcudnn9-headers-cuda-12 libibverbs-dev ninja-build; do
for package in libcudnn9-headers-cuda-13 libibverbs-dev ninja-build; do
if ! dpkg-query -W "${package}" >/dev/null 2>&1; then
missing_packages+=("${package}")
fi
Expand Down
2 changes: 1 addition & 1 deletion src/art/vllm_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,7 +529,7 @@ def get_vllm_runtime_nccl_so_path() -> Path:
"import importlib.util\n"
"spec = importlib.util.find_spec('nvidia.nccl')\n"
"if spec is None or spec.submodule_search_locations is None:\n"
" raise SystemExit('vLLM runtime is missing nvidia-nccl-cu12')\n"
" raise SystemExit('vLLM runtime is missing nvidia-nccl-cu13')\n"
"package_dir = Path(next(iter(spec.submodule_search_locations)))\n"
"path = package_dir / 'lib' / 'libnccl.so.2'\n"
"if not path.exists():\n"
Expand Down
4 changes: 2 additions & 2 deletions src/art/weight_transfer/nccl.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,13 +370,13 @@ def _find_nccl_library() -> str:
spec = importlib.util.find_spec("nvidia.nccl")
if spec is None or spec.submodule_search_locations is None:
raise RuntimeError(
"CUDA weight transfer requires the nvidia-nccl-cu12 package."
"CUDA weight transfer requires the nvidia-nccl-cu13 package."
)
nccl_library = (
Path(next(iter(spec.submodule_search_locations))) / "lib" / "libnccl.so.2"
)
if not nccl_library.exists():
raise RuntimeError(f"nvidia-nccl-cu12 is missing {nccl_library}")
raise RuntimeError(f"nvidia-nccl-cu13 is missing {nccl_library}")
return str(nccl_library)
if torch.version.hip is not None:
return "librccl.so.1"
Expand Down
Loading
Loading