Skip to content

CUDA CI Failed on SM120 #1851

@XuehaoSun

Description

@XuehaoSun
  1. test_fp8_format:
    @functools.cache
    def _load_finegrained_fp8_kernel() -> FineGrainedFP8:
        """
        Load the finegrained-fp8 Triton kernel once and return its entry points.
    
        Raises `ImportError` if the `kernels` package is missing, or the kernel or required
        symbols cannot be found.
        """
        if not is_kernels_available():
>           raise ImportError(
                "finegrained-fp8 kernel requires the `kernels` package. Install it with `pip install -U kernels`."
            )
E           ImportError: finegrained-fp8 kernel requires the `kernels` package. Install it with `pip install -U kernels`.
  1. test_sglang:
[2026-05-25 05:29:23] Scheduler hit an exception: Traceback (most recent call last):
  File "/root/.venv/lib/python3.12/site-packages/sglang/srt/model_executor/cuda_graph_runner.py", line 736, in __init__
    self.capture()
  File "/root/.venv/lib/python3.12/site-packages/sglang/srt/model_executor/cuda_graph_runner.py", line 918, in capture
    _capture_one_stream()
  File "/root/.venv/lib/python3.12/site-packages/sglang/srt/model_executor/cuda_graph_runner.py", line 906, in _capture_one_stream
    ) = self.capture_one_batch_size(bs, forward, stream_idx)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.venv/lib/python3.12/site-packages/sglang/srt/model_executor/cuda_graph_runner.py", line 1187, in capture_one_batch_size
    run_once()
  File "/root/.venv/lib/python3.12/site-packages/sglang/srt/model_executor/cuda_graph_runner.py", line 1165, in run_once
    logits_output_or_pp_proxy_tensors = forward(
                                        ^^^^^^^^
  File "/root/.venv/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 124, in decorate_context
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/root/.venv/lib/python3.12/site-packages/sglang/srt/models/qwen3.py", line 517, in forward
    hidden_states = self.model(
                    ^^^^^^^^^^^
  File "/root/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1779, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1790, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.venv/lib/python3.12/site-packages/sglang/srt/models/qwen2.py", line 367, in forward
    hidden_states, residual = layer(
                              ^^^^^^
  File "/root/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1779, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1790, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.venv/lib/python3.12/site-packages/sglang/srt/models/qwen3.py", line 393, in forward
    hidden_states, residual = self.layer_communicator.prepare_attn(
                              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.venv/lib/python3.12/site-packages/sglang/srt/layers/communicator.py", line 582, in prepare_attn
    hidden_states = self.input_layernorm(hidden_states)
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1779, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1790, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.venv/lib/python3.12/site-packages/sglang/srt/layers/utils/multi_platform.py", line 83, in forward
    return self._forward_method(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.venv/lib/python3.12/site-packages/sglang/srt/layers/layernorm.py", line 261, in forward_cuda
    out = rmsnorm(x, self.weight.data, self.variance_epsilon)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.venv/lib/python3.12/site-packages/sgl_kernel/elementwise.py", line 120, in rmsnorm
    return _flashinfer_norm.rmsnorm(input, weight, eps, out, enable_pdl)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.venv/lib/python3.12/site-packages/flashinfer/api_logging.py", line 1546, in _auto_dump_wrapper
    return _inner(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.venv/lib/python3.12/site-packages/flashinfer/norm/__init__.py", line 144, in rmsnorm
    _rmsnorm_impl(out, input, weight, eps, enable_pdl)
  File "/root/.venv/lib/python3.12/site-packages/flashinfer/norm/__init__.py", line 166, in _rmsnorm_impl
    rmsnorm_cute(
  File "/root/.venv/lib/python3.12/site-packages/flashinfer/norm/kernels/rmsnorm.py", line 1302, in rmsnorm_cute
    kernel = _get_compiled_rmsnorm_kernel(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.venv/lib/python3.12/site-packages/flashinfer/norm/kernels/rmsnorm.py", line 1143, in _get_compiled_rmsnorm_kernel
    compiled_kernel = cute.compile(
                      ^^^^^^^^^^^^^
  File "/root/.venv/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/compiler.py", line 582, in __call__
    return self._compile(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.venv/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/compiler.py", line 661, in _compile
    return func._dsl_object._func(func, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.venv/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/dsl.py", line 2214, in _func
    result = self.generate_mlir(
             ^^^^^^^^^^^^^^^^^^^
  File "/root/.venv/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/dsl.py", line 1835, in generate_mlir
    module, module_hash, result = self.generate_original_ir(
                                  ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.venv/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/dsl.py", line 1600, in generate_original_ir
    module_hash = self.get_module_hash(module, function_name)
                  ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.venv/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/dsl.py", line 1472, in get_module_hash
    hash_obj = self.get_version().copy()
               ^^^^^^^^^^^^^^^^^^
  File "/root/.venv/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/cutlass_dsl/cutlass.py", line 519, in get_version
    for lib in pkgutil.walk_packages([dsl_path], prefix="cutlass."):
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/.local/share/uv/python/cpython-3.12.13-linux-x86_64-gnu/lib/python3.12/pkgutil.py", line 93, in walk_packages
    yield from walk_packages(path, info.name+'.', onerror)
  File "/root/.local/share/uv/python/cpython-3.12.13-linux-x86_64-gnu/lib/python3.12/pkgutil.py", line 78, in walk_packages
    __import__(info.name)
  File "/root/.venv/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/cute/experimental/__init__.py", line 12, in <module>
    raise NotImplementedError(
NotImplementedError: CuTe Experimental module is only supported on Cuda toolkit 13.1 and above!
  1. test_vllm: (Only on RTX PRO 6000)
model = 'Intel/Qwen2-0.5B-Instruct-int4-sym-AutoRound'

    @pytest.mark.skipif(
        not current_platform.is_cpu() and not current_platform.is_xpu() and not current_platform.is_cuda(),
        reason="only supports CPU/XPU/CUDA backend.",
    )
    @pytest.mark.parametrize("model", MODELS)
    def test_auto_round(model):
        # offline inference loading test
        prompts = [
            "The capital of France is",
            "The future of AI is",
        ]
        # Create a sampling params object.
        sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
        # Create an LLM.
        QUANTIZATION = "auto-round"
        llm = LLM(
            model=model,
            quantization=QUANTIZATION,
            trust_remote_code=True,
            tensor_parallel_size=1,
            allow_deprecated_quantization=True,
        )
        # Generate texts from the prompts.
        # The output is a list of RequestOutput objects
        # that contain the prompt, generated text, and other information.
        outputs = llm.generate(prompts, sampling_params)
        # Print the outputs.
        for output in outputs:
            prompt = output.prompt
            generated_text = output.outputs[0].text
            if "France" in prompt:
>               assert "Paris" in generated_text
E               AssertionError: assert 'Paris' in ' located in the country of _____. A. Belgium B. Germany C. Italy'

Metadata

Metadata

Type

No fields configured for Bug.

Projects

No projects

Relationships

None yet

Development

No branches or pull requests

Issue actions