@functools.cache
def _load_finegrained_fp8_kernel() -> FineGrainedFP8:
"""
Load the finegrained-fp8 Triton kernel once and return its entry points.
Raises `ImportError` if the `kernels` package is missing, or the kernel or required
symbols cannot be found.
"""
if not is_kernels_available():
> raise ImportError(
"finegrained-fp8 kernel requires the `kernels` package. Install it with `pip install -U kernels`."
)
E ImportError: finegrained-fp8 kernel requires the `kernels` package. Install it with `pip install -U kernels`.
[2026-05-25 05:29:23] Scheduler hit an exception: Traceback (most recent call last):
File "/root/.venv/lib/python3.12/site-packages/sglang/srt/model_executor/cuda_graph_runner.py", line 736, in __init__
self.capture()
File "/root/.venv/lib/python3.12/site-packages/sglang/srt/model_executor/cuda_graph_runner.py", line 918, in capture
_capture_one_stream()
File "/root/.venv/lib/python3.12/site-packages/sglang/srt/model_executor/cuda_graph_runner.py", line 906, in _capture_one_stream
) = self.capture_one_batch_size(bs, forward, stream_idx)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.venv/lib/python3.12/site-packages/sglang/srt/model_executor/cuda_graph_runner.py", line 1187, in capture_one_batch_size
run_once()
File "/root/.venv/lib/python3.12/site-packages/sglang/srt/model_executor/cuda_graph_runner.py", line 1165, in run_once
logits_output_or_pp_proxy_tensors = forward(
^^^^^^^^
File "/root/.venv/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 124, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/root/.venv/lib/python3.12/site-packages/sglang/srt/models/qwen3.py", line 517, in forward
hidden_states = self.model(
^^^^^^^^^^^
File "/root/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1779, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1790, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.venv/lib/python3.12/site-packages/sglang/srt/models/qwen2.py", line 367, in forward
hidden_states, residual = layer(
^^^^^^
File "/root/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1779, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1790, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.venv/lib/python3.12/site-packages/sglang/srt/models/qwen3.py", line 393, in forward
hidden_states, residual = self.layer_communicator.prepare_attn(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.venv/lib/python3.12/site-packages/sglang/srt/layers/communicator.py", line 582, in prepare_attn
hidden_states = self.input_layernorm(hidden_states)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1779, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1790, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.venv/lib/python3.12/site-packages/sglang/srt/layers/utils/multi_platform.py", line 83, in forward
return self._forward_method(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.venv/lib/python3.12/site-packages/sglang/srt/layers/layernorm.py", line 261, in forward_cuda
out = rmsnorm(x, self.weight.data, self.variance_epsilon)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.venv/lib/python3.12/site-packages/sgl_kernel/elementwise.py", line 120, in rmsnorm
return _flashinfer_norm.rmsnorm(input, weight, eps, out, enable_pdl)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.venv/lib/python3.12/site-packages/flashinfer/api_logging.py", line 1546, in _auto_dump_wrapper
return _inner(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.venv/lib/python3.12/site-packages/flashinfer/norm/__init__.py", line 144, in rmsnorm
_rmsnorm_impl(out, input, weight, eps, enable_pdl)
File "/root/.venv/lib/python3.12/site-packages/flashinfer/norm/__init__.py", line 166, in _rmsnorm_impl
rmsnorm_cute(
File "/root/.venv/lib/python3.12/site-packages/flashinfer/norm/kernels/rmsnorm.py", line 1302, in rmsnorm_cute
kernel = _get_compiled_rmsnorm_kernel(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.venv/lib/python3.12/site-packages/flashinfer/norm/kernels/rmsnorm.py", line 1143, in _get_compiled_rmsnorm_kernel
compiled_kernel = cute.compile(
^^^^^^^^^^^^^
File "/root/.venv/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/compiler.py", line 582, in __call__
return self._compile(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.venv/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/compiler.py", line 661, in _compile
return func._dsl_object._func(func, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.venv/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/dsl.py", line 2214, in _func
result = self.generate_mlir(
^^^^^^^^^^^^^^^^^^^
File "/root/.venv/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/dsl.py", line 1835, in generate_mlir
module, module_hash, result = self.generate_original_ir(
^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.venv/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/dsl.py", line 1600, in generate_original_ir
module_hash = self.get_module_hash(module, function_name)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.venv/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/dsl.py", line 1472, in get_module_hash
hash_obj = self.get_version().copy()
^^^^^^^^^^^^^^^^^^
File "/root/.venv/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/cutlass_dsl/cutlass.py", line 519, in get_version
for lib in pkgutil.walk_packages([dsl_path], prefix="cutlass."):
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/.local/share/uv/python/cpython-3.12.13-linux-x86_64-gnu/lib/python3.12/pkgutil.py", line 93, in walk_packages
yield from walk_packages(path, info.name+'.', onerror)
File "/root/.local/share/uv/python/cpython-3.12.13-linux-x86_64-gnu/lib/python3.12/pkgutil.py", line 78, in walk_packages
__import__(info.name)
File "/root/.venv/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/cute/experimental/__init__.py", line 12, in <module>
raise NotImplementedError(
NotImplementedError: CuTe Experimental module is only supported on Cuda toolkit 13.1 and above!
model = 'Intel/Qwen2-0.5B-Instruct-int4-sym-AutoRound'
@pytest.mark.skipif(
not current_platform.is_cpu() and not current_platform.is_xpu() and not current_platform.is_cuda(),
reason="only supports CPU/XPU/CUDA backend.",
)
@pytest.mark.parametrize("model", MODELS)
def test_auto_round(model):
# offline inference loading test
prompts = [
"The capital of France is",
"The future of AI is",
]
# Create a sampling params object.
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Create an LLM.
QUANTIZATION = "auto-round"
llm = LLM(
model=model,
quantization=QUANTIZATION,
trust_remote_code=True,
tensor_parallel_size=1,
allow_deprecated_quantization=True,
)
# Generate texts from the prompts.
# The output is a list of RequestOutput objects
# that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
if "France" in prompt:
> assert "Paris" in generated_text
E AssertionError: assert 'Paris' in ' located in the country of _____. A. Belgium B. Germany C. Italy'