minor updates

kgajdamo · kgajdamo · commit 9130a3830aa2 · 2026-04-29T08:07:23.000Z
diff --git a/torchao/prototype/moe_training/kernels/float8_rowwise.py b/torchao/prototype/moe_training/kernels/float8_rowwise.py
@@ -811,7 +811,7 @@ def _triton_fp8_rowwise_2d_fused_scale_and_cast_kernel(
         if is_xpu:
             scale = tl.math.div_rn(fp8_dtype_max, row_amax)
         else:
-            scale = fp8_dtype_max / row_amax.to(tl.float64).to(tl.float32)
+            scale = (fp8_dtype_max / row_amax.to(tl.float64)).to(tl.float32)
 
         # Optionally round to power of 2 for hardware-friendly scaling.
         # Power-of-2 scales can be applied as exponent additions rather than
diff --git a/torchao/prototype/mx_formats/kernels.py b/torchao/prototype/mx_formats/kernels.py
@@ -10,7 +10,6 @@
 
 import numpy as np
 import torch
-from torch._inductor.runtime.triton_helpers import libdevice
 from torch.distributed.tensor import Replicate, Shard
 from torch.distributed.tensor.experimental import register_sharding
 from torch.utils._triton import has_triton
@@ -30,6 +29,8 @@
     torch_version_at_least,
 )
 
+_is_xpu = is_XPU()
+
 logger = logging.getLogger(__name__)
 
 
@@ -58,8 +59,6 @@ def get_bits(x: torch.Tensor) -> str:
 ZERO_BITS_F32 = 0x0
 ZERO_POINT_FIVE_BITS_F32 = 0x3F000000
 
-_is_xpu = is_XPU()
-
 
 def f32_to_f4_unpacked(x):
     """
@@ -173,6 +172,9 @@ def pack_uint4(uint8_data: torch.Tensor) -> torch.Tensor:
     import triton
     import triton.language as tl
     from torch.library import triton_op, wrap_triton
+    from triton.language.extra import libdevice
+
+    IS_XPU = tl.constexpr(_is_xpu)
 
     def triton_to_mxfp8_dim1_reference(
         x_hp: torch.Tensor,
@@ -231,7 +233,6 @@ def triton_mxfp8_dequant_dim0(
             e8m0_scales.size(1),
             out_dtype=out_dtype_tl,
             SCALE_BLOCK_SIZE=scale_block_size,
-            is_xpu=_is_xpu,
         )
         return out_buffer.reshape(orig_shape)
 
@@ -275,7 +276,6 @@ def _dequant_mxfp8_kernel(
         SCALE_BLOCK_SIZE: tl.constexpr,
         ROW_TILE_SIZE: tl.constexpr,
         COL_TILE_SIZE: tl.constexpr,
-        is_xpu: tl.constexpr,
     ):
         pid_row = tl.program_id(0)
         pid_col = tl.program_id(1)
@@ -307,7 +307,7 @@ def _dequant_mxfp8_kernel(
         e8m0_scale_block_r = e8m0_scale_block.reshape(
             ROW_TILE_SIZE * SCALE_BLOCKS_PER_COL_TILE, 1
         )
-        fp32_scale = _e8m0_to_fp32(e8m0_scale_block_r, is_xpu)
+        fp32_scale = _e8m0_to_fp32(e8m0_scale_block_r)
         data_hp = e4m3_data_block_r.to(tl.float32) * fp32_scale
 
         # Write to output buffer
@@ -316,11 +316,11 @@ def _dequant_mxfp8_kernel(
         tl.store(out_buffer + block_offs, out_buffer_block, mask=mask)
 
     @triton.jit
-    def _e8m0_to_fp32(scale_e8m0, is_xpu: tl.constexpr):
+    def _e8m0_to_fp32(scale_e8m0):
         e8m0_nan_val = 255
         e8m0_exponent_bias = 127
         s_offset = scale_e8m0.to(tl.int16) - e8m0_exponent_bias
-        if is_xpu:
+        if IS_XPU:
             s_fp = libdevice.exp2(s_offset.to(tl.float32))
         else:
             s_fp = tl.exp2(s_offset.to(tl.float32))
@@ -479,12 +479,7 @@ def triton_mxfp8_dequant_dim0(
 )
 
 if _triton_kernels_available:
-    import triton
-    import triton.language as tl
-    from torch.library import triton_op, wrap_triton
-
     IS_ROCM = tl.constexpr(is_ROCM())
-    IS_XPU = tl.constexpr(_is_xpu)
 
     @triton.jit
     def _calculate_reciprocal_scale(scale_e8m0_biased):