pytorch
diff --git a/‎test/prototype/mx_formats/test_inference_workflow.py‎
Lines changed: 25 additions & 11 deletions b/‎test/prototype/mx_formats/test_inference_workflow.py‎
Lines changed: 25 additions & 11 deletions
diff --git a/‎test/prototype/mx_formats/test_mx_serialization.py‎
Lines changed: 2 additions & 1 deletion b/‎test/prototype/mx_formats/test_mx_serialization.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎test/prototype/mx_formats/test_nvfp4_tensor.py‎
Lines changed: 25 additions & 12 deletions b/‎test/prototype/mx_formats/test_nvfp4_tensor.py‎
Lines changed: 25 additions & 12 deletions
diff --git a/‎torchao/prototype/mx_formats/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎torchao/prototype/mx_formats/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎torchao/prototype/mx_formats/config.py‎
Lines changed: 14 additions & 0 deletions b/‎torchao/prototype/mx_formats/config.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎torchao/prototype/mx_formats/inference_workflow.py‎
Lines changed: 24 additions & 9 deletions b/‎torchao/prototype/mx_formats/inference_workflow.py‎
Lines changed: 24 additions & 9 deletions
@@ -12,6 +12,7 @@
 import torch.nn as nn
 from torch.profiler import ProfilerActivity, profile
 
+from torchao.prototype.mx_formats.config import QuantizeToNVFP4KernelChoice
 from torchao.prototype.mx_formats.inference_workflow import (
     MXDynamicActivationMXWeightConfig,
     NVFP4DynamicActivationNVFP4WeightConfig,
@@ -140,7 +141,10 @@ def test_inference_workflow_mx(
 @pytest.mark.parametrize("compile", [True, False])
 @pytest.mark.parametrize("quant_type", ["dynamic", "weight_only"])
 @pytest.mark.parametrize("inpt_dtype", [torch.bfloat16, torch.float32])
-@pytest.mark.parametrize("use_triton_kernel", [True, False])
+@pytest.mark.parametrize(
+    "quantize_to_nvfp4_kernel_choice",
+    [QuantizeToNVFP4KernelChoice.TORCH, QuantizeToNVFP4KernelChoice.MSLK],
+)
 @pytest.mark.parametrize("use_dynamic_per_tensor_scale", [True, False])
 @pytest.mark.parametrize(
     "shapes",
@@ -164,7 +168,7 @@ def test_inference_workflow_nvfp4(
     compile: bool,
     quant_type: str,
     inpt_dtype: torch.dtype,
-    use_triton_kernel: bool,
+    quantize_to_nvfp4_kernel_choice: QuantizeToNVFP4KernelChoice,
     use_dynamic_per_tensor_scale: bool,
     shapes: tuple,
     use_inference_mode: bool,
@@ -179,17 +183,24 @@ def test_inference_workflow_nvfp4(
         pytest.skip("CUDA capability >= 10.0 required for DYNAMIC float4 gemm")
     if quant_type == "weight_only" and compile:
         pytest.skip("TODO: weight_only quant currently errors w/ compile")
-    if quant_type == "weight_only" and use_triton_kernel:
+    if (
+        quant_type == "weight_only"
+        and quantize_to_nvfp4_kernel_choice == QuantizeToNVFP4KernelChoice.MSLK
+    ):
         pytest.skip("unsupported configuration")
-    if use_triton_kernel and not use_dynamic_per_tensor_scale:
+    if quantize_to_nvfp4_kernel_choice == QuantizeToNVFP4KernelChoice.MSLK and not use_dynamic_per_tensor_scale:
         pytest.skip("unsupported configuration")
 
     if use_inference_mode and (
-        shapes != (128, 64, 256) or inpt_dtype != torch.bfloat16 or use_triton_kernel
+        shapes != (128, 64, 256)
+        or inpt_dtype != torch.bfloat16
+        or quantize_to_nvfp4_kernel_choice == QuantizeToNVFP4KernelChoice.MSLK
     ):
         pytest.skip("skipping unnecessary tests for inference mode")
     if x_rank == 3 and (
-        shapes != (128, 64, 256) or inpt_dtype != torch.bfloat16 or use_triton_kernel
+        shapes != (128, 64, 256)
+        or inpt_dtype != torch.bfloat16
+        or quantize_to_nvfp4_kernel_choice == QuantizeToNVFP4KernelChoice.MSLK
     ):
         pytest.skip("skipping unnecessary tests for x_rank 3")
 
@@ -200,7 +211,7 @@ def test_inference_workflow_nvfp4(
 
     if quant_type == "dynamic":
         config = NVFP4DynamicActivationNVFP4WeightConfig(
-            use_triton_kernel=use_triton_kernel,
+            quantize_to_nvfp4_kernel_choice=quantize_to_nvfp4_kernel_choice,
             use_dynamic_per_tensor_scale=use_dynamic_per_tensor_scale,
         )
     else:
@@ -218,7 +229,10 @@ def test_inference_workflow_nvfp4(
 
     y_ref = m(x)
 
-    if use_triton_kernel and quant_type == "dynamic":
+    if (
+        quantize_to_nvfp4_kernel_choice == QuantizeToNVFP4KernelChoice.MSLK
+        and quant_type == "dynamic"
+    ):
         with cuda_kernel_profiler("triton_quantize_nvfp4_kernel") as result:
             y_mx = m_mx(x)
         assert result["found"], "Expected quantize_nvfp4 kernel to be found"
@@ -393,7 +407,7 @@ def test_nvfp4_static_vs_dynamic_quantization():
     quantize_(
         m_dynamic,
         NVFP4DynamicActivationNVFP4WeightConfig(
-            use_triton_kernel=False,
+            quantize_to_nvfp4_kernel_choice=QuantizeToNVFP4KernelChoice.TORCH,
             use_dynamic_per_tensor_scale=True,
         ),
     )
@@ -406,7 +420,7 @@ def test_nvfp4_static_vs_dynamic_quantization():
         m_static,
         NVFP4DynamicActivationNVFP4WeightConfig(
             step="prepare",
-            use_triton_kernel=False,
+            quantize_to_nvfp4_kernel_choice=QuantizeToNVFP4KernelChoice.TORCH,
         ),
     )
     # Calibrate with the same input used for testing
@@ -416,7 +430,7 @@ def test_nvfp4_static_vs_dynamic_quantization():
         m_static,
         NVFP4DynamicActivationNVFP4WeightConfig(
             step="convert",
-            use_triton_kernel=False,
+            quantize_to_nvfp4_kernel_choice=QuantizeToNVFP4KernelChoice.TORCH,
         ),
     )
 
 
@@ -12,6 +12,7 @@
 import torch
 import torch.nn as nn
 
+from torchao.prototype.mx_formats.config import QuantizeToNVFP4KernelChoice
 from torchao.prototype.mx_formats.inference_workflow import (
     MXDynamicActivationMXWeightConfig,
     NVFP4DynamicActivationNVFP4WeightConfig,
@@ -48,7 +49,7 @@ def test_serialization(recipe_name):
         else:
             assert recipe_name == "nvfp4", "unsupported"
             config = NVFP4DynamicActivationNVFP4WeightConfig(
-                use_triton_kernel=False,
+                quantize_to_nvfp4_kernel_choice=QuantizeToNVFP4KernelChoice.TORCH,
                 use_dynamic_per_tensor_scale=False,
             )
 
 
@@ -9,6 +9,7 @@
 import torch
 import torch.nn.functional as F
 
+from torchao.prototype.mx_formats.config import QuantizeToNVFP4KernelChoice
 from torchao.prototype.mx_formats.constants import (
     F4_E2M1_MAX,
 )
@@ -383,14 +384,14 @@ def test_triton_nvfp4_quantize_equivalence(M, N, use_per_tensor_scale, dtype):
         x.clone(),
         per_tensor_scale=per_tensor_scale,
         is_swizzled_scales=True,
-        use_triton_kernel=False,
+        quantize_to_nvfp4_kernel_choice=QuantizeToNVFP4KernelChoice.TORCH,
     )
 
     nvfp4_triton = NVFP4Tensor.to_nvfp4(
         x.clone(),
         per_tensor_scale=per_tensor_scale,
         is_swizzled_scales=True,
-        use_triton_kernel=True,
+        quantize_to_nvfp4_kernel_choice=QuantizeToNVFP4KernelChoice.MSLK,
     )
 
     torch.testing.assert_close(nvfp4_pt.scale.flatten(), nvfp4_triton.scale.flatten())
@@ -427,7 +428,10 @@ def test_triton_nvfp4_quantize_equivalence(M, N, use_per_tensor_scale, dtype):
 @pytest.mark.parametrize("compile", [False])
 @pytest.mark.parametrize("bias", [True, False])
 @pytest.mark.parametrize("inpt_dtype", [torch.bfloat16, torch.float32])
-@pytest.mark.parametrize("use_triton_kernel", [True, False])
+@pytest.mark.parametrize(
+    "quantize_to_nvfp4_kernel_choice",
+    [QuantizeToNVFP4KernelChoice.MSLK, QuantizeToNVFP4KernelChoice.TORCH],
+)
 @pytest.mark.parametrize(
     "shapes",
     [
@@ -452,7 +456,7 @@ def test_nvfp4_matmul_with_amax(
     compile: bool,
     bias: bool,
     inpt_dtype: torch.dtype,
-    use_triton_kernel: bool,
+    quantize_to_nvfp4_kernel_choice: QuantizeToNVFP4KernelChoice,
     shapes: tuple,
 ):
     # DYNAMIC mode requires SM100+, but WEIGHT_ONLY works on older GPUs
@@ -489,13 +493,13 @@ def test_nvfp4_matmul_with_amax(
         A,
         per_tensor_scale=a_scale,
         is_swizzled_scales=True,
-        use_triton_kernel=use_triton_kernel,
+        quantize_to_nvfp4_kernel_choice=quantize_to_nvfp4_kernel_choice,
     )
     B_nvfp4 = NVFP4Tensor.to_nvfp4(
         B,
         per_tensor_scale=b_scale,
         is_swizzled_scales=True,
-        use_triton_kernel=use_triton_kernel,
+        quantize_to_nvfp4_kernel_choice=quantize_to_nvfp4_kernel_choice,
         act_quant_kwargs=act_quant_kwargs,
     )
 
@@ -527,7 +531,7 @@ def test_nvfp4_to_copy():
     assert x.act_per_tensor_scale is None
     assert y.act_per_tensor_scale is None
     assert x.block_size == y.block_size
-    assert x.use_triton_kernel == y.use_triton_kernel
+    assert x.quantize_to_nvfp4_kernel_choice == y.quantize_to_nvfp4_kernel_choice
     assert x.act_quant_kwargs == y.act_quant_kwargs
     assert x.dtype == torch.float32
     assert y.dtype == torch.bfloat16
@@ -538,7 +542,10 @@ def test_nvfp4_to_copy():
     not torch_version_at_least("2.8.0"), reason="NVFP4 requires PyTorch 2.8+"
 )
 @pytest.mark.parametrize("transpose", [False, True])
-@pytest.mark.parametrize("use_triton_kernel", [False, True])
+@pytest.mark.parametrize(
+    "quantize_to_nvfp4_kernel_choice",
+    [QuantizeToNVFP4KernelChoice.TORCH, QuantizeToNVFP4KernelChoice.MSLK],
+)
 @pytest.mark.parametrize("is_swizzled_scales", [False, True])
 @pytest.mark.parametrize(
     "shape",
@@ -551,11 +558,17 @@ def test_nvfp4_to_copy():
     ),
 )
 def test_scale_shape_matches_qdata(
-    transpose, use_triton_kernel, is_swizzled_scales, shape
+    transpose, quantize_to_nvfp4_kernel_choice, is_swizzled_scales, shape
 ):
-    if use_triton_kernel and not is_sm_at_least_100():
+    if (
+        quantize_to_nvfp4_kernel_choice == QuantizeToNVFP4KernelChoice.MSLK
+        and not is_sm_at_least_100()
+    ):
         pytest.skip("CUDA capability >= 10.0 required for nvfp4 triton kernel")
-    if use_triton_kernel and not is_swizzled_scales:
+    if (
+        quantize_to_nvfp4_kernel_choice == QuantizeToNVFP4KernelChoice.MSLK
+        and not is_swizzled_scales
+    ):
         pytest.skip("triton kernel requires swizzled scales")
 
     block_size = 16
@@ -568,7 +581,7 @@ def test_scale_shape_matches_qdata(
         x_hp,
         per_tensor_scale=per_tensor_scale,
         is_swizzled_scales=is_swizzled_scales,
-        use_triton_kernel=use_triton_kernel,
+        quantize_to_nvfp4_kernel_choice=quantize_to_nvfp4_kernel_choice,
     )
 
     if len(shape) == 2:
 
@@ -1,4 +1,5 @@
 from torchao.prototype.mx_formats.config import (
+    QuantizeToNVFP4KernelChoice,
     ScaleCalculationMode,
 )
 
@@ -15,5 +16,6 @@
     "MXDynamicActivationMXWeightConfig",
     "NVFP4DynamicActivationNVFP4WeightConfig",
     "NVFP4ObservedLinear",
+    "QuantizeToNVFP4KernelChoice",
     "NVFP4WeightOnlyConfig",
 ]
@@ -32,6 +32,20 @@ class MXFP8Dim1CastKernelChoice(Enum):
     TORCH = "torch"
 
 
+class QuantizeToNVFP4KernelChoice(str, Enum):
+    """Enum for specifying the kernel used for quantizing a high precision
+    tensor (float32/bfloat16/float16) to nvfp4 tensor with blockwise quantization
+    """
+
+    TORCH = "torch"
+    """Use torch native high precision to nvfp4 quantize kernel implemented with torch ops"""
+
+    MSLK = "mslk"
+    """Use MSLK triton high precision to nvfp4 quantize kernel"""
+
+
+torch.serialization.add_safe_globals([QuantizeToNVFP4KernelChoice])
+
 # register as pytree constant so we can use dynamo nonstrict trace in torchao.prototype.moe_training.ep
 @register_as_pytree_constant
 class ScaleCalculationMode(Enum):
 
@@ -14,7 +14,10 @@
 from torch import Tensor
 
 from torchao.core.config import AOBaseConfig
-from torchao.prototype.mx_formats.config import _validate_elem_dtype
+from torchao.prototype.mx_formats.config import (
+    QuantizeToNVFP4KernelChoice,
+    _validate_elem_dtype,
+)
 from torchao.prototype.mx_formats.mx_tensor import (
     MXTensor,
     QuantizeTensorToMXKwargs,
@@ -23,6 +26,7 @@
 from torchao.prototype.mx_formats.nvfp4_tensor import (
     NVFP4Tensor,
     QuantizeTensorToNVFP4Kwargs,
+    _handle_use_triton_kernel,
     per_tensor_amax_to_scale,
 )
 from torchao.quantization.quant_api import _module_extra_repr, _quantization_type
@@ -204,7 +208,7 @@ class NVFP4DynamicActivationNVFP4WeightConfig(AOBaseConfig):
     set to False.
 
     Configuration parameters:
-    - use_triton_kernel: bool, whether to use fused triton kernel for activation scaling (default: True).
+    - quantize_to_nvfp4_kernel_choice: QuantizeToNVFP4KernelChoice, kernel preference for quantization (default: QuantizeToNVFP4KernelChoice.MSLK)
       Requires `MSLK <https://github.com/pytorch/MSLK>`__ to be installed.
     - use_dynamic_per_tensor_scale: bool, whether to dynamically compute per tensor scale (default: True)
     - step: Optional[QuantizationStep], the quantization step for observer-based flow
@@ -221,11 +225,18 @@ class NVFP4DynamicActivationNVFP4WeightConfig(AOBaseConfig):
        :language: python
     """
 
-    use_triton_kernel: bool = True
+    quantize_to_nvfp4_kernel_choice: QuantizeToNVFP4KernelChoice = (
+        QuantizeToNVFP4KernelChoice.MSLK
+    )
     use_dynamic_per_tensor_scale: bool = True
     step: Optional["QuantizationStep"] = None
+    use_triton_kernel: bool = True
 
     def __post_init__(self):
+        self.quantize_to_nvfp4_kernel_choice = _handle_use_triton_kernel(
+            self.use_triton_kernel, self.quantize_to_nvfp4_kernel_choice
+        )
+
         if isinstance(self.step, str):
             self.step = QuantizationStep(self.step)
         # Validate PyTorch version
@@ -277,7 +288,7 @@ def _nvfp4_inference_linear_transform(
 
         act_quant_kwargs = QuantizeTensorToNVFP4Kwargs(
             use_dynamic_per_tensor_scale=False,
-            use_triton_kernel=config.use_triton_kernel,
+            quantize_to_nvfp4_kernel_choice=config.quantize_to_nvfp4_kernel_choice,
             is_swizzled_scales=True,
         )
 
@@ -286,10 +297,12 @@ def _nvfp4_inference_linear_transform(
             per_tensor_scale=weight_per_tensor_scale,
             act_per_tensor_scale=act_per_tensor_scale.detach(),
             is_swizzled_scales=True,
-            use_triton_kernel=False,  # Always use traditional construction for weights
+            quantize_to_nvfp4_kernel_choice=QuantizeToNVFP4KernelChoice.TORCH,  # Always use traditional construction for weights
             act_quant_kwargs=act_quant_kwargs,
         )
-        quantized_weight.use_triton_kernel = config.use_triton_kernel
+        quantized_weight.quantize_to_nvfp4_kernel_choice = (
+            config.quantize_to_nvfp4_kernel_choice
+        )
 
         # Create new Linear (not observed) with quantized weight
         linear = torch.nn.Linear(
@@ -319,18 +332,20 @@ def _nvfp4_inference_linear_transform(
 
         act_quant_kwargs = QuantizeTensorToNVFP4Kwargs(
             use_dynamic_per_tensor_scale=config.use_dynamic_per_tensor_scale,
-            use_triton_kernel=config.use_triton_kernel,
+            quantize_to_nvfp4_kernel_choice=config.quantize_to_nvfp4_kernel_choice,
             is_swizzled_scales=True,
         )
 
         quantized_weight = NVFP4Tensor.to_nvfp4(
             weight,
             per_tensor_scale=per_tensor_scale,
             is_swizzled_scales=True,
-            use_triton_kernel=False,  # Always use traditional construction for weights
+            quantize_to_nvfp4_kernel_choice=QuantizeToNVFP4KernelChoice.TORCH,  # Always use traditional construction for weights
             act_quant_kwargs=act_quant_kwargs,
         )
-        quantized_weight.use_triton_kernel = config.use_triton_kernel
+        quantized_weight.quantize_to_nvfp4_kernel_choice = (
+            config.quantize_to_nvfp4_kernel_choice
+        )
         setattr(
             module,
             parameter_name,
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`from torchao.prototype.mx_formats.config import (`
	`2`	`+ QuantizeToNVFP4KernelChoice,`
`2`	`3`	`ScaleCalculationMode,`
`3`	`4`	`)`
`4`	`5`
`@@ -15,5 +16,6 @@`
`15`	`16`	`"MXDynamicActivationMXWeightConfig",`
`16`	`17`	`"NVFP4DynamicActivationNVFP4WeightConfig",`
`17`	`18`	`"NVFP4ObservedLinear",`
	`19`	`+ "QuantizeToNVFP4KernelChoice",`
`18`	`20`	`"NVFP4WeightOnlyConfig",`
`19`	`21`	`]`