pytorch
diff --git a/‎.github/workflows/1xH100_tests.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/1xH100_tests.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎benchmarks/float8/bench_matmul.py‎
Lines changed: 0 additions & 1 deletion b/‎benchmarks/float8/bench_matmul.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎benchmarks/float8/float8_roofline.py‎
Lines changed: 14 additions & 6 deletions b/‎benchmarks/float8/float8_roofline.py‎
Lines changed: 14 additions & 6 deletions
diff --git a/‎benchmarks/float8/profile_lowp_training.py‎
Lines changed: 13 additions & 2 deletions b/‎benchmarks/float8/profile_lowp_training.py‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎benchmarks/prototype/moe_training/bench_moe_layer.py‎
Lines changed: 13 additions & 11 deletions b/‎benchmarks/prototype/moe_training/bench_moe_layer.py‎
Lines changed: 13 additions & 11 deletions
diff --git a/‎benchmarks/prototype/moe_training/benchmark_moe_layer_fsdp.py‎
Lines changed: 10 additions & 10 deletions b/‎benchmarks/prototype/moe_training/benchmark_moe_layer_fsdp.py‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎benchmarks/prototype/moe_training/benchmark_scaled_grouped_mm_dq.py‎
Lines changed: 18 additions & 17 deletions b/‎benchmarks/prototype/moe_training/benchmark_scaled_grouped_mm_dq.py‎
Lines changed: 18 additions & 17 deletions
diff --git a/‎docs/source/workflows/training.md‎
Lines changed: 12 additions & 18 deletions b/‎docs/source/workflows/training.md‎
Lines changed: 12 additions & 18 deletions
@@ -57,5 +57,6 @@ jobs:
         python test/quantization/quantize_/workflows/int4/test_int4_preshuffled_tensor.py
         ./test/float8/test_everything_single_gpu.sh
         pytest test/prototype/mx_formats/ --verbose -s
-        pytest test/prototype/moe_training/test_scaled_grouped_mm.py --verbose -s
+        pytest test/prototype/moe_training/test_fp8_grouped_mm.py --verbose -s
+        pytest test/prototype/moe_training/test_mxfp8_grouped_mm.py --verbose -s
         pytest test/prototype/moe_training/test_training.py --verbose -s
@@ -42,7 +42,6 @@ def run(
     assert recipe in (
         "tensorwise",
         "rowwise",
-        "mxfp8_cublas",
         "mxfp4_cutlass",
         "nvfp4",
     ), "unsupported"
 
@@ -61,7 +61,10 @@
     Float8LinearConfig,
     convert_to_float8_training,
 )
-from torchao.prototype.mx_formats import MXLinearConfig
+from torchao.prototype.moe_training.config import (
+    MXFP8TrainingOpConfig,
+    MXFP8TrainingRecipe,
+)
 from torchao.quantization import quantize_
 from torchao.testing.training.roofline_utils import (
     get_float8_mem_sympy,
@@ -253,10 +256,7 @@ def run(
     print(f"enable_fusion_modeling: {enable_fusion_modeling}")
 
     assert mx_recipe_name in (
-        # real mxfp8_cublas recipe
-        "mxfp8_cublas",
-        # real mxfp8_cublas_rceil recipe
-        "mxfp8_cublas_rceil",
+        None,
         # modeling of what mxfp8 with 32x32 block size and without gemm
         # operand layout restrictions would look like
         "mxfp8_32x32_flexible_gemm_layout",
@@ -429,7 +429,15 @@ def run(
                 )
             else:
                 assert mx_recipe_name is not None
-                config = MXLinearConfig.from_recipe_name(mx_recipe_name)
+                try:
+                    config = MXFP8TrainingOpConfig.from_recipe(
+                        MXFP8TrainingRecipe(mx_recipe_name)
+                    )
+                except ValueError:
+                    raise ValueError(
+                        f"Unsupported mx_recipe_name: {mx_recipe_name}. "
+                        f"Supported values: {[r.value for r in MXFP8TrainingRecipe]}"
+                    )
                 m_fp8_dyn = copy.deepcopy(m_orig)
                 quantize_(m_fp8_dyn, config=config)
             m_fp8_dyn = torch.compile(m_fp8_dyn)
 
@@ -45,7 +45,10 @@
 from torchao.float8.float8_linear_utils import (
     convert_to_float8_training,
 )
-from torchao.prototype.mx_formats.config import MXLinearConfig
+from torchao.prototype.moe_training.config import (
+    MXFP8TrainingOpConfig,
+    MXFP8TrainingRecipe,
+)
 from torchao.prototype.mx_formats.mx_tensor import MXTensor
 from torchao.prototype.mx_formats.utils import to_blocked
 from torchao.quantization import quantize_
@@ -320,7 +323,15 @@ def main(
     elif float8_recipe_name is not None:
         config = Float8LinearConfig.from_recipe_name(float8_recipe_name)
     elif mx_recipe_name is not None:
-        config = MXLinearConfig.from_recipe_name(mx_recipe_name)
+        try:
+            config = MXFP8TrainingOpConfig.from_recipe(
+                MXFP8TrainingRecipe(mx_recipe_name)
+            )
+        except ValueError:
+            raise ValueError(
+                f"Unsupported mx_recipe_name: {mx_recipe_name}. "
+                f"Supported values: {[r.value for r in MXFP8TrainingRecipe]}"
+            )
 
     print(f"Compile is set to       | {compile}")
     print(f"model_type is set to    | {model_type}")
 
@@ -16,9 +16,9 @@
 
 from benchmarks.utils import bench_fwd_bwd_microseconds, profile_fwd_bwd
 from torchao.prototype.moe_training.config import (
-    FP8GroupedMMRecipe,
-    MXFP8GroupedMMConfig,
-    MXFP8GroupedMMRecipe,
+    Float8TrainingRecipe,
+    MXFP8TrainingOpConfig,
+    MXFP8TrainingRecipe,
 )
 from torchao.quantization.quant_api import quantize_
 
@@ -58,15 +58,17 @@ def bench_moe_training_fsdp(args: argparse.Namespace):
 
     # Map recipe name to enum
     if recipe_name == "fp8_rowwise":
-        recipe = FP8GroupedMMRecipe.FP8_ROWWISE
+        recipe = Float8TrainingRecipe.FP8_ROWWISE
     elif recipe_name == "mxfp8_rceil":
-        recipe = MXFP8GroupedMMRecipe.MXFP8_RCEIL
+        recipe = MXFP8TrainingRecipe.MXFP8_RCEIL
     elif recipe_name == "mxfp8_rceil_wgrad_with_hp":
-        recipe = MXFP8GroupedMMRecipe.MXFP8_RCEIL_WGRAD_WITH_HP
+        recipe = MXFP8TrainingRecipe.MXFP8_RCEIL_WGRAD_WITH_HP
     else:
         raise ValueError(f"Unknown recipe: {recipe_name}")
+
+    # Check hardware requirements
     if (
-        recipe == FP8GroupedMMRecipe.FP8_ROWWISE
+        recipe == Float8TrainingRecipe.FP8_ROWWISE
         and torch.cuda.get_device_capability()
         != (
             9,
@@ -78,8 +80,8 @@ def bench_moe_training_fsdp(args: argparse.Namespace):
         )
         return
 
-    elif (
-        recipe == MXFP8GroupedMMRecipe.MXFP8_RCEIL
+    if (
+        recipe == MXFP8TrainingRecipe.MXFP8_RCEIL
         and torch.cuda.get_device_capability()
         != (
             10,
@@ -110,7 +112,7 @@ def bench_moe_training_fsdp(args: argparse.Namespace):
     model = copy.deepcopy(ref_model)
 
     # Token group alignment size must be 16 for fp8 rowwise training
-    alignment_size = 32 if recipe == MXFP8GroupedMMRecipe.MXFP8_RCEIL else 16
+    alignment_size = 32 if recipe == MXFP8TrainingRecipe.MXFP8_RCEIL else 16
     set_token_group_alignment_size_m(alignment_size)
 
     # assert starting params are identical for both models
@@ -125,7 +127,7 @@ def moe_module_filter_fn(mod: nn.Module, cur_fqn: str) -> bool:
         return False
 
     # quantize test model
-    config = MXFP8GroupedMMConfig.from_recipe(recipe)
+    config = MXFP8TrainingOpConfig.from_recipe(recipe)
     quantize_(model, config=config, filter_fn=moe_module_filter_fn)
 
     # inputs
 
@@ -25,9 +25,9 @@
 
 from benchmarks.utils import bench_fwd_bwd_microseconds, profile_fwd_bwd
 from torchao.prototype.moe_training.config import (
-    FP8GroupedMMRecipe,
-    MXFP8GroupedMMConfig,
-    MXFP8GroupedMMRecipe,
+    Float8TrainingRecipe,
+    MXFP8TrainingOpConfig,
+    MXFP8TrainingRecipe,
 )
 from torchao.quantization.quant_api import quantize_
 
@@ -48,15 +48,15 @@ def bench_moe_training_fsdp(recipe_name: str, enable_profile: bool, use_compile:
     assert recipe_name in ["fp8_rowwise", "mxfp8_rceil", "mxfp8_rceil_wgrad_with_hp"]
     # Map recipe names to enums
     if recipe_name.upper() == "fp8_rowwise":
-        recipe = FP8GroupedMMRecipe.FP8_ROWWISE
+        recipe = Float8TrainingRecipe.FP8_ROWWISE
     elif recipe_name.upper() == "mxfp8_rceil":
-        recipe = MXFP8GroupedMMRecipe.MXFP8_RCEIL
+        recipe = MXFP8TrainingRecipe.MXFP8_RCEIL
     elif recipe_name.upper() == "mxfp8_rceil_wgrad_with_hp":
-        recipe = MXFP8GroupedMMRecipe.MXFP8_RCEIL_WGRAD_WITH_HP
+        recipe = MXFP8TrainingRecipe.MXFP8_RCEIL_WGRAD_WITH_HP
     else:
         raise ValueError(f"Unknown recipe: {recipe_name}")
     if (
-        recipe == FP8GroupedMMRecipe.FP8_ROWWISE
+        recipe == Float8TrainingRecipe.FP8_ROWWISE
         and torch.cuda.get_device_capability()
         != (
             9,
@@ -69,7 +69,7 @@ def bench_moe_training_fsdp(recipe_name: str, enable_profile: bool, use_compile:
         return
 
     elif (
-        recipe == MXFP8GroupedMMRecipe.MXFP8_RCEIL
+        recipe == MXFP8TrainingRecipe.MXFP8_RCEIL
         and torch.cuda.get_device_capability()
         != (
             10,
@@ -104,7 +104,7 @@ def bench_moe_training_fsdp(recipe_name: str, enable_profile: bool, use_compile:
     model = copy.deepcopy(ref_model)
 
     # Token group alignment size must be 16 for fp8 rowwise training
-    alignment_size = 32 if recipe == MXFP8GroupedMMRecipe.MXFP8_RCEIL else 16
+    alignment_size = 32 if recipe == MXFP8TrainingRecipe.MXFP8_RCEIL else 16
     set_token_group_alignment_size_m(alignment_size)
 
     # assert starting params are identical for both models
@@ -119,7 +119,7 @@ def moe_module_filter_fn(mod: nn.Module, cur_fqn: str) -> bool:
         return False
 
     # quantize test model
-    config = MXFP8GroupedMMConfig.from_recipe(recipe)
+    config = MXFP8TrainingOpConfig.from_recipe(recipe)
     quantize_(model, config=config, filter_fn=moe_module_filter_fn)
 
     # FSDP2
 
@@ -19,14 +19,16 @@
     bench_fwd_microseconds,
     profile_fwd_bwd,
 )
-from torchao.prototype.moe_training import _quantize_then_scaled_grouped_mm
 from torchao.prototype.moe_training.config import (
-    FP8GroupedMMConfig,
-    FP8GroupedMMRecipe,
-    MXFP8GroupedMMConfig,
-    MXFP8GroupedMMRecipe,
+    Float8TrainingOpConfig,
+    Float8TrainingRecipe,
+    MXFP8TrainingOpConfig,
+    MXFP8TrainingRecipe,
+)
+from torchao.prototype.moe_training.utils import (
+    _quantize_then_scaled_grouped_mm,
+    generate_jagged_offs,
 )
-from torchao.prototype.moe_training.utils import generate_jagged_offs
 from torchao.utils import is_MI300, is_MI350, is_ROCM
 
 device = torch.device("cuda")
@@ -42,7 +44,7 @@
 class ExperimentConfig:
     high_precision_dtype: torch.dtype
     MNKG: tuple[int]
-    recipe: Union[FP8GroupedMMRecipe, MXFP8GroupedMMRecipe]
+    recipe: Union[Float8TrainingRecipe, MXFP8TrainingRecipe]
 
 
 @dataclass(frozen=True)
@@ -92,9 +94,8 @@ def get_configs() -> List[ExperimentConfig]:
         (128000, 2048, 7168, 8),
     ]
     recipes = [
-        FP8GroupedMMRecipe.FP8_ROWWISE,
-        MXFP8GroupedMMRecipe.MXFP8_RCEIL,
-        MXFP8GroupedMMRecipe.MXFP8_RCEIL_WGRAD_WITH_HP,
+        MXFP8TrainingRecipe.MXFP8_RCEIL,
+        MXFP8TrainingRecipe.MXFP8_RCEIL_WGRAD_WITH_HP,
     ]
     high_precision_dtypes = [torch.bfloat16]
     configs = []
@@ -138,7 +139,7 @@ def run_experiment(
     # - the transposed tensor in col-major format with groups along the row dimension,
     #    which represents the right operand.
     token_group_alignment_size = (
-        16 if config.recipe == FP8GroupedMMRecipe.FP8_ROWWISE else 32
+        16 if config.recipe == Float8TrainingRecipe.FP8_ROWWISE else 32
     )
 
     offs = generate_jagged_offs(G, total_M, multiple_of=token_group_alignment_size)
@@ -170,10 +171,10 @@ def run_experiment(
         )
 
     # Create config object from recipe
-    if isinstance(config.recipe, FP8GroupedMMRecipe):
-        quant_config = FP8GroupedMMConfig.from_recipe(config.recipe)
+    if isinstance(config.recipe, Float8TrainingRecipe):
+        quant_config = Float8TrainingOpConfig.from_recipe(config.recipe)
     else:
-        quant_config = MXFP8GroupedMMConfig.from_recipe(config.recipe)
+        quant_config = MXFP8TrainingOpConfig.from_recipe(config.recipe)
 
     # fwd_bwd scaled benchmark + profiling
     scaled_fwd_bwd_us = bench_fwd_bwd_microseconds(
@@ -261,7 +262,7 @@ def main(args: argparse.Namespace):
     configs = get_configs()
     results = []
     for config in tqdm(configs):
-        if config.recipe == FP8GroupedMMRecipe.FP8_ROWWISE:
+        if config.recipe == Float8TrainingRecipe.FP8_ROWWISE:
             if is_ROCM():
                 if not (is_MI300() or is_MI350()):
                     logging.warning(
@@ -276,8 +277,8 @@ def main(args: argparse.Namespace):
                     continue
 
         elif config.recipe in (
-            MXFP8GroupedMMRecipe.MXFP8_RCEIL,
-            MXFP8GroupedMMRecipe.MXFP8_RCEIL_WGRAD_WITH_HP,
+            MXFP8TrainingRecipe.MXFP8_RCEIL,
+            MXFP8TrainingRecipe.MXFP8_RCEIL_WGRAD_WITH_HP,
         ) and torch.cuda.get_device_capability() != (10, 0):
             logging.warning(
                 f"Skipping MXFP8 benchmarks, only supported on compute capability 10.0 and found {torch.cuda.get_device_capability()}"
 
@@ -266,7 +266,7 @@ with torch.inference_mode():
 ## mxfp8
 
 e2e training with mxfp8 from the [MX OCP spec](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf)
-in native PyTorch.  
+in native PyTorch.
 
 > :warning: We are currently in prototype.  Use nightly versions of PyTorch and torchao (or build from source) for best results.
 
@@ -336,25 +336,19 @@ Below is a toy training loop. For an example real training loop, see our torchti
 ```python
 import torch
 from torchao.quantization import quantize_
-import torchao.prototype.mx_formats
-from torchao.prototype.mx_formats import MXLinearConfig, ScaleCalculationMode
-from torchao.quantization.quantize_.common import KernelPreference
-
-# low precision gemm, requires CUDA capability 10.0+
-kernel_preference = KernelPreference.AUTO
-# or, emulated gemm
-# kernel_preference = KernelPreference.EMULATED
-
-scale_calculation_mode = ScaleCalculationMode.FLOOR
-# other supported modes: RCEIL, CEIL, EVEN
+from torchao.prototype.moe_training.config import MXFP8TrainingOpConfig, MXFP8TrainingRecipe
+from torchao.prototype.mx_formats import ScaleCalculationMode
+
+# create config from a recipe
+config = MXFP8TrainingOpConfig.from_recipe(MXFP8TrainingRecipe.MXFP8_RCEIL)
+# or manually configure
+# config = MXFP8TrainingOpConfig(
+#     kernel_preference=KernelPreference.AUTO,  # or KernelPreference.EMULATED
+#     scale_calculation_mode=ScaleCalculationMode.RCEIL,  # or FLOOR, CEIL, EVEN
+#     wgrad_with_hp=False,  # True to compute grad_weight in high precision
+# )
 
 m = torch.nn.Sequential(torch.nn.Linear(32, 32)).cuda()
-config = MXLinearConfig(
-    elem_dtype=torch.float8_e4m3fn,
-    block_size=32,
-    kernel_preference=kernel_preference,
-    scale_calculation_mode=scale_calculation_mode,
-)
 quantize_(m, config)
 m = torch.compile(m, fullgraph=True)