pytorch
diff --git a/‎.github/workflows/1xH100_tests.yml‎
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/1xH100_tests.yml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎.github/workflows/4xH100_tests.yml‎
Lines changed: 0 additions & 2 deletions b/‎.github/workflows/4xH100_tests.yml‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎.github/workflows/claude-code.yml‎
Lines changed: 17 additions & 0 deletions b/‎.github/workflows/claude-code.yml‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎CLAUDE.md‎
Lines changed: 3 additions & 0 deletions b/‎CLAUDE.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 11 additions & 0 deletions b/‎README.md‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎benchmarks/float8/bench_matmul.py‎
Lines changed: 0 additions & 1 deletion b/‎benchmarks/float8/bench_matmul.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎benchmarks/float8/float8_inference_roofline.py‎
Lines changed: 61 additions & 9 deletions b/‎benchmarks/float8/float8_inference_roofline.py‎
Lines changed: 61 additions & 9 deletions
diff --git a/‎benchmarks/float8/float8_roofline.py‎
Lines changed: 14 additions & 6 deletions b/‎benchmarks/float8/float8_roofline.py‎
Lines changed: 14 additions & 6 deletions
diff --git a/‎benchmarks/float8/profile_lowp_training.py‎
Lines changed: 13 additions & 2 deletions b/‎benchmarks/float8/profile_lowp_training.py‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎benchmarks/mx_formats/cast_bench.py‎
Lines changed: 38 additions & 4 deletions b/‎benchmarks/mx_formats/cast_bench.py‎
Lines changed: 38 additions & 4 deletions
@@ -57,5 +57,6 @@ jobs:
         python test/quantization/quantize_/workflows/int4/test_int4_preshuffled_tensor.py
         ./test/float8/test_everything_single_gpu.sh
         pytest test/prototype/mx_formats/ --verbose -s
-        pytest test/prototype/moe_training/test_scaled_grouped_mm.py --verbose -s
+        pytest test/prototype/moe_training/test_fp8_grouped_mm.py --verbose -s
+        pytest test/prototype/moe_training/test_mxfp8_grouped_mm.py --verbose -s
         pytest test/prototype/moe_training/test_training.py --verbose -s
@@ -47,6 +47,4 @@ jobs:
         uv pip install -r dev-requirements.txt
         pip install . --no-build-isolation
         ./test/float8/test_everything_multi_gpu.sh
-        ./test/prototype/mx_formats/test_mx_dtensor.sh
         ./test/prototype/mx_formats/test_mxfp8_allgather.sh
-        ./test/prototype/moe_training/test_distributed.sh
@@ -0,0 +1,17 @@
+name: Claude Code
+
+on:
+  issue_comment:
+    types: [created]
+  issues:
+    types: [opened]
+
+jobs:
+  claude-code:
+    uses: pytorch/test-infra/.github/workflows/_claude-code.yml@main
+    permissions:
+      contents: read
+      pull-requests: write
+      issues: write
+      id-token: write
+    secrets: inherit
@@ -0,0 +1,3 @@
+# TorchAO Claude Instructions
+
+Fill me in
@@ -110,6 +110,17 @@ pip install torchao
 
 Please see the [torchao compability table](https://github.com/pytorch/ao/issues/2919) for version requirements for dependencies.
 
+### Optional Dependencies
+
+[MSLK](https://github.com/pytorch/MSLK) is an optional runtime dependency that provides accelerated kernels for some of the workflows in torchao. Stable MSLK should be used with stable torchao, and nightly MSLK with nightly torchao.
+```bash
+# Stable
+pip install mslk-cuda==1.0.0
+
+# Nightly
+pip install --pre mslk --index-url https://download.pytorch.org/whl/nightly/cu128
+```
+
 ## 🔎 Inference
 
 TorchAO delivers substantial performance gains with minimal code changes:
 
@@ -42,7 +42,6 @@ def run(
     assert recipe in (
         "tensorwise",
         "rowwise",
-        "mxfp8_cublas",
         "mxfp4_cutlass",
         "nvfp4",
     ), "unsupported"
 
@@ -58,6 +58,7 @@
 )
 from torchao.quantization.quantize_.common import KernelPreference
 from torchao.testing.training.roofline_utils import (
+    get_inference_bf16_activation_mem_sympy,
     get_inference_float8_mem_sympy,
     get_inference_gemm_time_sympy,
 )
@@ -111,7 +112,7 @@ def get_gemm_times(
 
     bf16_time_s = get_gpu_kernel_gemm_time_s(torch.mm, x_bf16, w_bf16)
 
-    if recipe_name in ("mxfp4_cutlass", "nvfp4"):
+    if recipe_name in ("mxfp4_cutlass", "nvfp4", "nvfp4_static"):
         d1, d2, d3 = torch.float4_e2m1fn_x2, torch.float4_e2m1fn_x2, torch.bfloat16
         A = torch.randint(0, 255, (M, K // 2), device=device, dtype=torch.uint8).view(
             d1
@@ -150,7 +151,7 @@ def get_gemm_times(
         scale_b = torch.ones(N, K // 32, device=device, dtype=torch.float8_e8m0fnu)
         scale_a = to_blocked(scale_a)
         scale_b = to_blocked(scale_b)
-    elif recipe_name == "nvfp4":
+    elif recipe_name in ("nvfp4", "nvfp4_static"):
         scale_a = torch.ones(M, K // 16, device=device, dtype=torch.float8_e4m3fn)
         scale_b = torch.ones(N, K // 16, device=device, dtype=torch.float8_e4m3fn)
         scale_a = to_blocked(scale_a)
@@ -176,7 +177,7 @@ def do_matmul(A, B):
                 swizzle_b=SwizzleType.SWIZZLE_32_4_4,
                 output_dtype=d3,
             )
-        if recipe_name == "nvfp4":
+        if recipe_name in ("nvfp4", "nvfp4_static"):
             return torch._scaled_mm(
                 A, B, scale_a, scale_b, out_dtype=d3, use_fast_accum=False
             )
@@ -468,8 +469,8 @@ def _stack_layers_conv(
 
 
 def run(
-    outfile: str,
     recipe_name: str,
+    outfile: str | None = None,
     do_benchmarks: bool = True,
     shape_gen_name: str = "pow2",
     M: Optional[int] = None,
@@ -485,6 +486,7 @@ def run(
     kernel_size: Optional[int] = None,
     stride: int = 1,
     padding: int = 0,
+    skip_printing_detailed_metrics: bool = False,
 ):
     """
     Args:
@@ -500,6 +502,8 @@ def run(
     * `kernel_size`: kernel_size for conv3d / conv2d
     * `stride`: stride for conv ops (default: 1)
     * `padding`: padding for conv ops (default: 0)
+    * `skip_printing_detailed_metrics`: if True, prints e2e roofline
+      and observed speedups only, skipping all other intermediate metrics
     """
     _SUPPORTED_OPS = ["linear", "conv2d", "conv3d"]
     assert op_name in _SUPPORTED_OPS, (
@@ -561,6 +565,11 @@ def run(
         # TODO(future): also enable fusion modeling here
     )
     bf16_gemm_time_sympy = get_inference_gemm_time_sympy(M, K, N, torch.bfloat16, None)
+    if enable_fusion_modeling and op_name == "linear":
+        bf16_ovhd_time_sympy = get_inference_bf16_activation_mem_sympy(M, K, N)
+    else:
+        # multiply by M to ensure we get a sympy symbol
+        bf16_ovhd_time_sympy = M * 0
 
     if recipe_name and recipe_name.startswith(("nvfp4", "mxfp4")):
         fp8_gemm_time_sympy = get_inference_gemm_time_sympy(
@@ -572,6 +581,7 @@ def run(
             M, K, N, torch.float8_e4m3fn, gemm_recipe_name
         )
     print("bf16_gemm_time_sympy", bf16_gemm_time_sympy)
+    print("bf16_ovhd_time_sympy", bf16_ovhd_time_sympy)
     print("fp8_gemm_time_sympy", fp8_gemm_time_sympy)
     print("fp8_ovhd_time_sympy", fp8_ovhd_time_sympy)
     print()
@@ -587,6 +597,8 @@ def run(
         # roofline - gemm time (fwd + bwd, 3 gemms; for conv: using equivalent implicit gemm dims)
         "r_bf16_gemm_s",
         "r_fp8_gemm_s",
+        # roofline - bf16 overhead time (read-write prev activation, only if fusion modeling is on)
+        "r_bf16_ovhd_s",
         # roofline - fp8 overhead time (by counting reads/writes in the ideal case)
         "r_fp8_ovhd_s",
         # roofline - fp8 gemm + fp8 overhead time (does not include LN or sigmoid)
@@ -628,11 +640,16 @@ def run(
             )
 
             # note: cast from sympy.core.numbers.Float to float to make pandas formatting work
+            r_bf16_ovhd_time_s = float(
+                bf16_ovhd_time_sympy.subs(M, M_val).subs(K, K_val).subs(N, N_val)
+            )
             r_fp8_ovhd_time_s = float(
                 fp8_ovhd_time_sympy.subs(M, M_val).subs(K, K_val).subs(N, N_val)
             )
             r_fp8_gemm_and_ovhd_s = r_fp8_gemm_time_s + r_fp8_ovhd_time_s
-            r_speedup = r_bf16_gemm_time_s / (r_fp8_gemm_time_s + r_fp8_ovhd_time_s)
+            r_speedup = (r_bf16_gemm_time_s + r_bf16_ovhd_time_s) / (
+                r_fp8_gemm_time_s + r_fp8_ovhd_time_s
+            )
 
             # if enabled, also measured observed gemm time
             b_bf16_gemm_time_s, b_fp8_gemm_time_s = 0, 0
@@ -679,11 +696,16 @@ def run(
             r_fp8_gemm_time_s = float(
                 fp8_gemm_time_sympy.subs(M, gemm_M).subs(K, gemm_K).subs(N, gemm_N)
             )
+            r_bf16_ovhd_time_s = float(
+                bf16_ovhd_time_sympy.subs(M, M_val).subs(K, K_val).subs(N, N_val)
+            )
             r_fp8_ovhd_time_s = float(
                 fp8_ovhd_time_sympy.subs(M, gemm_M).subs(K, gemm_K).subs(N, gemm_N)
             )
             r_fp8_gemm_and_ovhd_s = r_fp8_gemm_time_s + r_fp8_ovhd_time_s
-            r_speedup = r_bf16_gemm_time_s / (r_fp8_gemm_time_s + r_fp8_ovhd_time_s)
+            r_speedup = (r_bf16_gemm_time_s + r_bf16_ovhd_time_s) / (
+                r_fp8_gemm_time_s + r_fp8_ovhd_time_s
+            )
 
             # measure actual conv kernel times (without quant overhead)
             b_bf16_gemm_time_s, b_fp8_gemm_time_s = 0, 0
@@ -773,12 +795,29 @@ def run(
                     )
                 elif recipe_name == "nvfp4":
                     config = NVFP4DynamicActivationNVFP4WeightConfig(
-                        use_dynamic_per_tensor_scale=False,
+                        use_dynamic_per_tensor_scale=True,
+                    )
+                elif recipe_name == "nvfp4_static":
+                    config_calib = NVFP4DynamicActivationNVFP4WeightConfig(
+                        step="prepare",
+                    )
+                    config = NVFP4DynamicActivationNVFP4WeightConfig(
+                        step="convert",
                     )
                 else:
                     assert False, "unsupported"
 
                 m_fp8_dyn = copy.deepcopy(m_orig)
+
+                if recipe_name == "nvfp4_static":
+                    # calibrate with sample data
+                    # this benchmark is performance-only, so a toy datum is fine
+                    quantize_(m_fp8_dyn, config_calib)
+                    toy_datum = torch.randn(
+                        M_val, K_val, dtype=torch.bfloat16, device="cuda"
+                    )
+                    m_fp8_dyn(toy_datum)
+
                 if op_name == "linear":
                     quantize_(m_fp8_dyn, config)
                 elif op_name == "conv2d":
@@ -813,7 +852,8 @@ def run(
                 # roofline - gemm
                 r_bf16_gemm_time_s,
                 r_fp8_gemm_time_s,
-                # roofline - fp8 overhead
+                # roofline - overhead
+                r_bf16_ovhd_time_s,
                 r_fp8_ovhd_time_s,
                 # roofline - gemm + overhead, and speedup
                 r_fp8_gemm_and_ovhd_s,
@@ -833,8 +873,20 @@ def run(
 
     pd.set_option("display.precision", 2)
     df = pd.DataFrame(results, columns=headers)
+
+    if outfile is not None:
+        df.to_csv(outfile)
+
+    if op_name == "linear":
+        # drop conv-only columns to simplify linear results
+        df = df.drop(columns=["D", "H", "W", "kernel_size"])
+
+    if skip_printing_detailed_metrics:
+        df = df[
+            ["fwd_M", "fwd_K", "fwd_N", "r_fp8_gemm_and_ovhd_spdp", "b_fp8_e2e_spdp"]
+        ]
+
     print(df)
-    df.to_csv(outfile)
     print("done")
 
 
 
@@ -61,7 +61,10 @@
     Float8LinearConfig,
     convert_to_float8_training,
 )
-from torchao.prototype.mx_formats import MXLinearConfig
+from torchao.prototype.moe_training.config import (
+    MXFP8TrainingOpConfig,
+    MXFP8TrainingRecipe,
+)
 from torchao.quantization import quantize_
 from torchao.testing.training.roofline_utils import (
     get_float8_mem_sympy,
@@ -253,10 +256,7 @@ def run(
     print(f"enable_fusion_modeling: {enable_fusion_modeling}")
 
     assert mx_recipe_name in (
-        # real mxfp8_cublas recipe
-        "mxfp8_cublas",
-        # real mxfp8_cublas_rceil recipe
-        "mxfp8_cublas_rceil",
+        None,
         # modeling of what mxfp8 with 32x32 block size and without gemm
         # operand layout restrictions would look like
         "mxfp8_32x32_flexible_gemm_layout",
@@ -429,7 +429,15 @@ def run(
                 )
             else:
                 assert mx_recipe_name is not None
-                config = MXLinearConfig.from_recipe_name(mx_recipe_name)
+                try:
+                    config = MXFP8TrainingOpConfig.from_recipe(
+                        MXFP8TrainingRecipe(mx_recipe_name)
+                    )
+                except ValueError:
+                    raise ValueError(
+                        f"Unsupported mx_recipe_name: {mx_recipe_name}. "
+                        f"Supported values: {[r.value for r in MXFP8TrainingRecipe]}"
+                    )
                 m_fp8_dyn = copy.deepcopy(m_orig)
                 quantize_(m_fp8_dyn, config=config)
             m_fp8_dyn = torch.compile(m_fp8_dyn)
 
@@ -45,7 +45,10 @@
 from torchao.float8.float8_linear_utils import (
     convert_to_float8_training,
 )
-from torchao.prototype.mx_formats.config import MXLinearConfig
+from torchao.prototype.moe_training.config import (
+    MXFP8TrainingOpConfig,
+    MXFP8TrainingRecipe,
+)
 from torchao.prototype.mx_formats.mx_tensor import MXTensor
 from torchao.prototype.mx_formats.utils import to_blocked
 from torchao.quantization import quantize_
@@ -320,7 +323,15 @@ def main(
     elif float8_recipe_name is not None:
         config = Float8LinearConfig.from_recipe_name(float8_recipe_name)
     elif mx_recipe_name is not None:
-        config = MXLinearConfig.from_recipe_name(mx_recipe_name)
+        try:
+            config = MXFP8TrainingOpConfig.from_recipe(
+                MXFP8TrainingRecipe(mx_recipe_name)
+            )
+        except ValueError:
+            raise ValueError(
+                f"Unsupported mx_recipe_name: {mx_recipe_name}. "
+                f"Supported values: {[r.value for r in MXFP8TrainingRecipe]}"
+            )
 
     print(f"Compile is set to       | {compile}")
     print(f"model_type is set to    | {model_type}")
 
@@ -83,8 +83,12 @@ def to_nvfp4_reference(x_hp):
 
 
 def to_nvfp4_reference_triton_swizzle(x_hp):
+    per_tensor_scale = torch.tensor(1.0, dtype=torch.float32, device=x_hp.device)
     nvfp4_tensor = NVFP4Tensor.to_nvfp4(
-        x_hp, use_triton_kernel=True, is_swizzled_scales=True
+        x_hp,
+        per_tensor_scale=per_tensor_scale,
+        use_triton_kernel=True,
+        is_swizzled_scales=True,
     )
     return nvfp4_tensor.qdata, nvfp4_tensor.scale
 
@@ -118,6 +122,7 @@ def run(
         "dim1_mxfp8_floor",
         "dim1_mxfp8_rceil",
         "dim1_mxfp8_triton_floor",
+        "dim1_mxfp8_triton_rceil",
         "dim1_mxfp8_cuda_floor",
         "dim1_mxfp8_cuda_rceil",
     )
@@ -350,12 +355,41 @@ def run(
         bps = (bytes_r + bytes_w) / (time_us / 1e6)
 
     elif mode == "dim1_mxfp8_triton_floor":
-        y_d1, s_d1 = triton_to_mxfp8_dim1(x, inner_block_size=BLOCK_SIZE)
+        y_d1, s_d1 = triton_to_mxfp8_dim1(
+            x, inner_block_size=BLOCK_SIZE, scaling_mode="floor"
+        )
 
         for _ in range(2):
-            __ = triton_to_mxfp8_dim1(x, inner_block_size=BLOCK_SIZE)
+            __ = triton_to_mxfp8_dim1(
+                x, inner_block_size=BLOCK_SIZE, scaling_mode="floor"
+            )
         time_us = benchmark_cuda_function_in_microseconds(
-            lambda x, b: triton_to_mxfp8_dim1(x, inner_block_size=BLOCK_SIZE),
+            lambda x, b: triton_to_mxfp8_dim1(
+                x, inner_block_size=BLOCK_SIZE, scaling_mode="floor"
+            ),
+            x,
+            BLOCK_SIZE,
+        )
+
+        assert y_d1.dtype == torch.float8_e4m3fn
+        assert s_d1.dtype == torch.float8_e8m0fnu
+        bytes_r = x.numel() * bytes_per_el_bf16
+        bytes_w = (y_d1.numel() + s_d1.numel()) * bytes_per_el_fp8
+        bps = (bytes_r + bytes_w) / (time_us / 1e6)
+
+    elif mode == "dim1_mxfp8_triton_rceil":
+        y_d1, s_d1 = triton_to_mxfp8_dim1(
+            x, inner_block_size=BLOCK_SIZE, scaling_mode="rceil"
+        )
+
+        for _ in range(2):
+            __ = triton_to_mxfp8_dim1(
+                x, inner_block_size=BLOCK_SIZE, scaling_mode="rceil"
+            )
+        time_us = benchmark_cuda_function_in_microseconds(
+            lambda x, b: triton_to_mxfp8_dim1(
+                x, inner_block_size=BLOCK_SIZE, scaling_mode="rceil"
+            ),
             x,
             BLOCK_SIZE,
         )
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# TorchAO Claude Instructions`
	`2`	`+`
	`3`	`+Fill me in`