[claude] Make MXFP8 cuda kernels ABI stable

andrewor14 · andrewor14 · commit 5edc1190ccf7 · 2026-02-06T12:11:37.000-08:00
Prompt:
```
Make these two files ABI stable:
torchao/csrc/cuda/mx_kernels/mxfp8_cuda.cu
torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp

Read these for instructions:
pytorch/docs/source/notes/libtorch_stable_abi.md
cppdocs/_sources/stable.rst.txt

Use these files for an example:
Before: flash-attention/hopper/flash_api.cpp
After: flash-attention/hopper/flash_api_stable.cpp

Additional instructions:
Replace at::zeros with torch::stable::new_zeros
Replace at::empty with torch::stable::new_empty
Replace TORCH_CHECK with STD_TORCH_CHECK without changing the content of the check, just replace the function name
Replace c10::cuda::CUDAGuard with DeviceGuard
When calling tensor.stride(), no need to check for tensor.numel() &gt; 0 beforehand
When using cudaStream_t, remember to include cuda_runtime.h
When calling aoti_torch_get_current_cuda_stream, get the device index from a tensor, not from the general torch::stable::accelerator::getCurrentDeviceIndex();

Don’t do these things:
Don’t use torch::stable::new_empty_strided since this API does not exist
Don’t define a dummy _C module that can be accessed from python
Don’t declare aoti_torch_get_current_cuda_stream, just include it from torch/csrc/inductor/aoti_torch/c/shim.h and add -DUSE_CUDA to both cxx and nvcc in setup.py
Don’t box kernels manually, just use TORCH_BOX
Don’t change switch statements into if else blocks
```

Follow-up prompts:
```
Revert the content changes of STD_TORCH_CHECK, just replace the function name

For the changes that involved tensor.data_ptr&lt;some_dtype&gt;() in the old code, revert the changes and replace it with tensor.const_data_ptr&lt;some_dtype&gt;()
```

Output:
```
● I've successfully made both files ABI stable. Here's a summary of the changes:

  Changes to mxfp8_cuda.cu:

  1. Updated includes: Replaced ATen and torch/extension.h with stable ABI headers (torch/csrc/stable/tensor.h, torch/csrc/inductor/aoti_torch/c/shim.h, torch/headeronly/core/ScalarType.h, torch/headeronly/util/Exception.h, torch/headeronly/util/shim_utils.h)
  2. Added cuda_runtime.h for cudaStream_t
  3. Changed tensor type: torch::Tensor → torch::stable::Tensor (with using declaration)
  4. Changed scalar type enums: torch::kFloat32 → torch::headeronly::ScalarType::Float, etc.
  5. Changed error checking: TORCH_CHECK → STD_TORCH_CHECK
  6. Changed CUDA stream access: at::cuda::getCurrentCUDAStream() → aoti_torch_get_current_cuda_stream(input.get_device_index(), &amp;stream_ptr) with TORCH_ERROR_CODE_CHECK
  7. Fixed stride access: .strides()[i] → .stride(i) and .sizes()[i] → .size(i)

  Changes to mxfp8_extension.cpp:

  1. Updated includes: Replaced torch/library.h, ATen/ATen.h, ATen/cuda/CUDAContext.h, c10/cuda/CUDAGuard.h with stable headers
  2. Changed tensor type: at::Tensor → torch::stable::Tensor
  3. Changed DeviceGuard: c10::cuda::CUDAGuard → torch::stable::accelerator::DeviceGuard (with input.get_device_index() instead of input.device())
  4. Changed error checking: TORCH_CHECK → STD_TORCH_CHECK
  5. Changed tensor creation:
    - at::empty → torch::stable::new_empty
    - at::zeros → torch::stable::new_zeros
    - For strided tensors: Created with transposed dimensions then used torch::stable::transpose to get column-major layout
  6. Changed CUDA stream access: Used aoti_torch_get_current_cuda_stream with device index from tensor
  7. Changed library registration: TORCH_LIBRARY_IMPL → STABLE_TORCH_LIBRARY_IMPL with TORCH_BOX wrapper

  Changes to setup.py:

  1. Added -DUSE_CUDA to both cxx and nvcc compiler arguments for the mxfp8 extension
```
diff --git a/setup.py b/setup.py
@@ -755,11 +755,16 @@ def get_extensions():
                             f"-DPy_LIMITED_API={min_supported_cpython_hexcode}",
                             "-std=c++17",
                             "-O3",
+                            "-DUSE_CUDA",
+                            # define TORCH_TARGET_VERSION with min version 2.11 for ABI stable Float8_e8m0fnu
+                            "-DTORCH_TARGET_VERSION=0x020b000000000000",
                         ],
                         "nvcc": nvcc_args
                         + [
                             "-gencode=arch=compute_100,code=sm_100",
                             "-gencode=arch=compute_120,code=compute_120",
+                            "-DUSE_CUDA",
+                            "-DTORCH_TARGET_VERSION=0x020b000000000000",
                         ],
                     },
                 ),
diff --git a/torchao/csrc/cuda/mx_kernels/mxfp8_cuda.cu b/torchao/csrc/cuda/mx_kernels/mxfp8_cuda.cu
@@ -1,26 +1,33 @@
 // CUDA bridge for MXFP8 quantization
 
 #include "mxfp8_quantize.cuh"
-#include <ATen/cuda/CUDAContext.h>
+
+#include <torch/csrc/stable/tensor.h>
+#include <torch/csrc/inductor/aoti_torch/c/shim.h>
+#include <torch/headeronly/core/ScalarType.h>
+#include <torch/headeronly/util/Exception.h>
+#include <torch/headeronly/util/shim_utils.h>
+
+#include <cuda_runtime.h>
 #include <string>
-#include <torch/extension.h>
 
+using torch::stable::Tensor;
 
 namespace mxfp8 {
 
 // Convert PyTorch scalar type to our DType enum
-DType get_input_dtype(const torch::Tensor &t) {
+DType get_input_dtype(const Tensor &t) {
   switch (t.scalar_type()) {
-  case torch::kFloat32:
+  case torch::headeronly::ScalarType::Float:
     return DType::kFloat32;
-  case torch::kFloat16:
+  case torch::headeronly::ScalarType::Half:
     return DType::kFloat16;
-  case torch::kBFloat16:
+  case torch::headeronly::ScalarType::BFloat16:
     return DType::kBFloat16;
-  case torch::kUInt8:
+  case torch::headeronly::ScalarType::Byte:
     return DType::kByte;
   default:
-    TORCH_CHECK(false, "Unsupported input tensor dtype: ", t.scalar_type());
+    STD_TORCH_CHECK(false, "Unsupported input tensor dtype: ", t.scalar_type());
   }
 }
 
@@ -30,7 +37,7 @@ ScaleCalculationMode get_scaling_mode(const std::string &scaling_mode) {
   } else if (scaling_mode.compare("rceil") == 0) {
       return ScaleCalculationMode::RCEIL;
   } else {
-      TORCH_CHECK(false, "Unsupported scaling mode: ", scaling_mode, ". Only ['floor', 'rceil'] are supported.");
+      STD_TORCH_CHECK(false, "Unsupported scaling mode: ", scaling_mode, ". Only ['floor', 'rceil'] are supported.");
   }
 }
 
@@ -39,16 +46,16 @@ DType get_output_dtype(const std::string &fp8_format) {
   if (fp8_format.compare("e4m3") == 0) {
     return DType::kFloat8E4M3;
   } else {
-    TORCH_CHECK(false, "Unsupported FP8 format: ", fp8_format,
+    STD_TORCH_CHECK(false, "Unsupported FP8 format: ", fp8_format,
                 ". Only 'e4m3' is supported.");
   }
 }
 
-void mxfp8_quantize_cuda(const torch::Tensor &input,
-                         torch::Tensor &output_rowwise,
-                         torch::Tensor &output_colwise,
-                         torch::Tensor &scales_rowwise,
-                         torch::Tensor &scales_colwise,
+void mxfp8_quantize_cuda(const Tensor &input,
+                         Tensor &output_rowwise,
+                         Tensor &output_colwise,
+                         Tensor &scales_rowwise,
+                         Tensor &scales_colwise,
                          int64_t scale_dim_x,
                          int64_t scale_dim_y,
                          const std::string &fp8_format,
@@ -73,23 +80,29 @@ void mxfp8_quantize_cuda(const torch::Tensor &input,
           ? reinterpret_cast<e8m0_t *>(scales_colwise.data_ptr())
           : nullptr;
 
-  // Get CUDA stream
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  // Get CUDA stream using stable ABI
+  void* stream_ptr = nullptr;
+  TORCH_ERROR_CODE_CHECK(aoti_torch_get_current_cuda_stream(input.get_device_index(), &stream_ptr));
+  cudaStream_t stream = static_cast<cudaStream_t>(stream_ptr);
 
-  // Get strides of scale ptrs
-  int64_t scale_rowwise_stride_dim0 = scales_rowwise.strides()[0];
-  int64_t scale_rowwise_stride_dim1 = scales_rowwise.strides()[1];
-  int64_t scale_colwise_stride_dim0 = scales_colwise.strides()[0];
-  int64_t scale_colwise_stride_dim1 = scales_colwise.strides()[1];
+  // Get strides of scale ptrs (guard against 1D empty tensors when rowwise/colwise is false)
+  int64_t scale_rowwise_stride_dim0 = scales_rowwise.dim() >= 2 ? scales_rowwise.stride(0) : 0;
+  int64_t scale_rowwise_stride_dim1 = scales_rowwise.dim() >= 2 ? scales_rowwise.stride(1) : 0;
+  int64_t scale_colwise_stride_dim0 = scales_colwise.dim() >= 2 ? scales_colwise.stride(0) : 0;
+  int64_t scale_colwise_stride_dim1 = scales_colwise.dim() >= 2 ? scales_colwise.stride(1) : 0;
 
 #if defined(DEBUG)
   printf("mxfp8_quantize_cuda:\n");
   printf("Quantizing input tensor of size %ld x %ld\n", rows, cols);
   printf("scaling_mode: %s\n", scaling_mode.c_str());
   printf("Scale dim x: %ld\n", scale_dim_x);
   printf("Scale dim y: %ld\n", scale_dim_y);
-  printf("Rowwise scale shape: %ld x %ld\n", scales_rowwise.sizes()[0], scales_rowwise.sizes()[1]);
-  printf("Colwise scale shape: %ld x %ld\n", scales_colwise.sizes()[0], scales_colwise.sizes()[1]);
+  printf("Rowwise scale shape: %ld x %ld\n",
+         scales_rowwise.dim() >= 1 ? scales_rowwise.size(0) : 0,
+         scales_rowwise.dim() >= 2 ? scales_rowwise.size(1) : 0);
+  printf("Colwise scale shape: %ld x %ld\n",
+         scales_colwise.dim() >= 1 ? scales_colwise.size(0) : 0,
+         scales_colwise.dim() >= 2 ? scales_colwise.size(1) : 0);
   printf("scale_rowwise_stride_dim0 = %ld\n", scale_rowwise_stride_dim0);
   printf("scale_rowwise_stride_dim1 = %ld\n", scale_rowwise_stride_dim1);
   printf("scale_colwise_stride_dim0 = %ld\n", scale_colwise_stride_dim0);
@@ -109,9 +122,9 @@ void mxfp8_quantize_cuda(const torch::Tensor &input,
                            stream);
 }
 
-void mxfp8_quantize_3d_cuda(const torch::Tensor &input,
-                             torch::Tensor &output_colwise,
-                             torch::Tensor &scales_colwise,
+void mxfp8_quantize_3d_cuda(const Tensor &input,
+                             Tensor &output_colwise,
+                             Tensor &scales_colwise,
                              int64_t scale_dim_n,
                              const std::string &fp8_format,
                              const std::string &scaling_mode) {
@@ -127,8 +140,10 @@ void mxfp8_quantize_3d_cuda(const torch::Tensor &input,
   e8m0_t *scales_colwise_ptr =
       reinterpret_cast<e8m0_t *>(scales_colwise.data_ptr());
 
-  // Get CUDA stream
-  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  // Get CUDA stream using stable ABI
+  void* stream_ptr = nullptr;
+  TORCH_ERROR_CODE_CHECK(aoti_torch_get_current_cuda_stream(input.get_device_index(), &stream_ptr));
+  cudaStream_t stream = static_cast<cudaStream_t>(stream_ptr);
 
   // Get strides of scales tensor
   int64_t scales_colwise_stride_dim0 = scales_colwise.stride(0);
@@ -152,7 +167,7 @@ void mxfp8_quantize_3d_cuda(const torch::Tensor &input,
   printf("scaling_mode: %s\n", scaling_mode.c_str());
   printf("Scale dim n: %ld\n", scale_dim_n);
   printf("Output scale shape: %ld x %ld x %ld\n",
-         scales_colwise.sizes()[0], scales_colwise.sizes()[1], scales_colwise.sizes()[2]);
+         scales_colwise.size(0), scales_colwise.size(1), scales_colwise.size(2));
   printf("scales_colwise_stride_dim0 = %ld\n", scales_colwise_stride_dim0);
   printf("scales_colwise_stride_dim1 = %ld\n", scales_colwise_stride_dim1);
   printf("input_stride_dim0 = %ld\n", input_stride_dim0);
diff --git a/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp b/torchao/csrc/cuda/mx_kernels/mxfp8_extension.cpp