Fix race condition in mxfp8 CUDA kernels (#4278)

lw · web-flow · commit de873d151eb6 · 2026-04-15T18:24:04.000+02:00
A race condition was present in two kernels due to a bad ordering between a syncthreads and an async-proxy fence. The fence is needed because it makes sure that the calling thread's writes to shmem are visible in the async proxy. However, the operation we're synchronizing with is the TMA write issues by thread 0, hence we need to establish a causality link between _all_ the fences performed by _all_ threads, and the issuing of the TMA load by thread 0. Thus the syncthreads must be inserted in between these two operations. The CUDA programming guide is very explicit about this, in [section 10.29.1. Using TMA to transfer one-dimensional arrays](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#using-tma-to-transfer-one-dimensional-arrays).
diff --git a/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_M_groups.cu b/torchao/csrc/cuda/mx_kernels/mx_block_rearrange_2d_M_groups.cu
@@ -316,10 +316,9 @@ __global__ void mx_blocked_layout_2d_M_groups_kernel(
                 ) = data;
             }
         }
-        __syncthreads();
-
         // Fence to ensure SMEM writes are visible to TMA async proxy
         ptx::fence_proxy_async_shared_cta();
+        __syncthreads();
 
         // Compute output tile coordinates
         const int chunk_sf_row_tile = (superblock_idx_in_group * CHUNKS_PER_TB) + chunk_idx;
@@ -785,8 +784,8 @@ __global__ void mx_blocked_layout_2d_simple_kernel(
     }
 
     // Ensure threads finish their smem writes and use explicit fence to ensure visibility to async proxy for TMA
-    __syncthreads();
     ptx::fence_proxy_async_shared_cta();
+    __syncthreads();
 
     if (is_master_thread) {
         // Issue separate 1D TMA stores for each valid SF tile

Original file line number	Diff line number	Diff line change
`@@ -316,10 +316,9 @@ __global__ void mx_blocked_layout_2d_M_groups_kernel(`
`316`	`316`	`) = data;`
`317`	`317`	`}`
`318`	`318`	`}`
`319`		`- __syncthreads();`
`320`		`-`
`321`	`319`	`// Fence to ensure SMEM writes are visible to TMA async proxy`
`322`	`320`	`ptx::fence_proxy_async_shared_cta();`
	`321`	`+ __syncthreads();`
`323`	`322`
`324`	`323`	`// Compute output tile coordinates`
`325`	`324`	`const int chunk_sf_row_tile = (superblock_idx_in_group * CHUNKS_PER_TB) + chunk_idx;`
`@@ -785,8 +784,8 @@ __global__ void mx_blocked_layout_2d_simple_kernel(`
`785`	`784`	`}`
`786`	`785`
`787`	`786`	`// Ensure threads finish their smem writes and use explicit fence to ensure visibility to async proxy for TMA`
`788`		`- __syncthreads();`
`789`	`787`	`ptx::fence_proxy_async_shared_cta();`
	`788`	`+ __syncthreads();`
`790`	`789`
`791`	`790`	`if (is_master_thread) {`
`792`	`791`	`// Issue separate 1D TMA stores for each valid SF tile`