[CPU] Add check of AVX512 at runtitme (#4039)

yanbing-j · web-flow · commit 3c2cb8c1d636 · 2026-03-12T22:42:04.000-07:00
* [CPU] Add check of AVX512 at runtitme

* Move flag to utils.h

* Update comments
diff --git a/torchao/csrc/cpu/aten_kernels/scaled_embedding_bag.cpp b/torchao/csrc/cpu/aten_kernels/scaled_embedding_bag.cpp
@@ -5,6 +5,7 @@
 #include <c10/util/Float8_e4m3fn.h>
 #include <c10/util/Unroll.h>
 #include <torch/all.h>
+#include "utils.h"
 
 #define QTYPE_DISPATCH(TYPE, ...)                                              \
   [&]() {                                                                      \
@@ -180,7 +181,7 @@ inline void _scaled_embedding_bag_krnl(
     const index_t *offsets, const data_t *weight, const double scale,
     output_t *result, const int64_t num_batch) {
 #if defined(CPU_CAPABILITY_AVX512)
-  if (emb_dim % 128 == 0) {
+  if (kHasAVX512 && emb_dim % 128 == 0) {
     constexpr int64_t block_dim = 128;
     const int64_t num_blocks = emb_dim / block_dim;
     __m512 scale_v = _mm512_set1_ps(scale);
diff --git a/torchao/csrc/cpu/aten_kernels/utils.h b/torchao/csrc/cpu/aten_kernels/utils.h
@@ -10,7 +10,7 @@
 #include <tuple>
 #include <ATen/native/cpu/utils.h>
 
-int64_t get_m_block(int64_t M) {
+inline int64_t get_m_block(int64_t M) {
   if (M <= 48) {
     return M;
   } else if (M < 64) {
@@ -22,7 +22,7 @@ int64_t get_m_block(int64_t M) {
   }
 }
 
-std::tuple<bool, int64_t, int64_t, int64_t>
+inline std::tuple<bool, int64_t, int64_t, int64_t>
 get_m_blocking(int64_t M) {
   bool parallel_on_M = M > 128;
   int64_t block_m = get_m_block(M);
@@ -32,6 +32,10 @@ get_m_blocking(int64_t M) {
 }
 
 #if defined(CPU_CAPABILITY_AVX512)
+// Cached check for AVX-512F support in this process, for use by CPU kernels
+// that include this header and are compiled with CPU_CAPABILITY_AVX512.
+inline const bool kHasAVX512 = __builtin_cpu_supports("avx512f");
+
 template<typename T>
 void zero_buffer(T* data, int64_t size) {
   const int32_t vec_size = at::vec::Vectorized<T>::size();