cpu: simplify comments and reduce code duplication

Copilot · Copilot · Copilot · commit e9b811fc34e4 · 2026-04-13T08:32:17.000Z
- scaled_embedding_bag.cpp: remove unused &lt;c10/util/Unroll.h&gt; include;
  shorten 8-line block comment to 3 lines; shorten 2-line comments to
  1 line throughout; move PREFETCH_DIST inside the kHasAVX512 branch
  where it is used; fix preprocessor imbalance (remove orphan #endif
  left by a prior removal of #if __GNUC__ &gt;= 15).

- utils.h: compress 2-line comments to 1 line each.

- quantized_sdpa.cpp: compress 3-line forward-decl comment to 1 line;
  remove 3-line AVX512-section comment body (keep the === marker);
  compress 2-line inline comments to 1 line each.

- setup.py: extract _resolve_cxx() helper to avoid the duplicated
  'os.environ.get(CXX) or find_preferred_cxx_compiler() or g++'
  pattern in filter_sources and precompile_isa_objects; shorten
  find_preferred_cxx_compiler by flattening the two-step if-chain to
  a single return expression; shorten add_compile_flags, get_link_flags,
  and precompile_isa_objects docstrings.

Co-authored-by: Copilot &lt;223556219+Copilot@users.noreply.github.com&gt;
diff --git a/setup.py b/setup.py
@@ -419,44 +419,33 @@ def _gcc_major(exe: str) -> "int | None":
 
     @staticmethod
     def find_preferred_cxx_compiler() -> "str | None":
-        """Find a C++ compiler that meets the preferred GCC version requirement.
+        """Find a C++ compiler at or above _PREFERRED_GCC_MAJOR.
 
-        Search order:
-          1. $CXX environment variable
-          2. ``g++`` on $PATH (via ``which``)
-
-        Returns the path of the first qualifying compiler, or None if no
-        compiler at or above _PREFERRED_GCC_MAJOR is found.
+        Checks $CXX first, then ``g++`` on $PATH.
+        Returns the compiler path or None.
         """
         min_major = X86KernelBuild._PREFERRED_GCC_MAJOR
 
         def _check(exe: str) -> "str | None":
             if not exe:
                 return None
             if os.sep not in exe:
-                # Plain name — resolve via PATH.
-                resolved = shutil.which(exe)
-                if not resolved:
-                    return None
-                exe = resolved
+                exe = shutil.which(exe) or ""
             if not os.path.isfile(exe):
                 return None
             major = X86KernelBuild._gcc_major(exe)
-            if major is not None and major >= min_major:
-                return exe
-            return None
-
-        # 1. Explicit $CXX
-        result = _check(os.environ.get("CXX", ""))
-        if result:
-            return result
+            return exe if major is not None and major >= min_major else None
 
-        # 2. g++ on PATH
-        result = _check("g++")
-        if result:
-            return result
+        return _check(os.environ.get("CXX", "")) or _check("g++")
 
-        return None
+    @staticmethod
+    def _resolve_cxx() -> str:
+        """Return the effective C++ compiler path (from $CXX or preferred, falling back to g++)."""
+        return (
+            os.environ.get("CXX")
+            or X86KernelBuild.find_preferred_cxx_compiler()
+            or "g++"
+        )
 
     @staticmethod
     def get_include_flags() -> list:
@@ -477,27 +466,12 @@ def get_include_flags() -> list:
     def add_compile_flags(extra_compile_args: dict) -> None:
         """Extend *extra_compile_args* with CPU-kernel compile options.
 
-        The main kernel files are compiled with AVX512 + AMX flags so that
-        PyTorch's vec512 headers (which use Sleef f16 intrinsics under
-        CPU_CAPABILITY_AVX512) compile correctly.  -fno-tree-vectorize
-        prevents the compiler from emitting 512-bit packed instructions in
-        scalar fallback functions; AVX512 pragma regions re-enable
-        vectorization only where explicitly desired.
-
-        Runtime dispatch via __builtin_cpu_supports() selects the right path:
-          - no AVX512:      scalar fallback paths are used.
-          - AVX512 + AMX:   AVX512/AMX optimised paths are selected.
-          - AVX10.2:        AVX10.2 objects (compiled separately with
-                            -march=diamondrapids) are also linked in.
+        Enables AVX512 + AMX (for PyTorch vec512 headers) and -fno-tree-vectorize
+        (to prevent scalar paths from emitting 512-bit instructions).
+        Runtime dispatch via __builtin_cpu_supports() selects the right path.
         """
         if not X86KernelBuild.is_enabled():
             return
-        # Build with full AVX512 + AMX support so PyTorch's vec512 headers
-        # (which use Sleef f16 intrinsics under CPU_CAPABILITY_AVX512) compile
-        # correctly.  -fno-tree-vectorize prevents the compiler from emitting
-        # 512-bit packed instructions in *scalar* fallback functions; AVX512
-        # pragma regions add #pragma GCC optimize("O3,tree-vectorize") to
-        # re-enable vectorization only where explicitly desired.
         extra_compile_args["cxx"].extend(
             [
                 "-DCPU_CAPABILITY_AVX512",
@@ -528,14 +502,10 @@ def add_compile_flags(extra_compile_args: dict) -> None:
 
     @staticmethod
     def get_link_flags() -> list:
-        """Return extra link flags for the CPU kernel .so.
-
-        Adds an RPATH entry for every PyTorch library directory so that
-        libc10.so / libtorch_cpu.so are found at runtime without needing
-        LD_LIBRARY_PATH.  Also statically links libstdc++ to carry new
-        CXXABI symbols (e.g. __cxa_call_terminate from CXXABI_1.3.15)
-        that newer GCC versions generate but PyTorch's bundled libstdc++
-        may lack.
+        """Return extra link flags: PyTorch lib RPATHs + -static-libstdc++.
+
+        Static libstdc++ carries new CXXABI symbols that GCC 15 generates but
+        PyTorch's bundled libstdc++ may lack.
         """
         if not X86KernelBuild.is_enabled():
             return []
@@ -554,11 +524,9 @@ def get_link_flags() -> list:
     def filter_sources(sources: list, extensions_dir: str) -> list:
         """Remove CPU aten_kernels sources from *sources* when not building for CPU."""
         aten_kernels_dir = os.path.join(extensions_dir, "cpu", "aten_kernels")
-        cxx = os.environ.get(
-            "CXX", X86KernelBuild.find_preferred_cxx_compiler() or "g++"
-        )
+        cxx = X86KernelBuild._resolve_cxx()
         compiler_ok = (
-            cxx and X86KernelBuild._gcc_major(cxx) >= X86KernelBuild._MINIMUM_GCC_MAJOR
+            X86KernelBuild._gcc_major(cxx) >= X86KernelBuild._MINIMUM_GCC_MAJOR
         )
         if not X86KernelBuild.is_enabled() or not compiler_ok:
             excluded = set(glob.glob(os.path.join(aten_kernels_dir, "*.cpp")))
@@ -567,31 +535,19 @@ def filter_sources(sources: list, extensions_dir: str) -> list:
 
     @staticmethod
     def precompile_isa_objects(build_temp: str, extensions: list) -> None:
-        """Pre-compile ISA-specific CPU objects from kernel source files.
-
-        Instead of maintaining separate *_avx10_2.cpp files, each kernel
-        source file contains ISA-specific code guarded by
-        CPU_CAPABILITY_AVX10_2.  At build time we:
-          1. Scan kernel .cpp files for the CPU_CAPABILITY_AVX10_2 marker.
-          2. Copy each matching file to a temp path in the build dir.
-          3. Compile that temp copy with -DCPU_CAPABILITY_AVX10_2
-             -march=diamondrapids.
-          4. Attach the resulting .o as extra_objects on the main extension.
-
-        The #if defined(CPU_CAPABILITY_AVX10_2) guard in each source file
-        ensures that only the AVX10.2 variant code is compiled in the temp
-        copy (the main build compiles the #else branch).
+        """Compile AVX10.2 temp copies of kernel files that contain CPU_CAPABILITY_AVX10_2.
+
+        Each matching .cpp is copied to a temp dir and compiled with
+        -DCPU_CAPABILITY_AVX10_2 -march=diamondrapids. The resulting .o is
+        attached as an extra_object on the main torchao._C extension.
         """
         main_ext = next((e for e in extensions if e.name == "torchao._C"), None)
         if main_ext is None:
             return
 
-        cxx = os.environ.get(
-            "CXX", X86KernelBuild.find_preferred_cxx_compiler() or "g++"
-        )
+        cxx = X86KernelBuild._resolve_cxx()
         compiler_ok = (
-            cxx
-            and X86KernelBuild._gcc_major(cxx) >= X86KernelBuild._PREFERRED_GCC_MAJOR
+            X86KernelBuild._gcc_major(cxx) >= X86KernelBuild._PREFERRED_GCC_MAJOR
         )
         if not compiler_ok:
             print(
@@ -607,9 +563,8 @@ def precompile_isa_objects(build_temp: str, extensions: list) -> None:
 
         aten_kernels_dir = os.path.join("torchao", "csrc", "cpu", "aten_kernels")
 
-        # --- AVX10.2 variant: copy kernel files and compile with DMR target ---
-        # Include the kernel source dir so that relative includes like
-        # utils.h still resolve when the file is compiled from a temp copy.
+        # Copy each kernel that has AVX10.2 code to a temp dir and compile
+        # with -march=diamondrapids so hardware fp8 instructions are available.
         avx10_2_flags = (
             ["-O3", "-std=c++20", "-fPIC", "-fopenmp"]
             + include_flags
diff --git a/torchao/csrc/cpu/aten_kernels/quantized_sdpa.cpp b/torchao/csrc/cpu/aten_kernels/quantized_sdpa.cpp
@@ -40,8 +40,6 @@ inline c10::SymFloat calculate_scale(
 }
 
 // Forward declarations for AVX512-compiled kernel entry points.
-// These are defined inside the #pragma GCC target region below and are
-// only called when __builtin_cpu_supports("avx512f") is true at runtime.
 void int8_sdpa_fused_kernel(
     const at::Tensor& output, const at::Tensor& query, const at::Tensor& key,
     const at::Tensor& value, double dropout_p, bool is_causal,
@@ -58,9 +56,6 @@ void fp8_sdpa_fused_kernel(
 #endif // CPUBLAS_BRGEMM_F8F8F32
 
 // === AVX512 IMPLEMENTATION SECTION ===
-// Functions in this section are compiled with AVX512 + AVX512VNNI + AMX
-// target regardless of global compiler flags.  They are only CALLED when
-// __builtin_cpu_supports("avx512f") returns true at runtime.
 #pragma GCC push_options
 #pragma GCC target("avx512f,avx512bw,avx512vl,avx512dq,avx512vnni,amx-int8,amx-tile,amx-bf16")
 #pragma GCC optimize("O3,tree-vectorize")
@@ -2553,8 +2548,7 @@ at::Tensor _qscaled_dot_product_cpu(
   }
 
   if (dtype == at::ScalarType::Byte) {
-      // Use optimized fused int8 SDPA kernel when AVX512 + AMX are available.
-      // Falls back to reference math kernel otherwise.
+      // Use optimized AVX512+AMX fused kernel, fall back to math kernel otherwise.
       if (__builtin_cpu_supports("avx512f") && at::native::cpublas::could_pack(dtype)) {
           at::Tensor output = at::empty_like(query, query.options()).transpose(1, 2);
           int8_sdpa_fused_kernel(output, query, key, value,
@@ -2575,8 +2569,7 @@ at::Tensor _qscaled_dot_product_cpu(
               o_scale, o_zp).transpose(1, 2).contiguous().transpose(1, 2);
       }
   } else if (dtype == at::ScalarType::Float8_e4m3fn) {
-      // Use optimized fused FP8 SDPA kernel when AVX512 + AMX-FP8 are available.
-      // Falls back to reference math kernel otherwise.
+      // Use optimized AVX512+AMX-FP8 fused kernel, fall back to math kernel otherwise.
 #if defined(CPUBLAS_BRGEMM_F8F8F32)
       if (__builtin_cpu_supports("avx512f") && at::native::cpublas::could_pack(dtype)) {
           at::Tensor output = at::empty_like(query, query.options()).transpose(1, 2);
diff --git a/torchao/csrc/cpu/aten_kernels/scaled_embedding_bag.cpp b/torchao/csrc/cpu/aten_kernels/scaled_embedding_bag.cpp
@@ -3,7 +3,6 @@
 #include <ATen/native/CPUBlas.h>
 #include <ATen/native/EmbeddingBag.h>
 #include <c10/util/Float8_e4m3fn.h>
-#include <c10/util/Unroll.h>
 #include <torch/all.h>
 #include "utils.h"
 
@@ -43,18 +42,11 @@
     }                                                                          \
   }()
 
-// =============================================================================
-// The AVX10.2 variant of this file is compiled as a temp copy with:
-//   -DCPU_CAPABILITY_AVX10_2 -march=diamondrapids
-// When __AVX10_2__ is set by -march=diamondrapids, the PyTorch helpers
-// cvtfp8e4m3_fp32 / cvtfp32_fp8e4m3 (vec512_float8.h) use the native
-// hardware instructions _mm256_cvthf8_ph / _mm256_cvtph_hf8 instead of the
-// multi-step AVX512 software emulation. All other kernel logic is identical.
-// =============================================================================
+// When compiled as a temp copy with -DCPU_CAPABILITY_AVX10_2 -march=diamondrapids,
+// cvtfp8e4m3_fp32/cvtfp32_fp8e4m3 use hardware instructions instead of AVX512 emulation.
+// All other kernel logic is identical between the two variants.
 
-// Forward-declare the AVX10.2 entry point so the runtime dispatcher can call
-// it when __builtin_cpu_supports("avx10.2") is true. Only needed in the
-// default (non-AVX10.2) build; in the AVX10.2 temp copy this TU defines it.
+// Forward-declare the AVX10.2 entry point for runtime dispatch.
 #ifndef CPU_CAPABILITY_AVX10_2
 namespace torchao {
 namespace cpu_avx10_2 {
@@ -66,18 +58,16 @@ at::Tensor _scaled_embedding_bag_avx10_2(
 } // namespace torchao
 #endif
 
-// All kernel code is compiled with AVX512 enabled. When compiled with
-// -march=diamondrapids (-DCPU_CAPABILITY_AVX10_2), these flags are a subset
-// of what the target already provides, so the pragma is harmless.
+// AVX512 flags are a subset of what -march=diamondrapids provides, so this
+// pragma is safe in both the default build and the AVX10.2 temp copy.
 #pragma GCC push_options
 #pragma GCC target("avx512f,avx512bw,avx512vl,avx512dq,avx512vnni,amx-int8,amx-tile,amx-bf16")
 #pragma GCC optimize("O3,tree-vectorize")
 #include <immintrin.h>
 
 namespace torchao {
 
-// In the AVX10.2 temp copy, emit into the cpu_avx10_2 namespace so the linker
-// sees a distinct symbol from the main build.
+// AVX10.2 temp copy emits into cpu_avx10_2 namespace; default build uses anonymous.
 #ifdef CPU_CAPABILITY_AVX10_2
 namespace cpu_avx10_2 {
 #else
@@ -177,14 +167,12 @@ static void _krnl(
     int64_t bs_begin, int64_t bs_end, int64_t num_emb, int64_t emb_dim,
     index_t last_offset, const index_t *indices, const index_t *offsets,
     const data_t *weight, double scale, output_t *result, int64_t num_batch) {
-  // How many batch entries ahead to prefetch to overlap DRAM latency with compute.
-  constexpr int64_t PREFETCH_DIST = 8;
   if (kHasAVX512 && emb_dim % 128 == 0) {
+    constexpr int64_t PREFETCH_DIST = 8;
     constexpr int64_t block_dim = 128;
     const int64_t num_blocks = emb_dim / block_dim;
     __m512 scale_v = _mm512_set1_ps(scale);
     for (int64_t b = bs_begin; b < bs_end; ++b) {
-      // Software prefetch for batch entries ahead to overlap DRAM latency.
       const int64_t pref_b = b + PREFETCH_DIST;
       if (pref_b < bs_end) {
         const int64_t pref_start = offsets[pref_b];
@@ -253,9 +241,7 @@ static void _run(
   }
 }
 
-// Entry-point function. Name and namespace differ by compile variant:
-//   default build  → torchao::{anonymous}::_scaled_embedding_bag_impl
-//   AVX10.2 copy   → torchao::cpu_avx10_2::_scaled_embedding_bag_avx10_2
+// Entry point: name/namespace differ per compile variant.
 #ifdef CPU_CAPABILITY_AVX10_2
 at::Tensor _scaled_embedding_bag_avx10_2(
 #else
@@ -265,7 +251,6 @@ at::Tensor _scaled_embedding_bag_impl(
     const at::Tensor& offsets, const at::Tensor& w_scales, double o_scale,
     int64_t mode, bool include_last_offset, at::ScalarType output_dtype) {
 #ifndef CPU_CAPABILITY_AVX10_2
-  // Runtime dispatch to hardware fp8 path when running on AVX10.2 CPU.
 #if __GNUC__ >= 15
   if (__builtin_cpu_supports("avx10.2")) {
     return cpu_avx10_2::_scaled_embedding_bag_avx10_2(
diff --git a/torchao/csrc/cpu/aten_kernels/utils.h b/torchao/csrc/cpu/aten_kernels/utils.h
@@ -31,12 +31,10 @@ get_m_blocking(int64_t M) {
   return std::make_tuple(parallel_on_M, block_m, Mc, Mc_parallel);
 }
 
-// Runtime check for AVX-512F support; available regardless of compile flags.
-// Use this instead of CPU_CAPABILITY_* macros for runtime dispatch.
+// Runtime AVX-512F check for use by CPU kernels; available regardless of compile flags.
 inline const bool kHasAVX512 = __builtin_cpu_supports("avx512f");
 
-// Zero a buffer of T elements.  Uses memset for portability — the compiler
-// will auto-vectorize with the highest ISA available in the calling context.
+// Uses memset so the compiler auto-vectorizes with whatever ISA is active.
 template<typename T>
 void zero_buffer(T* data, int64_t size) {
   memset(data, 0, sizeof(T) * size);