llnl · michaelmckinsey1 · Jun 9, 2026 · Jun 10, 2026 · Jun 10, 2026 · Jun 18, 2026
diff --git a/src/apps/LTIMES-Cuda.cpp b/src/apps/LTIMES-Cuda.cpp
@@ -27,9 +27,9 @@ using namespace ltimes_idx;
 //
 // Define thread block shape for CUDA execution
 //
-#define m_block_sz (32)
-#define g_block_sz (integer::greater_of_squarest_factor_pair(block_size/m_block_sz))
-#define z_block_sz (integer::lesser_of_squarest_factor_pair(block_size/m_block_sz))
+#define m_block_sz (block_size)
+#define g_block_sz (1)
+#define z_block_sz (1)
 
 #define LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \
   m_block_sz, g_block_sz, z_block_sz
@@ -39,19 +39,19 @@ using namespace ltimes_idx;
   static_assert(m_block_sz*g_block_sz*z_block_sz == block_size, "Invalid block_size");
 
 #define LTIMES_NBLOCKS_CUDA \
-  dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(*num_m, m_block_sz)), \
-               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(*num_g, g_block_sz)), \
-               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(*num_z, z_block_sz)));
+  dim3 nblocks(static_cast<size_t>(*num_z), \
+               static_cast<size_t>(*num_g), \
+               1);
 
 
-template < size_t m_block_size, size_t g_block_size, size_t z_block_size >
-__launch_bounds__(m_block_size*g_block_size*z_block_size)
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void ltimes(PHI_VIEW phi, ELL_VIEW ell, PSI_VIEW psi,
                        ID num_d, IM num_m, IG num_g, IZ num_z)
 {
-   IM m(blockIdx.x * m_block_size + threadIdx.x);
-   IG g(blockIdx.y * g_block_size + threadIdx.y);
-   IZ z(blockIdx.z * z_block_size + threadIdx.z);
+   IM m(threadIdx.x);
+   IG g(blockIdx.y);
+   IZ z(blockIdx.x);
 
    if (m < num_m && g < num_g && z < num_z) {
      for (ID d(0); d < num_d; ++d ) {
@@ -60,14 +60,14 @@ __global__ void ltimes(PHI_VIEW phi, ELL_VIEW ell, PSI_VIEW psi,
    }
 }
 
-template < size_t m_block_size, size_t g_block_size, size_t z_block_size, typename Lambda >
-__launch_bounds__(m_block_size*g_block_size*z_block_size)
+template < size_t block_size, typename Lambda >
+__launch_bounds__(block_size)
 __global__ void ltimes_lam(IM num_m, IG num_g, IZ num_z,
                            Lambda body)
 {
-   IM m(blockIdx.x * m_block_size + threadIdx.x);
-   IG g(blockIdx.y * g_block_size + threadIdx.y);
-   IZ z(blockIdx.z * z_block_size + threadIdx.z);
+   IM m(threadIdx.x);
+   IG g(blockIdx.y);
+   IZ z(blockIdx.x);
 
    if (m < num_m && g < num_g && z < num_z) {
      body(z, g, m);
@@ -97,7 +97,7 @@ void LTIMES::runCudaVariantImpl(VariantID vid)
       constexpr size_t shmem = 0;
 
       RPlaunchCudaKernel(
-        (ltimes<LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>),
+        (ltimes<block_size>),
         nblocks, nthreads_per_block,
         shmem, res.get_stream(),
         phi, ell, psi,
@@ -123,8 +123,7 @@ void LTIMES::runCudaVariantImpl(VariantID vid)
       constexpr size_t shmem = 0;
 
       RPlaunchCudaKernel(
-        (ltimes_lam<LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA,
-                    decltype(ltimes_lambda)>),
+        (ltimes_lam<block_size, decltype(ltimes_lambda)>),
         nblocks, nthreads_per_block,
         shmem, res.get_stream(),
         num_m, num_g, num_z,
@@ -139,10 +138,10 @@ void LTIMES::runCudaVariantImpl(VariantID vid)
 
       using EXEC_POL =
         RAJA::KernelPolicy<
-          RAJA::statement::CudaKernelFixedAsync<m_block_sz*g_block_sz*z_block_sz,
-            RAJA::statement::For<1, RAJA::cuda_global_size_z_direct<z_block_sz>,     //z
-              RAJA::statement::For<2, RAJA::cuda_global_size_y_direct<g_block_sz>,   //g
-                RAJA::statement::For<3, RAJA::cuda_global_size_x_direct<m_block_sz>, //m
+          RAJA::statement::CudaKernelAsync<
+            RAJA::statement::For<1, RAJA::cuda_block_x_loop, // z
+              RAJA::statement::For<2, RAJA::cuda_block_y_loop, // g
+                RAJA::statement::For<3, RAJA::cuda_thread_x_loop, // m
                   RAJA::statement::For<0, RAJA::seq_exec,           //d
                     RAJA::statement::Lambda<0>
                   >
@@ -174,29 +173,24 @@ void LTIMES::runCudaVariantImpl(VariantID vid)
 
       constexpr bool async = true;
 
-      using launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async, m_block_sz*g_block_sz*z_block_sz>>;
+      using launch_policy =
+          RAJA::LaunchPolicy<RAJA::cuda_launch_t<async, block_size>>;
 
-      using z_policy = RAJA::LoopPolicy<RAJA::cuda_global_size_z_loop<z_block_sz>>;
+      using z_policy = RAJA::LoopPolicy<RAJA::cuda_block_x_loop>;
 
-      using g_policy = RAJA::LoopPolicy<RAJA::cuda_global_size_y_loop<g_block_sz>>;
+      using g_policy = RAJA::LoopPolicy<RAJA::cuda_block_y_loop>;
 
-      using m_policy = RAJA::LoopPolicy<RAJA::cuda_global_size_x_loop<m_block_sz>>;
+      using m_policy = RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>;
 
       using d_policy = RAJA::LoopPolicy<RAJA::seq_exec>;
 
-      const size_t z_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_z, z_block_sz);
-
-      const size_t g_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_g, g_block_sz);
-
-      const size_t m_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_m, m_block_sz);
-
       startTimer();
       // Loop counter increment uses macro to quiet C++20 compiler warning
       for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) {
 
         RAJA::launch<launch_policy>( res,
-            RAJA::LaunchParams(RAJA::Teams(m_grid_sz, g_grid_sz, z_grid_sz),
-                               RAJA::Threads(m_block_sz, g_block_sz, z_block_sz)),
+            RAJA::LaunchParams(RAJA::Teams(*num_z, *num_g, 1),
+                               RAJA::Threads(block_size, 1, 1)),
             [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
 
               RAJA::loop<z_policy>(ctx, IZRange(0, *num_z),

diff --git a/src/apps/LTIMES-Hip.cpp b/src/apps/LTIMES-Hip.cpp
@@ -27,9 +27,9 @@ using namespace ltimes_idx;
 //
 // Define thread block shape for Hip execution
 //
-#define m_block_sz (32)
-#define g_block_sz (integer::greater_of_squarest_factor_pair(block_size/m_block_sz))
-#define z_block_sz (integer::lesser_of_squarest_factor_pair(block_size/m_block_sz))
+#define m_block_sz (block_size)
+#define g_block_sz (1)
+#define z_block_sz (1)
 
 #define LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \
   m_block_sz, g_block_sz, z_block_sz
@@ -38,19 +38,19 @@ using namespace ltimes_idx;
   dim3 nthreads_per_block(LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP);
 
 #define LTIMES_NBLOCKS_HIP \
-  dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(*num_m, m_block_sz)), \
-               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(*num_g, g_block_sz)), \
-               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(*num_z, z_block_sz)));
+  dim3 nblocks(static_cast<size_t>(*num_z), \
+               static_cast<size_t>(*num_g), \
+               1);
 
 
-template < size_t m_block_size, size_t g_block_size, size_t z_block_size >
-__launch_bounds__(m_block_size*g_block_size*z_block_size)
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void ltimes(PHI_VIEW phi, ELL_VIEW ell, PSI_VIEW psi,
                        ID num_d, IM num_m, IG num_g, IZ num_z)
 {
-   IM m(blockIdx.x * m_block_size + threadIdx.x);
-   IG g(blockIdx.y * g_block_size + threadIdx.y);
-   IZ z(blockIdx.z * z_block_size + threadIdx.z);
+   IM m(threadIdx.x);
+   IG g(blockIdx.y);
+   IZ z(blockIdx.x);
 
    if (m < num_m && g < num_g && z < num_z) {
      for (ID d(0); d < num_d; ++d ) {
@@ -59,14 +59,14 @@ __global__ void ltimes(PHI_VIEW phi, ELL_VIEW ell, PSI_VIEW psi,
    }
 }
 
-template < size_t m_block_size, size_t g_block_size, size_t z_block_size, typename Lambda >
-__launch_bounds__(m_block_size*g_block_size*z_block_size)
+template < size_t block_size, typename Lambda >
+__launch_bounds__(block_size)
 __global__ void ltimes_lam(IM num_m, IG num_g, IZ num_z,
                            Lambda body)
 {
-   IM m(blockIdx.x * m_block_size + threadIdx.x);
-   IG g(blockIdx.y * g_block_size + threadIdx.y);
-   IZ z(blockIdx.z * z_block_size + threadIdx.z);
+   IM m(threadIdx.x);
+   IG g(blockIdx.y);
+   IZ z(blockIdx.x);
 
    if (m < num_m && g < num_g && z < num_z) {
      body(z, g, m);
@@ -96,7 +96,7 @@ void LTIMES::runHipVariantImpl(VariantID vid)
       constexpr size_t shmem = 0;
 
       RPlaunchHipKernel(
-        (ltimes<LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
+        (ltimes<block_size>),
         nblocks, nthreads_per_block,
         shmem, res.get_stream(),
         phi, ell, psi,
@@ -122,8 +122,7 @@ void LTIMES::runHipVariantImpl(VariantID vid)
       constexpr size_t shmem = 0;
 
       RPlaunchHipKernel(
-        (ltimes_lam<LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP,
-                    decltype(ltimes_lambda)>),
+        (ltimes_lam<block_size, decltype(ltimes_lambda)>),
         nblocks, nthreads_per_block,
         shmem, res.get_stream(),
         num_m, num_g, num_z,
@@ -138,10 +137,10 @@ void LTIMES::runHipVariantImpl(VariantID vid)
 
       using EXEC_POL =
         RAJA::KernelPolicy<
-          RAJA::statement::HipKernelFixedAsync<m_block_sz*g_block_sz*z_block_sz,
-            RAJA::statement::For<1, RAJA::hip_global_size_z_direct<z_block_sz>,     //z
-              RAJA::statement::For<2, RAJA::hip_global_size_y_direct<g_block_sz>,   //g
-                RAJA::statement::For<3, RAJA::hip_global_size_x_direct<m_block_sz>, //m
+          RAJA::statement::HipKernelAsync<
+            RAJA::statement::For<1, RAJA::hip_block_x_loop, // z
+              RAJA::statement::For<2, RAJA::hip_block_y_loop, // g
+                RAJA::statement::For<3, RAJA::hip_thread_x_loop, // m
                   RAJA::statement::For<0, RAJA::seq_exec,          //d
                     RAJA::statement::Lambda<0>
                   >
@@ -173,29 +172,24 @@ void LTIMES::runHipVariantImpl(VariantID vid)
 
       constexpr bool async = true;
 
-      using launch_policy = RAJA::LaunchPolicy<RAJA::hip_launch_t<async, m_block_sz*g_block_sz*z_block_sz>>;
+      using launch_policy =
+          RAJA::LaunchPolicy<RAJA::hip_launch_t<async, block_size>>;
 
-      using z_policy = RAJA::LoopPolicy<RAJA::hip_global_size_z_loop<z_block_sz>>;
+      using z_policy = RAJA::LoopPolicy<RAJA::hip_block_x_loop>;
 
-      using g_policy = RAJA::LoopPolicy<RAJA::hip_global_size_y_loop<g_block_sz>>;
+      using g_policy = RAJA::LoopPolicy<RAJA::hip_block_y_loop>;
 
-      using m_policy = RAJA::LoopPolicy<RAJA::hip_global_size_x_loop<m_block_sz>>;
+      using m_policy = RAJA::LoopPolicy<RAJA::hip_thread_x_loop>;
 
       using d_policy = RAJA::LoopPolicy<RAJA::seq_exec>;
 
-      const size_t z_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_z, z_block_sz);
-
-      const size_t g_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_g, g_block_sz);
-
-      const size_t m_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_m, m_block_sz);
-
       startTimer();
       // Loop counter increment uses macro to quiet C++20 compiler warning
       for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) {
 
         RAJA::launch<launch_policy>( res,
-            RAJA::LaunchParams(RAJA::Teams(m_grid_sz, g_grid_sz, z_grid_sz),
-                               RAJA::Threads(m_block_sz, g_block_sz, z_block_sz)),
+            RAJA::LaunchParams(RAJA::Teams(*num_z, *num_g, 1),
+                               RAJA::Threads(block_size, 1, 1)),
             [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
 
               RAJA::loop<z_policy>(ctx, IZRange(0, *num_z),

diff --git a/src/apps/LTIMES-Sycl.cpp b/src/apps/LTIMES-Sycl.cpp
@@ -27,9 +27,9 @@ using namespace ltimes_idx;
 //
 // Define work-group shape for SYCL execution
 //
-#define m_wg_sz (32)
-#define g_wg_sz (integer::greater_of_squarest_factor_pair(work_group_size/m_wg_sz))
-#define z_wg_sz (integer::lesser_of_squarest_factor_pair(work_group_size/m_wg_sz))
+#define m_wg_sz (work_group_size)
+#define g_wg_sz (1)
+#define z_wg_sz (1)
 
 template <size_t work_group_size, size_t tune_idx >
 void LTIMES::runSyclVariantImpl(VariantID vid)
@@ -45,8 +45,8 @@ void LTIMES::runSyclVariantImpl(VariantID vid)
 
   if ( vid == Base_SYCL ) {
 
-    sycl::range<3> global_dim(z_wg_sz * RAJA_DIVIDE_CEILING_INT(*num_z, z_wg_sz),
-                              g_wg_sz * RAJA_DIVIDE_CEILING_INT(*num_g, g_wg_sz),
+    sycl::range<3> global_dim(*num_z,
+                              *num_g,
                               m_wg_sz * RAJA_DIVIDE_CEILING_INT(*num_m, m_wg_sz));
     sycl::range<3> wkgroup_dim(z_wg_sz, g_wg_sz, m_wg_sz);
 
@@ -124,9 +124,9 @@ void LTIMES::runSyclVariantImpl(VariantID vid)
 
       using d_policy = RAJA::LoopPolicy<RAJA::seq_exec>;
 
-      const size_t z_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_z, z_wg_sz);
+      const size_t z_grid_sz = *num_z;
 
-      const size_t g_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_g, g_wg_sz);
+      const size_t g_grid_sz = *num_g;
 
       const size_t m_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_m, m_wg_sz);
 

diff --git a/src/apps/LTIMES.hpp b/src/apps/LTIMES.hpp
@@ -9,14 +9,16 @@
 
 ///
 /// LTIMES kernel reference implementation:
+/// Dependent on running in Kripke --layout ZGD
+/// https://github.com/llnl/Kripke/blob/develop/src/Kripke/Kernel/LTimes.cpp#L27
 ///
 /// for (Index_type z = 0; z < num_z; ++z ) {
 ///   for (Index_type g = 0; g < num_g; ++g ) {
 ///     for (Index_type m = 0; m < num_m; ++m ) {
 ///       for (Index_type d = 0; d < num_d; ++d ) {
 ///
 ///         phi[m+ (g * num_m) + (z * num_m * num_g)] +=
-///           ell[d+ (m * num_d)] * psi[d+ (g * num_d) + (z * num_d * num_g];
+///           ell[m+ (d * num_m)] * psi[d+ (g * num_d) + (z * num_d * num_g];
 ///
 ///       }
 ///     }
@@ -42,7 +44,7 @@
                      RAJA::as_array<RAJA::Perm<0, 1, 2> >::get() ) ); \
   ELL_VIEW ell(m_elldat, \
                RAJA::make_permuted_layout( {{*num_m, *num_d}}, \
-                     RAJA::as_array<RAJA::Perm<0, 1> >::get() ) ); \
+                     RAJA::as_array<RAJA::Perm<1, 0> >::get() ) ); \
   PHI_VIEW phi(m_phidat, \
                RAJA::make_permuted_layout( {{*num_z, *num_g, *num_m}}, \
                      RAJA::as_array<RAJA::Perm<0, 1, 2> >::get() ) );
@@ -76,7 +78,7 @@ namespace ltimes_idx {
                                    RAJA::Layout<3, Index_type, 2>,
                                    IZ, IG, ID>;
   using ELL_VIEW = RAJA::TypedView<Real_type,
-                                   RAJA::Layout<2, Index_type, 1>,
+                                   RAJA::Layout<2, Index_type, 0>,
                                    IM, ID>;
   using PHI_VIEW = RAJA::TypedView<Real_type,
                                    RAJA::Layout<3, Index_type, 2>,
@@ -114,17 +116,17 @@ class LTIMES : public KernelBase
   void runOpenMPVariant(VariantID vid);
   void runOpenMPTargetVariant(VariantID vid);
 
-  template < size_t block_size, size_t tune_idx >
+  template < size_t m_block_sz, size_t tune_idx >
   void runCudaVariantImpl(VariantID vid);
-  template < size_t block_size, size_t tune_idx >
+  template < size_t m_block_sz, size_t tune_idx >
   void runHipVariantImpl(VariantID vid);
   template < size_t work_group_size, size_t tune_idx >
   void runSyclVariantImpl(VariantID vid);
 
 private:
-  static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size,
-                                                         integer::MultipleOf<32>>;
+  static const size_t default_gpu_block_size = 25;
+  using gpu_block_sizes_type =
+      integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_phidat;
   Real_ptr m_elldat;