diff --git a/src/apps/LTIMES-Cuda.cpp b/src/apps/LTIMES-Cuda.cpp
index 50692ed40..4de491bc9 100644
--- a/src/apps/LTIMES-Cuda.cpp
+++ b/src/apps/LTIMES-Cuda.cpp
@@ -27,9 +27,9 @@ using namespace ltimes_idx;
 //
 // Define thread block shape for CUDA execution
 //
-#define m_block_sz (32)
-#define g_block_sz (integer::greater_of_squarest_factor_pair(block_size/m_block_sz))
-#define z_block_sz (integer::lesser_of_squarest_factor_pair(block_size/m_block_sz))
+#define m_block_sz (block_size)
+#define g_block_sz (1)
+#define z_block_sz (1)
 
 #define LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \
   m_block_sz, g_block_sz, z_block_sz
@@ -39,19 +39,19 @@ using namespace ltimes_idx;
   static_assert(m_block_sz*g_block_sz*z_block_sz == block_size, "Invalid block_size");
 
 #define LTIMES_NBLOCKS_CUDA \
-  dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(*num_m, m_block_sz)), \
-               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(*num_g, g_block_sz)), \
-               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(*num_z, z_block_sz)));
+  dim3 nblocks(static_cast<size_t>(*num_z), \
+               static_cast<size_t>(*num_g), \
+               1);
 
 
-template < size_t m_block_size, size_t g_block_size, size_t z_block_size >
-__launch_bounds__(m_block_size*g_block_size*z_block_size)
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void ltimes(PHI_VIEW phi, ELL_VIEW ell, PSI_VIEW psi,
                        ID num_d, IM num_m, IG num_g, IZ num_z)
 {
-   IM m(blockIdx.x * m_block_size + threadIdx.x);
-   IG g(blockIdx.y * g_block_size + threadIdx.y);
-   IZ z(blockIdx.z * z_block_size + threadIdx.z);
+   IM m(threadIdx.x);
+   IG g(blockIdx.y);
+   IZ z(blockIdx.x);
 
    if (m < num_m && g < num_g && z < num_z) {
      for (ID d(0); d < num_d; ++d ) {
@@ -60,14 +60,14 @@ __global__ void ltimes(PHI_VIEW phi, ELL_VIEW ell, PSI_VIEW psi,
    }
 }
 
-template < size_t m_block_size, size_t g_block_size, size_t z_block_size, typename Lambda >
-__launch_bounds__(m_block_size*g_block_size*z_block_size)
+template < size_t block_size, typename Lambda >
+__launch_bounds__(block_size)
 __global__ void ltimes_lam(IM num_m, IG num_g, IZ num_z,
                            Lambda body)
 {
-   IM m(blockIdx.x * m_block_size + threadIdx.x);
-   IG g(blockIdx.y * g_block_size + threadIdx.y);
-   IZ z(blockIdx.z * z_block_size + threadIdx.z);
+   IM m(threadIdx.x);
+   IG g(blockIdx.y);
+   IZ z(blockIdx.x);
 
    if (m < num_m && g < num_g && z < num_z) {
      body(z, g, m);
@@ -97,7 +97,7 @@ void LTIMES::runCudaVariantImpl(VariantID vid)
       constexpr size_t shmem = 0;
 
       RPlaunchCudaKernel(
-        (ltimes<LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>),
+        (ltimes<block_size>),
         nblocks, nthreads_per_block,
         shmem, res.get_stream(),
         phi, ell, psi,
@@ -123,8 +123,7 @@ void LTIMES::runCudaVariantImpl(VariantID vid)
       constexpr size_t shmem = 0;
 
       RPlaunchCudaKernel(
-        (ltimes_lam<LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA,
-                    decltype(ltimes_lambda)>),
+        (ltimes_lam<block_size, decltype(ltimes_lambda)>),
         nblocks, nthreads_per_block,
         shmem, res.get_stream(),
         num_m, num_g, num_z,
@@ -139,10 +138,10 @@ void LTIMES::runCudaVariantImpl(VariantID vid)
 
       using EXEC_POL =
         RAJA::KernelPolicy<
-          RAJA::statement::CudaKernelFixedAsync<m_block_sz*g_block_sz*z_block_sz,
-            RAJA::statement::For<1, RAJA::cuda_global_size_z_direct<z_block_sz>,     //z
-              RAJA::statement::For<2, RAJA::cuda_global_size_y_direct<g_block_sz>,   //g
-                RAJA::statement::For<3, RAJA::cuda_global_size_x_direct<m_block_sz>, //m
+          RAJA::statement::CudaKernelAsync<
+            RAJA::statement::For<1, RAJA::cuda_block_x_loop, // z
+              RAJA::statement::For<2, RAJA::cuda_block_y_loop, // g
+                RAJA::statement::For<3, RAJA::cuda_thread_x_loop, // m
                   RAJA::statement::For<0, RAJA::seq_exec,           //d
                     RAJA::statement::Lambda<0>
                   >
@@ -174,29 +173,24 @@ void LTIMES::runCudaVariantImpl(VariantID vid)
 
       constexpr bool async = true;
 
-      using launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async, m_block_sz*g_block_sz*z_block_sz>>;
+      using launch_policy =
+          RAJA::LaunchPolicy<RAJA::cuda_launch_t<async, block_size>>;
 
-      using z_policy = RAJA::LoopPolicy<RAJA::cuda_global_size_z_loop<z_block_sz>>;
+      using z_policy = RAJA::LoopPolicy<RAJA::cuda_block_x_loop>;
 
-      using g_policy = RAJA::LoopPolicy<RAJA::cuda_global_size_y_loop<g_block_sz>>;
+      using g_policy = RAJA::LoopPolicy<RAJA::cuda_block_y_loop>;
 
-      using m_policy = RAJA::LoopPolicy<RAJA::cuda_global_size_x_loop<m_block_sz>>;
+      using m_policy = RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>;
 
       using d_policy = RAJA::LoopPolicy<RAJA::seq_exec>;
 
-      const size_t z_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_z, z_block_sz);
-
-      const size_t g_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_g, g_block_sz);
-
-      const size_t m_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_m, m_block_sz);
-
       startTimer();
       // Loop counter increment uses macro to quiet C++20 compiler warning
       for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) {
 
         RAJA::launch<launch_policy>( res,
-            RAJA::LaunchParams(RAJA::Teams(m_grid_sz, g_grid_sz, z_grid_sz),
-                               RAJA::Threads(m_block_sz, g_block_sz, z_block_sz)),
+            RAJA::LaunchParams(RAJA::Teams(*num_z, *num_g, 1),
+                               RAJA::Threads(block_size, 1, 1)),
             [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
 
               RAJA::loop<z_policy>(ctx, IZRange(0, *num_z),
diff --git a/src/apps/LTIMES-Hip.cpp b/src/apps/LTIMES-Hip.cpp
index e33665294..18ee89c7e 100644
--- a/src/apps/LTIMES-Hip.cpp
+++ b/src/apps/LTIMES-Hip.cpp
@@ -27,9 +27,9 @@ using namespace ltimes_idx;
 //
 // Define thread block shape for Hip execution
 //
-#define m_block_sz (32)
-#define g_block_sz (integer::greater_of_squarest_factor_pair(block_size/m_block_sz))
-#define z_block_sz (integer::lesser_of_squarest_factor_pair(block_size/m_block_sz))
+#define m_block_sz (block_size)
+#define g_block_sz (1)
+#define z_block_sz (1)
 
 #define LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \
   m_block_sz, g_block_sz, z_block_sz
@@ -38,19 +38,19 @@ using namespace ltimes_idx;
   dim3 nthreads_per_block(LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP);
 
 #define LTIMES_NBLOCKS_HIP \
-  dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(*num_m, m_block_sz)), \
-               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(*num_g, g_block_sz)), \
-               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(*num_z, z_block_sz)));
+  dim3 nblocks(static_cast<size_t>(*num_z), \
+               static_cast<size_t>(*num_g), \
+               1);
 
 
-template < size_t m_block_size, size_t g_block_size, size_t z_block_size >
-__launch_bounds__(m_block_size*g_block_size*z_block_size)
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void ltimes(PHI_VIEW phi, ELL_VIEW ell, PSI_VIEW psi,
                        ID num_d, IM num_m, IG num_g, IZ num_z)
 {
-   IM m(blockIdx.x * m_block_size + threadIdx.x);
-   IG g(blockIdx.y * g_block_size + threadIdx.y);
-   IZ z(blockIdx.z * z_block_size + threadIdx.z);
+   IM m(threadIdx.x);
+   IG g(blockIdx.y);
+   IZ z(blockIdx.x);
 
    if (m < num_m && g < num_g && z < num_z) {
      for (ID d(0); d < num_d; ++d ) {
@@ -59,14 +59,14 @@ __global__ void ltimes(PHI_VIEW phi, ELL_VIEW ell, PSI_VIEW psi,
    }
 }
 
-template < size_t m_block_size, size_t g_block_size, size_t z_block_size, typename Lambda >
-__launch_bounds__(m_block_size*g_block_size*z_block_size)
+template < size_t block_size, typename Lambda >
+__launch_bounds__(block_size)
 __global__ void ltimes_lam(IM num_m, IG num_g, IZ num_z,
                            Lambda body)
 {
-   IM m(blockIdx.x * m_block_size + threadIdx.x);
-   IG g(blockIdx.y * g_block_size + threadIdx.y);
-   IZ z(blockIdx.z * z_block_size + threadIdx.z);
+   IM m(threadIdx.x);
+   IG g(blockIdx.y);
+   IZ z(blockIdx.x);
 
    if (m < num_m && g < num_g && z < num_z) {
      body(z, g, m);
@@ -96,7 +96,7 @@ void LTIMES::runHipVariantImpl(VariantID vid)
       constexpr size_t shmem = 0;
 
       RPlaunchHipKernel(
-        (ltimes<LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
+        (ltimes<block_size>),
         nblocks, nthreads_per_block,
         shmem, res.get_stream(),
         phi, ell, psi,
@@ -122,8 +122,7 @@ void LTIMES::runHipVariantImpl(VariantID vid)
       constexpr size_t shmem = 0;
 
       RPlaunchHipKernel(
-        (ltimes_lam<LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP,
-                    decltype(ltimes_lambda)>),
+        (ltimes_lam<block_size, decltype(ltimes_lambda)>),
         nblocks, nthreads_per_block,
         shmem, res.get_stream(),
         num_m, num_g, num_z,
@@ -138,10 +137,10 @@ void LTIMES::runHipVariantImpl(VariantID vid)
 
       using EXEC_POL =
         RAJA::KernelPolicy<
-          RAJA::statement::HipKernelFixedAsync<m_block_sz*g_block_sz*z_block_sz,
-            RAJA::statement::For<1, RAJA::hip_global_size_z_direct<z_block_sz>,     //z
-              RAJA::statement::For<2, RAJA::hip_global_size_y_direct<g_block_sz>,   //g
-                RAJA::statement::For<3, RAJA::hip_global_size_x_direct<m_block_sz>, //m
+          RAJA::statement::HipKernelAsync<
+            RAJA::statement::For<1, RAJA::hip_block_x_loop, // z
+              RAJA::statement::For<2, RAJA::hip_block_y_loop, // g
+                RAJA::statement::For<3, RAJA::hip_thread_x_loop, // m
                   RAJA::statement::For<0, RAJA::seq_exec,          //d
                     RAJA::statement::Lambda<0>
                   >
@@ -173,29 +172,24 @@ void LTIMES::runHipVariantImpl(VariantID vid)
 
       constexpr bool async = true;
 
-      using launch_policy = RAJA::LaunchPolicy<RAJA::hip_launch_t<async, m_block_sz*g_block_sz*z_block_sz>>;
+      using launch_policy =
+          RAJA::LaunchPolicy<RAJA::hip_launch_t<async, block_size>>;
 
-      using z_policy = RAJA::LoopPolicy<RAJA::hip_global_size_z_loop<z_block_sz>>;
+      using z_policy = RAJA::LoopPolicy<RAJA::hip_block_x_loop>;
 
-      using g_policy = RAJA::LoopPolicy<RAJA::hip_global_size_y_loop<g_block_sz>>;
+      using g_policy = RAJA::LoopPolicy<RAJA::hip_block_y_loop>;
 
-      using m_policy = RAJA::LoopPolicy<RAJA::hip_global_size_x_loop<m_block_sz>>;
+      using m_policy = RAJA::LoopPolicy<RAJA::hip_thread_x_loop>;
 
       using d_policy = RAJA::LoopPolicy<RAJA::seq_exec>;
 
-      const size_t z_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_z, z_block_sz);
-
-      const size_t g_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_g, g_block_sz);
-
-      const size_t m_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_m, m_block_sz);
-
       startTimer();
       // Loop counter increment uses macro to quiet C++20 compiler warning
       for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) {
 
         RAJA::launch<launch_policy>( res,
-            RAJA::LaunchParams(RAJA::Teams(m_grid_sz, g_grid_sz, z_grid_sz),
-                               RAJA::Threads(m_block_sz, g_block_sz, z_block_sz)),
+            RAJA::LaunchParams(RAJA::Teams(*num_z, *num_g, 1),
+                               RAJA::Threads(block_size, 1, 1)),
             [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
 
               RAJA::loop<z_policy>(ctx, IZRange(0, *num_z),
diff --git a/src/apps/LTIMES-Sycl.cpp b/src/apps/LTIMES-Sycl.cpp
index 81e5c7117..4c167858a 100644
--- a/src/apps/LTIMES-Sycl.cpp
+++ b/src/apps/LTIMES-Sycl.cpp
@@ -27,9 +27,9 @@ using namespace ltimes_idx;
 //
 // Define work-group shape for SYCL execution
 //
-#define m_wg_sz (32)
-#define g_wg_sz (integer::greater_of_squarest_factor_pair(work_group_size/m_wg_sz))
-#define z_wg_sz (integer::lesser_of_squarest_factor_pair(work_group_size/m_wg_sz))
+#define m_wg_sz (work_group_size)
+#define g_wg_sz (1)
+#define z_wg_sz (1)
 
 template <size_t work_group_size, size_t tune_idx >
 void LTIMES::runSyclVariantImpl(VariantID vid)
@@ -45,8 +45,8 @@ void LTIMES::runSyclVariantImpl(VariantID vid)
 
   if ( vid == Base_SYCL ) {
 
-    sycl::range<3> global_dim(z_wg_sz * RAJA_DIVIDE_CEILING_INT(*num_z, z_wg_sz),
-                              g_wg_sz * RAJA_DIVIDE_CEILING_INT(*num_g, g_wg_sz),
+    sycl::range<3> global_dim(*num_z,
+                              *num_g,
                               m_wg_sz * RAJA_DIVIDE_CEILING_INT(*num_m, m_wg_sz));
     sycl::range<3> wkgroup_dim(z_wg_sz, g_wg_sz, m_wg_sz);
 
@@ -124,9 +124,9 @@ void LTIMES::runSyclVariantImpl(VariantID vid)
 
       using d_policy = RAJA::LoopPolicy<RAJA::seq_exec>;
 
-      const size_t z_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_z, z_wg_sz);
+      const size_t z_grid_sz = *num_z;
 
-      const size_t g_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_g, g_wg_sz);
+      const size_t g_grid_sz = *num_g;
 
       const size_t m_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_m, m_wg_sz);
 
diff --git a/src/apps/LTIMES.hpp b/src/apps/LTIMES.hpp
index 8c6ebed7c..df4f901d7 100644
--- a/src/apps/LTIMES.hpp
+++ b/src/apps/LTIMES.hpp
@@ -9,6 +9,8 @@
 
 ///
 /// LTIMES kernel reference implementation:
+/// Dependent on running in Kripke --layout ZGD
+/// https://github.com/llnl/Kripke/blob/develop/src/Kripke/Kernel/LTimes.cpp#L27
 ///
 /// for (Index_type z = 0; z < num_z; ++z ) {
 ///   for (Index_type g = 0; g < num_g; ++g ) {
@@ -16,7 +18,7 @@
 ///       for (Index_type d = 0; d < num_d; ++d ) {
 ///
 ///         phi[m+ (g * num_m) + (z * num_m * num_g)] +=
-///           ell[d+ (m * num_d)] * psi[d+ (g * num_d) + (z * num_d * num_g];
+///           ell[m+ (d * num_m)] * psi[d+ (g * num_d) + (z * num_d * num_g];
 ///
 ///       }
 ///     }
@@ -42,7 +44,7 @@
                      RAJA::as_array<RAJA::Perm<0, 1, 2> >::get() ) ); \
   ELL_VIEW ell(m_elldat, \
                RAJA::make_permuted_layout( {{*num_m, *num_d}}, \
-                     RAJA::as_array<RAJA::Perm<0, 1> >::get() ) ); \
+                     RAJA::as_array<RAJA::Perm<1, 0> >::get() ) ); \
   PHI_VIEW phi(m_phidat, \
                RAJA::make_permuted_layout( {{*num_z, *num_g, *num_m}}, \
                      RAJA::as_array<RAJA::Perm<0, 1, 2> >::get() ) );
@@ -76,7 +78,7 @@ namespace ltimes_idx {
                                    RAJA::Layout<3, Index_type, 2>,
                                    IZ, IG, ID>;
   using ELL_VIEW = RAJA::TypedView<Real_type,
-                                   RAJA::Layout<2, Index_type, 1>,
+                                   RAJA::Layout<2, Index_type, 0>,
                                    IM, ID>;
   using PHI_VIEW = RAJA::TypedView<Real_type,
                                    RAJA::Layout<3, Index_type, 2>,
@@ -114,17 +116,17 @@ class LTIMES : public KernelBase
   void runOpenMPVariant(VariantID vid);
   void runOpenMPTargetVariant(VariantID vid);
 
-  template < size_t block_size, size_t tune_idx >
+  template < size_t m_block_sz, size_t tune_idx >
   void runCudaVariantImpl(VariantID vid);
-  template < size_t block_size, size_t tune_idx >
+  template < size_t m_block_sz, size_t tune_idx >
   void runHipVariantImpl(VariantID vid);
   template < size_t work_group_size, size_t tune_idx >
   void runSyclVariantImpl(VariantID vid);
 
 private:
-  static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size,
-                                                         integer::MultipleOf<32>>;
+  static const size_t default_gpu_block_size = 25;
+  using gpu_block_sizes_type =
+      integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_phidat;
   Real_ptr m_elldat;
diff --git a/src/apps/LTIMES_NOVIEW-Cuda.cpp b/src/apps/LTIMES_NOVIEW-Cuda.cpp
index 7c8ab94b8..bca1b9227 100644
--- a/src/apps/LTIMES_NOVIEW-Cuda.cpp
+++ b/src/apps/LTIMES_NOVIEW-Cuda.cpp
@@ -25,31 +25,32 @@ namespace apps
 //
 // Define thread block shape for CUDA execution
 //
-#define m_block_sz (32)
-#define g_block_sz (integer::greater_of_squarest_factor_pair(block_size/m_block_sz))
-#define z_block_sz (integer::lesser_of_squarest_factor_pair(block_size/m_block_sz))
+#define m_block_sz (block_size)
+#define g_block_sz (1)
+#define z_block_sz (1)
 
 #define LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \
   m_block_sz, g_block_sz, z_block_sz
 
 #define LTIMES_NOVIEW_THREADS_PER_BLOCK_CUDA \
-  dim3 nthreads_per_block(LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA);
+  dim3 nthreads_per_block(LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA); \
+  static_assert(m_block_sz*g_block_sz*z_block_sz == block_size, "Invalid block_size");
 
 #define LTIMES_NOVIEW_NBLOCKS_CUDA \
-  dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(num_m, m_block_sz)), \
-               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(num_g, g_block_sz)), \
-               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(num_z, z_block_sz)));
+  dim3 nblocks(static_cast<size_t>(num_z), \
+               static_cast<size_t>(num_g), \
+               1);
 
 
-template < size_t m_block_size, size_t g_block_size, size_t z_block_size >
-__launch_bounds__(m_block_size*g_block_size*z_block_size)
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void ltimes_noview(Real_ptr phidat, Real_ptr elldat, Real_ptr psidat,
                               Index_type num_d,
                               Index_type num_m, Index_type num_g, Index_type num_z)
 {
-   Index_type m = blockIdx.x * m_block_size + threadIdx.x;
-   Index_type g = blockIdx.y * g_block_size + threadIdx.y;
-   Index_type z = blockIdx.z * z_block_size + threadIdx.z;
+   Index_type m = threadIdx.x;
+   Index_type g = blockIdx.y;
+   Index_type z = blockIdx.x;
 
    if (m < num_m && g < num_g && z < num_z) {
      for (Index_type d = 0; d < num_d; ++d ) {
@@ -58,14 +59,14 @@ __global__ void ltimes_noview(Real_ptr phidat, Real_ptr elldat, Real_ptr psidat,
    }
 }
 
-template < size_t m_block_size, size_t g_block_size, size_t z_block_size, typename Lambda >
-__launch_bounds__(m_block_size*g_block_size*z_block_size)
+template < size_t block_size, typename Lambda >
+__launch_bounds__(block_size)
 __global__ void ltimes_noview_lam(Index_type num_m, Index_type num_g, Index_type num_z,
                                   Lambda body)
 {
-   Index_type m = blockIdx.x * m_block_size + threadIdx.x;
-   Index_type g = blockIdx.y * g_block_size + threadIdx.y;
-   Index_type z = blockIdx.z * z_block_size + threadIdx.z;
+   Index_type m = threadIdx.x;
+   Index_type g = blockIdx.y;
+   Index_type z = blockIdx.x;
 
    if (m < num_m && g < num_g && z < num_z) {
      body(z, g, m);
@@ -95,7 +96,7 @@ void LTIMES_NOVIEW::runCudaVariantImpl(VariantID vid)
       constexpr size_t shmem = 0;
 
       RPlaunchCudaKernel( 
-        (ltimes_noview<LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>),
+        (ltimes_noview<block_size>),
         nblocks, nthreads_per_block,
         shmem, res.get_stream(),
         phidat, elldat, psidat,
@@ -122,8 +123,7 @@ void LTIMES_NOVIEW::runCudaVariantImpl(VariantID vid)
       constexpr size_t shmem = 0;
 
       RPlaunchCudaKernel( 
-        (ltimes_noview_lam<LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA,
-                           decltype(ltimes_noview_lambda)>),
+        (ltimes_noview_lam<block_size, decltype(ltimes_noview_lambda)>),
         nblocks, nthreads_per_block,
         shmem, res.get_stream(),
         num_m, num_g, num_z,
@@ -138,10 +138,10 @@ void LTIMES_NOVIEW::runCudaVariantImpl(VariantID vid)
 
       using EXEC_POL =
         RAJA::KernelPolicy<
-          RAJA::statement::CudaKernelFixedAsync<m_block_sz*g_block_sz*z_block_sz,
-            RAJA::statement::For<1, RAJA::cuda_global_size_z_direct<z_block_sz>,     //z
-              RAJA::statement::For<2, RAJA::cuda_global_size_y_direct<g_block_sz>,   //g
-                RAJA::statement::For<3, RAJA::cuda_global_size_x_direct<m_block_sz>, //m
+          RAJA::statement::CudaKernelAsync<
+            RAJA::statement::For<1, RAJA::cuda_block_x_loop, // z
+              RAJA::statement::For<2, RAJA::cuda_block_y_loop, // g
+                RAJA::statement::For<3, RAJA::cuda_thread_x_loop, // m
                   RAJA::statement::For<0, RAJA::seq_exec,           //d
                     RAJA::statement::Lambda<0>
                   >
@@ -174,29 +174,24 @@ void LTIMES_NOVIEW::runCudaVariantImpl(VariantID vid)
 
       constexpr bool async = true;
 
-      using launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async, m_block_sz*g_block_sz*z_block_sz>>;
+      using launch_policy =
+          RAJA::LaunchPolicy<RAJA::cuda_launch_t<async, block_size>>;
 
-      using z_policy = RAJA::LoopPolicy<RAJA::cuda_global_size_z_loop<z_block_sz>>;
+      using z_policy = RAJA::LoopPolicy<RAJA::cuda_block_x_loop>;
 
-      using g_policy = RAJA::LoopPolicy<RAJA::cuda_global_size_y_loop<g_block_sz>>;
+      using g_policy = RAJA::LoopPolicy<RAJA::cuda_block_y_loop>;
 
-      using m_policy = RAJA::LoopPolicy<RAJA::cuda_global_size_x_loop<m_block_sz>>;
+      using m_policy = RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>;
 
       using d_policy = RAJA::LoopPolicy<RAJA::seq_exec>;
 
-      const size_t z_grid_sz = RAJA_DIVIDE_CEILING_INT(num_z, z_block_sz);
-
-      const size_t g_grid_sz = RAJA_DIVIDE_CEILING_INT(num_g, g_block_sz);
-
-      const size_t m_grid_sz = RAJA_DIVIDE_CEILING_INT(num_m, m_block_sz);
-
       startTimer();
       // Loop counter increment uses macro to quiet C++20 compiler warning
       for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) {
 
         RAJA::launch<launch_policy>( res,
-            RAJA::LaunchParams(RAJA::Teams(m_grid_sz, g_grid_sz, z_grid_sz),
-                               RAJA::Threads(m_block_sz, g_block_sz, z_block_sz)),
+            RAJA::LaunchParams(RAJA::Teams(num_z, num_g, 1),
+                               RAJA::Threads(block_size, 1, 1)),
             [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
 
               RAJA::loop<z_policy>(ctx, RAJA::RangeSegment(0, num_z),
diff --git a/src/apps/LTIMES_NOVIEW-Hip.cpp b/src/apps/LTIMES_NOVIEW-Hip.cpp
index e222dce50..f530c6798 100644
--- a/src/apps/LTIMES_NOVIEW-Hip.cpp
+++ b/src/apps/LTIMES_NOVIEW-Hip.cpp
@@ -25,31 +25,32 @@ namespace apps
 //
 // Define thread block shape for Hip execution
 //
-#define m_block_sz (32)
-#define g_block_sz (integer::greater_of_squarest_factor_pair(block_size/m_block_sz))
-#define z_block_sz (integer::lesser_of_squarest_factor_pair(block_size/m_block_sz))
+#define m_block_sz (block_size)
+#define g_block_sz (1)
+#define z_block_sz (1)
 
 #define LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \
   m_block_sz, g_block_sz, z_block_sz
 
 #define LTIMES_NOVIEW_THREADS_PER_BLOCK_HIP \
-  dim3 nthreads_per_block(LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP);
+  dim3 nthreads_per_block(LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP); \
+  static_assert(m_block_sz*g_block_sz*z_block_sz == block_size, "Invalid block_size");
 
 #define LTIMES_NOVIEW_NBLOCKS_HIP \
-  dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(num_m, m_block_sz)), \
-               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(num_g, g_block_sz)), \
-               static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(num_z, z_block_sz)));
+  dim3 nblocks(static_cast<size_t>(num_z), \
+               static_cast<size_t>(num_g), \
+               1);
 
 
-template < size_t m_block_size, size_t g_block_size, size_t z_block_size >
-__launch_bounds__(m_block_size*g_block_size*z_block_size)
+template < size_t block_size >
+__launch_bounds__(block_size)
 __global__ void ltimes_noview(Real_ptr phidat, Real_ptr elldat, Real_ptr psidat,
                               Index_type num_d,
                               Index_type num_m, Index_type num_g, Index_type num_z)
 {
-   Index_type m = blockIdx.x * m_block_size + threadIdx.x;
-   Index_type g = blockIdx.y * g_block_size + threadIdx.y;
-   Index_type z = blockIdx.z * z_block_size + threadIdx.z;
+   Index_type m = threadIdx.x;
+   Index_type g = blockIdx.y;
+   Index_type z = blockIdx.x;
 
    if (m < num_m && g < num_g && z < num_z) {
      for (Index_type d = 0; d < num_d; ++d ) {
@@ -58,14 +59,14 @@ __global__ void ltimes_noview(Real_ptr phidat, Real_ptr elldat, Real_ptr psidat,
    }
 }
 
-template < size_t m_block_size, size_t g_block_size, size_t z_block_size, typename Lambda >
-__launch_bounds__(m_block_size*g_block_size*z_block_size)
+template < size_t block_size, typename Lambda >
+__launch_bounds__(block_size)
 __global__ void ltimes_noview_lam(Index_type num_m, Index_type num_g, Index_type num_z,
                                   Lambda body)
 {
-   Index_type m = blockIdx.x * m_block_size + threadIdx.x;
-   Index_type g = blockIdx.y * g_block_size + threadIdx.y;
-   Index_type z = blockIdx.z * z_block_size + threadIdx.z;
+   Index_type m = threadIdx.x;
+   Index_type g = blockIdx.y;
+   Index_type z = blockIdx.x;
 
    if (m < num_m && g < num_g && z < num_z) {
      body(z, g, m);
@@ -95,7 +96,7 @@ void LTIMES_NOVIEW::runHipVariantImpl(VariantID vid)
       constexpr size_t shmem = 0;
 
       RPlaunchHipKernel(
-        (ltimes_noview<LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
+        (ltimes_noview<block_size>),
         nblocks, nthreads_per_block,
         shmem, res.get_stream(),
         phidat, elldat, psidat,
@@ -122,8 +123,7 @@ void LTIMES_NOVIEW::runHipVariantImpl(VariantID vid)
       constexpr size_t shmem = 0;
 
       RPlaunchHipKernel(
-        (ltimes_noview_lam<LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP,
-                           decltype(ltimes_noview_lambda)>),
+        (ltimes_noview_lam<block_size, decltype(ltimes_noview_lambda)>),
         nblocks, nthreads_per_block,
         shmem, res.get_stream(),
         num_m, num_g, num_z,
@@ -138,10 +138,10 @@ void LTIMES_NOVIEW::runHipVariantImpl(VariantID vid)
 
       using EXEC_POL =
         RAJA::KernelPolicy<
-          RAJA::statement::HipKernelFixedAsync<m_block_sz*g_block_sz*z_block_sz,
-            RAJA::statement::For<1, RAJA::hip_global_size_z_direct<z_block_sz>,     //z
-              RAJA::statement::For<2, RAJA::hip_global_size_y_direct<g_block_sz>,   //g
-                RAJA::statement::For<3, RAJA::hip_global_size_x_direct<m_block_sz>, //m
+          RAJA::statement::HipKernelAsync<
+            RAJA::statement::For<1, RAJA::hip_block_x_loop, // z
+              RAJA::statement::For<2, RAJA::hip_block_y_loop, // g
+                RAJA::statement::For<3, RAJA::hip_thread_x_loop, // m
                   RAJA::statement::For<0, RAJA::seq_exec,          //d
                     RAJA::statement::Lambda<0>
                   >
@@ -174,29 +174,24 @@ void LTIMES_NOVIEW::runHipVariantImpl(VariantID vid)
 
       constexpr bool async = true;
 
-      using launch_policy = RAJA::LaunchPolicy<RAJA::hip_launch_t<async, m_block_sz*g_block_sz*z_block_sz>>;
+      using launch_policy =
+          RAJA::LaunchPolicy<RAJA::hip_launch_t<async, block_size>>;
 
-      using z_policy = RAJA::LoopPolicy<RAJA::hip_global_size_z_loop<z_block_sz>>;
+      using z_policy = RAJA::LoopPolicy<RAJA::hip_block_x_loop>;
 
-      using g_policy = RAJA::LoopPolicy<RAJA::hip_global_size_y_loop<g_block_sz>>;
+      using g_policy = RAJA::LoopPolicy<RAJA::hip_block_y_loop>;
 
-      using m_policy = RAJA::LoopPolicy<RAJA::hip_global_size_x_loop<m_block_sz>>;
+      using m_policy = RAJA::LoopPolicy<RAJA::hip_thread_x_loop>;
 
       using d_policy = RAJA::LoopPolicy<RAJA::seq_exec>;
 
-      const size_t z_grid_sz = RAJA_DIVIDE_CEILING_INT(num_z, z_block_sz);
-
-      const size_t g_grid_sz = RAJA_DIVIDE_CEILING_INT(num_g, g_block_sz);
-
-      const size_t m_grid_sz = RAJA_DIVIDE_CEILING_INT(num_m, m_block_sz);
-
       startTimer();
       // Loop counter increment uses macro to quiet C++20 compiler warning
       for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) {
 
         RAJA::launch<launch_policy>( res,
-            RAJA::LaunchParams(RAJA::Teams(m_grid_sz, g_grid_sz, z_grid_sz),
-                               RAJA::Threads(m_block_sz, g_block_sz, z_block_sz)),
+            RAJA::LaunchParams(RAJA::Teams(num_z, num_g, 1),
+                               RAJA::Threads(block_size, 1, 1)),
             [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {
 
               RAJA::loop<z_policy>(ctx, RAJA::RangeSegment(0, num_z),
diff --git a/src/apps/LTIMES_NOVIEW-Sycl.cpp b/src/apps/LTIMES_NOVIEW-Sycl.cpp
index 921790d7a..10548ad15 100644
--- a/src/apps/LTIMES_NOVIEW-Sycl.cpp
+++ b/src/apps/LTIMES_NOVIEW-Sycl.cpp
@@ -25,9 +25,9 @@ namespace apps
 //
 // Define work-group shape for SYCL execution
 //
-#define m_wg_sz (32)
-#define g_wg_sz (integer::greater_of_squarest_factor_pair(work_group_size/m_wg_sz))
-#define z_wg_sz (integer::lesser_of_squarest_factor_pair(work_group_size/m_wg_sz))
+#define m_wg_sz (work_group_size)
+#define g_wg_sz (1)
+#define z_wg_sz (1)
 
 template <size_t work_group_size, size_t tune_idx >
 void LTIMES_NOVIEW::runSyclVariantImpl(VariantID vid)
@@ -43,8 +43,8 @@ void LTIMES_NOVIEW::runSyclVariantImpl(VariantID vid)
 
   if ( vid == Base_SYCL ) {
 
-    sycl::range<3> global_dim(z_wg_sz * RAJA_DIVIDE_CEILING_INT(num_z, z_wg_sz),
-                              g_wg_sz * RAJA_DIVIDE_CEILING_INT(num_g, g_wg_sz),
+    sycl::range<3> global_dim(num_z,
+                              num_g,
                               m_wg_sz * RAJA_DIVIDE_CEILING_INT(num_m, m_wg_sz));
     sycl::range<3> wkgroup_dim(z_wg_sz, g_wg_sz, m_wg_sz);
 
@@ -122,9 +122,9 @@ void LTIMES_NOVIEW::runSyclVariantImpl(VariantID vid)
 
       using d_policy = RAJA::LoopPolicy<RAJA::seq_exec>;
 
-      const size_t z_grid_sz = RAJA_DIVIDE_CEILING_INT(num_z, z_wg_sz);
+      const size_t z_grid_sz = num_z;
 
-      const size_t g_grid_sz = RAJA_DIVIDE_CEILING_INT(num_g, g_wg_sz);
+      const size_t g_grid_sz = num_g;
 
       const size_t m_grid_sz = RAJA_DIVIDE_CEILING_INT(num_m, m_wg_sz);
 
diff --git a/src/apps/LTIMES_NOVIEW.hpp b/src/apps/LTIMES_NOVIEW.hpp
index 99b75c9de..2b064d97a 100644
--- a/src/apps/LTIMES_NOVIEW.hpp
+++ b/src/apps/LTIMES_NOVIEW.hpp
@@ -10,7 +10,7 @@
 ///
 /// LTIMES_NOVIEW kernel reference implementation:
 /// actual order of phi is gmz
-/// actual order of ell is md
+/// actual order of ell is dm
 /// actual order of psi is gdz
 /// for (Index_type z = 0; z < num_z; ++z ) {
 ///   for (Index_type g = 0; g < num_g; ++g ) {
@@ -18,7 +18,7 @@
 ///       for (Index_type d = 0; d < num_d; ++d ) {
 ///
 ///         phi[m+ (g * num_m) + (z * num_m * num_g)] +=
-///           ell[d+ (m * num_d)] * psi[d+ (g * num_d) + (z * num_d * num_g];
+///           ell[m+ (d * num_m)] * psi[d+ (g * num_d) + (z * num_d * num_g];
 ///
 ///       }
 ///     }
@@ -41,7 +41,7 @@
 
 #define LTIMES_NOVIEW_BODY \
   phidat[m+ (g * num_m) + (z * num_m * num_g)] += \
-    elldat[d+ (m * num_d)] * psidat[d+ (g * num_d) + (z * num_d * num_g)];
+    elldat[m+ (d * num_m)] * psidat[d+ (g * num_d) + (z * num_d * num_g)];
 
 #include "common/KernelBase.hpp"
 
@@ -86,9 +86,9 @@ class LTIMES_NOVIEW : public KernelBase
   void runSyclVariantImpl(VariantID vid);
 
 private:
-  static const size_t default_gpu_block_size = 256;
-  using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size,
-                                                         integer::MultipleOf<32>>;
+  static const size_t default_gpu_block_size = 25;
+  using gpu_block_sizes_type =
+      integer::make_gpu_block_size_list_type<default_gpu_block_size>;
 
   Real_ptr m_phidat;
   Real_ptr m_elldat;