diff --git a/src/apps/LTIMES-Cuda.cpp b/src/apps/LTIMES-Cuda.cpp index 50692ed40..4de491bc9 100644 --- a/src/apps/LTIMES-Cuda.cpp +++ b/src/apps/LTIMES-Cuda.cpp @@ -27,9 +27,9 @@ using namespace ltimes_idx; // // Define thread block shape for CUDA execution // -#define m_block_sz (32) -#define g_block_sz (integer::greater_of_squarest_factor_pair(block_size/m_block_sz)) -#define z_block_sz (integer::lesser_of_squarest_factor_pair(block_size/m_block_sz)) +#define m_block_sz (block_size) +#define g_block_sz (1) +#define z_block_sz (1) #define LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \ m_block_sz, g_block_sz, z_block_sz @@ -39,19 +39,19 @@ using namespace ltimes_idx; static_assert(m_block_sz*g_block_sz*z_block_sz == block_size, "Invalid block_size"); #define LTIMES_NBLOCKS_CUDA \ - dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(*num_m, m_block_sz)), \ - static_cast(RAJA_DIVIDE_CEILING_INT(*num_g, g_block_sz)), \ - static_cast(RAJA_DIVIDE_CEILING_INT(*num_z, z_block_sz))); + dim3 nblocks(static_cast(*num_z), \ + static_cast(*num_g), \ + 1); -template < size_t m_block_size, size_t g_block_size, size_t z_block_size > -__launch_bounds__(m_block_size*g_block_size*z_block_size) +template < size_t block_size > +__launch_bounds__(block_size) __global__ void ltimes(PHI_VIEW phi, ELL_VIEW ell, PSI_VIEW psi, ID num_d, IM num_m, IG num_g, IZ num_z) { - IM m(blockIdx.x * m_block_size + threadIdx.x); - IG g(blockIdx.y * g_block_size + threadIdx.y); - IZ z(blockIdx.z * z_block_size + threadIdx.z); + IM m(threadIdx.x); + IG g(blockIdx.y); + IZ z(blockIdx.x); if (m < num_m && g < num_g && z < num_z) { for (ID d(0); d < num_d; ++d ) { @@ -60,14 +60,14 @@ __global__ void ltimes(PHI_VIEW phi, ELL_VIEW ell, PSI_VIEW psi, } } -template < size_t m_block_size, size_t g_block_size, size_t z_block_size, typename Lambda > -__launch_bounds__(m_block_size*g_block_size*z_block_size) +template < size_t block_size, typename Lambda > +__launch_bounds__(block_size) __global__ void ltimes_lam(IM num_m, IG num_g, IZ num_z, Lambda body) { - IM m(blockIdx.x * m_block_size + threadIdx.x); - IG g(blockIdx.y * g_block_size + threadIdx.y); - IZ z(blockIdx.z * z_block_size + threadIdx.z); + IM m(threadIdx.x); + IG g(blockIdx.y); + IZ z(blockIdx.x); if (m < num_m && g < num_g && z < num_z) { body(z, g, m); @@ -97,7 +97,7 @@ void LTIMES::runCudaVariantImpl(VariantID vid) constexpr size_t shmem = 0; RPlaunchCudaKernel( - (ltimes), + (ltimes), nblocks, nthreads_per_block, shmem, res.get_stream(), phi, ell, psi, @@ -123,8 +123,7 @@ void LTIMES::runCudaVariantImpl(VariantID vid) constexpr size_t shmem = 0; RPlaunchCudaKernel( - (ltimes_lam), + (ltimes_lam), nblocks, nthreads_per_block, shmem, res.get_stream(), num_m, num_g, num_z, @@ -139,10 +138,10 @@ void LTIMES::runCudaVariantImpl(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< - RAJA::statement::CudaKernelFixedAsync, //z - RAJA::statement::For<2, RAJA::cuda_global_size_y_direct, //g - RAJA::statement::For<3, RAJA::cuda_global_size_x_direct, //m + RAJA::statement::CudaKernelAsync< + RAJA::statement::For<1, RAJA::cuda_block_x_loop, // z + RAJA::statement::For<2, RAJA::cuda_block_y_loop, // g + RAJA::statement::For<3, RAJA::cuda_thread_x_loop, // m RAJA::statement::For<0, RAJA::seq_exec, //d RAJA::statement::Lambda<0> > @@ -174,29 +173,24 @@ void LTIMES::runCudaVariantImpl(VariantID vid) constexpr bool async = true; - using launch_policy = RAJA::LaunchPolicy>; + using launch_policy = + RAJA::LaunchPolicy>; - using z_policy = RAJA::LoopPolicy>; + using z_policy = RAJA::LoopPolicy; - using g_policy = RAJA::LoopPolicy>; + using g_policy = RAJA::LoopPolicy; - using m_policy = RAJA::LoopPolicy>; + using m_policy = RAJA::LoopPolicy; using d_policy = RAJA::LoopPolicy; - const size_t z_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_z, z_block_sz); - - const size_t g_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_g, g_block_sz); - - const size_t m_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_m, m_block_sz); - startTimer(); // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { RAJA::launch( res, - RAJA::LaunchParams(RAJA::Teams(m_grid_sz, g_grid_sz, z_grid_sz), - RAJA::Threads(m_block_sz, g_block_sz, z_block_sz)), + RAJA::LaunchParams(RAJA::Teams(*num_z, *num_g, 1), + RAJA::Threads(block_size, 1, 1)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { RAJA::loop(ctx, IZRange(0, *num_z), diff --git a/src/apps/LTIMES-Hip.cpp b/src/apps/LTIMES-Hip.cpp index e33665294..18ee89c7e 100644 --- a/src/apps/LTIMES-Hip.cpp +++ b/src/apps/LTIMES-Hip.cpp @@ -27,9 +27,9 @@ using namespace ltimes_idx; // // Define thread block shape for Hip execution // -#define m_block_sz (32) -#define g_block_sz (integer::greater_of_squarest_factor_pair(block_size/m_block_sz)) -#define z_block_sz (integer::lesser_of_squarest_factor_pair(block_size/m_block_sz)) +#define m_block_sz (block_size) +#define g_block_sz (1) +#define z_block_sz (1) #define LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \ m_block_sz, g_block_sz, z_block_sz @@ -38,19 +38,19 @@ using namespace ltimes_idx; dim3 nthreads_per_block(LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP); #define LTIMES_NBLOCKS_HIP \ - dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(*num_m, m_block_sz)), \ - static_cast(RAJA_DIVIDE_CEILING_INT(*num_g, g_block_sz)), \ - static_cast(RAJA_DIVIDE_CEILING_INT(*num_z, z_block_sz))); + dim3 nblocks(static_cast(*num_z), \ + static_cast(*num_g), \ + 1); -template < size_t m_block_size, size_t g_block_size, size_t z_block_size > -__launch_bounds__(m_block_size*g_block_size*z_block_size) +template < size_t block_size > +__launch_bounds__(block_size) __global__ void ltimes(PHI_VIEW phi, ELL_VIEW ell, PSI_VIEW psi, ID num_d, IM num_m, IG num_g, IZ num_z) { - IM m(blockIdx.x * m_block_size + threadIdx.x); - IG g(blockIdx.y * g_block_size + threadIdx.y); - IZ z(blockIdx.z * z_block_size + threadIdx.z); + IM m(threadIdx.x); + IG g(blockIdx.y); + IZ z(blockIdx.x); if (m < num_m && g < num_g && z < num_z) { for (ID d(0); d < num_d; ++d ) { @@ -59,14 +59,14 @@ __global__ void ltimes(PHI_VIEW phi, ELL_VIEW ell, PSI_VIEW psi, } } -template < size_t m_block_size, size_t g_block_size, size_t z_block_size, typename Lambda > -__launch_bounds__(m_block_size*g_block_size*z_block_size) +template < size_t block_size, typename Lambda > +__launch_bounds__(block_size) __global__ void ltimes_lam(IM num_m, IG num_g, IZ num_z, Lambda body) { - IM m(blockIdx.x * m_block_size + threadIdx.x); - IG g(blockIdx.y * g_block_size + threadIdx.y); - IZ z(blockIdx.z * z_block_size + threadIdx.z); + IM m(threadIdx.x); + IG g(blockIdx.y); + IZ z(blockIdx.x); if (m < num_m && g < num_g && z < num_z) { body(z, g, m); @@ -96,7 +96,7 @@ void LTIMES::runHipVariantImpl(VariantID vid) constexpr size_t shmem = 0; RPlaunchHipKernel( - (ltimes), + (ltimes), nblocks, nthreads_per_block, shmem, res.get_stream(), phi, ell, psi, @@ -122,8 +122,7 @@ void LTIMES::runHipVariantImpl(VariantID vid) constexpr size_t shmem = 0; RPlaunchHipKernel( - (ltimes_lam), + (ltimes_lam), nblocks, nthreads_per_block, shmem, res.get_stream(), num_m, num_g, num_z, @@ -138,10 +137,10 @@ void LTIMES::runHipVariantImpl(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< - RAJA::statement::HipKernelFixedAsync, //z - RAJA::statement::For<2, RAJA::hip_global_size_y_direct, //g - RAJA::statement::For<3, RAJA::hip_global_size_x_direct, //m + RAJA::statement::HipKernelAsync< + RAJA::statement::For<1, RAJA::hip_block_x_loop, // z + RAJA::statement::For<2, RAJA::hip_block_y_loop, // g + RAJA::statement::For<3, RAJA::hip_thread_x_loop, // m RAJA::statement::For<0, RAJA::seq_exec, //d RAJA::statement::Lambda<0> > @@ -173,29 +172,24 @@ void LTIMES::runHipVariantImpl(VariantID vid) constexpr bool async = true; - using launch_policy = RAJA::LaunchPolicy>; + using launch_policy = + RAJA::LaunchPolicy>; - using z_policy = RAJA::LoopPolicy>; + using z_policy = RAJA::LoopPolicy; - using g_policy = RAJA::LoopPolicy>; + using g_policy = RAJA::LoopPolicy; - using m_policy = RAJA::LoopPolicy>; + using m_policy = RAJA::LoopPolicy; using d_policy = RAJA::LoopPolicy; - const size_t z_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_z, z_block_sz); - - const size_t g_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_g, g_block_sz); - - const size_t m_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_m, m_block_sz); - startTimer(); // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { RAJA::launch( res, - RAJA::LaunchParams(RAJA::Teams(m_grid_sz, g_grid_sz, z_grid_sz), - RAJA::Threads(m_block_sz, g_block_sz, z_block_sz)), + RAJA::LaunchParams(RAJA::Teams(*num_z, *num_g, 1), + RAJA::Threads(block_size, 1, 1)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { RAJA::loop(ctx, IZRange(0, *num_z), diff --git a/src/apps/LTIMES-Sycl.cpp b/src/apps/LTIMES-Sycl.cpp index 81e5c7117..4c167858a 100644 --- a/src/apps/LTIMES-Sycl.cpp +++ b/src/apps/LTIMES-Sycl.cpp @@ -27,9 +27,9 @@ using namespace ltimes_idx; // // Define work-group shape for SYCL execution // -#define m_wg_sz (32) -#define g_wg_sz (integer::greater_of_squarest_factor_pair(work_group_size/m_wg_sz)) -#define z_wg_sz (integer::lesser_of_squarest_factor_pair(work_group_size/m_wg_sz)) +#define m_wg_sz (work_group_size) +#define g_wg_sz (1) +#define z_wg_sz (1) template void LTIMES::runSyclVariantImpl(VariantID vid) @@ -45,8 +45,8 @@ void LTIMES::runSyclVariantImpl(VariantID vid) if ( vid == Base_SYCL ) { - sycl::range<3> global_dim(z_wg_sz * RAJA_DIVIDE_CEILING_INT(*num_z, z_wg_sz), - g_wg_sz * RAJA_DIVIDE_CEILING_INT(*num_g, g_wg_sz), + sycl::range<3> global_dim(*num_z, + *num_g, m_wg_sz * RAJA_DIVIDE_CEILING_INT(*num_m, m_wg_sz)); sycl::range<3> wkgroup_dim(z_wg_sz, g_wg_sz, m_wg_sz); @@ -124,9 +124,9 @@ void LTIMES::runSyclVariantImpl(VariantID vid) using d_policy = RAJA::LoopPolicy; - const size_t z_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_z, z_wg_sz); + const size_t z_grid_sz = *num_z; - const size_t g_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_g, g_wg_sz); + const size_t g_grid_sz = *num_g; const size_t m_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_m, m_wg_sz); diff --git a/src/apps/LTIMES.hpp b/src/apps/LTIMES.hpp index 8c6ebed7c..df4f901d7 100644 --- a/src/apps/LTIMES.hpp +++ b/src/apps/LTIMES.hpp @@ -9,6 +9,8 @@ /// /// LTIMES kernel reference implementation: +/// Dependent on running in Kripke --layout ZGD +/// https://github.com/llnl/Kripke/blob/develop/src/Kripke/Kernel/LTimes.cpp#L27 /// /// for (Index_type z = 0; z < num_z; ++z ) { /// for (Index_type g = 0; g < num_g; ++g ) { @@ -16,7 +18,7 @@ /// for (Index_type d = 0; d < num_d; ++d ) { /// /// phi[m+ (g * num_m) + (z * num_m * num_g)] += -/// ell[d+ (m * num_d)] * psi[d+ (g * num_d) + (z * num_d * num_g]; +/// ell[m+ (d * num_m)] * psi[d+ (g * num_d) + (z * num_d * num_g]; /// /// } /// } @@ -42,7 +44,7 @@ RAJA::as_array >::get() ) ); \ ELL_VIEW ell(m_elldat, \ RAJA::make_permuted_layout( {{*num_m, *num_d}}, \ - RAJA::as_array >::get() ) ); \ + RAJA::as_array >::get() ) ); \ PHI_VIEW phi(m_phidat, \ RAJA::make_permuted_layout( {{*num_z, *num_g, *num_m}}, \ RAJA::as_array >::get() ) ); @@ -76,7 +78,7 @@ namespace ltimes_idx { RAJA::Layout<3, Index_type, 2>, IZ, IG, ID>; using ELL_VIEW = RAJA::TypedView, + RAJA::Layout<2, Index_type, 0>, IM, ID>; using PHI_VIEW = RAJA::TypedView, @@ -114,17 +116,17 @@ class LTIMES : public KernelBase void runOpenMPVariant(VariantID vid); void runOpenMPTargetVariant(VariantID vid); - template < size_t block_size, size_t tune_idx > + template < size_t m_block_sz, size_t tune_idx > void runCudaVariantImpl(VariantID vid); - template < size_t block_size, size_t tune_idx > + template < size_t m_block_sz, size_t tune_idx > void runHipVariantImpl(VariantID vid); template < size_t work_group_size, size_t tune_idx > void runSyclVariantImpl(VariantID vid); private: - static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = integer::make_gpu_block_size_list_type>; + static const size_t default_gpu_block_size = 25; + using gpu_block_sizes_type = + integer::make_gpu_block_size_list_type; Real_ptr m_phidat; Real_ptr m_elldat; diff --git a/src/apps/LTIMES_NOVIEW-Cuda.cpp b/src/apps/LTIMES_NOVIEW-Cuda.cpp index 7c8ab94b8..bca1b9227 100644 --- a/src/apps/LTIMES_NOVIEW-Cuda.cpp +++ b/src/apps/LTIMES_NOVIEW-Cuda.cpp @@ -25,31 +25,32 @@ namespace apps // // Define thread block shape for CUDA execution // -#define m_block_sz (32) -#define g_block_sz (integer::greater_of_squarest_factor_pair(block_size/m_block_sz)) -#define z_block_sz (integer::lesser_of_squarest_factor_pair(block_size/m_block_sz)) +#define m_block_sz (block_size) +#define g_block_sz (1) +#define z_block_sz (1) #define LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \ m_block_sz, g_block_sz, z_block_sz #define LTIMES_NOVIEW_THREADS_PER_BLOCK_CUDA \ - dim3 nthreads_per_block(LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA); + dim3 nthreads_per_block(LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA); \ + static_assert(m_block_sz*g_block_sz*z_block_sz == block_size, "Invalid block_size"); #define LTIMES_NOVIEW_NBLOCKS_CUDA \ - dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(num_m, m_block_sz)), \ - static_cast(RAJA_DIVIDE_CEILING_INT(num_g, g_block_sz)), \ - static_cast(RAJA_DIVIDE_CEILING_INT(num_z, z_block_sz))); + dim3 nblocks(static_cast(num_z), \ + static_cast(num_g), \ + 1); -template < size_t m_block_size, size_t g_block_size, size_t z_block_size > -__launch_bounds__(m_block_size*g_block_size*z_block_size) +template < size_t block_size > +__launch_bounds__(block_size) __global__ void ltimes_noview(Real_ptr phidat, Real_ptr elldat, Real_ptr psidat, Index_type num_d, Index_type num_m, Index_type num_g, Index_type num_z) { - Index_type m = blockIdx.x * m_block_size + threadIdx.x; - Index_type g = blockIdx.y * g_block_size + threadIdx.y; - Index_type z = blockIdx.z * z_block_size + threadIdx.z; + Index_type m = threadIdx.x; + Index_type g = blockIdx.y; + Index_type z = blockIdx.x; if (m < num_m && g < num_g && z < num_z) { for (Index_type d = 0; d < num_d; ++d ) { @@ -58,14 +59,14 @@ __global__ void ltimes_noview(Real_ptr phidat, Real_ptr elldat, Real_ptr psidat, } } -template < size_t m_block_size, size_t g_block_size, size_t z_block_size, typename Lambda > -__launch_bounds__(m_block_size*g_block_size*z_block_size) +template < size_t block_size, typename Lambda > +__launch_bounds__(block_size) __global__ void ltimes_noview_lam(Index_type num_m, Index_type num_g, Index_type num_z, Lambda body) { - Index_type m = blockIdx.x * m_block_size + threadIdx.x; - Index_type g = blockIdx.y * g_block_size + threadIdx.y; - Index_type z = blockIdx.z * z_block_size + threadIdx.z; + Index_type m = threadIdx.x; + Index_type g = blockIdx.y; + Index_type z = blockIdx.x; if (m < num_m && g < num_g && z < num_z) { body(z, g, m); @@ -95,7 +96,7 @@ void LTIMES_NOVIEW::runCudaVariantImpl(VariantID vid) constexpr size_t shmem = 0; RPlaunchCudaKernel( - (ltimes_noview), + (ltimes_noview), nblocks, nthreads_per_block, shmem, res.get_stream(), phidat, elldat, psidat, @@ -122,8 +123,7 @@ void LTIMES_NOVIEW::runCudaVariantImpl(VariantID vid) constexpr size_t shmem = 0; RPlaunchCudaKernel( - (ltimes_noview_lam), + (ltimes_noview_lam), nblocks, nthreads_per_block, shmem, res.get_stream(), num_m, num_g, num_z, @@ -138,10 +138,10 @@ void LTIMES_NOVIEW::runCudaVariantImpl(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< - RAJA::statement::CudaKernelFixedAsync, //z - RAJA::statement::For<2, RAJA::cuda_global_size_y_direct, //g - RAJA::statement::For<3, RAJA::cuda_global_size_x_direct, //m + RAJA::statement::CudaKernelAsync< + RAJA::statement::For<1, RAJA::cuda_block_x_loop, // z + RAJA::statement::For<2, RAJA::cuda_block_y_loop, // g + RAJA::statement::For<3, RAJA::cuda_thread_x_loop, // m RAJA::statement::For<0, RAJA::seq_exec, //d RAJA::statement::Lambda<0> > @@ -174,29 +174,24 @@ void LTIMES_NOVIEW::runCudaVariantImpl(VariantID vid) constexpr bool async = true; - using launch_policy = RAJA::LaunchPolicy>; + using launch_policy = + RAJA::LaunchPolicy>; - using z_policy = RAJA::LoopPolicy>; + using z_policy = RAJA::LoopPolicy; - using g_policy = RAJA::LoopPolicy>; + using g_policy = RAJA::LoopPolicy; - using m_policy = RAJA::LoopPolicy>; + using m_policy = RAJA::LoopPolicy; using d_policy = RAJA::LoopPolicy; - const size_t z_grid_sz = RAJA_DIVIDE_CEILING_INT(num_z, z_block_sz); - - const size_t g_grid_sz = RAJA_DIVIDE_CEILING_INT(num_g, g_block_sz); - - const size_t m_grid_sz = RAJA_DIVIDE_CEILING_INT(num_m, m_block_sz); - startTimer(); // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { RAJA::launch( res, - RAJA::LaunchParams(RAJA::Teams(m_grid_sz, g_grid_sz, z_grid_sz), - RAJA::Threads(m_block_sz, g_block_sz, z_block_sz)), + RAJA::LaunchParams(RAJA::Teams(num_z, num_g, 1), + RAJA::Threads(block_size, 1, 1)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { RAJA::loop(ctx, RAJA::RangeSegment(0, num_z), diff --git a/src/apps/LTIMES_NOVIEW-Hip.cpp b/src/apps/LTIMES_NOVIEW-Hip.cpp index e222dce50..f530c6798 100644 --- a/src/apps/LTIMES_NOVIEW-Hip.cpp +++ b/src/apps/LTIMES_NOVIEW-Hip.cpp @@ -25,31 +25,32 @@ namespace apps // // Define thread block shape for Hip execution // -#define m_block_sz (32) -#define g_block_sz (integer::greater_of_squarest_factor_pair(block_size/m_block_sz)) -#define z_block_sz (integer::lesser_of_squarest_factor_pair(block_size/m_block_sz)) +#define m_block_sz (block_size) +#define g_block_sz (1) +#define z_block_sz (1) #define LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \ m_block_sz, g_block_sz, z_block_sz #define LTIMES_NOVIEW_THREADS_PER_BLOCK_HIP \ - dim3 nthreads_per_block(LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP); + dim3 nthreads_per_block(LTIMES_NOVIEW_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP); \ + static_assert(m_block_sz*g_block_sz*z_block_sz == block_size, "Invalid block_size"); #define LTIMES_NOVIEW_NBLOCKS_HIP \ - dim3 nblocks(static_cast(RAJA_DIVIDE_CEILING_INT(num_m, m_block_sz)), \ - static_cast(RAJA_DIVIDE_CEILING_INT(num_g, g_block_sz)), \ - static_cast(RAJA_DIVIDE_CEILING_INT(num_z, z_block_sz))); + dim3 nblocks(static_cast(num_z), \ + static_cast(num_g), \ + 1); -template < size_t m_block_size, size_t g_block_size, size_t z_block_size > -__launch_bounds__(m_block_size*g_block_size*z_block_size) +template < size_t block_size > +__launch_bounds__(block_size) __global__ void ltimes_noview(Real_ptr phidat, Real_ptr elldat, Real_ptr psidat, Index_type num_d, Index_type num_m, Index_type num_g, Index_type num_z) { - Index_type m = blockIdx.x * m_block_size + threadIdx.x; - Index_type g = blockIdx.y * g_block_size + threadIdx.y; - Index_type z = blockIdx.z * z_block_size + threadIdx.z; + Index_type m = threadIdx.x; + Index_type g = blockIdx.y; + Index_type z = blockIdx.x; if (m < num_m && g < num_g && z < num_z) { for (Index_type d = 0; d < num_d; ++d ) { @@ -58,14 +59,14 @@ __global__ void ltimes_noview(Real_ptr phidat, Real_ptr elldat, Real_ptr psidat, } } -template < size_t m_block_size, size_t g_block_size, size_t z_block_size, typename Lambda > -__launch_bounds__(m_block_size*g_block_size*z_block_size) +template < size_t block_size, typename Lambda > +__launch_bounds__(block_size) __global__ void ltimes_noview_lam(Index_type num_m, Index_type num_g, Index_type num_z, Lambda body) { - Index_type m = blockIdx.x * m_block_size + threadIdx.x; - Index_type g = blockIdx.y * g_block_size + threadIdx.y; - Index_type z = blockIdx.z * z_block_size + threadIdx.z; + Index_type m = threadIdx.x; + Index_type g = blockIdx.y; + Index_type z = blockIdx.x; if (m < num_m && g < num_g && z < num_z) { body(z, g, m); @@ -95,7 +96,7 @@ void LTIMES_NOVIEW::runHipVariantImpl(VariantID vid) constexpr size_t shmem = 0; RPlaunchHipKernel( - (ltimes_noview), + (ltimes_noview), nblocks, nthreads_per_block, shmem, res.get_stream(), phidat, elldat, psidat, @@ -122,8 +123,7 @@ void LTIMES_NOVIEW::runHipVariantImpl(VariantID vid) constexpr size_t shmem = 0; RPlaunchHipKernel( - (ltimes_noview_lam), + (ltimes_noview_lam), nblocks, nthreads_per_block, shmem, res.get_stream(), num_m, num_g, num_z, @@ -138,10 +138,10 @@ void LTIMES_NOVIEW::runHipVariantImpl(VariantID vid) using EXEC_POL = RAJA::KernelPolicy< - RAJA::statement::HipKernelFixedAsync, //z - RAJA::statement::For<2, RAJA::hip_global_size_y_direct, //g - RAJA::statement::For<3, RAJA::hip_global_size_x_direct, //m + RAJA::statement::HipKernelAsync< + RAJA::statement::For<1, RAJA::hip_block_x_loop, // z + RAJA::statement::For<2, RAJA::hip_block_y_loop, // g + RAJA::statement::For<3, RAJA::hip_thread_x_loop, // m RAJA::statement::For<0, RAJA::seq_exec, //d RAJA::statement::Lambda<0> > @@ -174,29 +174,24 @@ void LTIMES_NOVIEW::runHipVariantImpl(VariantID vid) constexpr bool async = true; - using launch_policy = RAJA::LaunchPolicy>; + using launch_policy = + RAJA::LaunchPolicy>; - using z_policy = RAJA::LoopPolicy>; + using z_policy = RAJA::LoopPolicy; - using g_policy = RAJA::LoopPolicy>; + using g_policy = RAJA::LoopPolicy; - using m_policy = RAJA::LoopPolicy>; + using m_policy = RAJA::LoopPolicy; using d_policy = RAJA::LoopPolicy; - const size_t z_grid_sz = RAJA_DIVIDE_CEILING_INT(num_z, z_block_sz); - - const size_t g_grid_sz = RAJA_DIVIDE_CEILING_INT(num_g, g_block_sz); - - const size_t m_grid_sz = RAJA_DIVIDE_CEILING_INT(num_m, m_block_sz); - startTimer(); // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { RAJA::launch( res, - RAJA::LaunchParams(RAJA::Teams(m_grid_sz, g_grid_sz, z_grid_sz), - RAJA::Threads(m_block_sz, g_block_sz, z_block_sz)), + RAJA::LaunchParams(RAJA::Teams(num_z, num_g, 1), + RAJA::Threads(block_size, 1, 1)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { RAJA::loop(ctx, RAJA::RangeSegment(0, num_z), diff --git a/src/apps/LTIMES_NOVIEW-Sycl.cpp b/src/apps/LTIMES_NOVIEW-Sycl.cpp index 921790d7a..10548ad15 100644 --- a/src/apps/LTIMES_NOVIEW-Sycl.cpp +++ b/src/apps/LTIMES_NOVIEW-Sycl.cpp @@ -25,9 +25,9 @@ namespace apps // // Define work-group shape for SYCL execution // -#define m_wg_sz (32) -#define g_wg_sz (integer::greater_of_squarest_factor_pair(work_group_size/m_wg_sz)) -#define z_wg_sz (integer::lesser_of_squarest_factor_pair(work_group_size/m_wg_sz)) +#define m_wg_sz (work_group_size) +#define g_wg_sz (1) +#define z_wg_sz (1) template void LTIMES_NOVIEW::runSyclVariantImpl(VariantID vid) @@ -43,8 +43,8 @@ void LTIMES_NOVIEW::runSyclVariantImpl(VariantID vid) if ( vid == Base_SYCL ) { - sycl::range<3> global_dim(z_wg_sz * RAJA_DIVIDE_CEILING_INT(num_z, z_wg_sz), - g_wg_sz * RAJA_DIVIDE_CEILING_INT(num_g, g_wg_sz), + sycl::range<3> global_dim(num_z, + num_g, m_wg_sz * RAJA_DIVIDE_CEILING_INT(num_m, m_wg_sz)); sycl::range<3> wkgroup_dim(z_wg_sz, g_wg_sz, m_wg_sz); @@ -122,9 +122,9 @@ void LTIMES_NOVIEW::runSyclVariantImpl(VariantID vid) using d_policy = RAJA::LoopPolicy; - const size_t z_grid_sz = RAJA_DIVIDE_CEILING_INT(num_z, z_wg_sz); + const size_t z_grid_sz = num_z; - const size_t g_grid_sz = RAJA_DIVIDE_CEILING_INT(num_g, g_wg_sz); + const size_t g_grid_sz = num_g; const size_t m_grid_sz = RAJA_DIVIDE_CEILING_INT(num_m, m_wg_sz); diff --git a/src/apps/LTIMES_NOVIEW.hpp b/src/apps/LTIMES_NOVIEW.hpp index 99b75c9de..2b064d97a 100644 --- a/src/apps/LTIMES_NOVIEW.hpp +++ b/src/apps/LTIMES_NOVIEW.hpp @@ -10,7 +10,7 @@ /// /// LTIMES_NOVIEW kernel reference implementation: /// actual order of phi is gmz -/// actual order of ell is md +/// actual order of ell is dm /// actual order of psi is gdz /// for (Index_type z = 0; z < num_z; ++z ) { /// for (Index_type g = 0; g < num_g; ++g ) { @@ -18,7 +18,7 @@ /// for (Index_type d = 0; d < num_d; ++d ) { /// /// phi[m+ (g * num_m) + (z * num_m * num_g)] += -/// ell[d+ (m * num_d)] * psi[d+ (g * num_d) + (z * num_d * num_g]; +/// ell[m+ (d * num_m)] * psi[d+ (g * num_d) + (z * num_d * num_g]; /// /// } /// } @@ -41,7 +41,7 @@ #define LTIMES_NOVIEW_BODY \ phidat[m+ (g * num_m) + (z * num_m * num_g)] += \ - elldat[d+ (m * num_d)] * psidat[d+ (g * num_d) + (z * num_d * num_g)]; + elldat[m+ (d * num_m)] * psidat[d+ (g * num_d) + (z * num_d * num_g)]; #include "common/KernelBase.hpp" @@ -86,9 +86,9 @@ class LTIMES_NOVIEW : public KernelBase void runSyclVariantImpl(VariantID vid); private: - static const size_t default_gpu_block_size = 256; - using gpu_block_sizes_type = integer::make_gpu_block_size_list_type>; + static const size_t default_gpu_block_size = 25; + using gpu_block_sizes_type = + integer::make_gpu_block_size_list_type; Real_ptr m_phidat; Real_ptr m_elldat;