Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 29 additions & 35 deletions src/apps/LTIMES-Cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ using namespace ltimes_idx;
//
// Define thread block shape for CUDA execution
//
#define m_block_sz (32)
#define g_block_sz (integer::greater_of_squarest_factor_pair(block_size/m_block_sz))
#define z_block_sz (integer::lesser_of_squarest_factor_pair(block_size/m_block_sz))
#define m_block_sz (block_size)
#define g_block_sz (1)
#define z_block_sz (1)

#define LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA \
m_block_sz, g_block_sz, z_block_sz
Expand All @@ -39,19 +39,19 @@ using namespace ltimes_idx;
static_assert(m_block_sz*g_block_sz*z_block_sz == block_size, "Invalid block_size");

#define LTIMES_NBLOCKS_CUDA \
dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(*num_m, m_block_sz)), \
static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(*num_g, g_block_sz)), \
static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(*num_z, z_block_sz)));
dim3 nblocks(static_cast<size_t>(*num_z), \
static_cast<size_t>(*num_g), \
1);


template < size_t m_block_size, size_t g_block_size, size_t z_block_size >
__launch_bounds__(m_block_size*g_block_size*z_block_size)
template < size_t block_size >
__launch_bounds__(block_size)
__global__ void ltimes(PHI_VIEW phi, ELL_VIEW ell, PSI_VIEW psi,
ID num_d, IM num_m, IG num_g, IZ num_z)
{
IM m(blockIdx.x * m_block_size + threadIdx.x);
IG g(blockIdx.y * g_block_size + threadIdx.y);
IZ z(blockIdx.z * z_block_size + threadIdx.z);
IM m(threadIdx.x);
IG g(blockIdx.y);
IZ z(blockIdx.x);

if (m < num_m && g < num_g && z < num_z) {
for (ID d(0); d < num_d; ++d ) {
Expand All @@ -60,14 +60,14 @@ __global__ void ltimes(PHI_VIEW phi, ELL_VIEW ell, PSI_VIEW psi,
}
}

template < size_t m_block_size, size_t g_block_size, size_t z_block_size, typename Lambda >
__launch_bounds__(m_block_size*g_block_size*z_block_size)
template < size_t block_size, typename Lambda >
__launch_bounds__(block_size)
__global__ void ltimes_lam(IM num_m, IG num_g, IZ num_z,
Lambda body)
{
IM m(blockIdx.x * m_block_size + threadIdx.x);
IG g(blockIdx.y * g_block_size + threadIdx.y);
IZ z(blockIdx.z * z_block_size + threadIdx.z);
IM m(threadIdx.x);
IG g(blockIdx.y);
IZ z(blockIdx.x);

if (m < num_m && g < num_g && z < num_z) {
body(z, g, m);
Expand Down Expand Up @@ -97,7 +97,7 @@ void LTIMES::runCudaVariantImpl(VariantID vid)
constexpr size_t shmem = 0;

RPlaunchCudaKernel(
(ltimes<LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA>),
(ltimes<block_size>),
nblocks, nthreads_per_block,
shmem, res.get_stream(),
phi, ell, psi,
Expand All @@ -123,8 +123,7 @@ void LTIMES::runCudaVariantImpl(VariantID vid)
constexpr size_t shmem = 0;

RPlaunchCudaKernel(
(ltimes_lam<LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_CUDA,
decltype(ltimes_lambda)>),
(ltimes_lam<block_size, decltype(ltimes_lambda)>),
nblocks, nthreads_per_block,
shmem, res.get_stream(),
num_m, num_g, num_z,
Expand All @@ -139,10 +138,10 @@ void LTIMES::runCudaVariantImpl(VariantID vid)

using EXEC_POL =
RAJA::KernelPolicy<
RAJA::statement::CudaKernelFixedAsync<m_block_sz*g_block_sz*z_block_sz,
RAJA::statement::For<1, RAJA::cuda_global_size_z_direct<z_block_sz>, //z
RAJA::statement::For<2, RAJA::cuda_global_size_y_direct<g_block_sz>, //g
RAJA::statement::For<3, RAJA::cuda_global_size_x_direct<m_block_sz>, //m
RAJA::statement::CudaKernelAsync<

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that I've reverted LTimes to launch synchronously in Kripke, for correctness. This is fine though because the direction loop is inner-most, which should avoid race conditions.

@michaelmckinsey1 michaelmckinsey1 Jun 24, 2026

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It has been async in RAJAPerf, I just changed it from CudaKernelFixedAsync to CudaKernelAsync, but for completeness I can make it CudaKernel. I don't this would matter for performance in RAJAPerf.

RAJA::statement::For<1, RAJA::cuda_block_x_loop, // z
RAJA::statement::For<2, RAJA::cuda_block_y_loop, // g
RAJA::statement::For<3, RAJA::cuda_thread_x_loop, // m

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume this is a non-size loop policy because it is in ltimes. Here we know the block size at compile time, is that also true in kripke?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In Kripke, the block sizes are determined by parameters passed in at runtime. For example, like in this version of LTimes, it will be blocked on zones and groups. The exact parameters which will be blocked are not always the same for each Kripke run. For instance, if we use the DZG layout at runtime, then the loops will be blocked with directions and zones (while groups are threaded).

RAJA::statement::For<0, RAJA::seq_exec, //d
RAJA::statement::Lambda<0>
>
Expand Down Expand Up @@ -174,29 +173,24 @@ void LTIMES::runCudaVariantImpl(VariantID vid)

constexpr bool async = true;

using launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async, m_block_sz*g_block_sz*z_block_sz>>;
using launch_policy =
RAJA::LaunchPolicy<RAJA::cuda_launch_t<async, block_size>>;

using z_policy = RAJA::LoopPolicy<RAJA::cuda_global_size_z_loop<z_block_sz>>;
using z_policy = RAJA::LoopPolicy<RAJA::cuda_block_x_loop>;

using g_policy = RAJA::LoopPolicy<RAJA::cuda_global_size_y_loop<g_block_sz>>;
using g_policy = RAJA::LoopPolicy<RAJA::cuda_block_y_loop>;

using m_policy = RAJA::LoopPolicy<RAJA::cuda_global_size_x_loop<m_block_sz>>;
using m_policy = RAJA::LoopPolicy<RAJA::cuda_thread_x_loop>;

using d_policy = RAJA::LoopPolicy<RAJA::seq_exec>;

const size_t z_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_z, z_block_sz);

const size_t g_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_g, g_block_sz);

const size_t m_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_m, m_block_sz);

startTimer();
// Loop counter increment uses macro to quiet C++20 compiler warning
for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) {

RAJA::launch<launch_policy>( res,
RAJA::LaunchParams(RAJA::Teams(m_grid_sz, g_grid_sz, z_grid_sz),
RAJA::Threads(m_block_sz, g_block_sz, z_block_sz)),
RAJA::LaunchParams(RAJA::Teams(*num_z, *num_g, 1),
RAJA::Threads(block_size, 1, 1)),
[=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {

RAJA::loop<z_policy>(ctx, IZRange(0, *num_z),
Expand Down
64 changes: 29 additions & 35 deletions src/apps/LTIMES-Hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ using namespace ltimes_idx;
//
// Define thread block shape for Hip execution
//
#define m_block_sz (32)
#define g_block_sz (integer::greater_of_squarest_factor_pair(block_size/m_block_sz))
#define z_block_sz (integer::lesser_of_squarest_factor_pair(block_size/m_block_sz))
#define m_block_sz (block_size)
#define g_block_sz (1)
#define z_block_sz (1)

#define LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP \
m_block_sz, g_block_sz, z_block_sz
Expand All @@ -38,19 +38,19 @@ using namespace ltimes_idx;
dim3 nthreads_per_block(LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP);

#define LTIMES_NBLOCKS_HIP \
dim3 nblocks(static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(*num_m, m_block_sz)), \
static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(*num_g, g_block_sz)), \
static_cast<size_t>(RAJA_DIVIDE_CEILING_INT(*num_z, z_block_sz)));
dim3 nblocks(static_cast<size_t>(*num_z), \
static_cast<size_t>(*num_g), \
1);


template < size_t m_block_size, size_t g_block_size, size_t z_block_size >
__launch_bounds__(m_block_size*g_block_size*z_block_size)
template < size_t block_size >
__launch_bounds__(block_size)
__global__ void ltimes(PHI_VIEW phi, ELL_VIEW ell, PSI_VIEW psi,
ID num_d, IM num_m, IG num_g, IZ num_z)
{
IM m(blockIdx.x * m_block_size + threadIdx.x);
IG g(blockIdx.y * g_block_size + threadIdx.y);
IZ z(blockIdx.z * z_block_size + threadIdx.z);
IM m(threadIdx.x);
IG g(blockIdx.y);
IZ z(blockIdx.x);

if (m < num_m && g < num_g && z < num_z) {
for (ID d(0); d < num_d; ++d ) {
Expand All @@ -59,14 +59,14 @@ __global__ void ltimes(PHI_VIEW phi, ELL_VIEW ell, PSI_VIEW psi,
}
}

template < size_t m_block_size, size_t g_block_size, size_t z_block_size, typename Lambda >
__launch_bounds__(m_block_size*g_block_size*z_block_size)
template < size_t block_size, typename Lambda >
__launch_bounds__(block_size)
__global__ void ltimes_lam(IM num_m, IG num_g, IZ num_z,
Lambda body)
{
IM m(blockIdx.x * m_block_size + threadIdx.x);
IG g(blockIdx.y * g_block_size + threadIdx.y);
IZ z(blockIdx.z * z_block_size + threadIdx.z);
IM m(threadIdx.x);
IG g(blockIdx.y);
IZ z(blockIdx.x);

if (m < num_m && g < num_g && z < num_z) {
body(z, g, m);
Expand Down Expand Up @@ -96,7 +96,7 @@ void LTIMES::runHipVariantImpl(VariantID vid)
constexpr size_t shmem = 0;

RPlaunchHipKernel(
(ltimes<LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP>),
(ltimes<block_size>),
nblocks, nthreads_per_block,
shmem, res.get_stream(),
phi, ell, psi,
Expand All @@ -122,8 +122,7 @@ void LTIMES::runHipVariantImpl(VariantID vid)
constexpr size_t shmem = 0;

RPlaunchHipKernel(
(ltimes_lam<LTIMES_THREADS_PER_BLOCK_TEMPLATE_PARAMS_HIP,
decltype(ltimes_lambda)>),
(ltimes_lam<block_size, decltype(ltimes_lambda)>),
nblocks, nthreads_per_block,
shmem, res.get_stream(),
num_m, num_g, num_z,
Expand All @@ -138,10 +137,10 @@ void LTIMES::runHipVariantImpl(VariantID vid)

using EXEC_POL =
RAJA::KernelPolicy<
RAJA::statement::HipKernelFixedAsync<m_block_sz*g_block_sz*z_block_sz,
RAJA::statement::For<1, RAJA::hip_global_size_z_direct<z_block_sz>, //z
RAJA::statement::For<2, RAJA::hip_global_size_y_direct<g_block_sz>, //g
RAJA::statement::For<3, RAJA::hip_global_size_x_direct<m_block_sz>, //m
RAJA::statement::HipKernelAsync<
RAJA::statement::For<1, RAJA::hip_block_x_loop, // z
RAJA::statement::For<2, RAJA::hip_block_y_loop, // g
RAJA::statement::For<3, RAJA::hip_thread_x_loop, // m
RAJA::statement::For<0, RAJA::seq_exec, //d
RAJA::statement::Lambda<0>
>
Expand Down Expand Up @@ -173,29 +172,24 @@ void LTIMES::runHipVariantImpl(VariantID vid)

constexpr bool async = true;

using launch_policy = RAJA::LaunchPolicy<RAJA::hip_launch_t<async, m_block_sz*g_block_sz*z_block_sz>>;
using launch_policy =
RAJA::LaunchPolicy<RAJA::hip_launch_t<async, block_size>>;

using z_policy = RAJA::LoopPolicy<RAJA::hip_global_size_z_loop<z_block_sz>>;
using z_policy = RAJA::LoopPolicy<RAJA::hip_block_x_loop>;

using g_policy = RAJA::LoopPolicy<RAJA::hip_global_size_y_loop<g_block_sz>>;
using g_policy = RAJA::LoopPolicy<RAJA::hip_block_y_loop>;

using m_policy = RAJA::LoopPolicy<RAJA::hip_global_size_x_loop<m_block_sz>>;
using m_policy = RAJA::LoopPolicy<RAJA::hip_thread_x_loop>;

using d_policy = RAJA::LoopPolicy<RAJA::seq_exec>;

const size_t z_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_z, z_block_sz);

const size_t g_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_g, g_block_sz);

const size_t m_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_m, m_block_sz);

startTimer();
// Loop counter increment uses macro to quiet C++20 compiler warning
for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) {

RAJA::launch<launch_policy>( res,
RAJA::LaunchParams(RAJA::Teams(m_grid_sz, g_grid_sz, z_grid_sz),
RAJA::Threads(m_block_sz, g_block_sz, z_block_sz)),
RAJA::LaunchParams(RAJA::Teams(*num_z, *num_g, 1),
RAJA::Threads(block_size, 1, 1)),
[=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {

RAJA::loop<z_policy>(ctx, IZRange(0, *num_z),
Expand Down
14 changes: 7 additions & 7 deletions src/apps/LTIMES-Sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,9 @@ using namespace ltimes_idx;
//
// Define work-group shape for SYCL execution
//
#define m_wg_sz (32)
#define g_wg_sz (integer::greater_of_squarest_factor_pair(work_group_size/m_wg_sz))
#define z_wg_sz (integer::lesser_of_squarest_factor_pair(work_group_size/m_wg_sz))
#define m_wg_sz (work_group_size)
#define g_wg_sz (1)
#define z_wg_sz (1)

template <size_t work_group_size, size_t tune_idx >
void LTIMES::runSyclVariantImpl(VariantID vid)
Expand All @@ -45,8 +45,8 @@ void LTIMES::runSyclVariantImpl(VariantID vid)

if ( vid == Base_SYCL ) {

sycl::range<3> global_dim(z_wg_sz * RAJA_DIVIDE_CEILING_INT(*num_z, z_wg_sz),
g_wg_sz * RAJA_DIVIDE_CEILING_INT(*num_g, g_wg_sz),
sycl::range<3> global_dim(*num_z,
*num_g,
m_wg_sz * RAJA_DIVIDE_CEILING_INT(*num_m, m_wg_sz));
sycl::range<3> wkgroup_dim(z_wg_sz, g_wg_sz, m_wg_sz);

Expand Down Expand Up @@ -124,9 +124,9 @@ void LTIMES::runSyclVariantImpl(VariantID vid)

using d_policy = RAJA::LoopPolicy<RAJA::seq_exec>;

const size_t z_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_z, z_wg_sz);
const size_t z_grid_sz = *num_z;

const size_t g_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_g, g_wg_sz);
const size_t g_grid_sz = *num_g;

const size_t m_grid_sz = RAJA_DIVIDE_CEILING_INT(*num_m, m_wg_sz);

Expand Down
18 changes: 10 additions & 8 deletions src/apps/LTIMES.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,16 @@

///
/// LTIMES kernel reference implementation:
/// Dependent on running in Kripke --layout ZGD
/// https://github.com/llnl/Kripke/blob/develop/src/Kripke/Kernel/LTimes.cpp#L27
///
/// for (Index_type z = 0; z < num_z; ++z ) {
/// for (Index_type g = 0; g < num_g; ++g ) {
/// for (Index_type m = 0; m < num_m; ++m ) {
/// for (Index_type d = 0; d < num_d; ++d ) {
///
/// phi[m+ (g * num_m) + (z * num_m * num_g)] +=
/// ell[d+ (m * num_d)] * psi[d+ (g * num_d) + (z * num_d * num_g];
/// ell[m+ (d * num_m)] * psi[d+ (g * num_d) + (z * num_d * num_g];
///
/// }
/// }
Expand All @@ -42,7 +44,7 @@
RAJA::as_array<RAJA::Perm<0, 1, 2> >::get() ) ); \
ELL_VIEW ell(m_elldat, \
RAJA::make_permuted_layout( {{*num_m, *num_d}}, \
RAJA::as_array<RAJA::Perm<0, 1> >::get() ) ); \
RAJA::as_array<RAJA::Perm<1, 0> >::get() ) ); \
PHI_VIEW phi(m_phidat, \
RAJA::make_permuted_layout( {{*num_z, *num_g, *num_m}}, \
RAJA::as_array<RAJA::Perm<0, 1, 2> >::get() ) );
Expand Down Expand Up @@ -76,7 +78,7 @@ namespace ltimes_idx {
RAJA::Layout<3, Index_type, 2>,
IZ, IG, ID>;
using ELL_VIEW = RAJA::TypedView<Real_type,
RAJA::Layout<2, Index_type, 1>,
RAJA::Layout<2, Index_type, 0>,
IM, ID>;
Comment thread
MrBurmark marked this conversation as resolved.
using PHI_VIEW = RAJA::TypedView<Real_type,
RAJA::Layout<3, Index_type, 2>,
Expand Down Expand Up @@ -114,17 +116,17 @@ class LTIMES : public KernelBase
void runOpenMPVariant(VariantID vid);
void runOpenMPTargetVariant(VariantID vid);

template < size_t block_size, size_t tune_idx >
template < size_t m_block_sz, size_t tune_idx >
void runCudaVariantImpl(VariantID vid);
template < size_t block_size, size_t tune_idx >
template < size_t m_block_sz, size_t tune_idx >
void runHipVariantImpl(VariantID vid);
template < size_t work_group_size, size_t tune_idx >
void runSyclVariantImpl(VariantID vid);

private:
static const size_t default_gpu_block_size = 256;
using gpu_block_sizes_type = integer::make_gpu_block_size_list_type<default_gpu_block_size,
integer::MultipleOf<32>>;
static const size_t default_gpu_block_size = 25;
using gpu_block_sizes_type =
integer::make_gpu_block_size_list_type<default_gpu_block_size>;

Real_ptr m_phidat;
Real_ptr m_elldat;
Expand Down
Loading
Loading