Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
263 changes: 182 additions & 81 deletions src/apps/MASS3DPA-Cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,16 @@ namespace apps {
template < size_t block_size >
__launch_bounds__(block_size)
__global__ void Mass3DPA(const Real_ptr B, const Real_ptr Bt,
const Real_ptr D, const Real_ptr X, Real_ptr Y) {
const Real_ptr D, const Real_ptr X, Real_ptr Y,
Index_type NE) {

const Index_type e = blockIdx.x;
const Index_type zbatch = threadIdx.z;
const Index_type e = blockIdx.x * blockDim.z + zbatch;
const bool valid_e = e < NE;
if (!valid_e) { return; }

MASS3DPA_0_GPU
MASS3DPA_GPU_SMEM_DECL(mpa::TBATCH)
MASS3DPA_GPU_SMEM_SLICE(zbatch)

GPU_FOREACH_THREAD_INC(dy, y, mpa::D1D, mpa::Q1D) {
GPU_FOREACH_THREAD_INC(dx, x, mpa::D1D, mpa::Q1D){
Expand All @@ -40,6 +45,13 @@ __global__ void Mass3DPA(const Real_ptr B, const Real_ptr Bt,
MASS3DPA_2
}
}
if (threadIdx.z == 0) {
GPU_FOREACH_THREAD_INC(dy, y, mpa::D1D, mpa::Q1D) {
GPU_FOREACH_THREAD_INC(dx, x, mpa::Q1D, mpa::Q1D) {
MASS3DPA_2
}
}
}
__syncthreads();
GPU_FOREACH_THREAD_INC(dy, y, mpa::D1D, mpa::Q1D) {

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be interesting to compare having this be a for loop vs. doing a simple if statement check (blockDim.x and blockDim.y should always be >= Q1D).

This seems to help with the hip compiler.

GPU_FOREACH_THREAD_INC(qx, x, mpa::Q1D, mpa::Q1D) {
Expand All @@ -60,9 +72,11 @@ __global__ void Mass3DPA(const Real_ptr B, const Real_ptr Bt,
}

__syncthreads();
GPU_FOREACH_THREAD_INC(d, y, mpa::D1D, mpa::Q1D) {
GPU_FOREACH_THREAD_INC(q, x, mpa::Q1D, mpa::Q1D) {
MASS3DPA_6
if (threadIdx.z == 0) {
GPU_FOREACH_THREAD_INC(d, y, mpa::D1D, mpa::Q1D) {
GPU_FOREACH_THREAD_INC(q, x, mpa::Q1D, mpa::Q1D) {
MASS3DPA_6
}
}
}

Expand Down Expand Up @@ -97,6 +111,8 @@ void MASS3DPA::runCudaVariantImpl(VariantID vid) {
auto res{getCudaResource()};

MASS3DPA_DATA_SETUP;
const Index_type num_elem_blocks =
RAJA_DIVIDE_CEILING_INT(NE, static_cast<Index_type>(mpa::TBATCH));

switch (vid) {

Expand All @@ -106,13 +122,13 @@ void MASS3DPA::runCudaVariantImpl(VariantID vid) {
// Loop counter increment uses macro to quiet C++20 compiler warning
for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) {

dim3 nthreads_per_block(mpa::Q1D, mpa::Q1D, 1);
dim3 nthreads_per_block(mpa::Q1D, mpa::Q1D, mpa::TBATCH);
constexpr size_t shmem = 0;

RPlaunchCudaKernel( (Mass3DPA<block_size>),
NE, nthreads_per_block,
num_elem_blocks, nthreads_per_block,
shmem, res.get_stream(),
B, Bt, D, X, Y );
B, Bt, D, X, Y, NE );
}
stopTimer();

Expand All @@ -123,130 +139,215 @@ void MASS3DPA::runCudaVariantImpl(VariantID vid) {

constexpr bool async = true;

using launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async, mpa::Q1D*mpa::Q1D>>;
using launch_policy = RAJA::LaunchPolicy<RAJA::cuda_launch_t<async, block_size>>;

using outer_x = RAJA::LoopPolicy<RAJA::cuda_block_x_direct>;

using inner_x = RAJA::LoopPolicy<RAJA::cuda_thread_size_x_loop<mpa::Q1D>>;

using inner_y = RAJA::LoopPolicy<RAJA::cuda_thread_size_y_loop<mpa::Q1D>>;

using inner_z = RAJA::LoopPolicy<RAJA::cuda_thread_size_z_direct<mpa::TBATCH>>;

startTimer();
// Loop counter increment uses macro to quiet C++20 compiler warning
for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) {

//clang-format off
RAJA::launch<launch_policy>( res,
RAJA::LaunchParams(RAJA::Teams(NE),
RAJA::Threads(mpa::Q1D, mpa::Q1D, 1)),
RAJA::LaunchParams(RAJA::Teams(num_elem_blocks),
RAJA::Threads(mpa::Q1D, mpa::Q1D, mpa::TBATCH)),
[=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) {

RAJA::loop<outer_x>(ctx, RAJA::RangeSegment(0, NE),
[&](Index_type e) {

MASS3DPA_0_GPU

RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, mpa::D1D),
[&](Index_type dy) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, mpa::D1D),
[&](Index_type dx) {
MASS3DPA_1
}
); // RAJA::loop<inner_x>

RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, mpa::Q1D),
[&](Index_type dx) {
MASS3DPA_2
RAJA::loop<outer_x>(ctx, RAJA::RangeSegment(0, num_elem_blocks),
[&](Index_type elem_block) {

MASS3DPA_GPU_SMEM_DECL(mpa::TBATCH)

RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, mpa::TBATCH),
[&](Index_type zbatch) {
const Index_type e = elem_block * mpa::TBATCH + zbatch;
const bool valid_e = e < NE;
if (!valid_e) { return; }
MASS3DPA_GPU_SMEM_SLICE(zbatch)

RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, mpa::D1D),
[&](Index_type dy) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, mpa::D1D),
[&](Index_type dx) {
MASS3DPA_1
}
); // RAJA::loop<inner_x>

RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, mpa::Q1D),
[&](Index_type dx) {
MASS3DPA_2
}
); // RAJA::loop<inner_x>
} // lambda (dy)
); // RAJA::loop<inner_y>
}
); // RAJA::loop<inner_z>

RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, 1),
[&](Index_type RAJA_UNUSED_ARG(zbatch)) {
RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, mpa::D1D),
[&](Index_type dy) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, mpa::Q1D),
[&](Index_type dx) {
MASS3DPA_2
}
); // RAJA::loop<inner_x>
}
); // RAJA::loop<inner_x>
} // lambda (dy)
); // RAJA::loop<inner_y>
); // RAJA::loop<inner_y>
}
); // RAJA::loop<inner_z>

ctx.teamSync();

RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, mpa::D1D),
[&](Index_type dy) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, mpa::Q1D),
[&](Index_type qx) {
MASS3DPA_3
RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, mpa::TBATCH),
[&](Index_type zbatch) {
const Index_type e = elem_block * mpa::TBATCH + zbatch;
const bool valid_e = e < NE;
if (!valid_e) { return; }
MASS3DPA_GPU_SMEM_SLICE(zbatch)

RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, mpa::D1D),
[&](Index_type dy) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, mpa::Q1D),
[&](Index_type qx) {
MASS3DPA_3
}
); // RAJA::loop<inner_x>
}
); // RAJA::loop<inner_x>
); // RAJA::loop<inner_y>
}
); // RAJA::loop<inner_y>
); // RAJA::loop<inner_z>

ctx.teamSync();

RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, mpa::Q1D),
[&](Index_type qy) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, mpa::Q1D),
[&](Index_type qx) {
MASS3DPA_4
RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, mpa::TBATCH),
[&](Index_type zbatch) {
const Index_type e = elem_block * mpa::TBATCH + zbatch;
const bool valid_e = e < NE;
if (!valid_e) { return; }
MASS3DPA_GPU_SMEM_SLICE(zbatch)

RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, mpa::Q1D),
[&](Index_type qy) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, mpa::Q1D),
[&](Index_type qx) {
MASS3DPA_4
}
); // RAJA::loop<inner_x>
}
); // RAJA::loop<inner_x>
); // RAJA::loop<inner_y>
}
); // RAJA::loop<inner_y>
); // RAJA::loop<inner_z>

ctx.teamSync();

RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, mpa::Q1D),
[&](Index_type qy) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, mpa::Q1D),
[&](Index_type qx) {
MASS3DPA_5
RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, mpa::TBATCH),
[&](Index_type zbatch) {
const Index_type e = elem_block * mpa::TBATCH + zbatch;
const bool valid_e = e < NE;
if (!valid_e) { return; }
MASS3DPA_GPU_SMEM_SLICE(zbatch)

RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, mpa::Q1D),
[&](Index_type qy) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, mpa::Q1D),
[&](Index_type qx) {
MASS3DPA_5
}
); // RAJA::loop<inner_x>
}
); // RAJA::loop<inner_x>
); // RAJA::loop<inner_y>
}
); // RAJA::loop<inner_y>
); // RAJA::loop<inner_z>

ctx.teamSync();

RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, mpa::D1D),
[&](Index_type d) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, mpa::Q1D),
[&](Index_type q) {
MASS3DPA_6
RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, 1),
[&](Index_type zbatch) {
MASS3DPA_GPU_SMEM_SLICE(zbatch)

RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, mpa::D1D),
[&](Index_type d) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, mpa::Q1D),
[&](Index_type q) {
MASS3DPA_6
}
); // RAJA::loop<inner_x>
}
); // RAJA::loop<inner_x>
); // RAJA::loop<inner_y>
}
); // RAJA::loop<inner_y>
); // RAJA::loop<inner_z>

ctx.teamSync();

RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, mpa::Q1D),
[&](Index_type qy) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, mpa::D1D),
[&](Index_type dx) {
MASS3DPA_7
RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, mpa::TBATCH),
[&](Index_type zbatch) {
const Index_type e = elem_block * mpa::TBATCH + zbatch;
const bool valid_e = e < NE;
if (!valid_e) { return; }
MASS3DPA_GPU_SMEM_SLICE(zbatch)

RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, mpa::Q1D),
[&](Index_type qy) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, mpa::D1D),
[&](Index_type dx) {
MASS3DPA_7
}
); // RAJA::loop<inner_x>
}
); // RAJA::loop<inner_x>
); // RAJA::loop<inner_y>
}
); // RAJA::loop<inner_y>
); // RAJA::loop<inner_z>

ctx.teamSync();

RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, mpa::D1D),
[&](Index_type dy) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, mpa::D1D),
[&](Index_type dx) {
MASS3DPA_8
RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, mpa::TBATCH),
[&](Index_type zbatch) {
const Index_type e = elem_block * mpa::TBATCH + zbatch;
const bool valid_e = e < NE;
if (!valid_e) { return; }
MASS3DPA_GPU_SMEM_SLICE(zbatch)

RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, mpa::D1D),
[&](Index_type dy) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, mpa::D1D),
[&](Index_type dx) {
MASS3DPA_8
}
); // RAJA::loop<inner_x>
}
); // RAJA::loop<inner_x>
); // RAJA::loop<inner_y>
}
); // RAJA::loop<inner_y>
); // RAJA::loop<inner_z>

ctx.teamSync();

RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, mpa::D1D),
[&](Index_type dy) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, mpa::D1D),
[&](Index_type dx) {
MASS3DPA_9
RAJA::loop<inner_z>(ctx, RAJA::RangeSegment(0, mpa::TBATCH),
[&](Index_type zbatch) {
const Index_type e = elem_block * mpa::TBATCH + zbatch;
const bool valid_e = e < NE;
if (!valid_e) { return; }
MASS3DPA_GPU_SMEM_SLICE(zbatch)

RAJA::loop<inner_y>(ctx, RAJA::RangeSegment(0, mpa::D1D),
[&](Index_type dy) {
RAJA::loop<inner_x>(ctx, RAJA::RangeSegment(0, mpa::D1D),
[&](Index_type dx) {
MASS3DPA_9
}
); // RAJA::loop<inner_x>
}
); // RAJA::loop<inner_x>
); // RAJA::loop<inner_y>
}
); // RAJA::loop<inner_y>
); // RAJA::loop<inner_z>

} // lambda (e)
} // lambda (elem_block)
); // RAJA::loop<outer_x>

} // outer lambda (ctx)
Expand Down
Loading
Loading