From 5e9ba945d804c41cf8ed12900bc79ebc49e5bd08 Mon Sep 17 00:00:00 2001 From: Michael McKinsey Date: Fri, 19 Jun 2026 13:53:28 -0700 Subject: [PATCH 1/2] Use batching --- src/apps/MASS3DPA-Cuda.cpp | 297 ++++++++++++++++++++++++++----------- src/apps/MASS3DPA.hpp | 45 ++++-- 2 files changed, 242 insertions(+), 100 deletions(-) diff --git a/src/apps/MASS3DPA-Cuda.cpp b/src/apps/MASS3DPA-Cuda.cpp index 0805d35e0..3fcee06c7 100644 --- a/src/apps/MASS3DPA-Cuda.cpp +++ b/src/apps/MASS3DPA-Cuda.cpp @@ -26,64 +26,91 @@ namespace apps { template < size_t block_size > __launch_bounds__(block_size) __global__ void Mass3DPA(const Real_ptr B, const Real_ptr Bt, - const Real_ptr D, const Real_ptr X, Real_ptr Y) { + const Real_ptr D, const Real_ptr X, Real_ptr Y, + Index_type NE) { - const Index_type e = blockIdx.x; + const Index_type zbatch = threadIdx.z; + const Index_type e = blockIdx.x * blockDim.z + zbatch; + const bool valid_e = e < NE; - MASS3DPA_0_GPU + MASS3DPA_GPU_SMEM_DECL(mpa::TBATCH) + MASS3DPA_GPU_SMEM_SLICE(zbatch) GPU_FOREACH_THREAD_INC(dy, y, mpa::D1D, mpa::Q1D) { GPU_FOREACH_THREAD_INC(dx, x, mpa::D1D, mpa::Q1D){ - MASS3DPA_1 + if (valid_e) { + MASS3DPA_1 + } } GPU_FOREACH_THREAD_INC(dx, x, mpa::Q1D, mpa::Q1D) { MASS3DPA_2 } } + if (threadIdx.z == 0) { + GPU_FOREACH_THREAD_INC(dy, y, mpa::D1D, mpa::Q1D) { + GPU_FOREACH_THREAD_INC(dx, x, mpa::Q1D, mpa::Q1D) { + MASS3DPA_2 + } + } + } __syncthreads(); GPU_FOREACH_THREAD_INC(dy, y, mpa::D1D, mpa::Q1D) { GPU_FOREACH_THREAD_INC(qx, x, mpa::Q1D, mpa::Q1D) { - MASS3DPA_3 + if (valid_e) { + MASS3DPA_3 + } } } __syncthreads(); GPU_FOREACH_THREAD_INC(qy, y, mpa::Q1D, mpa::Q1D) { GPU_FOREACH_THREAD_INC(qx, x, mpa::Q1D, mpa::Q1D) { - MASS3DPA_4 + if (valid_e) { + MASS3DPA_4 + } } } __syncthreads(); GPU_FOREACH_THREAD_INC(qy, y, mpa::Q1D, mpa::Q1D) { GPU_FOREACH_THREAD_INC(qx, x, mpa::Q1D, mpa::Q1D) { - MASS3DPA_5 + if (valid_e) { + MASS3DPA_5 + } } } __syncthreads(); - GPU_FOREACH_THREAD_INC(d, y, mpa::D1D, mpa::Q1D) { - GPU_FOREACH_THREAD_INC(q, x, mpa::Q1D, mpa::Q1D) { - MASS3DPA_6 + if (threadIdx.z == 0) { + GPU_FOREACH_THREAD_INC(d, y, mpa::D1D, mpa::Q1D) { + GPU_FOREACH_THREAD_INC(q, x, mpa::Q1D, mpa::Q1D) { + MASS3DPA_6 + } } } __syncthreads(); GPU_FOREACH_THREAD_INC(qy, y, mpa::Q1D, mpa::Q1D) { GPU_FOREACH_THREAD_INC(dx, x, mpa::D1D, mpa::Q1D) { - MASS3DPA_7 + if (valid_e) { + MASS3DPA_7 + } } } __syncthreads(); GPU_FOREACH_THREAD_INC(dy, y, mpa::D1D, mpa::Q1D) { GPU_FOREACH_THREAD_INC(dx, x, mpa::D1D, mpa::Q1D) { - MASS3DPA_8 + if (valid_e) { + MASS3DPA_8 + } } } __syncthreads(); GPU_FOREACH_THREAD_INC(dy, y, mpa::D1D, mpa::Q1D) { GPU_FOREACH_THREAD_INC(dx, x, mpa::D1D, mpa::Q1D) { - MASS3DPA_9 + if (valid_e) { + MASS3DPA_9 + } } } } @@ -97,6 +124,8 @@ void MASS3DPA::runCudaVariantImpl(VariantID vid) { auto res{getCudaResource()}; MASS3DPA_DATA_SETUP; + const Index_type num_elem_blocks = + RAJA_DIVIDE_CEILING_INT(NE, static_cast(mpa::TBATCH)); switch (vid) { @@ -106,13 +135,13 @@ void MASS3DPA::runCudaVariantImpl(VariantID vid) { // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { - dim3 nthreads_per_block(mpa::Q1D, mpa::Q1D, 1); + dim3 nthreads_per_block(mpa::Q1D, mpa::Q1D, mpa::TBATCH); constexpr size_t shmem = 0; RPlaunchCudaKernel( (Mass3DPA), - NE, nthreads_per_block, + num_elem_blocks, nthreads_per_block, shmem, res.get_stream(), - B, Bt, D, X, Y ); + B, Bt, D, X, Y, NE ); } stopTimer(); @@ -123,7 +152,7 @@ void MASS3DPA::runCudaVariantImpl(VariantID vid) { constexpr bool async = true; - using launch_policy = RAJA::LaunchPolicy>; + using launch_policy = RAJA::LaunchPolicy>; using outer_x = RAJA::LoopPolicy; @@ -131,122 +160,214 @@ void MASS3DPA::runCudaVariantImpl(VariantID vid) { using inner_y = RAJA::LoopPolicy>; + using inner_z = RAJA::LoopPolicy>; + startTimer(); // Loop counter increment uses macro to quiet C++20 compiler warning for (RepIndex_type irep = 0; irep < run_reps; RP_REPCOUNTINC(irep)) { //clang-format off RAJA::launch( res, - RAJA::LaunchParams(RAJA::Teams(NE), - RAJA::Threads(mpa::Q1D, mpa::Q1D, 1)), + RAJA::LaunchParams(RAJA::Teams(num_elem_blocks), + RAJA::Threads(mpa::Q1D, mpa::Q1D, mpa::TBATCH)), [=] RAJA_HOST_DEVICE(RAJA::LaunchContext ctx) { - RAJA::loop(ctx, RAJA::RangeSegment(0, NE), - [&](Index_type e) { - - MASS3DPA_0_GPU - - RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), - [&](Index_type dx) { - MASS3DPA_1 - } - ); // RAJA::loop - - RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), - [&](Index_type dx) { - MASS3DPA_2 + RAJA::loop(ctx, RAJA::RangeSegment(0, num_elem_blocks), + [&](Index_type elem_block) { + + MASS3DPA_GPU_SMEM_DECL(mpa::TBATCH) + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::TBATCH), + [&](Index_type zbatch) { + const Index_type e = elem_block * mpa::TBATCH + zbatch; + const bool valid_e = e < NE; + MASS3DPA_GPU_SMEM_SLICE(zbatch) + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), + [&](Index_type dx) { + if (valid_e) { + MASS3DPA_1 + } + } + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), + [&](Index_type dx) { + MASS3DPA_2 + } + ); // RAJA::loop + } // lambda (dy) + ); // RAJA::loop + } + ); // RAJA::loop + + RAJA::loop(ctx, RAJA::RangeSegment(0, 1), + [&](Index_type RAJA_UNUSED_ARG(zbatch)) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), + [&](Index_type dx) { + MASS3DPA_2 + } + ); // RAJA::loop } - ); // RAJA::loop - } // lambda (dy) - ); // RAJA::loop + ); // RAJA::loop + } + ); // RAJA::loop ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), - [&](Index_type qx) { - MASS3DPA_3 + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::TBATCH), + [&](Index_type zbatch) { + const Index_type e = elem_block * mpa::TBATCH + zbatch; + const bool valid_e = e < NE; + MASS3DPA_GPU_SMEM_SLICE(zbatch) + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), + [&](Index_type qx) { + if (valid_e) { + MASS3DPA_3 + } + } + ); // RAJA::loop } - ); // RAJA::loop + ); // RAJA::loop } - ); // RAJA::loop + ); // RAJA::loop ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), - [&](Index_type qx) { - MASS3DPA_4 + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::TBATCH), + [&](Index_type zbatch) { + const Index_type e = elem_block * mpa::TBATCH + zbatch; + const bool valid_e = e < NE; + MASS3DPA_GPU_SMEM_SLICE(zbatch) + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), + [&](Index_type qx) { + if (valid_e) { + MASS3DPA_4 + } + } + ); // RAJA::loop } - ); // RAJA::loop + ); // RAJA::loop } - ); // RAJA::loop + ); // RAJA::loop ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), - [&](Index_type qx) { - MASS3DPA_5 + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::TBATCH), + [&](Index_type zbatch) { + const Index_type e = elem_block * mpa::TBATCH + zbatch; + const bool valid_e = e < NE; + MASS3DPA_GPU_SMEM_SLICE(zbatch) + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), + [&](Index_type qx) { + if (valid_e) { + MASS3DPA_5 + } + } + ); // RAJA::loop } - ); // RAJA::loop + ); // RAJA::loop } - ); // RAJA::loop + ); // RAJA::loop ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), - [&](Index_type d) { - RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), - [&](Index_type q) { - MASS3DPA_6 + RAJA::loop(ctx, RAJA::RangeSegment(0, 1), + [&](Index_type zbatch) { + MASS3DPA_GPU_SMEM_SLICE(zbatch) + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), + [&](Index_type d) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), + [&](Index_type q) { + MASS3DPA_6 + } + ); // RAJA::loop } - ); // RAJA::loop + ); // RAJA::loop } - ); // RAJA::loop + ); // RAJA::loop ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), - [&](Index_type qy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), - [&](Index_type dx) { - MASS3DPA_7 + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::TBATCH), + [&](Index_type zbatch) { + const Index_type e = elem_block * mpa::TBATCH + zbatch; + const bool valid_e = e < NE; + MASS3DPA_GPU_SMEM_SLICE(zbatch) + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), + [&](Index_type qy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), + [&](Index_type dx) { + if (valid_e) { + MASS3DPA_7 + } + } + ); // RAJA::loop } - ); // RAJA::loop + ); // RAJA::loop } - ); // RAJA::loop + ); // RAJA::loop ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), - [&](Index_type dx) { - MASS3DPA_8 + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::TBATCH), + [&](Index_type zbatch) { + const Index_type e = elem_block * mpa::TBATCH + zbatch; + const bool valid_e = e < NE; + MASS3DPA_GPU_SMEM_SLICE(zbatch) + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), + [&](Index_type dx) { + if (valid_e) { + MASS3DPA_8 + } + } + ); // RAJA::loop } - ); // RAJA::loop + ); // RAJA::loop } - ); // RAJA::loop + ); // RAJA::loop ctx.teamSync(); - RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), - [&](Index_type dy) { - RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), - [&](Index_type dx) { - MASS3DPA_9 + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::TBATCH), + [&](Index_type zbatch) { + const Index_type e = elem_block * mpa::TBATCH + zbatch; + const bool valid_e = e < NE; + MASS3DPA_GPU_SMEM_SLICE(zbatch) + + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), + [&](Index_type dy) { + RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), + [&](Index_type dx) { + if (valid_e) { + MASS3DPA_9 + } + } + ); // RAJA::loop } - ); // RAJA::loop + ); // RAJA::loop } - ); // RAJA::loop + ); // RAJA::loop - } // lambda (e) + } // lambda (elem_block) ); // RAJA::loop } // outer lambda (ctx) diff --git a/src/apps/MASS3DPA.hpp b/src/apps/MASS3DPA.hpp index b0916e1d5..308a7058e 100644 --- a/src/apps/MASS3DPA.hpp +++ b/src/apps/MASS3DPA.hpp @@ -172,8 +172,22 @@ // Number of Dofs/Qpts in 1D namespace mpa { -constexpr RAJA::Index_type D1D = 4; -constexpr RAJA::Index_type Q1D = 5; +// linear +// constexpr RAJA::Index_type D1D = 2; +// constexpr RAJA::Index_type Q1D = 2; +// } +// quadratic +// constexpr RAJA::Index_type D1D = 4; +// constexpr RAJA::Index_type Q1D = 4; +// cubic +// constexpr RAJA::Index_type D1D = 6; +// constexpr RAJA::Index_type Q1D = 6; + constexpr RAJA::Index_type D1D = 2; + constexpr RAJA::Index_type Q1D = 2; + + constexpr RAJA::Index_type TBATCH = 16; // linear + // constexpr RAJA::Index_type TBATCH = 2; // quadratic + // constexpr RAJA::Index_type TBATCH = 1; // cubic } // namespace mpa #define MPA_B(x, y) B[x + mpa::Q1D * y] #define MPA_Bt(x, y) Bt[x + mpa::D1D * y] @@ -203,21 +217,28 @@ constexpr RAJA::Index_type Q1D = 5; Real_type(*QQD)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])sm0; \ Real_type(*QDD)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm1; -#define MASS3DPA_0_GPU \ +#define MASS3DPA_GPU_SMEM_DECL(TBATCH) \ constexpr Index_type MQ1 = mpa::Q1D; \ constexpr Index_type MD1 = mpa::D1D; \ constexpr Index_type MDQ = (MQ1 > MD1) ? MQ1 : MD1; \ RAJA_TEAM_SHARED Real_type sDQ[MQ1 * MD1]; \ Real_type(*Bsmem)[MD1] = (Real_type(*)[MD1])sDQ; \ Real_type(*Btsmem)[MQ1] = (Real_type(*)[MQ1])sDQ; \ - RAJA_TEAM_SHARED Real_type sm0[MDQ * MDQ * MDQ]; \ - RAJA_TEAM_SHARED Real_type sm1[MDQ * MDQ * MDQ]; \ - Real_type(*Xsmem)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm0; \ - Real_type(*DDQ)[MD1][MQ1] = (Real_type(*)[MD1][MQ1])sm1; \ - Real_type(*DQQ)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])sm0; \ - Real_type(*QQQ)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])sm1; \ - Real_type(*QQD)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])sm0; \ - Real_type(*QDD)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm1; + RAJA_TEAM_SHARED Real_type sm0[TBATCH][MDQ * MDQ * MDQ]; \ + RAJA_TEAM_SHARED Real_type sm1[TBATCH][MDQ * MDQ * MDQ]; + +#define MASS3DPA_GPU_SMEM_SLICE(TBATCH) \ + Real_type(*Xsmem)[MD1][MD1] = \ + (Real_type(*)[MD1][MD1])sm0[TBATCH]; \ + Real_type(*DDQ)[MD1][MQ1] = (Real_type(*)[MD1][MQ1])sm1[TBATCH]; \ + Real_type(*DQQ)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])sm0[TBATCH]; \ + Real_type(*QQQ)[MQ1][MQ1] = (Real_type(*)[MQ1][MQ1])sm1[TBATCH]; \ + Real_type(*QQD)[MQ1][MD1] = (Real_type(*)[MQ1][MD1])sm0[TBATCH]; \ + Real_type(*QDD)[MD1][MD1] = (Real_type(*)[MD1][MD1])sm1[TBATCH]; + +#define MASS3DPA_0_GPU \ + MASS3DPA_GPU_SMEM_DECL(mpa::TBATCH) \ + MASS3DPA_GPU_SMEM_SLICE(0) #define MASS3DPA_1 \ RAJAPERF_UNROLL(MD1) \ @@ -375,7 +396,7 @@ class MASS3DPA : public KernelBase { template void runSyclVariantImpl(VariantID vid); private: - static const size_t default_gpu_block_size = mpa::Q1D * mpa::Q1D; + static const size_t default_gpu_block_size = mpa::Q1D * mpa::Q1D * mpa::TBATCH; using gpu_block_sizes_type = integer::list_type; Real_ptr m_B; From efaa7903c014434c48690b4b71453d87f3fccfa1 Mon Sep 17 00:00:00 2001 From: Michael McKinsey Date: Wed, 24 Jun 2026 13:47:44 -0700 Subject: [PATCH 2/2] Refactor check --- src/apps/MASS3DPA-Cuda.cpp | 64 +++++++++++++------------------------- 1 file changed, 22 insertions(+), 42 deletions(-) diff --git a/src/apps/MASS3DPA-Cuda.cpp b/src/apps/MASS3DPA-Cuda.cpp index 3fcee06c7..08e5113fb 100644 --- a/src/apps/MASS3DPA-Cuda.cpp +++ b/src/apps/MASS3DPA-Cuda.cpp @@ -32,15 +32,14 @@ __global__ void Mass3DPA(const Real_ptr B, const Real_ptr Bt, const Index_type zbatch = threadIdx.z; const Index_type e = blockIdx.x * blockDim.z + zbatch; const bool valid_e = e < NE; + if (!valid_e) { return; } MASS3DPA_GPU_SMEM_DECL(mpa::TBATCH) MASS3DPA_GPU_SMEM_SLICE(zbatch) GPU_FOREACH_THREAD_INC(dy, y, mpa::D1D, mpa::Q1D) { GPU_FOREACH_THREAD_INC(dx, x, mpa::D1D, mpa::Q1D){ - if (valid_e) { - MASS3DPA_1 - } + MASS3DPA_1 } GPU_FOREACH_THREAD_INC(dx, x, mpa::Q1D, mpa::Q1D) { MASS3DPA_2 @@ -56,25 +55,19 @@ __global__ void Mass3DPA(const Real_ptr B, const Real_ptr Bt, __syncthreads(); GPU_FOREACH_THREAD_INC(dy, y, mpa::D1D, mpa::Q1D) { GPU_FOREACH_THREAD_INC(qx, x, mpa::Q1D, mpa::Q1D) { - if (valid_e) { - MASS3DPA_3 - } + MASS3DPA_3 } } __syncthreads(); GPU_FOREACH_THREAD_INC(qy, y, mpa::Q1D, mpa::Q1D) { GPU_FOREACH_THREAD_INC(qx, x, mpa::Q1D, mpa::Q1D) { - if (valid_e) { - MASS3DPA_4 - } + MASS3DPA_4 } } __syncthreads(); GPU_FOREACH_THREAD_INC(qy, y, mpa::Q1D, mpa::Q1D) { GPU_FOREACH_THREAD_INC(qx, x, mpa::Q1D, mpa::Q1D) { - if (valid_e) { - MASS3DPA_5 - } + MASS3DPA_5 } } @@ -90,27 +83,21 @@ __global__ void Mass3DPA(const Real_ptr B, const Real_ptr Bt, __syncthreads(); GPU_FOREACH_THREAD_INC(qy, y, mpa::Q1D, mpa::Q1D) { GPU_FOREACH_THREAD_INC(dx, x, mpa::D1D, mpa::Q1D) { - if (valid_e) { - MASS3DPA_7 - } + MASS3DPA_7 } } __syncthreads(); GPU_FOREACH_THREAD_INC(dy, y, mpa::D1D, mpa::Q1D) { GPU_FOREACH_THREAD_INC(dx, x, mpa::D1D, mpa::Q1D) { - if (valid_e) { - MASS3DPA_8 - } + MASS3DPA_8 } } __syncthreads(); GPU_FOREACH_THREAD_INC(dy, y, mpa::D1D, mpa::Q1D) { GPU_FOREACH_THREAD_INC(dx, x, mpa::D1D, mpa::Q1D) { - if (valid_e) { - MASS3DPA_9 - } + MASS3DPA_9 } } } @@ -181,15 +168,14 @@ void MASS3DPA::runCudaVariantImpl(VariantID vid) { [&](Index_type zbatch) { const Index_type e = elem_block * mpa::TBATCH + zbatch; const bool valid_e = e < NE; + if (!valid_e) { return; } MASS3DPA_GPU_SMEM_SLICE(zbatch) RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dy) { RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dx) { - if (valid_e) { - MASS3DPA_1 - } + MASS3DPA_1 } ); // RAJA::loop @@ -223,15 +209,14 @@ void MASS3DPA::runCudaVariantImpl(VariantID vid) { [&](Index_type zbatch) { const Index_type e = elem_block * mpa::TBATCH + zbatch; const bool valid_e = e < NE; + if (!valid_e) { return; } MASS3DPA_GPU_SMEM_SLICE(zbatch) RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dy) { RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qx) { - if (valid_e) { - MASS3DPA_3 - } + MASS3DPA_3 } ); // RAJA::loop } @@ -245,15 +230,14 @@ void MASS3DPA::runCudaVariantImpl(VariantID vid) { [&](Index_type zbatch) { const Index_type e = elem_block * mpa::TBATCH + zbatch; const bool valid_e = e < NE; + if (!valid_e) { return; } MASS3DPA_GPU_SMEM_SLICE(zbatch) RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qy) { RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qx) { - if (valid_e) { - MASS3DPA_4 - } + MASS3DPA_4 } ); // RAJA::loop } @@ -267,15 +251,14 @@ void MASS3DPA::runCudaVariantImpl(VariantID vid) { [&](Index_type zbatch) { const Index_type e = elem_block * mpa::TBATCH + zbatch; const bool valid_e = e < NE; + if (!valid_e) { return; } MASS3DPA_GPU_SMEM_SLICE(zbatch) RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qy) { RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qx) { - if (valid_e) { - MASS3DPA_5 - } + MASS3DPA_5 } ); // RAJA::loop } @@ -307,15 +290,14 @@ void MASS3DPA::runCudaVariantImpl(VariantID vid) { [&](Index_type zbatch) { const Index_type e = elem_block * mpa::TBATCH + zbatch; const bool valid_e = e < NE; + if (!valid_e) { return; } MASS3DPA_GPU_SMEM_SLICE(zbatch) RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::Q1D), [&](Index_type qy) { RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dx) { - if (valid_e) { - MASS3DPA_7 - } + MASS3DPA_7 } ); // RAJA::loop } @@ -329,15 +311,14 @@ void MASS3DPA::runCudaVariantImpl(VariantID vid) { [&](Index_type zbatch) { const Index_type e = elem_block * mpa::TBATCH + zbatch; const bool valid_e = e < NE; + if (!valid_e) { return; } MASS3DPA_GPU_SMEM_SLICE(zbatch) RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dy) { RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dx) { - if (valid_e) { - MASS3DPA_8 - } + MASS3DPA_8 } ); // RAJA::loop } @@ -351,15 +332,14 @@ void MASS3DPA::runCudaVariantImpl(VariantID vid) { [&](Index_type zbatch) { const Index_type e = elem_block * mpa::TBATCH + zbatch; const bool valid_e = e < NE; + if (!valid_e) { return; } MASS3DPA_GPU_SMEM_SLICE(zbatch) RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dy) { RAJA::loop(ctx, RAJA::RangeSegment(0, mpa::D1D), [&](Index_type dx) { - if (valid_e) { - MASS3DPA_9 - } + MASS3DPA_9 } ); // RAJA::loop }