From 406c6c43b1bde52b63d67b91aac0eeeef2d7dd3f Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Tue, 13 Jan 2026 19:05:37 +0100 Subject: [PATCH 01/46] Add amrex::Launch --- Src/Base/AMReX_GpuControl.H | 8 ++ Src/Base/AMReX_GpuLaunchFunctsG.H | 26 +++++ Src/Base/AMReX_GpuTypes.H | 164 ++++++++++++++++++++++++++++++ 3 files changed, 198 insertions(+) diff --git a/Src/Base/AMReX_GpuControl.H b/Src/Base/AMReX_GpuControl.H index c5643e2268d..20084bc34cc 100644 --- a/Src/Base/AMReX_GpuControl.H +++ b/Src/Base/AMReX_GpuControl.H @@ -33,6 +33,14 @@ #define AMREX_HIP_OR_CUDA_OR_SYCL(a,b,c) ((void)0); #endif +#if defined(AMREX_USE_HIP) || defined(AMREX_USE_CUDA) +#define AMREX_HIP_CUDA_OR_SYCL_OR_CPU(a,b,c) a +#elif defined(AMREX_USE_SYCL) +#define AMREX_HIP_CUDA_OR_SYCL_OR_CPU(a,b,c) b +#else +#define AMREX_HIP_CUDA_OR_SYCL_OR_CPU(a,b,c) c +#endif + #ifdef AMREX_USE_GPU #define AMREX_GPU_OR_CPU(a,b) a #else diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H index 275956aaea2..29042d06135 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsG.H +++ b/Src/Base/AMReX_GpuLaunchFunctsG.H @@ -705,6 +705,32 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, } } +template +void Launch (IntVectND nblocks, std::size_t shared_mem_bytes, L const& f) +{ + const auto nthreads_total = MT * std::size_t(nblocks); + const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1) + / sizeof(unsigned long long); + auto& q = *(stream.queue); + try { + q.submit([&] (sycl::handler& h) { + sycl::local_accessor + shared_data(sycl::range<1>(shared_mem_numull), h); + h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total), + sycl::range<1>(MT)), + [=] (sycl::nd_item<1> item) + [[sycl::reqd_work_group_size(MT)]] + [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] + { + f(Gpu::Handler{&item,shared_data.get_multi_ptr().get()}); + }); + }); + } catch (sycl::exception const& ex) { + amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!"); + } +} + + #else // CUDA or HIP diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H index e01303656ff..a1e07436c21 100644 --- a/Src/Base/AMReX_GpuTypes.H +++ b/Src/Base/AMReX_GpuTypes.H @@ -102,6 +102,170 @@ struct Handler {}; #endif +template +struct LaunchHandler +{ +#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) + LaunchHandler() = default; +#elif defined(AMREX_USE_SYCL) + LaunchHandler(sycl::nd_item const* a_item, char * a_shared_mem) + : m_item{a_item}, m_shared_mem{a_shared_mem} {}; +#else + LaunchHandler(IntVectND a_blockid, IntVectND a_griddim, char * a_shared_mem) + : m_blockid{a_blockid}, m_griddim{a_griddim}, m_shared_mem{a_shared_mem} {}; +#endif + + AMREX_GPU_DEVICE AMREX_FORCE_INLINE + unsigned int threadIdx1D () const { + return static_cast(AMREX_HIP_CUDA_OR_SYCL_OR_CPU( + threadIdx.x, + m_item->get_local_linear_id(), + 0 + )); + } + + template + AMREX_GPU_DEVICE AMREX_FORCE_INLINE + IntVectND threadIdxND () const { + static_assert(sizeof...(nd_block_size) == dim && + (1 * ... * nd_block_size) == threads_per_block); + constexpr IntVectND iv_block_size = blockDimND(); + IntVectND ret(0); + unsigned int idx = threadIdx(); + if constexpr (dim == 3) { + ret[2] = idx / iv_block_size[2]; + idx = idx - ret[2] * iv_block_size[2]; + } + if constexpr (dim >= 2) { + ret[1] = idx / iv_block_size[1]; + idx = idx - ret[1] * iv_block_size[1]; + } + ret[0] = idx; + return ret; + } + + AMREX_GPU_DEVICE AMREX_FORCE_INLINE + unsigned int blockIdx1D () const { + if constexpr (dim == 1) { + return static_cast(AMREX_HIP_CUDA_OR_SYCL_OR_CPU( + blockIdx.x, + m_item->get_group_id(0), + m_blockid[0] + )); + } else if constexpr (dim == 2) { + return static_cast(AMREX_HIP_CUDA_OR_SYCL_OR_CPU( + blockIdx.x + gridDim.x * blockIdx.y, + m_item->get_group_linear_id(), + m_blockid[0] + m_griddim[0] * m_blockid[1] + )); + } else { + return static_cast(AMREX_HIP_CUDA_OR_SYCL_OR_CPU( + blockIdx.x + gridDim.x * blockIdx.y + gridDim.x * gridDim.y * blockIdx.z, + m_item->get_group_linear_id(), + m_blockid[0] + m_griddim[0] * m_blockid[1] + + m_griddim[0] * m_griddim[1] * m_blockid[2] + )); + } + } + + AMREX_GPU_DEVICE AMREX_FORCE_INLINE + IntVectND blockIdxND () const { + if constexpr (dim == 1) { + return AMREX_HIP_CUDA_OR_SYCL_OR_CPU( + IntVectND(static_cast(blockIdx.x)), + IntVectND(static_cast(m_item->get_group_id(0))), + m_blockid + ); + } else if constexpr (dim == 2) { + return AMREX_HIP_CUDA_OR_SYCL_OR_CPU( + IntVectND(static_cast(blockIdx.x), + static_cast(blockIdx.y)), + IntVectND(static_cast(m_item->get_group_id(1)), + static_cast(m_item->get_group_id(0))), + m_blockid + ); + } else { + return AMREX_HIP_CUDA_OR_SYCL_OR_CPU( + IntVectND(static_cast(blockIdx.x), + static_cast(blockIdx.y), + static_cast(blockIdx.z)), + IntVectND(static_cast(m_item->get_group_id(2)), + static_cast(m_item->get_group_id(1)), + static_cast(m_item->get_group_id(0))), + m_blockid + ); + } + } + + AMREX_GPU_DEVICE AMREX_FORCE_INLINE + constexpr unsigned int blockDim1D () const { + return threads_per_block; + } + + template + AMREX_GPU_DEVICE AMREX_FORCE_INLINE + constexpr IntVectND blockDimND () const { + static_assert(sizeof...(nd_block_size) == dim && + (1 * ... * nd_block_size) == threads_per_block); + return {nd_block_size...}; + } + + AMREX_GPU_DEVICE AMREX_FORCE_INLINE + unsigned int globalIdx1D () const { + return blockIdx1D() * threads_per_block + threadIdx1D(); + } + + template + AMREX_GPU_DEVICE AMREX_FORCE_INLINE + IntVectND globalIdxND () const { + return blockIdxND() * blockDimND() + threadIdxND(); + } + + AMREX_GPU_DEVICE AMREX_FORCE_INLINE + void syncthreads () const { + AMREX_HIP_CUDA_OR_SYCL_OR_CPU( + __syncthreads(), + m_item->barrier(sycl::access::fence_space::global_and_local), + (void)0 + ); + } + + template + AMREX_GPU_DEVICE AMREX_FORCE_INLINE + T* shared_memory () const { +#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) + alignas(32) extern __shared__ char smem[]; + return reinterpret_cast(smem); +#else + return reinterpret_cast(m_shared_mem); +#endif + } + + AMREX_GPU_DEVICE AMREX_FORCE_INLINE + auto sycl_item () const { +#if defined(AMREX_USE_SYCL) + return m_item; +#else + return 0; +#endif + } + +private: + +#if defined(AMREX_USE_SYCL) + sycl::nd_item const* m_item; + char * m_shared_mem; +#endif + +#if !defined(AMREX_USE_GPU) + IntVectND m_blockid; + IntVectND m_griddim; + char * m_shared_mem; +#endif +}; + + + } #endif From 2ac0e24b12095481e76fa1c3f4108ff28af2d884 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Thu, 15 Jan 2026 15:40:18 +0100 Subject: [PATCH 02/46] add CUDA and CPU versions --- Src/Base/AMReX_GpuLaunchFunctsC.H | 51 ++++++++++++++ Src/Base/AMReX_GpuLaunchFunctsG.H | 110 +++++++++++++++++++++++++----- Src/Base/AMReX_GpuTypes.H | 14 ++-- 3 files changed, 152 insertions(+), 23 deletions(-) diff --git a/Src/Base/AMReX_GpuLaunchFunctsC.H b/Src/Base/AMReX_GpuLaunchFunctsC.H index 5091970ad38..e12e8c4a376 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsC.H +++ b/Src/Base/AMReX_GpuLaunchFunctsC.H @@ -149,6 +149,57 @@ void launch (T const& n, L&& f) noexcept std::forward(f)(n); } +template +void LaunchRaw (IntVectND nblocks, L const& f) +{ + static_assert(MT == 1); + if constexpr(dim == 1) { + for (int bx=0; bx < nblocks[0]; ++bx) { + f(Gpu::LaunchHandler{{bx}, nblocks, nullptr}); + } + } else if constexpr(dim == 2) { + for (int by=0; by < nblocks[1]; ++by) { + for (int bx=0; bx < nblocks[0]; ++bx) { + f(Gpu::LaunchHandler{{bx, by}, nblocks, nullptr}); + } + } + } else { + for (int bz=0; bz < nblocks[2]; ++bz) { + for (int by=0; by < nblocks[1]; ++by) { + for (int bx=0; bx < nblocks[0]; ++bx) { + f(Gpu::LaunchHandler{{bx, by, bz}, nblocks, nullptr}); + } + } + } + } +} + +template +void LaunchRaw (IntVectND nblocks, std::size_t shared_mem_elements, L const& f) +{ + static_assert(MT == 1); + T smem[shared_mem_elements]; + if constexpr(dim == 1) { + for (int bx=0; bx < nblocks[0]; ++bx) { + f(Gpu::LaunchHandler{{bx}, nblocks, &smem}); + } + } else if constexpr(dim == 2) { + for (int by=0; by < nblocks[1]; ++by) { + for (int bx=0; bx < nblocks[0]; ++bx) { + f(Gpu::LaunchHandler{{bx, by}, nblocks, &smem}); + } + } + } else { + for (int bz=0; bz < nblocks[2]; ++bz) { + for (int by=0; by < nblocks[1]; ++by) { + for (int bx=0; bx < nblocks[0]; ++bx) { + f(Gpu::LaunchHandler{{bx, by, bz}, nblocks, &smem}); + } + } + } + } +} + template > > AMREX_ATTRIBUTE_FLATTEN_FOR void For (T n, L const& f) noexcept diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H index 29042d06135..b08acf8cc6c 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsG.H +++ b/Src/Base/AMReX_GpuLaunchFunctsG.H @@ -706,27 +706,67 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, } template -void Launch (IntVectND nblocks, std::size_t shared_mem_bytes, L const& f) +void LaunchRaw (IntVectND nblocks, L const& f) { - const auto nthreads_total = MT * std::size_t(nblocks); - const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1) - / sizeof(unsigned long long); - auto& q = *(stream.queue); + auto& q = Gpu::Device::streamQueue(); + + sycl::range threads_per_block; + sycl::range threads_total; + + for (int i=0; i - shared_data(sycl::range<1>(shared_mem_numull), h); - h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total), - sycl::range<1>(MT)), - [=] (sycl::nd_item<1> item) - [[sycl::reqd_work_group_size(MT)]] - [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] - { - f(Gpu::Handler{&item,shared_data.get_multi_ptr().get()}); - }); + + h.parallel_for(sycl::nd_range(threads_total, threads_per_block), + [=] (sycl::nd_item item) + [[sycl::reqd_work_group_size(1, 1, MT)]] + [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] + { + f(Gpu::LaunchHandler{ + &item, + nullptr + }); + }); }); } catch (sycl::exception const& ex) { - amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!"); + amrex::Abort(std::string("LaunchRaw: ")+ex.what()+"!!!!!"); + } +} + +template +void LaunchRaw (IntVectND nblocks, std::size_t shared_mem_elements, L const& f) +{ + auto& q = Gpu::Device::streamQueue(); + + sycl::range threads_per_block; + sycl::range threads_total; + + for (int i=0; i shared_data(sycl::range<1>(shared_mem_elements), h); + + h.parallel_for(sycl::nd_range(threads_total, threads_per_block), + [=] (sycl::nd_item item) + [[sycl::reqd_work_group_size(1, 1, MT)]] + [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] + { + f(Gpu::LaunchHandler{ + &item, + shared_data.get_multi_ptr().get() + }); + }); + }); + } catch (sycl::exception const& ex) { + amrex::Abort(std::string("LaunchRaw: ")+ex.what()+"!!!!!"); } } @@ -1123,6 +1163,44 @@ ParallelFor (Gpu::KernelInfo const&, AMREX_GPU_ERROR_CHECK(); } +template +void LaunchRaw (IntVectND nblocks, L const& f) +{ + dim3 num_blocks; + num_blocks.x = nblocks[0]; + if constexpr (dim >= 2) { + num_blocks.y = nblocks[1]; + } + if constexpr (dim == 3) { + num_blocks.z = nblocks[2]; + } + + AMREX_LAUNCH_KERNEL(MT, num_blocks, MT, 0, Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + f(Gpu::LaunchHandler{}); + }); + AMREX_GPU_ERROR_CHECK(); +} + +template +void LaunchRaw (IntVectND nblocks, std::size_t shared_mem_elements, L const& f) +{ + dim3 num_blocks; + num_blocks.x = nblocks[0]; + if constexpr (dim >= 2) { + num_blocks.y = nblocks[1]; + } + if constexpr (dim == 3) { + num_blocks.z = nblocks[2]; + } + + AMREX_LAUNCH_KERNEL(MT, num_blocks, MT, shared_mem_elements * sizeof(T), Gpu::gpuStream(), + [=] AMREX_GPU_DEVICE () noexcept { + f(Gpu::LaunchHandler{}); + }); + AMREX_GPU_ERROR_CHECK(); +} + #endif template diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H index a1e07436c21..7ed05df3e6f 100644 --- a/Src/Base/AMReX_GpuTypes.H +++ b/Src/Base/AMReX_GpuTypes.H @@ -102,16 +102,16 @@ struct Handler {}; #endif -template +template struct LaunchHandler { #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) LaunchHandler() = default; #elif defined(AMREX_USE_SYCL) - LaunchHandler(sycl::nd_item const* a_item, char * a_shared_mem) + LaunchHandler(sycl::nd_item const* a_item, T * a_shared_mem) : m_item{a_item}, m_shared_mem{a_shared_mem} {}; #else - LaunchHandler(IntVectND a_blockid, IntVectND a_griddim, char * a_shared_mem) + LaunchHandler(IntVectND a_blockid, IntVectND a_griddim, T * a_shared_mem) : m_blockid{a_blockid}, m_griddim{a_griddim}, m_shared_mem{a_shared_mem} {}; #endif @@ -230,14 +230,14 @@ struct LaunchHandler ); } - template AMREX_GPU_DEVICE AMREX_FORCE_INLINE T* shared_memory () const { #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) + static_assert(32 >= alignof(T)); alignas(32) extern __shared__ char smem[]; return reinterpret_cast(smem); #else - return reinterpret_cast(m_shared_mem); + return m_shared_mem; #endif } @@ -254,13 +254,13 @@ private: #if defined(AMREX_USE_SYCL) sycl::nd_item const* m_item; - char * m_shared_mem; + T * m_shared_mem; #endif #if !defined(AMREX_USE_GPU) IntVectND m_blockid; IntVectND m_griddim; - char * m_shared_mem; + T * m_shared_mem; #endif }; From 12be03339268b244246a8fa79f13fa6af4753aa6 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Thu, 15 Jan 2026 17:30:54 +0100 Subject: [PATCH 03/46] add includes --- Src/Base/AMReX_GpuTypes.H | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H index 7ed05df3e6f..d37ed86096a 100644 --- a/Src/Base/AMReX_GpuTypes.H +++ b/Src/Base/AMReX_GpuTypes.H @@ -3,6 +3,8 @@ #include #include +#include +#include #ifdef AMREX_USE_GPU From 9912b9be38e8ac5403696e9624542ab892a78bae Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Mon, 19 Jan 2026 18:26:42 +0100 Subject: [PATCH 04/46] Fix circular include --- Src/Base/AMReX_GpuControl.H | 1 - Src/Base/AMReX_GpuTypes.H | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/Src/Base/AMReX_GpuControl.H b/Src/Base/AMReX_GpuControl.H index 20084bc34cc..d2f41800c90 100644 --- a/Src/Base/AMReX_GpuControl.H +++ b/Src/Base/AMReX_GpuControl.H @@ -3,7 +3,6 @@ #include #include -#include #include diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H index d37ed86096a..c16cd0ee631 100644 --- a/Src/Base/AMReX_GpuTypes.H +++ b/Src/Base/AMReX_GpuTypes.H @@ -133,7 +133,7 @@ struct LaunchHandler (1 * ... * nd_block_size) == threads_per_block); constexpr IntVectND iv_block_size = blockDimND(); IntVectND ret(0); - unsigned int idx = threadIdx(); + unsigned int idx = threadIdx1D(); if constexpr (dim == 3) { ret[2] = idx / iv_block_size[2]; idx = idx - ret[2] * iv_block_size[2]; From 288428f1c93d1a6cdb4faf60df50a3a06bc14093 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Mon, 19 Jan 2026 19:08:10 +0100 Subject: [PATCH 05/46] fix index calculation --- Src/Base/AMReX_GpuTypes.H | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H index c16cd0ee631..f75a99ca2df 100644 --- a/Src/Base/AMReX_GpuTypes.H +++ b/Src/Base/AMReX_GpuTypes.H @@ -135,12 +135,12 @@ struct LaunchHandler IntVectND ret(0); unsigned int idx = threadIdx1D(); if constexpr (dim == 3) { - ret[2] = idx / iv_block_size[2]; - idx = idx - ret[2] * iv_block_size[2]; + ret[2] = idx / iv_block_size[1]; + idx = idx - ret[2] * iv_block_size[1]; } if constexpr (dim >= 2) { - ret[1] = idx / iv_block_size[1]; - idx = idx - ret[1] * iv_block_size[1]; + ret[1] = idx / iv_block_size[0]; + idx = idx - ret[1] * iv_block_size[0]; } ret[0] = idx; return ret; @@ -200,13 +200,13 @@ struct LaunchHandler } AMREX_GPU_DEVICE AMREX_FORCE_INLINE - constexpr unsigned int blockDim1D () const { + static constexpr unsigned int blockDim1D () { return threads_per_block; } template AMREX_GPU_DEVICE AMREX_FORCE_INLINE - constexpr IntVectND blockDimND () const { + static constexpr IntVectND blockDimND () { static_assert(sizeof...(nd_block_size) == dim && (1 * ... * nd_block_size) == threads_per_block); return {nd_block_size...}; From 418ba21a5095fe3e2980e267f5d7104d3c6d8df8 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Wed, 28 Jan 2026 17:45:21 +0100 Subject: [PATCH 06/46] add function to get the runtime value of blockDim.x --- Src/Base/AMReX_GpuTypes.H | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H index f75a99ca2df..81d5c75e881 100644 --- a/Src/Base/AMReX_GpuTypes.H +++ b/Src/Base/AMReX_GpuTypes.H @@ -204,6 +204,15 @@ struct LaunchHandler return threads_per_block; } + AMREX_GPU_DEVICE AMREX_FORCE_INLINE + unsigned int blockDim1Drt () const { + return static_cast(AMREX_HIP_CUDA_OR_SYCL_OR_CPU( + blockDim.x, + threads_per_block, + threads_per_block + )); + } + template AMREX_GPU_DEVICE AMREX_FORCE_INLINE static constexpr IntVectND blockDimND () { From 87d2dcf08ca708f4c2d8bcfae93d3b5d12e79459 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Wed, 4 Feb 2026 09:57:59 +0100 Subject: [PATCH 07/46] fix --- Src/Base/AMReX_GpuLaunchFunctsC.H | 10 ++++++---- Src/Base/AMReX_GpuTypes.H | 4 ++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/Src/Base/AMReX_GpuLaunchFunctsC.H b/Src/Base/AMReX_GpuLaunchFunctsC.H index 6421082524e..28d75aa2e84 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsC.H +++ b/Src/Base/AMReX_GpuLaunchFunctsC.H @@ -2,6 +2,8 @@ #define AMREX_GPU_LAUNCH_FUNCTS_C_H_ #include +#include + namespace amrex { /** Helper type to store/access the SIMD width in ParallelForSIMD lambdas @@ -178,22 +180,22 @@ template void LaunchRaw (IntVectND nblocks, std::size_t shared_mem_elements, L const& f) { static_assert(MT == 1); - T smem[shared_mem_elements]; + std::vector smem(shared_mem_elements); if constexpr(dim == 1) { for (int bx=0; bx < nblocks[0]; ++bx) { - f(Gpu::LaunchHandler{{bx}, nblocks, &smem}); + f(Gpu::LaunchHandler{{bx}, nblocks, smem.data()}); } } else if constexpr(dim == 2) { for (int by=0; by < nblocks[1]; ++by) { for (int bx=0; bx < nblocks[0]; ++bx) { - f(Gpu::LaunchHandler{{bx, by}, nblocks, &smem}); + f(Gpu::LaunchHandler{{bx, by}, nblocks, smem.data()}); } } } else { for (int bz=0; bz < nblocks[2]; ++bz) { for (int by=0; by < nblocks[1]; ++by) { for (int bx=0; bx < nblocks[0]; ++bx) { - f(Gpu::LaunchHandler{{bx, by, bz}, nblocks, &smem}); + f(Gpu::LaunchHandler{{bx, by, bz}, nblocks, smem.data()}); } } } diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H index 81d5c75e881..2c24c0facfb 100644 --- a/Src/Base/AMReX_GpuTypes.H +++ b/Src/Base/AMReX_GpuTypes.H @@ -111,10 +111,10 @@ struct LaunchHandler LaunchHandler() = default; #elif defined(AMREX_USE_SYCL) LaunchHandler(sycl::nd_item const* a_item, T * a_shared_mem) - : m_item{a_item}, m_shared_mem{a_shared_mem} {}; + : m_item{a_item}, m_shared_mem{a_shared_mem} {} #else LaunchHandler(IntVectND a_blockid, IntVectND a_griddim, T * a_shared_mem) - : m_blockid{a_blockid}, m_griddim{a_griddim}, m_shared_mem{a_shared_mem} {}; + : m_blockid{a_blockid}, m_griddim{a_griddim}, m_shared_mem{a_shared_mem} {} #endif AMREX_GPU_DEVICE AMREX_FORCE_INLINE From 2636e1986f48388cc22bb546bdea84428a4f2cac Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Wed, 4 Feb 2026 10:23:50 +0100 Subject: [PATCH 08/46] add template for SYCL --- Src/Base/AMReX_GpuLaunchFunctsG.H | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H index 678fc24d857..3afe26ec9e9 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsG.H +++ b/Src/Base/AMReX_GpuLaunchFunctsG.H @@ -178,7 +178,9 @@ void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes, [=] (sycl::nd_item<1> item) [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - f(Gpu::Handler{&item,shared_data.get_multi_ptr().get()}); + f(Gpu::Handler{&item, + shared_data.template get_multi_ptr().get() + }); }); }); } catch (sycl::exception const& ex) { @@ -224,7 +226,9 @@ void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream, [[sycl::reqd_work_group_size(MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - f(Gpu::Handler{&item,shared_data.get_multi_ptr().get()}); + f(Gpu::Handler{&item, + shared_data.template get_multi_ptr().get() + }); }); }); } catch (sycl::exception const& ex) { @@ -302,8 +306,9 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L const& f) noexcept i < std::size_t(n); i += stride) { int n_active_threads = amrex::min(std::size_t(n)-i+item.get_local_id(0), item.get_local_range(0)); - detail::call_f_scalar_handler(f, T(i), Gpu::Handler{&item, shared_data.get_multi_ptr().get(), - n_active_threads}); + detail::call_f_scalar_handler(f, T(i), Gpu::Handler{&item, + shared_data.template get_multi_ptr().get(), + n_active_threads}); } }); }); @@ -352,8 +357,9 @@ void ParallelFor (Gpu::KernelInfo const& info, BoxND const& box, L const& f auto iv = indexer.intVect(icell); int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)), std::uint64_t(item.get_local_range(0))); - detail::call_f_intvect_handler(f, iv, Gpu::Handler{&item, shared_data.get_multi_ptr().get(), - n_active_threads}); + detail::call_f_intvect_handler(f, iv, Gpu::Handler{&item, + shared_data.template get_multi_ptr().get(), + n_active_threads}); } }); }); @@ -403,9 +409,9 @@ void ParallelFor (Gpu::KernelInfo const& info, BoxND const& box, T ncomp, L auto iv = indexer.intVect(icell); int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)), std::uint64_t(item.get_local_range(0))); - detail::call_f_intvect_ncomp_handler(f, iv, ncomp, - Gpu::Handler{&item, shared_data.get_multi_ptr().get(), - n_active_threads}); + detail::call_f_intvect_ncomp_handler(f, iv, ncomp, Gpu::Handler{&item, + shared_data.template get_multi_ptr().get(), + n_active_threads}); } }); }); @@ -761,7 +767,7 @@ void LaunchRaw (IntVectND nblocks, std::size_t shared_mem_elements, L const { f(Gpu::LaunchHandler{ &item, - shared_data.get_multi_ptr().get() + shared_data.template get_multi_ptr().get() }); }); }); From 536707fe248a90c08753c85055fbae6de6070f6b Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Mon, 9 Feb 2026 15:20:38 +0100 Subject: [PATCH 09/46] remove blockDim1Drt --- Src/Base/AMReX_GpuTypes.H | 9 --------- 1 file changed, 9 deletions(-) diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H index 2c24c0facfb..9731f23f5cf 100644 --- a/Src/Base/AMReX_GpuTypes.H +++ b/Src/Base/AMReX_GpuTypes.H @@ -204,15 +204,6 @@ struct LaunchHandler return threads_per_block; } - AMREX_GPU_DEVICE AMREX_FORCE_INLINE - unsigned int blockDim1Drt () const { - return static_cast(AMREX_HIP_CUDA_OR_SYCL_OR_CPU( - blockDim.x, - threads_per_block, - threads_per_block - )); - } - template AMREX_GPU_DEVICE AMREX_FORCE_INLINE static constexpr IntVectND blockDimND () { From e1facb833e55800d1f9b5a77089f4b8d5b5d0d03 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Mon, 9 Feb 2026 18:44:09 +0100 Subject: [PATCH 10/46] convert some functions to use LaunchRaw --- Src/AmrCore/AMReX_TagBox.cpp | 86 +++++-------------- Src/Base/AMReX_BaseFabUtility.H | 48 +++-------- Src/Base/AMReX_FBI.H | 27 ++---- Src/Base/AMReX_GpuContainers.H | 40 ++------- Src/Base/AMReX_GpuLaunchFunctsC.H | 6 +- Src/Base/AMReX_GpuLaunchFunctsG.H | 2 - Src/Base/AMReX_GpuReduce.H | 7 ++ Src/Base/AMReX_GpuTypes.H | 19 ++-- .../MLMG/AMReX_MLEBTensorOp_bc.cpp | 18 ++-- Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp | 16 +--- Src/Particle/AMReX_ParticleUtil.H | 45 +++++----- 11 files changed, 107 insertions(+), 207 deletions(-) diff --git a/Src/AmrCore/AMReX_TagBox.cpp b/Src/AmrCore/AMReX_TagBox.cpp index a0a3a0e9940..066c863a01f 100644 --- a/Src/AmrCore/AMReX_TagBox.cpp +++ b/Src/AmrCore/AMReX_TagBox.cpp @@ -446,33 +446,18 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector& v) const int* ntags = dv_ntags.data() + blockoffset[li]; const int ncells = fai.fabbox().numPts(); const char* tags = (*this)[fai].dataPtr(); -#ifdef AMREX_USE_SYCL - amrex::launch(nblocks[li], sizeof(int)*Gpu::Device::warp_size, - Gpu::Device::gpuStream(), - [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept - { - int bid = h.item->get_group_linear_id(); - int tid = h.item->get_local_id(0); - int icell = h.item->get_global_id(0); - - int t = 0; - if (icell < ncells && tags[icell] != TagBox::CLEAR) { - t = 1; - } - t = Gpu::blockReduce - (t, Gpu::warpReduce >(), 0, h); - if (tid == 0) { - ntags[bid] = t; - } - }); + amrex::LaunchRaw(amrex::IntVectND<1>{nblocks[li]}, +#ifdef AMREX_USE_SYCL + Gpu::Device::warp_size, #else - amrex::launch(nblocks[li], Gpu::Device::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept + 0, +#endif + [=] AMREX_GPU_DEVICE (auto lh) noexcept { - int bid = blockIdx.x; - int tid = threadIdx.x; - int icell = block_size*blockIdx.x+threadIdx.x; + int bid = lh.blockIdx1D(); + int tid = lh.threadIdx1D(); + int icell = lh.globalIdx1D(); int t = 0; if (icell < ncells && tags[icell] != TagBox::CLEAR) { @@ -480,12 +465,12 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector& v) const } t = Gpu::blockReduce - (t, Gpu::warpReduce >(), 0); + (t, Gpu::warpReduce >(), 0, + lh.handler()); if (tid == 0) { ntags[bid] = t; } }); -#endif } Gpu::PinnedVector hv_ntags(ntotblocks); @@ -524,51 +509,27 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector& v) const const auto lenx = len.x; const int ncells = bx.numPts(); const char* tags = (*this)[fai].dataPtr(); -#ifdef AMREX_USE_SYCL - amrex::launch(nblocks[li], sizeof(unsigned int), Gpu::Device::gpuStream(), - [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept + amrex::LaunchRaw(amrex::IntVectND<1>{nblocks[li]}, 1, + [=] AMREX_GPU_DEVICE (auto lh) noexcept { - int bid = h.item->get_group(0); - int tid = h.item->get_local_id(0); - int icell = h.item->get_global_id(0); + int bid = lh.blockIdx1D(); + int tid = lh.threadIdx1D(); + int icell = lh.globalIdx1D(); - unsigned int* shared_counter = (unsigned int*)h.local; + unsigned int * shared_counter = lh.shared_memory(); if (tid == 0) { *shared_counter = 0; } - h.item->barrier(sycl::access::fence_space::local_space); + lh.syncthreads(); if (icell < ncells && tags[icell] != TagBox::CLEAR) { - unsigned int itag = Gpu::Atomic::Add - (shared_counter, 1u); - IntVect* p = dp_tags + dp_tags_offset[iblock_begin+bid]; - int k = icell / lenxy; - int j = (icell - k*lenxy) / lenx; - int i = (icell - k*lenxy) - j*lenx; - i += lo.x; - j += lo.y; - k += lo.z; - p[itag] = IntVect(AMREX_D_DECL(i,j,k)); - } - }); -#else - amrex::launch(nblocks[li], sizeof(unsigned int), Gpu::Device::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept - { - int bid = blockIdx.x; - int tid = threadIdx.x; - int icell = block_size*blockIdx.x+threadIdx.x; - Gpu::SharedMemory gsm; - unsigned int * shared_counter = gsm.dataPtr(); - if (tid == 0) { - *shared_counter = 0; - } - __syncthreads(); + unsigned int itag = Gpu::Atomic::Add +#ifdef AMREX_USE_SYCL + +#endif + (shared_counter, 1u); - if (icell < ncells && tags[icell] != TagBox::CLEAR) { - unsigned int itag = Gpu::Atomic::Add(shared_counter, 1u); IntVect* p = dp_tags + dp_tags_offset[iblock_begin+bid]; int k = icell / lenxy; int j = (icell - k*lenxy) / lenx; @@ -579,7 +540,6 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector& v) const p[itag] = IntVect(AMREX_D_DECL(i,j,k)); } }); -#endif } } diff --git a/Src/Base/AMReX_BaseFabUtility.H b/Src/Base/AMReX_BaseFabUtility.H index d0c1e78847b..c6c125db5be 100644 --- a/Src/Base/AMReX_BaseFabUtility.H +++ b/Src/Base/AMReX_BaseFabUtility.H @@ -42,49 +42,27 @@ void fill (BaseFab& aos_fab, F const& f) std::uint64_t nblocks_long = (ntotcells+nthreads_per_block-1)/nthreads_per_block; AMREX_ASSERT(nblocks_long <= std::uint64_t(std::numeric_limits::max())); auto nblocks = int(nblocks_long); - std::size_t shared_mem_bytes = nthreads_per_block * sizeof(STRUCT); + std::size_t shared_mem_elem = nthreads_per_block * STRUCTSIZE; T* p = (T*)aos_fab.dataPtr(); -#ifdef AMREX_USE_SYCL - amrex::launch(nblocks, shared_mem_bytes, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept + amrex::LaunchRaw(amrex::IntVectND<1>{nblocks}, shared_mem_elem, + [=] AMREX_GPU_DEVICE (auto lh) noexcept { - auto const icell = std::uint64_t(handler.globalIdx()); - std::uint64_t const blockDimx = handler.blockDim(); - std::uint64_t const threadIdxx = handler.threadIdx(); - std::uint64_t const blockIdxx = handler.blockIdx(); - auto const shared = (T*)handler.sharedMemory(); + std::uint64_t const icell = + std::uint64_t(lh.blockDim1D())*lh.blockIdx1D()+lh.threadIdx1D(); + T* const shared = lh.shared_memory(); if (icell < indexer.numPts()) { - auto ga = new(shared+threadIdxx*STRUCTSIZE) STRUCT; + auto ga = new(shared+std::uint64_t(lh.threadIdx1D())*STRUCTSIZE) STRUCT; auto [i, j, k] = indexer(icell); f(*ga, i, j, k); } - handler.sharedBarrier(); - for (std::uint64_t m = threadIdxx, - mend = amrex::min(blockDimx, indexer.numPts()-blockDimx*blockIdxx) * STRUCTSIZE; - m < mend; m += blockDimx) { - p[blockDimx*blockIdxx*STRUCTSIZE+m] = shared[m]; + lh.syncthreads(); + for (std::uint64_t m = lh.threadIdx1D(), + mend = amrex::min(lh.blockDim1D(), + indexer.numPts()-std::uint64_t(lh.blockDim1D())*lh.blockIdx1D()) * STRUCTSIZE; + m < mend; m += lh.blockDim1D()) { + p[std::uint64_t(lh.blockDim1D())*lh.blockIdx1D()*STRUCTSIZE+m] = shared[m]; } }); -#else - amrex::launch(nblocks, shared_mem_bytes, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept - { - std::uint64_t const icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x; - Gpu::SharedMemory gsm; - T* const shared = gsm.dataPtr(); - if (icell < indexer.numPts()) { - auto ga = new(shared+std::uint64_t(threadIdx.x)*STRUCTSIZE) STRUCT; - auto [i, j, k] = indexer(icell); - f(*ga, i, j, k); - } - __syncthreads(); - for (std::uint64_t m = threadIdx.x, - mend = amrex::min(blockDim.x, indexer.numPts()-std::uint64_t(blockDim.x)*blockIdx.x) * STRUCTSIZE; - m < mend; m += blockDim.x) { - p[std::uint64_t(blockDim.x)*blockIdx.x*STRUCTSIZE+m] = shared[m]; - } - }); -#endif } else #endif { diff --git a/Src/Base/AMReX_FBI.H b/Src/Base/AMReX_FBI.H index 1b3f8cc13f0..ee9584a4c0e 100644 --- a/Src/Base/AMReX_FBI.H +++ b/Src/Base/AMReX_FBI.H @@ -295,25 +295,16 @@ void deterministic_fab_to_fab (Vector> const& a_tags, int s auto const* pntags = d_ntags.data(); auto const nblocks = int(h_ntags.size()-1); constexpr auto nthreads = 256; - amrex::launch(nblocks, Gpu::gpuStream(), -#ifdef AMREX_USE_SYCL - [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept - [[sycl::reqd_work_group_size(nthreads)]] -#else - [=] AMREX_GPU_DEVICE () noexcept -#endif - { -#ifdef AMREX_USE_SYCL - Dim1 blockIdx{item.get_group_linear_id()}; - Dim1 threadIdx{item.get_local_linear_id()}; -#endif - for (unsigned int itag = pntags[blockIdx.x]; itag < pntags[blockIdx.x+1]; ++itag) { + amrex::LaunchRaw(amrex::IntVectND<1>{nblocks}, + [=] AMREX_GPU_DEVICE (auto lh) noexcept + { + for (unsigned int itag = pntags[lh.blockIdx1D()]; itag < pntags[lh.blockIdx1D()+1]; ++itag) { auto const tag = ptag[itag]; auto ncells = int(tag.dbox.numPts()); const auto len = amrex::length(tag.dbox); const auto lo = amrex::lbound(tag.dbox); - for (int icell = int(threadIdx.x); icell < ncells; icell += nthreads) { + for (int icell = int(lh.threadIdx1D()); icell < ncells; icell += nthreads) { int k = icell / (len.x*len.y); int j = (icell - k*(len.x*len.y)) / len.x; int i = (icell - k*(len.x*len.y)) - j*len.x; @@ -328,12 +319,8 @@ void deterministic_fab_to_fab (Vector> const& a_tags, int s } } - if (itag+1 < pntags[blockIdx.x+1]) { -#ifdef AMREX_USE_SYCL - sycl::group_barrier(item.get_group()); -#else - __syncthreads(); -#endif + if (itag+1 < pntags[lh.blockIdx1D()+1]) { + lh.syncthreads(); } } }); diff --git a/Src/Base/AMReX_GpuContainers.H b/Src/Base/AMReX_GpuContainers.H index 11f4585feda..b1d0be9de89 100644 --- a/Src/Base/AMReX_GpuContainers.H +++ b/Src/Base/AMReX_GpuContainers.H @@ -438,51 +438,27 @@ namespace amrex::Gpu { auto pu = reinterpret_cast(p); constexpr int nthreads_per_block = (sizeof(T) <= 64) ? 256 : 128; int nblocks = static_cast((N+nthreads_per_block-1)/nthreads_per_block); - std::size_t shared_mem_bytes = nthreads_per_block * sizeof(T); -#ifdef AMREX_USE_SYCL - amrex::launch(nblocks, shared_mem_bytes, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept + std::size_t shared_mem_Uelem = nthreads_per_block * nU; + amrex::LaunchRaw(amrex::IntVectND<1>{nblocks}, shared_mem_Uelem, + [=] AMREX_GPU_DEVICE (auto lh) noexcept { - Long i = handler.globalIdx(); - Long blockDimx = handler.blockDim(); - Long threadIdxx = handler.threadIdx(); - Long blockIdxx = handler.blockIdx(); - auto const shared_U = (U*)handler.sharedMemory(); - auto const shared_T = (T*)shared_U; - if (i < N) { - auto ga = new(shared_T+threadIdxx) T; - f(*ga, i); - } - handler.sharedBarrier(); - for (Long m = threadIdxx, - mend = nU * amrex::min(blockDimx, N-blockDimx*blockIdxx); - m < mend; m += blockDimx) { - pu[blockDimx*blockIdxx*nU+m] = shared_U[m]; - } - }); -#else - amrex::launch(nblocks, shared_mem_bytes, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept - { - Long blockDimx = blockDim.x; - Long threadIdxx = threadIdx.x; - Long blockIdxx = blockIdx.x; + Long blockDimx = lh.blockDim1D(); + Long threadIdxx = lh.threadIdx1D(); + Long blockIdxx = lh.blockIdx1D(); Long i = blockDimx*blockIdxx + threadIdxx; - Gpu::SharedMemory gsm; - auto const shared_U = gsm.dataPtr(); + auto const shared_U = lh.shared_memory(); auto const shared_T = (T*)shared_U; if (i < N) { auto ga = new(shared_T+threadIdxx) T; f(*ga, i); } - __syncthreads(); + lh.syncthreads(); for (Long m = threadIdxx, mend = nU * amrex::min(blockDimx, N-blockDimx*blockIdxx); m < mend; m += blockDimx) { pu[blockDimx*blockIdxx*nU+m] = shared_U[m]; } }); -#endif } #endif } diff --git a/Src/Base/AMReX_GpuLaunchFunctsC.H b/Src/Base/AMReX_GpuLaunchFunctsC.H index 0c13b1cb6bd..4a10a3a70ed 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsC.H +++ b/Src/Base/AMReX_GpuLaunchFunctsC.H @@ -165,19 +165,19 @@ void LaunchRaw (IntVectND nblocks, std::size_t shared_mem_elements, L const std::vector smem(shared_mem_elements); if constexpr(dim == 1) { for (int bx=0; bx < nblocks[0]; ++bx) { - f(Gpu::LaunchHandler{{bx}, nblocks, smem.data()}); + f(Gpu::LaunchHandler{{bx}, nblocks, smem.data()}); } } else if constexpr(dim == 2) { for (int by=0; by < nblocks[1]; ++by) { for (int bx=0; bx < nblocks[0]; ++bx) { - f(Gpu::LaunchHandler{{bx, by}, nblocks, smem.data()}); + f(Gpu::LaunchHandler{{bx, by}, nblocks, smem.data()}); } } } else { for (int bz=0; bz < nblocks[2]; ++bz) { for (int by=0; by < nblocks[1]; ++by) { for (int bx=0; bx < nblocks[0]; ++bx) { - f(Gpu::LaunchHandler{{bx, by, bz}, nblocks, smem.data()}); + f(Gpu::LaunchHandler{{bx, by, bz}, nblocks, smem.data()}); } } } diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H index 3afe26ec9e9..b647c58bd3f 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsG.H +++ b/Src/Base/AMReX_GpuLaunchFunctsG.H @@ -726,7 +726,6 @@ void LaunchRaw (IntVectND nblocks, L const& f) try { q.submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range(threads_total, threads_per_block), [=] (sycl::nd_item item) [[sycl::reqd_work_group_size(1, 1, MT)]] @@ -759,7 +758,6 @@ void LaunchRaw (IntVectND nblocks, std::size_t shared_mem_elements, L const try { q.submit([&] (sycl::handler& h) { sycl::local_accessor shared_data(sycl::range<1>(shared_mem_elements), h); - h.parallel_for(sycl::nd_range(threads_total, threads_per_block), [=] (sycl::nd_item item) [[sycl::reqd_work_group_size(1, 1, MT)]] diff --git a/Src/Base/AMReX_GpuReduce.H b/Src/Base/AMReX_GpuReduce.H index b0d8c451427..45aca944157 100644 --- a/Src/Base/AMReX_GpuReduce.H +++ b/Src/Base/AMReX_GpuReduce.H @@ -330,6 +330,13 @@ T blockReduce (T x, WARPREDUCE && warp_reduce, T x0) return x; } +template +AMREX_GPU_DEVICE AMREX_FORCE_INLINE +T blockReduce (T x, WARPREDUCE && warp_reduce, T x0, Gpu::Handler const&) +{ + return blockReduce(x, warp_reduce, x0); +} + template AMREX_GPU_DEVICE AMREX_FORCE_INLINE void blockReduce_partial (T* dest, T x, WARPREDUCE && warp_reduce, ATOMICOP && atomic_op, diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H index 9731f23f5cf..7a63dd050cf 100644 --- a/Src/Base/AMReX_GpuTypes.H +++ b/Src/Base/AMReX_GpuTypes.H @@ -107,6 +107,8 @@ struct Handler {}; template struct LaunchHandler { + static_assert(dim >= 1 && dim <= 3); + #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) LaunchHandler() = default; #elif defined(AMREX_USE_SYCL) @@ -243,6 +245,17 @@ struct LaunchHandler #endif } + AMREX_GPU_DEVICE AMREX_FORCE_INLINE + auto handler () const { +#if defined(AMREX_USE_SYCL) + return Gpu::Handler(m_item, reinterpret_cast(m_shared_mem), threads_per_block); +#elif defined(AMREX_USE_GPU) + return Gpu::Handler(threads_per_block); +#else + return 0; +#endif + } + AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto sycl_item () const { #if defined(AMREX_USE_SYCL) @@ -257,17 +270,13 @@ private: #if defined(AMREX_USE_SYCL) sycl::nd_item const* m_item; T * m_shared_mem; -#endif - -#if !defined(AMREX_USE_GPU) +#elif !defined(AMREX_USE_GPU) IntVectND m_blockid; IntVectND m_griddim; T * m_shared_mem; #endif }; - - } #endif diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp_bc.cpp b/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp_bc.cpp index 5e1fb62fc20..d9a29036d70 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp_bc.cpp +++ b/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp_bc.cpp @@ -85,20 +85,12 @@ MLEBTensorOp::applyBCTensor (int amrlev, int mglev, MultiFab& vel, #ifdef AMREX_USE_GPU if (Gpu::inLaunchRegion()) { - amrex::launch<64>(12, Gpu::gpuStream(), -#ifdef AMREX_USE_SYCL - [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) + amrex::LaunchRaw<64>(amrex::IntVectND<1>{12}, + [=] AMREX_GPU_DEVICE (auto lh) { - int bid = item.get_group_linear_id(); - int tid = item.get_local_linear_id(); - int bdim = item.get_local_range(0); -#else - [=] AMREX_GPU_DEVICE () - { - int bid = blockIdx.x; - int tid = threadIdx.x; - int bdim = blockDim.x; -#endif + int bid = lh.blockIdx1D(); + int tid = lh.threadIdx1D(); + int bdim = lh.blockDim1D(); mltensor_fill_edges(bid, tid, bdim, vbx, velfab, mxlo, mylo, mzlo, mxhi, myhi, mzhi, bvxlo, bvylo, bvzlo, bvxhi, bvyhi, bvzhi, diff --git a/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp b/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp index 8d72107d541..dfd8dce0544 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp +++ b/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp @@ -408,20 +408,12 @@ MLTensorOp::applyBCTensor (int amrlev, int mglev, MultiFab& vel, // NOLINT(reada // only edge vals used in 3D stencil #ifdef AMREX_USE_GPU if (Gpu::inLaunchRegion()) { - amrex::launch<64>(12, Gpu::gpuStream(), -#ifdef AMREX_USE_SYCL + amrex::LaunchRaw<64>(amrex::IntVectND<1>{12}, [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) { - int bid = item.get_group_linear_id(); - int tid = item.get_local_linear_id(); - int bdim = item.get_local_range(0); -#else - [=] AMREX_GPU_DEVICE () - { - int bid = blockIdx.x; - int tid = threadIdx.x; - int bdim = blockDim.x; -#endif + int bid = lh.blockIdx1D(); + int tid = lh.threadIdx1D(); + int bdim = lh.blockDim1D(); mltensor_fill_edges(bid, tid, bdim, vbx, velfab, mxlo, mylo, mzlo, mxhi, myhi, mzhi, bvxlo, bvylo, bvzlo, bvxhi, bvyhi, bvzhi, diff --git a/Src/Particle/AMReX_ParticleUtil.H b/Src/Particle/AMReX_ParticleUtil.H index fe1b599f0e4..c2278f80042 100644 --- a/Src/Particle/AMReX_ParticleUtil.H +++ b/Src/Particle/AMReX_ParticleUtil.H @@ -799,12 +799,13 @@ void PermutationForDeposition (Gpu::DeviceVector& perm, index_type n pllist_next[i] = Gpu::Atomic::Exch(pllist_start + f(i), i); }); -#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) - amrex::launch(nbins / gpu_block_size, Gpu::gpuStream(), - [pllist_start,pllist_next,pperm,pglobal_idx] AMREX_GPU_DEVICE () { - __shared__ index_type sdata[gpu_block_size]; - __shared__ index_type global_idx_start; - __shared__ index_type idx_start; +#if defined(AMREX_USE_GPU) + amrex::LaunchRaw(amrex::IntVectND<1>{nbins / gpu_block_size}, + gpu_block_size + 2, + [pllist_start,pllist_next,pperm,pglobal_idx] AMREX_GPU_DEVICE (auto lh) { + index_type* sdata = lh.shared_memory(); + index_type& global_idx_start = *(sdata + gpu_block_size); + index_type& idx_start = *(sdata + gpu_block_size + 1); index_type current_idx = 0; @@ -812,7 +813,7 @@ void PermutationForDeposition (Gpu::DeviceVector& perm, index_type n // Compressed layout: subsequent sweeps of up to gpu_block_size contiguous particles // are put right next to each other, while without the compressed layout, // there can be other particle sweeps from different locations between them. - current_idx = pllist_start[threadIdx.x + gpu_block_size * blockIdx.x]; + current_idx = pllist_start[lh.threadIdx1D() + gpu_block_size * lh.blockIdx1D()]; index_type num_particles_thread = 0; while (current_idx != llist_guard) { @@ -823,33 +824,33 @@ void PermutationForDeposition (Gpu::DeviceVector& perm, index_type n index_type num_particles_block = Gpu::blockReduceSum(num_particles_thread); - if (threadIdx.x == 0) { + if (lh.threadIdx1D() == 0) { global_idx_start = Gpu::Atomic::Add(pglobal_idx, num_particles_block); } } - current_idx = pllist_start[threadIdx.x + gpu_block_size * blockIdx.x]; + current_idx = pllist_start[lh.threadIdx1D() + gpu_block_size * lh.blockIdx1D()]; while (true) { - sdata[threadIdx.x] = index_type(current_idx != llist_guard); + sdata[lh.threadIdx1D()] = index_type(current_idx != llist_guard); index_type x = 0; // simple block wide prefix sum for (index_type i = 1; i= i) { - x = sdata[threadIdx.x - i]; + lh.syncthreads(); + if (lh.threadIdx1D() >= i) { + x = sdata[lh.threadIdx1D() - i]; } - __syncthreads(); - if (threadIdx.x >= i) { - sdata[threadIdx.x] += x; + lh.syncthreads(); + if (lh.threadIdx1D() >= i) { + sdata[lh.threadIdx1D()] += x; } } - __syncthreads(); + lh.syncthreads(); if (sdata[gpu_block_size_m1] == 0) { break; } - if (threadIdx.x == gpu_block_size_m1) { + if (lh.threadIdx1D() == gpu_block_size_m1) { if constexpr (compressed_layout) { idx_start = global_idx_start; global_idx_start += sdata[gpu_block_size_m1]; @@ -857,17 +858,17 @@ void PermutationForDeposition (Gpu::DeviceVector& perm, index_type n idx_start = Gpu::Atomic::Add(pglobal_idx, sdata[gpu_block_size_m1]); } } - __syncthreads(); - sdata[threadIdx.x] += idx_start; + lh.syncthreads(); + sdata[lh.threadIdx1D()] += idx_start; if (current_idx != llist_guard) { - pperm[sdata[threadIdx.x] - 1] = current_idx; + pperm[sdata[lh.threadIdx1D()] - 1] = current_idx; current_idx = pllist_next[current_idx]; } } }); #else amrex::ignore_unused(pperm, pglobal_idx, compressed_layout); - Abort("PermutationForDeposition only implemented for CUDA and HIP"); + Abort("PermutationForDeposition only implemented for GPU"); #endif Gpu::Device::streamSynchronize(); From 1a0897b3e7296778cbeb6e729fb66beb3f7a65e0 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Tue, 10 Feb 2026 17:26:52 +0100 Subject: [PATCH 11/46] Use LaunchRaw in more functions --- Src/AmrCore/AMReX_TagBox.cpp | 10 +- Src/Base/AMReX_GpuReduce.H | 7 - Src/Base/AMReX_MultiFabUtil.H | 22 +- Src/Base/AMReX_MultiFabUtil.cpp | 29 +-- Src/Base/AMReX_Reduce.H | 247 +++++++------------- Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp | 2 +- 6 files changed, 105 insertions(+), 212 deletions(-) diff --git a/Src/AmrCore/AMReX_TagBox.cpp b/Src/AmrCore/AMReX_TagBox.cpp index 066c863a01f..3600d42cc62 100644 --- a/Src/AmrCore/AMReX_TagBox.cpp +++ b/Src/AmrCore/AMReX_TagBox.cpp @@ -448,11 +448,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector& v) const const char* tags = (*this)[fai].dataPtr(); amrex::LaunchRaw(amrex::IntVectND<1>{nblocks[li]}, -#ifdef AMREX_USE_SYCL - Gpu::Device::warp_size, -#else - 0, -#endif + AMREX_SYCL_REDUCE_SMEM(Gpu::Device::warp_size), [=] AMREX_GPU_DEVICE (auto lh) noexcept { int bid = lh.blockIdx1D(); @@ -465,8 +461,8 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector& v) const } t = Gpu::blockReduce - (t, Gpu::warpReduce >(), 0, - lh.handler()); + (t, Gpu::warpReduce >(), 0 + AMREX_SYCL_REDUCE_HANDLER(lh.handler())); if (tid == 0) { ntags[bid] = t; } diff --git a/Src/Base/AMReX_GpuReduce.H b/Src/Base/AMReX_GpuReduce.H index 45aca944157..b0d8c451427 100644 --- a/Src/Base/AMReX_GpuReduce.H +++ b/Src/Base/AMReX_GpuReduce.H @@ -330,13 +330,6 @@ T blockReduce (T x, WARPREDUCE && warp_reduce, T x0) return x; } -template -AMREX_GPU_DEVICE AMREX_FORCE_INLINE -T blockReduce (T x, WARPREDUCE && warp_reduce, T x0, Gpu::Handler const&) -{ - return blockReduce(x, warp_reduce, x0); -} - template AMREX_GPU_DEVICE AMREX_FORCE_INLINE void blockReduce_partial (T* dest, T x, WARPREDUCE && warp_reduce, ATOMICOP && atomic_op, diff --git a/Src/Base/AMReX_MultiFabUtil.H b/Src/Base/AMReX_MultiFabUtil.H index 440f5e27f2a..ce890a249a0 100644 --- a/Src/Base/AMReX_MultiFabUtil.H +++ b/Src/Base/AMReX_MultiFabUtil.H @@ -1209,20 +1209,14 @@ void reduce_to_plane (Array4 const& ar, int direction, Box const& bx, int box const auto len = amrex::length(bx); constexpr int nthreads = 128; auto nblocks = static_cast(b2d.numPts()); -#ifdef AMREX_USE_SYCL - constexpr std::size_t shared_mem_bytes = sizeof(T)*Gpu::Device::warp_size; - amrex::launch(nblocks, shared_mem_bytes, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) - { - int bid = h.blockIdx(); - int tid = h.threadIdx(); -#else - amrex::launch(nblocks, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () + + amrex::LaunchRaw(amrex::IntVectND<1>{nblocks}, + AMREX_SYCL_REDUCE_SMEM(Gpu::Device::warp_size), + [=] AMREX_GPU_DEVICE (auto lh) { - int bid = blockIdx.x; - int tid = threadIdx.x; -#endif + int bid = lh.blockIdx1D(); + int tid = lh.threadIdx1D(); + T tmp; Op().init(tmp); T* p; @@ -1255,7 +1249,7 @@ void reduce_to_plane (Array4 const& ar, int direction, Box const& bx, int box p = ar.ptr(i,j,0); } #ifdef AMREX_USE_SYCL - Op().template parallel_update(*p, tmp, h); + Op().template parallel_update(*p, tmp, lh.handler()); #else Op().template parallel_update(*p, tmp); #endif diff --git a/Src/Base/AMReX_MultiFabUtil.cpp b/Src/Base/AMReX_MultiFabUtil.cpp index 16b7afc362f..de1685b9b5a 100644 --- a/Src/Base/AMReX_MultiFabUtil.cpp +++ b/Src/Base/AMReX_MultiFabUtil.cpp @@ -866,22 +866,14 @@ namespace amrex } int n2dblocks = (n2d+AMREX_GPU_MAX_THREADS-1)/AMREX_GPU_MAX_THREADS; int nblocks = n2dblocks * b.length(direction); -#ifdef AMREX_USE_SYCL - std::size_t shared_mem_byte = sizeof(Real)*Gpu::Device::warp_size; - amrex::launch(nblocks, shared_mem_byte, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept -#else - amrex::launch(nblocks, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept -#endif + + amrex::LaunchRaw(amrex::IntVectND<1>{nblocks}, + AMREX_SYCL_REDUCE_SMEM(Gpu::Device::warp_size), + [=] AMREX_GPU_DEVICE (auto lh) noexcept { -#ifdef AMREX_USE_SYCL - int i1d = h.blockIdx() / n2dblocks; - int i2d = h.threadIdx() + AMREX_GPU_MAX_THREADS*(h.blockIdx()-i1d*n2dblocks); -#else - int i1d = blockIdx.x / n2dblocks; - int i2d = threadIdx.x + AMREX_GPU_MAX_THREADS*(blockIdx.x-i1d*n2dblocks); -#endif + int i1d = lh.blockIdx1D() / n2dblocks; + int i2d = lh.threadIdx1D() + + AMREX_GPU_MAX_THREADS*(lh.blockIdx1D()-i1d*n2dblocks); int i2dy = i2d / n2dx; int i2dx = i2d - i2dy*n2dx; int i, j, k, idir; @@ -903,11 +895,8 @@ namespace amrex } for (int n = 0; n < ncomp; ++n) { Real r = (i2d < n2d) ? fab(i,j,k,n+icomp) : Real(0.0); -#ifdef AMREX_USE_SYCL - Gpu::deviceReduceSum_full(p+n+ncomp*idir, r, h); -#else - Gpu::deviceReduceSum_full(p+n+ncomp*idir, r); -#endif + Gpu::deviceReduceSum_full(p+n+ncomp*idir, r + AMREX_SYCL_REDUCE_HANDLER(lh.handler())); } }); } diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H index f97ecbaaa9e..e7f33db5569 100644 --- a/Src/Base/AMReX_Reduce.H +++ b/Src/Base/AMReX_Reduce.H @@ -229,6 +229,10 @@ namespace Reduce::detail { P().parallel_update(amrex::get(d), amrex::get(s), h); for_each_parallel(d, s, h); } + +#define AMREX_SYCL_REDUCE_SMEM(warp_size) warp_size +#define AMREX_SYCL_REDUCE_HANDLER(launch_handler) , launch_handler + #else template AMREX_GPU_DEVICE AMREX_FORCE_INLINE @@ -244,6 +248,10 @@ namespace Reduce::detail { P().parallel_update(amrex::get(d), amrex::get(s)); for_each_parallel(d, s); } + +#define AMREX_SYCL_REDUCE_SMEM(warp_size) 0 +#define AMREX_SYCL_REDUCE_HANDLER(launch_handler) + #endif #endif @@ -604,29 +612,22 @@ public: reduce_data.nBlocks(stream) = nblocks_ec; reduce_data.updateMaxStreamIndex(stream); -#ifdef AMREX_USE_SYCL // device reduce needs local(i.e., shared) memory - constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size; - amrex::launch(nblocks_ec, shared_mem_bytes, stream, - [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept + amrex::LaunchRaw( + amrex::IntVectND<1>{nblocks_ec}, + AMREX_SYCL_REDUCE_SMEM(Gpu::Device::warp_size), + [=] AMREX_GPU_DEVICE (auto lh) noexcept { - Dim1 blockIdx {gh.blockIdx()}; - Dim1 threadIdx{gh.threadIdx()}; -#else - amrex::launch_global - <<>> - ([=] AMREX_GPU_DEVICE () noexcept - { -#endif ReduceTuple r; Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r); ReduceTuple& dst = pdst[blockIdx.x]; - if (threadIdx.x == 0) { + if (lh.threadIdx1D() == 0) { dst = r; } - for (int iblock = blockIdx.x; iblock < nblocks; iblock += nblocks_ec) { + for (int iblock = lh.blockIdx1D(); iblock < nblocks; iblock += nblocks_ec) { int ibox = iblock / nblocks_per_box; - auto icell = std::uint64_t(iblock-ibox*nblocks_per_box)*AMREX_GPU_MAX_THREADS + threadIdx.x; + auto icell = std::uint64_t(iblock-ibox*nblocks_per_box)*AMREX_GPU_MAX_THREADS + + lh.threadIdx1D(); BoxIndexer const& indexer = dp_boxes[ibox]; if (icell < indexer.numPts()) { @@ -635,11 +636,8 @@ public: (f, ibox, i, j, k, ncomp, r); } } -#ifdef AMREX_USE_SYCL - Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh); -#else - Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r); -#endif + Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r + AMREX_SYCL_REDUCE_HANDLER(lh.handler())); }); } } @@ -659,28 +657,20 @@ public: / (nitems_per_thread*AMREX_GPU_MAX_THREADS); nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks()); reduce_data.updateMaxStreamIndex(stream); -#ifdef AMREX_USE_SYCL - // device reduce needs local(i.e., shared) memory - constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size; - amrex::launch(nblocks_ec, shared_mem_bytes, stream, - [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept + amrex::LaunchRaw( + amrex::IntVectND<1>{static_cast(nblocks_ec)}, + AMREX_SYCL_REDUCE_SMEM(Gpu::Device::warp_size), + [=] AMREX_GPU_DEVICE (auto lh) noexcept { - Dim1 blockIdx {gh.blockIdx()}; - Dim1 threadIdx{gh.threadIdx()}; - Dim1 gridDim {gh.gridDim()}; -#else - amrex::launch(nblocks_ec, 0, stream, - [=] AMREX_GPU_DEVICE () noexcept - { -#endif ReduceTuple r; Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r); - ReduceTuple& dst = *(dp+blockIdx.x); - if (threadIdx.x == 0 && static_cast(blockIdx.x) >= nblocks) { + ReduceTuple& dst = *(dp+lh.blockIdx1D()); + if (lh.threadIdx1D() == 0 && static_cast(lh.blockIdx1D()) >= nblocks) { dst = r; } - for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x, - stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; + for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*lh.blockIdx1D() + + lh.threadIdx1D(), + stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*nblocks_ec; icell < indexer.numPts(); icell += stride) { @@ -696,11 +686,8 @@ public: } } } -#ifdef AMREX_USE_SYCL - Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh); -#else - Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r); -#endif + Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r + AMREX_SYCL_REDUCE_HANDLER(lh.handler())); }); nblocks = std::max(nblocks, static_cast(nblocks_ec)); } @@ -792,39 +779,28 @@ public: / (nitems_per_thread*AMREX_GPU_MAX_THREADS); nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks()); reduce_data.updateMaxStreamIndex(stream); -#ifdef AMREX_USE_SYCL // device reduce needs local(i.e., shared) memory - constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size; - amrex::launch(nblocks_ec, shared_mem_bytes, stream, - [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept + amrex::LaunchRaw( + amrex::IntVectND<1>{nblocks_ec}, + AMREX_SYCL_REDUCE_SMEM(Gpu::Device::warp_size), + [=] AMREX_GPU_DEVICE (auto lh) noexcept { - Dim1 blockIdx {gh.blockIdx()}; - Dim1 threadIdx{gh.threadIdx()}; - Dim1 gridDim {gh.gridDim()}; -#else - amrex::launch(nblocks_ec, 0, stream, - [=] AMREX_GPU_DEVICE () noexcept - { -#endif ReduceTuple r; Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r); - ReduceTuple& dst = *(dp+blockIdx.x); - if (threadIdx.x == 0 && static_cast(blockIdx.x) >= nblocks) { + ReduceTuple& dst = *(dp+lh.blockIdx1D()); + if (lh.threadIdx1D() == 0 && static_cast(lh.blockIdx1D()) >= nblocks) { dst = r; } - for (N i = N(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x, - stride = N(AMREX_GPU_MAX_THREADS)*gridDim.x; + for (N i = N(AMREX_GPU_MAX_THREADS)*lh.blockIdx1D()+lh.threadIdx1D(), + stride = N(AMREX_GPU_MAX_THREADS)*nblocks_ec; i < n; i += stride) { auto pr = f(i); Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r,pr); } -#ifdef AMREX_USE_SYCL - Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh); -#else - Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r); -#endif + Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r + AMREX_SYCL_REDUCE_HANDLER(lh.handler())); }); nblocks = amrex::max(nblocks, nblocks_ec); } @@ -860,52 +836,34 @@ public: #endif { int maxblocks = reduce_data.maxBlocks(); -#ifdef AMREX_USE_SYCL - // device reduce needs local(i.e., shared) memory - constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size; -#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND +#if !defined(AMREX_NO_SYCL_REDUCE_WORKAROUND) && defined(AMREX_USE_SYCL) // xxxxx SYCL todo: reduce bug workaround Gpu::DeviceVector dtmp(1); auto presult = dtmp.data(); #else auto presult = hp; #endif - amrex::launch(1, shared_mem_bytes, stream, - [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept + amrex::LaunchRaw( + amrex::IntVectND<1>{1}, + AMREX_SYCL_REDUCE_SMEM(Gpu::Device::warp_size), + [=] AMREX_GPU_DEVICE (auto lh) noexcept { ReduceTuple r; Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r); ReduceTuple dst = r; for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) { auto dp_stream = dp+istream*maxblocks; - for (int i = gh.item->get_global_id(0), stride = gh.item->get_global_range(0); + for (int i = lh.globalIdx1D(), stride = AMREX_GPU_MAX_THREADS; i < nblocks[istream]; i += stride) { Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]); } } - Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh); - if (gh.threadIdx() == 0) { *presult = dst; } + Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r + AMREX_SYCL_REDUCE_HANDLER(lh.handler())); + if (lh.threadIdx1D() == 0) { *presult = dst; } }); -#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND +#if !defined(AMREX_NO_SYCL_REDUCE_WORKAROUND) && defined(AMREX_USE_SYCL) Gpu::dtoh_memcpy_async(hp, dtmp.data(), sizeof(ReduceTuple)); -#endif -#else - amrex::launch(1, 0, stream, - [=] AMREX_GPU_DEVICE () noexcept - { - ReduceTuple r; - Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r); - ReduceTuple dst = r; - for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) { - auto dp_stream = dp+istream*maxblocks; - for (int i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x; - i < nblocks[istream]; i += stride) { - Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]); - } - } - Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r); - if (threadIdx.x == 0) { *hp = dst; } - }); #endif Gpu::streamSynchronize(); } @@ -1036,51 +994,34 @@ bool AnyOf (N n, T const* v, P const& pred) int* dp = ds.dataPtr(); auto ec = Gpu::ExecutionConfig(n); ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()); - -#ifdef AMREX_USE_SYCL - const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1; - const std::size_t shared_mem_bytes = num_ints*sizeof(int); - amrex::launch(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept { - int* has_any = &(static_cast(gh.sharedMemory())[num_ints-1]); - if (gh.threadIdx() == 0) { *has_any = *dp; } - gh.sharedBarrier(); - - if (!(*has_any)) - { - int r = false; - for (N i = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim(); - i < n && !r; i += stride) - { - r = pred(v[i]) ? 1 : 0; - } - - r = Gpu::blockReduce - (r, Gpu::warpReduce >(), 0, gh); - if (gh.threadIdx() == 0 && r) { *dp = 1; } - } - }); -#else - amrex::launch(ec.numBlocks.x, 0, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - __shared__ int has_any; - if (threadIdx.x == 0) { has_any = *dp; } - __syncthreads(); + const int nblocks_ec = ec.numBlocks.x; + [[maybe_unused]] const int num_ints = + std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size); + + amrex::LaunchRaw( + amrex::IntVectND<1>{nblocks_ec}, + AMREX_SYCL_REDUCE_SMEM(num_ints) + 1, + [=] AMREX_GPU_DEVICE (auto lh) noexcept + { + int& has_any = *(lh.shared_memory() + AMREX_SYCL_REDUCE_SMEM(num_ints)); + if (lh.threadIdx1D() == 0) { has_any = *dp; } + lh.syncthreads(); if (!has_any) { int r = false; - for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x; - i < n && !r; i += stride) + for (N i = AMREX_GPU_MAX_THREADS*lh.blockIdx1D()+lh.threadIdx1D(), + stride = AMREX_GPU_MAX_THREADS*nblocks_ec; + i < n && !r; i += stride) { r = pred(v[i]) ? 1 : 0; } r = Gpu::blockReduce - (r, Gpu::warpReduce >(), 0); + (r, Gpu::warpReduce >(), 0 + AMREX_SYCL_REDUCE_HANDLER(lh.handler())); if (threadIdx.x == 0 && r) *dp = 1; } }); -#endif return ds.dataValue(); } @@ -1093,45 +1034,25 @@ bool AnyOf (BoxND const& box, P const& pred) const BoxIndexerND indexer(box); auto ec = Gpu::ExecutionConfig(box.numPts()); ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch()); - -#ifdef AMREX_USE_SYCL - const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1; - const std::size_t shared_mem_bytes = num_ints*sizeof(int); - amrex::launch(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept { - int* has_any = &(static_cast(gh.sharedMemory())[num_ints-1]); - if (gh.threadIdx() == 0) { *has_any = *dp; } - gh.sharedBarrier(); - - if (!(*has_any)) - { - int r = false; - for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*gh.blockIdx()+gh.threadIdx(), - stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gh.gridDim(); - icell < indexer.numPts() && !r; - icell += stride) - { - auto iv = indexer.intVect(icell); - r = amrex::detail::call_f_intvect(pred, iv) ? 1 : 0; - } - r = Gpu::blockReduce - (r, Gpu::warpReduce >(), 0, gh); - if (gh.threadIdx() == 0 && r) { *dp = 1; } - } - }); -#else - AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, ec.numBlocks, ec.numThreads, 0, - Gpu::gpuStream(), - [=] AMREX_GPU_DEVICE () noexcept { - __shared__ int has_any; - if (threadIdx.x == 0) { has_any = *dp; } - __syncthreads(); + const int nblocks_ec = ec.numBlocks.x; + [[maybe_unused]] const int num_ints = + std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size); + + amrex::LaunchRaw( + amrex::IntVectND<1>{nblocks_ec}, + AMREX_SYCL_REDUCE_SMEM(num_ints) + 1, + [=] AMREX_GPU_DEVICE (auto lh) noexcept + { + int& has_any = *(lh.shared_memory() + AMREX_SYCL_REDUCE_SMEM(num_ints)); + if (lh.threadIdx1D() == 0) { has_any = *dp; } + lh.syncthreads(); if (!has_any) { int r = false; - for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x, - stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x; + for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)* + lh.blockIdx1D()+lh.threadIdx1D(), + stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*nblocks_ec; icell < indexer.numPts() && !r; icell += stride) { @@ -1139,11 +1060,11 @@ bool AnyOf (BoxND const& box, P const& pred) r = amrex::detail::call_f_intvect(pred, iv) ? 1 : 0; } r = Gpu::blockReduce - (r, Gpu::warpReduce >(), 0); - if (threadIdx.x == 0 && r) *dp = 1; + (r, Gpu::warpReduce >(), 0 + AMREX_SYCL_REDUCE_HANDLER(lh.handler())); + if (lh.threadIdx1D() == 0 && r) *dp = 1; } }); -#endif return ds.dataValue(); } diff --git a/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp b/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp index dfd8dce0544..372fd1ae09f 100644 --- a/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp +++ b/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp @@ -409,7 +409,7 @@ MLTensorOp::applyBCTensor (int amrlev, int mglev, MultiFab& vel, // NOLINT(reada #ifdef AMREX_USE_GPU if (Gpu::inLaunchRegion()) { amrex::LaunchRaw<64>(amrex::IntVectND<1>{12}, - [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) + [=] AMREX_GPU_DEVICE (auto lh) { int bid = lh.blockIdx1D(); int tid = lh.threadIdx1D(); From 4956fde3e02934a8f13d5b1958af9fd511756a5d Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Tue, 10 Feb 2026 17:34:52 +0100 Subject: [PATCH 12/46] fix --- Src/Base/AMReX_Reduce.H | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H index e7f33db5569..260f8be11f8 100644 --- a/Src/Base/AMReX_Reduce.H +++ b/Src/Base/AMReX_Reduce.H @@ -620,7 +620,7 @@ public: { ReduceTuple r; Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r); - ReduceTuple& dst = pdst[blockIdx.x]; + ReduceTuple& dst = pdst[lh.blockIdx1D()]; if (lh.threadIdx1D() == 0) { dst = r; } @@ -815,7 +815,6 @@ public: } using ReduceTuple = typename D::Type; - auto const& stream = Gpu::gpuStream(); auto dp = reduce_data.devicePtr(); auto const& nblocks = reduce_data.nBlocks(); #if defined(AMREX_USE_SYCL) @@ -1019,7 +1018,7 @@ bool AnyOf (N n, T const* v, P const& pred) r = Gpu::blockReduce (r, Gpu::warpReduce >(), 0 AMREX_SYCL_REDUCE_HANDLER(lh.handler())); - if (threadIdx.x == 0 && r) *dp = 1; + if (lh.threadIdx1D() == 0 && r) *dp = 1; } }); return ds.dataValue(); From 19c909a79ae97d9f9077031d6cf9897272114757 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Tue, 10 Feb 2026 17:43:08 +0100 Subject: [PATCH 13/46] fix sycl --- Src/Base/AMReX_GpuTypes.H | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H index 7a63dd050cf..4928328ce7e 100644 --- a/Src/Base/AMReX_GpuTypes.H +++ b/Src/Base/AMReX_GpuTypes.H @@ -153,7 +153,7 @@ struct LaunchHandler if constexpr (dim == 1) { return static_cast(AMREX_HIP_CUDA_OR_SYCL_OR_CPU( blockIdx.x, - m_item->get_group_id(0), + m_item->get_group(0), m_blockid[0] )); } else if constexpr (dim == 2) { @@ -177,15 +177,15 @@ struct LaunchHandler if constexpr (dim == 1) { return AMREX_HIP_CUDA_OR_SYCL_OR_CPU( IntVectND(static_cast(blockIdx.x)), - IntVectND(static_cast(m_item->get_group_id(0))), + IntVectND(static_cast(m_item->get_group(0))), m_blockid ); } else if constexpr (dim == 2) { return AMREX_HIP_CUDA_OR_SYCL_OR_CPU( IntVectND(static_cast(blockIdx.x), static_cast(blockIdx.y)), - IntVectND(static_cast(m_item->get_group_id(1)), - static_cast(m_item->get_group_id(0))), + IntVectND(static_cast(m_item->get_group(1)), + static_cast(m_item->get_group(0))), m_blockid ); } else { @@ -193,9 +193,9 @@ struct LaunchHandler IntVectND(static_cast(blockIdx.x), static_cast(blockIdx.y), static_cast(blockIdx.z)), - IntVectND(static_cast(m_item->get_group_id(2)), - static_cast(m_item->get_group_id(1)), - static_cast(m_item->get_group_id(0))), + IntVectND(static_cast(m_item->get_group(2)), + static_cast(m_item->get_group(1)), + static_cast(m_item->get_group(0))), m_blockid ); } From c0a70a9240d55a18f3210a997372f9d7ae7f15c3 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Tue, 10 Mar 2026 11:35:17 +0100 Subject: [PATCH 14/46] Add AMREX_IF_SYCL and nodiscard --- Src/AmrCore/AMReX_TagBox.cpp | 4 ++-- Src/Base/AMReX_GpuControl.H | 8 ++++++++ Src/Base/AMReX_GpuTypes.H | 24 +++++++++++----------- Src/Base/AMReX_MultiFabUtil.H | 2 +- Src/Base/AMReX_MultiFabUtil.cpp | 4 ++-- Src/Base/AMReX_Reduce.H | 36 +++++++++++++-------------------- 6 files changed, 39 insertions(+), 39 deletions(-) diff --git a/Src/AmrCore/AMReX_TagBox.cpp b/Src/AmrCore/AMReX_TagBox.cpp index 2c71eafbfbe..62e6d051490 100644 --- a/Src/AmrCore/AMReX_TagBox.cpp +++ b/Src/AmrCore/AMReX_TagBox.cpp @@ -448,7 +448,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector& v) const const char* tags = (*this)[fai].dataPtr(); amrex::LaunchRaw(amrex::IntVectND<1>{nblocks[li]}, - AMREX_SYCL_REDUCE_SMEM(Gpu::Device::warp_size), + AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0), [=] AMREX_GPU_DEVICE (auto lh) noexcept { int bid = lh.blockIdx1D(); @@ -462,7 +462,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector& v) const t = Gpu::blockReduce (t, Gpu::warpReduce >(), 0 - AMREX_SYCL_REDUCE_HANDLER(lh.handler())); + AMREX_IF_SYCL(, lh.handler())); if (tid == 0) { ntags[bid] = t; } diff --git a/Src/Base/AMReX_GpuControl.H b/Src/Base/AMReX_GpuControl.H index 6a7ebca1762..e5213f5440b 100644 --- a/Src/Base/AMReX_GpuControl.H +++ b/Src/Base/AMReX_GpuControl.H @@ -46,6 +46,14 @@ #define AMREX_GPU_OR_CPU(a,b) b #endif +#ifdef AMREX_USE_SYCL +#define AMREX_IF_SYCL(...) __VA_ARGS__ +#define AMREX_IF_NOT_SYCL(...) +#else +#define AMREX_IF_SYCL(...) +#define AMREX_IF_NOT_SYCL(...) __VA_ARGS__ +#endif + #ifdef AMREX_USE_SYCL #define AMREX_SYCL_ONLY(a) a #else diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H index 5a43a6426ba..433984ee786 100644 --- a/Src/Base/AMReX_GpuTypes.H +++ b/Src/Base/AMReX_GpuTypes.H @@ -119,7 +119,7 @@ struct LaunchHandler : m_blockid{a_blockid}, m_griddim{a_griddim}, m_shared_mem{a_shared_mem} {} #endif - AMREX_GPU_DEVICE AMREX_FORCE_INLINE + AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]] unsigned int threadIdx1D () const { return static_cast(AMREX_HIP_CUDA_OR_SYCL_OR_CPU( threadIdx.x, @@ -129,7 +129,7 @@ struct LaunchHandler } template - AMREX_GPU_DEVICE AMREX_FORCE_INLINE + AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]] IntVectND threadIdxND () const { static_assert(sizeof...(nd_block_size) == dim && (1 * ... * nd_block_size) == threads_per_block); @@ -148,7 +148,7 @@ struct LaunchHandler return ret; } - AMREX_GPU_DEVICE AMREX_FORCE_INLINE + AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]] unsigned int blockIdx1D () const { if constexpr (dim == 1) { return static_cast(AMREX_HIP_CUDA_OR_SYCL_OR_CPU( @@ -172,7 +172,7 @@ struct LaunchHandler } } - AMREX_GPU_DEVICE AMREX_FORCE_INLINE + AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]] IntVectND blockIdxND () const { if constexpr (dim == 1) { return AMREX_HIP_CUDA_OR_SYCL_OR_CPU( @@ -201,31 +201,31 @@ struct LaunchHandler } } - AMREX_GPU_DEVICE AMREX_FORCE_INLINE + AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]] static constexpr unsigned int blockDim1D () { return threads_per_block; } template - AMREX_GPU_DEVICE AMREX_FORCE_INLINE + AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]] static constexpr IntVectND blockDimND () { static_assert(sizeof...(nd_block_size) == dim && (1 * ... * nd_block_size) == threads_per_block); return {nd_block_size...}; } - AMREX_GPU_DEVICE AMREX_FORCE_INLINE + AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]] unsigned int globalIdx1D () const { return blockIdx1D() * threads_per_block + threadIdx1D(); } template - AMREX_GPU_DEVICE AMREX_FORCE_INLINE + AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]] IntVectND globalIdxND () const { return blockIdxND() * blockDimND() + threadIdxND(); } - AMREX_GPU_DEVICE AMREX_FORCE_INLINE + AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]] void syncthreads () const { AMREX_HIP_CUDA_OR_SYCL_OR_CPU( __syncthreads(), @@ -234,7 +234,7 @@ struct LaunchHandler ); } - AMREX_GPU_DEVICE AMREX_FORCE_INLINE + AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]] T* shared_memory () const { #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) static_assert(32 >= alignof(T)); @@ -245,7 +245,7 @@ struct LaunchHandler #endif } - AMREX_GPU_DEVICE AMREX_FORCE_INLINE + AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]] auto handler () const { #if defined(AMREX_USE_SYCL) return Gpu::Handler(m_item, reinterpret_cast(m_shared_mem), threads_per_block); @@ -256,7 +256,7 @@ struct LaunchHandler #endif } - AMREX_GPU_DEVICE AMREX_FORCE_INLINE + AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]] auto sycl_item () const { #if defined(AMREX_USE_SYCL) return m_item; diff --git a/Src/Base/AMReX_MultiFabUtil.H b/Src/Base/AMReX_MultiFabUtil.H index 96ee5e58043..a01fcd0c831 100644 --- a/Src/Base/AMReX_MultiFabUtil.H +++ b/Src/Base/AMReX_MultiFabUtil.H @@ -1255,7 +1255,7 @@ void reduce_to_plane (Array4 const& ar, int direction, Box const& bx, int box auto nblocks = static_cast(b2d.numPts()); amrex::LaunchRaw(amrex::IntVectND<1>{nblocks}, - AMREX_SYCL_REDUCE_SMEM(Gpu::Device::warp_size), + AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0), [=] AMREX_GPU_DEVICE (auto lh) { int bid = lh.blockIdx1D(); diff --git a/Src/Base/AMReX_MultiFabUtil.cpp b/Src/Base/AMReX_MultiFabUtil.cpp index 7adba546ac7..3870394bd86 100644 --- a/Src/Base/AMReX_MultiFabUtil.cpp +++ b/Src/Base/AMReX_MultiFabUtil.cpp @@ -872,7 +872,7 @@ namespace amrex int nblocks = n2dblocks * b.length(direction); amrex::LaunchRaw(amrex::IntVectND<1>{nblocks}, - AMREX_SYCL_REDUCE_SMEM(Gpu::Device::warp_size), + AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0), [=] AMREX_GPU_DEVICE (auto lh) noexcept { int i1d = lh.blockIdx1D() / n2dblocks; @@ -900,7 +900,7 @@ namespace amrex for (int n = 0; n < ncomp; ++n) { Real r = (i2d < n2d) ? fab(i,j,k,n+icomp) : Real(0.0); Gpu::deviceReduceSum_full(p+n+ncomp*idir, r - AMREX_SYCL_REDUCE_HANDLER(lh.handler())); + AMREX_IF_SYCL(, lh.handler())); } }); } diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H index d2e1f538310..18ceb3abe51 100644 --- a/Src/Base/AMReX_Reduce.H +++ b/Src/Base/AMReX_Reduce.H @@ -229,10 +229,6 @@ namespace Reduce::detail { P().parallel_update(amrex::get(d), amrex::get(s), h); for_each_parallel(d, s, h); } - -#define AMREX_SYCL_REDUCE_SMEM(warp_size) warp_size -#define AMREX_SYCL_REDUCE_HANDLER(launch_handler) , launch_handler - #else template AMREX_GPU_DEVICE AMREX_FORCE_INLINE @@ -248,10 +244,6 @@ namespace Reduce::detail { P().parallel_update(amrex::get(d), amrex::get(s)); for_each_parallel(d, s); } - -#define AMREX_SYCL_REDUCE_SMEM(warp_size) 0 -#define AMREX_SYCL_REDUCE_HANDLER(launch_handler) - #endif #endif @@ -615,7 +607,7 @@ public: // device reduce needs local(i.e., shared) memory amrex::LaunchRaw( amrex::IntVectND<1>{nblocks_ec}, - AMREX_SYCL_REDUCE_SMEM(Gpu::Device::warp_size), + AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0), [=] AMREX_GPU_DEVICE (auto lh) noexcept { ReduceTuple r; @@ -637,7 +629,7 @@ public: } } Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r - AMREX_SYCL_REDUCE_HANDLER(lh.handler())); + AMREX_IF_SYCL(, lh.handler())); }); } } @@ -659,7 +651,7 @@ public: reduce_data.updateMaxStreamIndex(stream); amrex::LaunchRaw( amrex::IntVectND<1>{static_cast(nblocks_ec)}, - AMREX_SYCL_REDUCE_SMEM(Gpu::Device::warp_size), + AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0), [=] AMREX_GPU_DEVICE (auto lh) noexcept { ReduceTuple r; @@ -687,7 +679,7 @@ public: } } Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r - AMREX_SYCL_REDUCE_HANDLER(lh.handler())); + AMREX_IF_SYCL(, lh.handler())); }); nblocks = std::max(nblocks, static_cast(nblocks_ec)); } @@ -782,7 +774,7 @@ public: // device reduce needs local(i.e., shared) memory amrex::LaunchRaw( amrex::IntVectND<1>{nblocks_ec}, - AMREX_SYCL_REDUCE_SMEM(Gpu::Device::warp_size), + AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0), [=] AMREX_GPU_DEVICE (auto lh) noexcept { ReduceTuple r; @@ -800,7 +792,7 @@ public: Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r,pr); } Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r - AMREX_SYCL_REDUCE_HANDLER(lh.handler())); + AMREX_IF_SYCL(, lh.handler())); }); nblocks = amrex::max(nblocks, nblocks_ec); } @@ -844,7 +836,7 @@ public: #endif amrex::LaunchRaw( amrex::IntVectND<1>{1}, - AMREX_SYCL_REDUCE_SMEM(Gpu::Device::warp_size), + AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0), [=] AMREX_GPU_DEVICE (auto lh) noexcept { ReduceTuple r; @@ -858,7 +850,7 @@ public: } } Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r - AMREX_SYCL_REDUCE_HANDLER(lh.handler())); + AMREX_IF_SYCL(, lh.handler())); if (lh.threadIdx1D() == 0) { *presult = dst; } }); #if !defined(AMREX_NO_SYCL_REDUCE_WORKAROUND) && defined(AMREX_USE_SYCL) @@ -999,10 +991,10 @@ bool AnyOf (N n, T const* v, P const& pred) amrex::LaunchRaw( amrex::IntVectND<1>{nblocks_ec}, - AMREX_SYCL_REDUCE_SMEM(num_ints) + 1, + AMREX_IF_SYCL(num_ints + 1) AMREX_IF_NOT_SYCL(1), [=] AMREX_GPU_DEVICE (auto lh) noexcept { - int& has_any = *(lh.shared_memory() + AMREX_SYCL_REDUCE_SMEM(num_ints)); + int& has_any = *(lh.shared_memory() AMREX_IF_SYCL(+ num_ints)); if (lh.threadIdx1D() == 0) { has_any = *dp; } lh.syncthreads(); @@ -1017,7 +1009,7 @@ bool AnyOf (N n, T const* v, P const& pred) } r = Gpu::blockReduce (r, Gpu::warpReduce >(), 0 - AMREX_SYCL_REDUCE_HANDLER(lh.handler())); + AMREX_IF_SYCL(, lh.handler())); if (lh.threadIdx1D() == 0 && r) *dp = 1; } }); @@ -1039,10 +1031,10 @@ bool AnyOf (BoxND const& box, P const& pred) amrex::LaunchRaw( amrex::IntVectND<1>{nblocks_ec}, - AMREX_SYCL_REDUCE_SMEM(num_ints) + 1, + AMREX_IF_SYCL(num_ints + 1) AMREX_IF_NOT_SYCL(1), [=] AMREX_GPU_DEVICE (auto lh) noexcept { - int& has_any = *(lh.shared_memory() + AMREX_SYCL_REDUCE_SMEM(num_ints)); + int& has_any = *(lh.shared_memory() AMREX_IF_SYCL(+ num_ints)); if (lh.threadIdx1D() == 0) { has_any = *dp; } lh.syncthreads(); @@ -1060,7 +1052,7 @@ bool AnyOf (BoxND const& box, P const& pred) } r = Gpu::blockReduce (r, Gpu::warpReduce >(), 0 - AMREX_SYCL_REDUCE_HANDLER(lh.handler())); + AMREX_IF_SYCL(, lh.handler())); if (lh.threadIdx1D() == 0 && r) *dp = 1; } }); From 232988f88ded71486114378143f00105985acf95 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Tue, 10 Mar 2026 11:39:32 +0100 Subject: [PATCH 15/46] fix nodiscard --- Src/Base/AMReX_GpuTypes.H | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H index 433984ee786..6d5f295ad29 100644 --- a/Src/Base/AMReX_GpuTypes.H +++ b/Src/Base/AMReX_GpuTypes.H @@ -119,7 +119,7 @@ struct LaunchHandler : m_blockid{a_blockid}, m_griddim{a_griddim}, m_shared_mem{a_shared_mem} {} #endif - AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]] + [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE unsigned int threadIdx1D () const { return static_cast(AMREX_HIP_CUDA_OR_SYCL_OR_CPU( threadIdx.x, @@ -129,7 +129,7 @@ struct LaunchHandler } template - AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]] + [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE IntVectND threadIdxND () const { static_assert(sizeof...(nd_block_size) == dim && (1 * ... * nd_block_size) == threads_per_block); @@ -148,7 +148,7 @@ struct LaunchHandler return ret; } - AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]] + [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE unsigned int blockIdx1D () const { if constexpr (dim == 1) { return static_cast(AMREX_HIP_CUDA_OR_SYCL_OR_CPU( @@ -172,7 +172,7 @@ struct LaunchHandler } } - AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]] + [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE IntVectND blockIdxND () const { if constexpr (dim == 1) { return AMREX_HIP_CUDA_OR_SYCL_OR_CPU( @@ -201,31 +201,31 @@ struct LaunchHandler } } - AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]] + [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE static constexpr unsigned int blockDim1D () { return threads_per_block; } template - AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]] + [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE static constexpr IntVectND blockDimND () { static_assert(sizeof...(nd_block_size) == dim && (1 * ... * nd_block_size) == threads_per_block); return {nd_block_size...}; } - AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]] + [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE unsigned int globalIdx1D () const { return blockIdx1D() * threads_per_block + threadIdx1D(); } template - AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]] + [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE IntVectND globalIdxND () const { return blockIdxND() * blockDimND() + threadIdxND(); } - AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]] + [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE void syncthreads () const { AMREX_HIP_CUDA_OR_SYCL_OR_CPU( __syncthreads(), @@ -234,7 +234,7 @@ struct LaunchHandler ); } - AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]] + [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE T* shared_memory () const { #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) static_assert(32 >= alignof(T)); @@ -245,7 +245,7 @@ struct LaunchHandler #endif } - AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]] + [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto handler () const { #if defined(AMREX_USE_SYCL) return Gpu::Handler(m_item, reinterpret_cast(m_shared_mem), threads_per_block); @@ -256,7 +256,7 @@ struct LaunchHandler #endif } - AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]] + [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto sycl_item () const { #if defined(AMREX_USE_SYCL) return m_item; From c392c70d289b62a6be39bf20996f32f375156d9a Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Tue, 10 Mar 2026 11:43:49 +0100 Subject: [PATCH 16/46] fix nodiscard 2 --- Src/Base/AMReX_GpuTypes.H | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H index 6d5f295ad29..ae01f08178d 100644 --- a/Src/Base/AMReX_GpuTypes.H +++ b/Src/Base/AMReX_GpuTypes.H @@ -225,7 +225,7 @@ struct LaunchHandler return blockIdxND() * blockDimND() + threadIdxND(); } - [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE + AMREX_GPU_DEVICE AMREX_FORCE_INLINE void syncthreads () const { AMREX_HIP_CUDA_OR_SYCL_OR_CPU( __syncthreads(), From 62565bb21fa61fde3c20b2d2c95358988972e4c0 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Tue, 10 Mar 2026 12:42:55 +0100 Subject: [PATCH 17/46] try fix void return issue --- Src/Base/AMReX_GpuTypes.H | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H index ae01f08178d..be186c19c35 100644 --- a/Src/Base/AMReX_GpuTypes.H +++ b/Src/Base/AMReX_GpuTypes.H @@ -252,7 +252,7 @@ struct LaunchHandler #elif defined(AMREX_USE_GPU) return Gpu::Handler(threads_per_block); #else - return 0; + return int{0}; #endif } @@ -261,7 +261,7 @@ struct LaunchHandler #if defined(AMREX_USE_SYCL) return m_item; #else - return 0; + return int{0}; #endif } From 5c59a308ccb0e79d69269486b2f856235e40a270 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Tue, 10 Mar 2026 12:51:28 +0100 Subject: [PATCH 18/46] remove nodiscard --- Src/Base/AMReX_GpuTypes.H | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H index be186c19c35..fcc6b072bf5 100644 --- a/Src/Base/AMReX_GpuTypes.H +++ b/Src/Base/AMReX_GpuTypes.H @@ -245,23 +245,23 @@ struct LaunchHandler #endif } - [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE + AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto handler () const { #if defined(AMREX_USE_SYCL) return Gpu::Handler(m_item, reinterpret_cast(m_shared_mem), threads_per_block); #elif defined(AMREX_USE_GPU) return Gpu::Handler(threads_per_block); #else - return int{0}; + return 0; #endif } - [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE + AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto sycl_item () const { #if defined(AMREX_USE_SYCL) return m_item; #else - return int{0}; + return 0; #endif } From 6161d9015ed0caaf9cf3ef0267a44b0577291d8f Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Tue, 10 Mar 2026 13:19:59 +0100 Subject: [PATCH 19/46] try fix cuda first capture issue --- Src/Base/AMReX_Reduce.H | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H index 18ceb3abe51..db006dd71ab 100644 --- a/Src/Base/AMReX_Reduce.H +++ b/Src/Base/AMReX_Reduce.H @@ -667,13 +667,16 @@ public: icell += stride) { auto iv = indexer.intVect(icell); - amrex::ignore_unused(f,ncomp,ixtype); // work around first-capture + // work around first-capture + auto f2 = f; + auto ncomp2 = ncomp; + auto ixtype2 = ixtype; if constexpr (std::is_same_v) { - auto pr = Reduce::detail::call_f_intvect_box(f, iv, ixtype); + auto pr = Reduce::detail::call_f_intvect_box(f2, iv, ixtype2); Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr); } else { - for (int n = 0; n < ncomp; ++n) { - auto pr = Reduce::detail::call_f_intvect_n(f, iv, n); + for (int n = 0; n < ncomp2; ++n) { + auto pr = Reduce::detail::call_f_intvect_n(f2, iv, n); Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr); } } From ec26b0b305a27276dbd1cd576cc00c6fd78331c3 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Tue, 10 Mar 2026 14:25:26 +0100 Subject: [PATCH 20/46] add documentation --- Docs/Doxygen/groups.dox | 1 + Src/Base/AMReX_GpuLaunchFunctsG.H | 43 +++++++++++++++++++++++++++++++ Src/Base/AMReX_GpuTypes.H | 42 ++++++++++++++++++++++++++++++ 3 files changed, 86 insertions(+) diff --git a/Docs/Doxygen/groups.dox b/Docs/Doxygen/groups.dox index c2d946159fd..1265c9a5050 100644 --- a/Docs/Doxygen/groups.dox +++ b/Docs/Doxygen/groups.dox @@ -187,6 +187,7 @@ * - \ref amrex::ParallelFor * - \ref amrex::ParallelForOMP * - \ref amrex::ParallelForRNG + * - \ref amrex::LaunchRaw */ /** diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H index fe9578793cd..24b58376a58 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsG.H +++ b/Src/Base/AMReX_GpuLaunchFunctsG.H @@ -1348,6 +1348,24 @@ ParallelFor (Gpu::KernelInfo const&, AMREX_GPU_ERROR_CHECK(); } +/** + * \ingroup amrex_execution + * \brief Performance-portable kernel launch function + * that provides low-level access to GPU thread blocks through \ref amrex::Gpu::LaunchHandler. + * + * The number of threads per block is a compile-time-known one-dimensional value that usually + * should be one of 128, 256, 512 or 1024. + * + * The number of total blocks can be a 1D, 2D or 3D IntVectND. Internally this uses the native + * way to split the index, such as blockIdx.x, blockIdx.y and blockIdx.z. + * Note that this uses types int and unsigned int which might overflow if many blocks or + * total threads are needed. In case this is an issue, it is necessary to explicitly add + * a 64-bit grid-strided loop or to call LaunchRaw multiple times with fewer blocks. + * + * \tparam MT number of threads per GPU block. + * \param nblocks number of GPU blocks to launch. + * \param f a callable object that takes amrex::Gpu::LaunchHandler as input. + */ template void LaunchRaw (IntVectND nblocks, L const& f) { @@ -1367,6 +1385,31 @@ void LaunchRaw (IntVectND nblocks, L const& f) AMREX_GPU_ERROR_CHECK(); } +/** + * \ingroup amrex_execution + * \brief Performance-portable kernel launch function + * that provides low-level access to GPU thread blocks and shared memory + * through \ref amrex::Gpu::LaunchHandler. + * + * The number of threads per block is a compile-time-known one-dimensional value that usually + * should be one of 128, 256, 512 or 1024. + * + * The number of total blocks can be a 1D, 2D or 3D IntVectND. Internally this uses the native + * way to split the index, such as blockIdx.x, blockIdx.y and blockIdx.z. + * Note that this uses types int and unsigned int which might overflow if many blocks or + * total threads are needed. In case this is an issue, it is necessary to explicitly add + * a 64-bit grid-strided loop or to call LaunchRaw multiple times with fewer blocks. + * + * This version of LaunchRaw supports the use of dynamic shared memory inside the thread block. + * Shared memory is a fast cache local to thread blocks. It has different names in + * different GPU backends (CUDA: shared memory, HIP: local data share, SYCL: local memory). + * + * \tparam MT number of threads per GPU block. + * \tparam T data type of shared memory elements. + * \param nblocks number of GPU blocks to launch. + * \param shared_mem_elements number of shared memory elements per block to allocate. + * \param f a callable object that takes amrex::Gpu::LaunchHandler as input. + */ template void LaunchRaw (IntVectND nblocks, std::size_t shared_mem_elements, L const& f) { diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H index fcc6b072bf5..db647cbf9a7 100644 --- a/Src/Base/AMReX_GpuTypes.H +++ b/Src/Base/AMReX_GpuTypes.H @@ -119,6 +119,9 @@ struct LaunchHandler : m_blockid{a_blockid}, m_griddim{a_griddim}, m_shared_mem{a_shared_mem} {} #endif + /** + * \brief Returns the thread ID in the local block. + */ [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE unsigned int threadIdx1D () const { return static_cast(AMREX_HIP_CUDA_OR_SYCL_OR_CPU( @@ -128,6 +131,10 @@ struct LaunchHandler )); } + /** + * \brief Splits the local thread ID into N dimensions. + * The sizes of the dimensions are supplied as template arguments. + */ template [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE IntVectND threadIdxND () const { @@ -148,6 +155,9 @@ struct LaunchHandler return ret; } + /** + * \brief Returns the block ID flattened to 1D. + */ [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE unsigned int blockIdx1D () const { if constexpr (dim == 1) { @@ -172,6 +182,9 @@ struct LaunchHandler } } + /** + * \brief Returns the 1D/2D/3D block ID. + */ [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE IntVectND blockIdxND () const { if constexpr (dim == 1) { @@ -201,11 +214,18 @@ struct LaunchHandler } } + /** + * \brief Returns the number of threads inside a block. + */ [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE static constexpr unsigned int blockDim1D () { return threads_per_block; } + /** + * \brief Analogous to threadIdxND but returns the ND block size. + * This effectively just returns the template arguments as an IntVectND. + */ template [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE static constexpr IntVectND blockDimND () { @@ -214,17 +234,27 @@ struct LaunchHandler return {nd_block_size...}; } + /** + * \brief Returns the global thread index flattened to 1D. + */ [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE unsigned int globalIdx1D () const { return blockIdx1D() * threads_per_block + threadIdx1D(); } + /** + * \brief Returns the global 1D/2D/3D thread index. + */ template [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE IntVectND globalIdxND () const { return blockIdxND() * blockDimND() + threadIdxND(); } + /** + * \brief Synchronizes all threads within a block. This is needed before + * accessing (shared) memory that was previously written by another thread in the block. + */ AMREX_GPU_DEVICE AMREX_FORCE_INLINE void syncthreads () const { AMREX_HIP_CUDA_OR_SYCL_OR_CPU( @@ -234,6 +264,12 @@ struct LaunchHandler ); } + /** + * \brief Returns a pointer to block-local shared memory. If multiple shared memory + * allocations are needed in a block, then the allocation must be manually split by adding + * offsets to it. For CUDA and HIP the memory is aligned to 32 bytes, + * for SYCL to the alignment of the chosen data type. + */ [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE T* shared_memory () const { #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) @@ -245,6 +281,9 @@ struct LaunchHandler #endif } + /** + * \brief Returns an amrex::Gpu::Handler object, which is sometimes needed for reductions. + */ AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto handler () const { #if defined(AMREX_USE_SYCL) @@ -256,6 +295,9 @@ struct LaunchHandler #endif } + /** + * \brief Returns the internal sycl::nd_item. + */ AMREX_GPU_DEVICE AMREX_FORCE_INLINE auto sycl_item () const { #if defined(AMREX_USE_SYCL) From cf44395d9f03eaa183786cfafa1d1eea3090f5c3 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Tue, 10 Mar 2026 18:14:20 +0100 Subject: [PATCH 21/46] start to add test --- Tests/GPU/LaunchRaw/GNUmakefile | 23 ++++++++++ Tests/GPU/LaunchRaw/Make.package | 2 + Tests/GPU/LaunchRaw/inputs | 1 + Tests/GPU/LaunchRaw/main.cpp | 73 ++++++++++++++++++++++++++++++++ 4 files changed, 99 insertions(+) create mode 100644 Tests/GPU/LaunchRaw/GNUmakefile create mode 100644 Tests/GPU/LaunchRaw/Make.package create mode 100644 Tests/GPU/LaunchRaw/inputs create mode 100644 Tests/GPU/LaunchRaw/main.cpp diff --git a/Tests/GPU/LaunchRaw/GNUmakefile b/Tests/GPU/LaunchRaw/GNUmakefile new file mode 100644 index 00000000000..42cbc3ea210 --- /dev/null +++ b/Tests/GPU/LaunchRaw/GNUmakefile @@ -0,0 +1,23 @@ +AMREX_HOME = ../../../ + +DEBUG = FALSE + +DIM = 3 + +COMP = gcc + +USE_CUDA = FALSE +USE_ACC = FALSE +USE_OMP_OFFLOAD = FALSE + +USE_MPI = FALSE +USE_OMP = FALSE + +TINY_PROFILE = FALSE + +include $(AMREX_HOME)/Tools/GNUMake/Make.defs + +include ./Make.package +include $(AMREX_HOME)/Src/Base/Make.package + +include $(AMREX_HOME)/Tools/GNUMake/Make.rules diff --git a/Tests/GPU/LaunchRaw/Make.package b/Tests/GPU/LaunchRaw/Make.package new file mode 100644 index 00000000000..7f43e5e87cb --- /dev/null +++ b/Tests/GPU/LaunchRaw/Make.package @@ -0,0 +1,2 @@ +CEXE_sources += main.cpp + diff --git a/Tests/GPU/LaunchRaw/inputs b/Tests/GPU/LaunchRaw/inputs new file mode 100644 index 00000000000..bba7f4f305b --- /dev/null +++ b/Tests/GPU/LaunchRaw/inputs @@ -0,0 +1 @@ +amrex.the_arena_is_managed = 0 diff --git a/Tests/GPU/LaunchRaw/main.cpp b/Tests/GPU/LaunchRaw/main.cpp new file mode 100644 index 00000000000..e3408685679 --- /dev/null +++ b/Tests/GPU/LaunchRaw/main.cpp @@ -0,0 +1,73 @@ +#include +#include +#include +#include +#include +#include + +using namespace amrex; + +void test3d () { + + constexpr int num_threads = 256; + IntVectND<3> num_blocks {31, 23, 11}; + + Gpu::DeviceVector vect(num_threads * num_blocks[0] * num_blocks[1] * num_blocks[2], -999); + + auto data = vect.dataPtr(); + + LaunchRaw(num_blocks, + [=](auto lh){ + data[lh.globalIdx1D()] = lh.blockIdx1D(); + }); + + LaunchRaw(num_blocks, + [=](auto lh){ + data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] += lh.threadIdx1D(); + }); + + LaunchRaw(num_blocks, + [=](auto lh){ + auto block = lh.blockIdxND(); + auto thread = lh.template threadIdxND<2, 8, 16>(); + auto tmp = data[ + (block[0] + block[1] * num_blocks[0] + + block[2] * num_blocks[0] * num_blocks[1]) * num_threads + + thread[2] + thread[1] * 16 + + thread[0] * 16 * 8 + ]; + lh.synctheads(); + data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] = tmp; + }); + + LaunchRaw(num_blocks, num_threads, + [=](auto lh){ + auto smem = lh.shared_memory(); + auto thread1 = lh.template threadIdxND<16, 8, 2>(); + auto locid1 = thread1[2] + thread1[1] * 2 + thread1[0] * 2 * 8; + auto thread2 = lh.template threadIdxND<16, 2, 8>(); + auto locid2 = thread2[0] + thread2[2] * 16 + thread2[1] * 16 * 8; + + smem[lh.threadIdx1D()] = data[lh.blockIdx1D() * lh.blockDim1D() + locid1]; + lh.syncthreads(); + data[lh.blockIdx1D() * lh.blockDim1D() + locid2] = smem[locid2]; + }); + + LaunchRaw(num_blocks, + [=](auto lh){ + data[lh.globalIdx1D()] = data[lh.globalIdx1D()] == (lh.blockIdx1D() + lh.threadIdx1D()); + }); + + AMREX_ALWAYS_ASSERT(Reduce::Sum(vect.size(), data, 0) == vect.size()); +} + +int main (int argc, char* argv[]) +{ + amrex::Initialize(argc,argv); + { + test3d(); + + amrex::Print() << "Passed! \n"; + } + amrex::Finalize(); +} From 09d75ee02b27714d27f6ff93a0b2c6153436ae6b Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Wed, 11 Mar 2026 13:43:57 +0100 Subject: [PATCH 22/46] update test and see if it is compiled --- Tests/GPU/LaunchRaw/GNUmakefile | 6 ------ Tests/GPU/LaunchRaw/main.cpp | 28 +++++++++++++++++----------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/Tests/GPU/LaunchRaw/GNUmakefile b/Tests/GPU/LaunchRaw/GNUmakefile index 42cbc3ea210..8c8ce8057ac 100644 --- a/Tests/GPU/LaunchRaw/GNUmakefile +++ b/Tests/GPU/LaunchRaw/GNUmakefile @@ -4,12 +4,6 @@ DEBUG = FALSE DIM = 3 -COMP = gcc - -USE_CUDA = FALSE -USE_ACC = FALSE -USE_OMP_OFFLOAD = FALSE - USE_MPI = FALSE USE_OMP = FALSE diff --git a/Tests/GPU/LaunchRaw/main.cpp b/Tests/GPU/LaunchRaw/main.cpp index e3408685679..6d83980797b 100644 --- a/Tests/GPU/LaunchRaw/main.cpp +++ b/Tests/GPU/LaunchRaw/main.cpp @@ -9,8 +9,13 @@ using namespace amrex; void test3d () { - constexpr int num_threads = 256; - IntVectND<3> num_blocks {31, 23, 11}; + const IntVectND<3> num_blocks {31, 23, 11}; +#ifdef AMREX_USE_GPU + static constexpr IntVectND<3> blockdim {2, 8, 16}; +#else + static constexpr IntVectND<3> blockdim {1, 1, 1}; +#endif + static constexpr int num_threads = blockdim[0] * blockdim[1] * blockdim[2]; Gpu::DeviceVector vect(num_threads * num_blocks[0] * num_blocks[1] * num_blocks[2], -999); @@ -29,12 +34,12 @@ void test3d () { LaunchRaw(num_blocks, [=](auto lh){ auto block = lh.blockIdxND(); - auto thread = lh.template threadIdxND<2, 8, 16>(); + auto thread = lh.template threadIdxND(); auto tmp = data[ (block[0] + block[1] * num_blocks[0] + block[2] * num_blocks[0] * num_blocks[1]) * num_threads + - thread[2] + thread[1] * 16 + - thread[0] * 16 * 8 + thread[2] + thread[1] * blockdim[2] + + thread[0] * blockdim[2] * blockdim[1] ]; lh.synctheads(); data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] = tmp; @@ -43,11 +48,12 @@ void test3d () { LaunchRaw(num_blocks, num_threads, [=](auto lh){ auto smem = lh.shared_memory(); - auto thread1 = lh.template threadIdxND<16, 8, 2>(); - auto locid1 = thread1[2] + thread1[1] * 2 + thread1[0] * 2 * 8; - auto thread2 = lh.template threadIdxND<16, 2, 8>(); - auto locid2 = thread2[0] + thread2[2] * 16 + thread2[1] * 16 * 8; - + auto thread1 = lh.template threadIdxND(); + auto locid1 = thread1[2] + thread1[1] * blockdim[0] + + thread1[0] * blockdim[0] * blockdim[1]; + auto thread2 = lh.template threadIdxND(); + auto locid2 = thread2[0] + thread2[2] * blockdim[2] + + thread2[1] * blockdim[2] * blockdim[1]; smem[lh.threadIdx1D()] = data[lh.blockIdx1D() * lh.blockDim1D() + locid1]; lh.syncthreads(); data[lh.blockIdx1D() * lh.blockDim1D() + locid2] = smem[locid2]; @@ -65,7 +71,7 @@ int main (int argc, char* argv[]) { amrex::Initialize(argc,argv); { - test3d(); + test3d() amrex::Print() << "Passed! \n"; } From 76a84019b0dfce40c66a6791bf71e6f330349928 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Wed, 11 Mar 2026 14:39:46 +0100 Subject: [PATCH 23/46] move test dir --- Tests/CMakeLists.txt | 2 +- Tests/LaunchRaw/CMakeLists.txt | 9 +++++++++ Tests/{GPU => }/LaunchRaw/GNUmakefile | 2 -- Tests/{GPU => }/LaunchRaw/Make.package | 0 Tests/{GPU => }/LaunchRaw/inputs | 0 Tests/{GPU => }/LaunchRaw/main.cpp | 0 6 files changed, 10 insertions(+), 3 deletions(-) create mode 100644 Tests/LaunchRaw/CMakeLists.txt rename Tests/{GPU => }/LaunchRaw/GNUmakefile (96%) rename Tests/{GPU => }/LaunchRaw/Make.package (100%) rename Tests/{GPU => }/LaunchRaw/inputs (100%) rename Tests/{GPU => }/LaunchRaw/main.cpp (100%) diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt index 975b575c951..e4e8b7d2794 100644 --- a/Tests/CMakeLists.txt +++ b/Tests/CMakeLists.txt @@ -126,7 +126,7 @@ else() # List of subdirectories to search for CMakeLists. # set( AMREX_TESTS_SUBDIRS Amr ArrayND AsyncOut CallNoinline CLZ CommType CTOParFor DeviceGlobal - Enum HeatEquation MultiBlock MultiPeriod ParmParse Parser Parser2 + Enum HeatEquation LaunchRaw MultiBlock MultiPeriod ParmParse Parser Parser2 ParserUserFn Reducer ReduceToPlanePatchy Reinit RoundoffDomain SIMD SmallMatrix SumBoundary TOML) diff --git a/Tests/LaunchRaw/CMakeLists.txt b/Tests/LaunchRaw/CMakeLists.txt new file mode 100644 index 00000000000..224c4563c83 --- /dev/null +++ b/Tests/LaunchRaw/CMakeLists.txt @@ -0,0 +1,9 @@ +foreach(D IN LISTS AMReX_SPACEDIM) + set(_sources main.cpp) + set(_input_files) + + setup_test(${D} _sources _input_files) + + unset(_sources) + unset(_input_files) +endforeach() diff --git a/Tests/GPU/LaunchRaw/GNUmakefile b/Tests/LaunchRaw/GNUmakefile similarity index 96% rename from Tests/GPU/LaunchRaw/GNUmakefile rename to Tests/LaunchRaw/GNUmakefile index 8c8ce8057ac..173eef4c670 100644 --- a/Tests/GPU/LaunchRaw/GNUmakefile +++ b/Tests/LaunchRaw/GNUmakefile @@ -2,8 +2,6 @@ AMREX_HOME = ../../../ DEBUG = FALSE -DIM = 3 - USE_MPI = FALSE USE_OMP = FALSE diff --git a/Tests/GPU/LaunchRaw/Make.package b/Tests/LaunchRaw/Make.package similarity index 100% rename from Tests/GPU/LaunchRaw/Make.package rename to Tests/LaunchRaw/Make.package diff --git a/Tests/GPU/LaunchRaw/inputs b/Tests/LaunchRaw/inputs similarity index 100% rename from Tests/GPU/LaunchRaw/inputs rename to Tests/LaunchRaw/inputs diff --git a/Tests/GPU/LaunchRaw/main.cpp b/Tests/LaunchRaw/main.cpp similarity index 100% rename from Tests/GPU/LaunchRaw/main.cpp rename to Tests/LaunchRaw/main.cpp From 9df5358d58359f8a31c7ff60b9e949e28de55658 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Wed, 11 Mar 2026 16:09:12 +0100 Subject: [PATCH 24/46] fix test --- Tests/LaunchRaw/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tests/LaunchRaw/main.cpp b/Tests/LaunchRaw/main.cpp index 6d83980797b..2b3cf0c0efa 100644 --- a/Tests/LaunchRaw/main.cpp +++ b/Tests/LaunchRaw/main.cpp @@ -71,7 +71,7 @@ int main (int argc, char* argv[]) { amrex::Initialize(argc,argv); { - test3d() + test3d(); amrex::Print() << "Passed! \n"; } From 8a78b4f6081b9888806141477144c807a33ac5e4 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Wed, 11 Mar 2026 16:16:53 +0100 Subject: [PATCH 25/46] add static cast --- Tests/LaunchRaw/main.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/Tests/LaunchRaw/main.cpp b/Tests/LaunchRaw/main.cpp index 2b3cf0c0efa..d5cdb0316b7 100644 --- a/Tests/LaunchRaw/main.cpp +++ b/Tests/LaunchRaw/main.cpp @@ -41,7 +41,7 @@ void test3d () { thread[2] + thread[1] * blockdim[2] + thread[0] * blockdim[2] * blockdim[1] ]; - lh.synctheads(); + lh.syncthreads(); data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] = tmp; }); @@ -61,10 +61,11 @@ void test3d () { LaunchRaw(num_blocks, [=](auto lh){ - data[lh.globalIdx1D()] = data[lh.globalIdx1D()] == (lh.blockIdx1D() + lh.threadIdx1D()); + data[lh.globalIdx1D()] = + data[lh.globalIdx1D()] == static_cast(lh.blockIdx1D() + lh.threadIdx1D()); }); - AMREX_ALWAYS_ASSERT(Reduce::Sum(vect.size(), data, 0) == vect.size()); + AMREX_ALWAYS_ASSERT(Reduce::Sum(vect.size(), data, 0) == static_cast(vect.size())); } int main (int argc, char* argv[]) From f29aef1eab8664fdc56c6ab58dcda828af9eb1ce Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Wed, 11 Mar 2026 16:38:25 +0100 Subject: [PATCH 26/46] add 1d and 2d tests --- Src/Base/AMReX_GpuTypes.H | 14 +++-- Tests/LaunchRaw/main.cpp | 119 +++++++++++++++++++++++++++++++++++++- 2 files changed, 125 insertions(+), 8 deletions(-) diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H index db647cbf9a7..4b666a5847f 100644 --- a/Src/Base/AMReX_GpuTypes.H +++ b/Src/Base/AMReX_GpuTypes.H @@ -284,28 +284,30 @@ struct LaunchHandler /** * \brief Returns an amrex::Gpu::Handler object, which is sometimes needed for reductions. */ - AMREX_GPU_DEVICE AMREX_FORCE_INLINE - auto handler () const { + [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE + Gpu::Handler handler () const { #if defined(AMREX_USE_SYCL) return Gpu::Handler(m_item, reinterpret_cast(m_shared_mem), threads_per_block); #elif defined(AMREX_USE_GPU) return Gpu::Handler(threads_per_block); #else - return 0; + return Gpu::Handler{}; #endif } /** * \brief Returns the internal sycl::nd_item. */ - AMREX_GPU_DEVICE AMREX_FORCE_INLINE - auto sycl_item () const { + [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE #if defined(AMREX_USE_SYCL) + sycl::nd_item sycl_item () const { return m_item; + } #else + int sycl_item () const { return 0; -#endif } +#endif private: diff --git a/Tests/LaunchRaw/main.cpp b/Tests/LaunchRaw/main.cpp index d5cdb0316b7..eb53f64971d 100644 --- a/Tests/LaunchRaw/main.cpp +++ b/Tests/LaunchRaw/main.cpp @@ -7,6 +7,117 @@ using namespace amrex; +void test1d () { + + const IntVectND<1> num_blocks {31}; +#ifdef AMREX_USE_GPU + static constexpr IntVectND<1> blockdim {256}; +#else + static constexpr IntVectND<1> blockdim {1}; +#endif + static constexpr int num_threads = blockdim[0]; + + Gpu::DeviceVector vect(num_threads * num_blocks[0], -999); + + auto * data = vect.dataPtr(); + + LaunchRaw(num_blocks, + [=](auto lh){ + data[lh.globalIdx1D()] = lh.blockIdx1D(); + }); + + LaunchRaw(num_blocks, + [=](auto lh){ + data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] += lh.threadIdx1D(); + }); + + LaunchRaw(num_blocks, + [=](auto lh){ + auto block = lh.blockIdxND(); + auto thread = lh.template threadIdxND(); + auto tmp = data[ + block[0] * num_threads + thread[0] + ]; + lh.syncthreads(); + data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] = tmp; + }); + + LaunchRaw(num_blocks, num_threads, + [=](auto lh){ + auto smem = lh.shared_memory(); + auto thread = lh.template threadIdxND(); + auto locid = thread[0]; + smem[lh.threadIdx1D()] = data[lh.blockIdx1D() * lh.blockDim1D() + locid]; + lh.syncthreads(); + data[lh.blockIdx1D() * lh.blockDim1D() + locid] = smem[locid]; + }); + + LaunchRaw(num_blocks, + [=](auto lh){ + data[lh.globalIdx1D()] = + data[lh.globalIdx1D()] == static_cast(lh.blockIdx1D() + lh.threadIdx1D()); + }); + + AMREX_ALWAYS_ASSERT(Reduce::Sum(vect.size(), data, 0) == static_cast(vect.size())); +} + +void test2d () { + + const IntVectND<2> num_blocks {31, 23}; +#ifdef AMREX_USE_GPU + static constexpr IntVectND<2> blockdim {8, 32}; +#else + static constexpr IntVectND<2> blockdim {1, 1}; +#endif + static constexpr int num_threads = blockdim[0] * blockdim[1]; + + Gpu::DeviceVector vect(num_threads * num_blocks[0] * num_blocks[1], -999); + + auto * data = vect.dataPtr(); + + LaunchRaw(num_blocks, + [=](auto lh){ + data[lh.globalIdx1D()] = lh.blockIdx1D(); + }); + + LaunchRaw(num_blocks, + [=](auto lh){ + data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] += lh.threadIdx1D(); + }); + + LaunchRaw(num_blocks, + [=](auto lh){ + auto block = lh.blockIdxND(); + auto thread = lh.template threadIdxND(); + auto tmp = data[ + (block[0] + block[1] * num_blocks[0]) * num_threads + + thread[1] + thread[0] * blockdim[1] + ]; + lh.syncthreads(); + data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] = tmp; + }); + + LaunchRaw(num_blocks, num_threads, + [=](auto lh){ + auto smem = lh.shared_memory(); + auto thread1 = lh.template threadIdxND(); + auto locid1 = thread1[1] + thread1[0] * blockdim[0]; + auto thread2 = lh.template threadIdxND(); + auto locid2 = thread2[1] + thread2[0] * blockdim[1]; + smem[lh.threadIdx1D()] = data[lh.blockIdx1D() * lh.blockDim1D() + locid1]; + lh.syncthreads(); + data[lh.blockIdx1D() * lh.blockDim1D() + locid2] = smem[locid2]; + }); + + LaunchRaw(num_blocks, + [=](auto lh){ + data[lh.globalIdx1D()] = + data[lh.globalIdx1D()] == static_cast(lh.blockIdx1D() + lh.threadIdx1D()); + }); + + AMREX_ALWAYS_ASSERT(Reduce::Sum(vect.size(), data, 0) == static_cast(vect.size())); +} + void test3d () { const IntVectND<3> num_blocks {31, 23, 11}; @@ -19,7 +130,7 @@ void test3d () { Gpu::DeviceVector vect(num_threads * num_blocks[0] * num_blocks[1] * num_blocks[2], -999); - auto data = vect.dataPtr(); + auto * data = vect.dataPtr(); LaunchRaw(num_blocks, [=](auto lh){ @@ -70,8 +181,12 @@ void test3d () { int main (int argc, char* argv[]) { - amrex::Initialize(argc,argv); + amrex::Initialize(argc, argv); { + test1d(); + + test2d(); + test3d(); amrex::Print() << "Passed! \n"; From 28b39497412e51131765926ef430d59c3f762424 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Wed, 11 Mar 2026 16:51:23 +0100 Subject: [PATCH 27/46] fix constructor --- Src/Base/AMReX_GpuLaunchFunctsC.H | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/Src/Base/AMReX_GpuLaunchFunctsC.H b/Src/Base/AMReX_GpuLaunchFunctsC.H index 4a10a3a70ed..6f39150b460 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsC.H +++ b/Src/Base/AMReX_GpuLaunchFunctsC.H @@ -139,19 +139,22 @@ void LaunchRaw (IntVectND nblocks, L const& f) static_assert(MT == 1); if constexpr(dim == 1) { for (int bx=0; bx < nblocks[0]; ++bx) { - f(Gpu::LaunchHandler{{bx}, nblocks, nullptr}); + f(Gpu::LaunchHandler{ + IntVectND<1>{bx}, nblocks, nullptr}); } } else if constexpr(dim == 2) { for (int by=0; by < nblocks[1]; ++by) { for (int bx=0; bx < nblocks[0]; ++bx) { - f(Gpu::LaunchHandler{{bx, by}, nblocks, nullptr}); + f(Gpu::LaunchHandler{ + IntVectND<2>{bx, by}, nblocks, nullptr}); } } } else { for (int bz=0; bz < nblocks[2]; ++bz) { for (int by=0; by < nblocks[1]; ++by) { for (int bx=0; bx < nblocks[0]; ++bx) { - f(Gpu::LaunchHandler{{bx, by, bz}, nblocks, nullptr}); + f(Gpu::LaunchHandler{ + IntVectND<3>{bx, by, bz}, nblocks, nullptr}); } } } @@ -165,19 +168,22 @@ void LaunchRaw (IntVectND nblocks, std::size_t shared_mem_elements, L const std::vector smem(shared_mem_elements); if constexpr(dim == 1) { for (int bx=0; bx < nblocks[0]; ++bx) { - f(Gpu::LaunchHandler{{bx}, nblocks, smem.data()}); + f(Gpu::LaunchHandler{ + IntVectND<1>{bx}, nblocks, smem.data()}); } } else if constexpr(dim == 2) { for (int by=0; by < nblocks[1]; ++by) { for (int bx=0; bx < nblocks[0]; ++bx) { - f(Gpu::LaunchHandler{{bx, by}, nblocks, smem.data()}); + f(Gpu::LaunchHandler{ + IntVectND<2>{bx, by}, nblocks, smem.data()}); } } } else { for (int bz=0; bz < nblocks[2]; ++bz) { for (int by=0; by < nblocks[1]; ++by) { for (int bx=0; bx < nblocks[0]; ++bx) { - f(Gpu::LaunchHandler{{bx, by, bz}, nblocks, smem.data()}); + f(Gpu::LaunchHandler{ + IntVectND<3>{bx, by, bz}, nblocks, smem.data()}); } } } From aa9bbead67ada28e008d2b8ecfb6efba61d97596 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Wed, 11 Mar 2026 16:58:30 +0100 Subject: [PATCH 28/46] fix constructor 2 --- Src/Base/AMReX_GpuTypes.H | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H index 4b666a5847f..f031f170d15 100644 --- a/Src/Base/AMReX_GpuTypes.H +++ b/Src/Base/AMReX_GpuTypes.H @@ -231,7 +231,7 @@ struct LaunchHandler static constexpr IntVectND blockDimND () { static_assert(sizeof...(nd_block_size) == dim && (1 * ... * nd_block_size) == threads_per_block); - return {nd_block_size...}; + return IntVectND{nd_block_size...}; } /** From 4bb24ae28c9e3ebc4f57fa4e5031e8c4b71dca46 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Wed, 11 Mar 2026 17:16:46 +0100 Subject: [PATCH 29/46] test --- Src/Base/AMReX_Reduce.H | 18 +++++++++--------- Tests/LaunchRaw/main.cpp | 30 +++++++++++++++--------------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H index db006dd71ab..83e48ebe7d2 100644 --- a/Src/Base/AMReX_Reduce.H +++ b/Src/Base/AMReX_Reduce.H @@ -526,19 +526,19 @@ namespace Reduce::detail { template AMREX_GPU_DEVICE AMREX_FORCE_INLINE - auto call_f_intvect_box (F const& f, IntVectND iv, IndexTypeND) noexcept -> - decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence(), f, iv)) + auto call_f_intvect_box (F const& f, IntVectND iv, IndexTypeND) noexcept //-> + // decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence(), f, iv)) { return amrex::detail::call_f_intvect_inner(std::make_index_sequence(), f, iv); } - template - AMREX_GPU_DEVICE AMREX_FORCE_INLINE - auto call_f_intvect_box (F const& f, IntVectND iv, IndexTypeND t) noexcept -> - decltype(f(BoxND(iv, iv, t))) - { - return f(BoxND(iv, iv, t)); - } + // template + // AMREX_GPU_DEVICE AMREX_FORCE_INLINE + // auto call_f_intvect_box (F const& f, IntVectND iv, IndexTypeND t) noexcept -> + // decltype(f(BoxND(iv, iv, t))) + // { + // return f(BoxND(iv, iv, t)); + // } // call_f_intvect_n template diff --git a/Tests/LaunchRaw/main.cpp b/Tests/LaunchRaw/main.cpp index eb53f64971d..68b9366dcea 100644 --- a/Tests/LaunchRaw/main.cpp +++ b/Tests/LaunchRaw/main.cpp @@ -22,17 +22,17 @@ void test1d () { auto * data = vect.dataPtr(); LaunchRaw(num_blocks, - [=](auto lh){ + [=] AMREX_GPU_DEVICE (auto lh) { data[lh.globalIdx1D()] = lh.blockIdx1D(); }); LaunchRaw(num_blocks, - [=](auto lh){ + [=] AMREX_GPU_DEVICE (auto lh) { data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] += lh.threadIdx1D(); }); LaunchRaw(num_blocks, - [=](auto lh){ + [=] AMREX_GPU_DEVICE (auto lh) { auto block = lh.blockIdxND(); auto thread = lh.template threadIdxND(); auto tmp = data[ @@ -43,7 +43,7 @@ void test1d () { }); LaunchRaw(num_blocks, num_threads, - [=](auto lh){ + [=] AMREX_GPU_DEVICE (auto lh) { auto smem = lh.shared_memory(); auto thread = lh.template threadIdxND(); auto locid = thread[0]; @@ -53,7 +53,7 @@ void test1d () { }); LaunchRaw(num_blocks, - [=](auto lh){ + [=] AMREX_GPU_DEVICE (auto lh) { data[lh.globalIdx1D()] = data[lh.globalIdx1D()] == static_cast(lh.blockIdx1D() + lh.threadIdx1D()); }); @@ -76,17 +76,17 @@ void test2d () { auto * data = vect.dataPtr(); LaunchRaw(num_blocks, - [=](auto lh){ + [=] AMREX_GPU_DEVICE (auto lh) { data[lh.globalIdx1D()] = lh.blockIdx1D(); }); LaunchRaw(num_blocks, - [=](auto lh){ + [=] AMREX_GPU_DEVICE (auto lh) { data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] += lh.threadIdx1D(); }); LaunchRaw(num_blocks, - [=](auto lh){ + [=] AMREX_GPU_DEVICE (auto lh) { auto block = lh.blockIdxND(); auto thread = lh.template threadIdxND(); auto tmp = data[ @@ -98,7 +98,7 @@ void test2d () { }); LaunchRaw(num_blocks, num_threads, - [=](auto lh){ + [=] AMREX_GPU_DEVICE (auto lh) { auto smem = lh.shared_memory(); auto thread1 = lh.template threadIdxND(); auto locid1 = thread1[1] + thread1[0] * blockdim[0]; @@ -110,7 +110,7 @@ void test2d () { }); LaunchRaw(num_blocks, - [=](auto lh){ + [=] AMREX_GPU_DEVICE (auto lh) { data[lh.globalIdx1D()] = data[lh.globalIdx1D()] == static_cast(lh.blockIdx1D() + lh.threadIdx1D()); }); @@ -133,17 +133,17 @@ void test3d () { auto * data = vect.dataPtr(); LaunchRaw(num_blocks, - [=](auto lh){ + [=] AMREX_GPU_DEVICE (auto lh) { data[lh.globalIdx1D()] = lh.blockIdx1D(); }); LaunchRaw(num_blocks, - [=](auto lh){ + [=] AMREX_GPU_DEVICE (auto lh) { data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] += lh.threadIdx1D(); }); LaunchRaw(num_blocks, - [=](auto lh){ + [=] AMREX_GPU_DEVICE (auto lh) { auto block = lh.blockIdxND(); auto thread = lh.template threadIdxND(); auto tmp = data[ @@ -157,7 +157,7 @@ void test3d () { }); LaunchRaw(num_blocks, num_threads, - [=](auto lh){ + [=] AMREX_GPU_DEVICE (auto lh) { auto smem = lh.shared_memory(); auto thread1 = lh.template threadIdxND(); auto locid1 = thread1[2] + thread1[1] * blockdim[0] + @@ -171,7 +171,7 @@ void test3d () { }); LaunchRaw(num_blocks, - [=](auto lh){ + [=] AMREX_GPU_DEVICE (auto lh) { data[lh.globalIdx1D()] = data[lh.globalIdx1D()] == static_cast(lh.blockIdx1D() + lh.threadIdx1D()); }); From 8bcefc0097c211aed038e3e77015a02ba08ca590 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Wed, 11 Mar 2026 17:48:02 +0100 Subject: [PATCH 30/46] test2 --- Src/Base/AMReX_Reduce.H | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H index 83e48ebe7d2..cd042dbfa5c 100644 --- a/Src/Base/AMReX_Reduce.H +++ b/Src/Base/AMReX_Reduce.H @@ -526,19 +526,19 @@ namespace Reduce::detail { template AMREX_GPU_DEVICE AMREX_FORCE_INLINE - auto call_f_intvect_box (F const& f, IntVectND iv, IndexTypeND) noexcept //-> - // decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence(), f, iv)) + auto call_f_intvect_box (F const& f, IntVectND iv, IndexTypeND) noexcept -> + decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence(), f, iv)) { return amrex::detail::call_f_intvect_inner(std::make_index_sequence(), f, iv); } - // template - // AMREX_GPU_DEVICE AMREX_FORCE_INLINE - // auto call_f_intvect_box (F const& f, IntVectND iv, IndexTypeND t) noexcept -> - // decltype(f(BoxND(iv, iv, t))) - // { - // return f(BoxND(iv, iv, t)); - // } + template + AMREX_GPU_DEVICE AMREX_FORCE_INLINE + auto call_f_intvect_box (F const& f, IntVectND iv, IndexTypeND t) noexcept -> + decltype(f(BoxND(iv, iv, t))) + { + return f(BoxND(iv, iv, t)); + } // call_f_intvect_n template @@ -652,7 +652,9 @@ public: amrex::LaunchRaw( amrex::IntVectND<1>{static_cast(nblocks_ec)}, AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0), - [=] AMREX_GPU_DEVICE (auto lh) noexcept + [=] AMREX_GPU_DEVICE ( + amrex::LaunchHandler lh + ) noexcept { ReduceTuple r; Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r); From cb9a3fdef2c550d7949a0707ea5d4b2959a0cd71 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Wed, 11 Mar 2026 17:51:31 +0100 Subject: [PATCH 31/46] fix --- Src/Base/AMReX_Reduce.H | 2 +- Tests/LaunchRaw/main.cpp | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H index cd042dbfa5c..1eac0423616 100644 --- a/Src/Base/AMReX_Reduce.H +++ b/Src/Base/AMReX_Reduce.H @@ -653,7 +653,7 @@ public: amrex::IntVectND<1>{static_cast(nblocks_ec)}, AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0), [=] AMREX_GPU_DEVICE ( - amrex::LaunchHandler lh + Gpu::LaunchHandler lh ) noexcept { ReduceTuple r; diff --git a/Tests/LaunchRaw/main.cpp b/Tests/LaunchRaw/main.cpp index 68b9366dcea..231975fde4f 100644 --- a/Tests/LaunchRaw/main.cpp +++ b/Tests/LaunchRaw/main.cpp @@ -17,7 +17,7 @@ void test1d () { #endif static constexpr int num_threads = blockdim[0]; - Gpu::DeviceVector vect(num_threads * num_blocks[0], -999); + Gpu::DeviceVector vect(static_cast(num_threads) * num_blocks[0], -999); auto * data = vect.dataPtr(); @@ -71,7 +71,8 @@ void test2d () { #endif static constexpr int num_threads = blockdim[0] * blockdim[1]; - Gpu::DeviceVector vect(num_threads * num_blocks[0] * num_blocks[1], -999); + Gpu::DeviceVector vect(static_cast(num_threads) + * num_blocks[0] * num_blocks[1], -999); auto * data = vect.dataPtr(); @@ -128,7 +129,8 @@ void test3d () { #endif static constexpr int num_threads = blockdim[0] * blockdim[1] * blockdim[2]; - Gpu::DeviceVector vect(num_threads * num_blocks[0] * num_blocks[1] * num_blocks[2], -999); + Gpu::DeviceVector vect(static_cast(num_threads) + * num_blocks[0] * num_blocks[1] * num_blocks[2], -999); auto * data = vect.dataPtr(); From d4566ec4802f89d9378ff419593398cec9181cb5 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Wed, 11 Mar 2026 18:11:33 +0100 Subject: [PATCH 32/46] remove static --- Tests/LaunchRaw/main.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Tests/LaunchRaw/main.cpp b/Tests/LaunchRaw/main.cpp index 231975fde4f..ac125be7edd 100644 --- a/Tests/LaunchRaw/main.cpp +++ b/Tests/LaunchRaw/main.cpp @@ -11,11 +11,11 @@ void test1d () { const IntVectND<1> num_blocks {31}; #ifdef AMREX_USE_GPU - static constexpr IntVectND<1> blockdim {256}; + constexpr IntVectND<1> blockdim {256}; #else - static constexpr IntVectND<1> blockdim {1}; + constexpr IntVectND<1> blockdim {1}; #endif - static constexpr int num_threads = blockdim[0]; + constexpr int num_threads = blockdim[0]; Gpu::DeviceVector vect(static_cast(num_threads) * num_blocks[0], -999); @@ -65,11 +65,11 @@ void test2d () { const IntVectND<2> num_blocks {31, 23}; #ifdef AMREX_USE_GPU - static constexpr IntVectND<2> blockdim {8, 32}; + constexpr IntVectND<2> blockdim {8, 32}; #else - static constexpr IntVectND<2> blockdim {1, 1}; + constexpr IntVectND<2> blockdim {1, 1}; #endif - static constexpr int num_threads = blockdim[0] * blockdim[1]; + constexpr int num_threads = blockdim[0] * blockdim[1]; Gpu::DeviceVector vect(static_cast(num_threads) * num_blocks[0] * num_blocks[1], -999); @@ -123,11 +123,11 @@ void test3d () { const IntVectND<3> num_blocks {31, 23, 11}; #ifdef AMREX_USE_GPU - static constexpr IntVectND<3> blockdim {2, 8, 16}; + constexpr IntVectND<3> blockdim {2, 8, 16}; #else - static constexpr IntVectND<3> blockdim {1, 1, 1}; + constexpr IntVectND<3> blockdim {1, 1, 1}; #endif - static constexpr int num_threads = blockdim[0] * blockdim[1] * blockdim[2]; + constexpr int num_threads = blockdim[0] * blockdim[1] * blockdim[2]; Gpu::DeviceVector vect(static_cast(num_threads) * num_blocks[0] * num_blocks[1] * num_blocks[2], -999); From 13b8a4248994805a408d423c2485bbe01d30f725 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Wed, 11 Mar 2026 18:25:07 +0100 Subject: [PATCH 33/46] fix27 --- Tests/LaunchRaw/main.cpp | 58 ++++++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 26 deletions(-) diff --git a/Tests/LaunchRaw/main.cpp b/Tests/LaunchRaw/main.cpp index ac125be7edd..4b5ced4d1d4 100644 --- a/Tests/LaunchRaw/main.cpp +++ b/Tests/LaunchRaw/main.cpp @@ -11,11 +11,11 @@ void test1d () { const IntVectND<1> num_blocks {31}; #ifdef AMREX_USE_GPU - constexpr IntVectND<1> blockdim {256}; + static constexpr int blockdim_x = 256; #else - constexpr IntVectND<1> blockdim {1}; + static constexpr int blockdim_x = 1; #endif - constexpr int num_threads = blockdim[0]; + static constexpr int num_threads = blockdim_x; Gpu::DeviceVector vect(static_cast(num_threads) * num_blocks[0], -999); @@ -34,7 +34,7 @@ void test1d () { LaunchRaw(num_blocks, [=] AMREX_GPU_DEVICE (auto lh) { auto block = lh.blockIdxND(); - auto thread = lh.template threadIdxND(); + auto thread = lh.template threadIdxND(); auto tmp = data[ block[0] * num_threads + thread[0] ]; @@ -45,7 +45,7 @@ void test1d () { LaunchRaw(num_blocks, num_threads, [=] AMREX_GPU_DEVICE (auto lh) { auto smem = lh.shared_memory(); - auto thread = lh.template threadIdxND(); + auto thread = lh.template threadIdxND(); auto locid = thread[0]; smem[lh.threadIdx1D()] = data[lh.blockIdx1D() * lh.blockDim1D() + locid]; lh.syncthreads(); @@ -65,11 +65,13 @@ void test2d () { const IntVectND<2> num_blocks {31, 23}; #ifdef AMREX_USE_GPU - constexpr IntVectND<2> blockdim {8, 32}; + static constexpr int blockdim_x = 8; + static constexpr int blockdim_y = 32; #else - constexpr IntVectND<2> blockdim {1, 1}; + static constexpr int blockdim_x = 1; + static constexpr int blockdim_y = 1; #endif - constexpr int num_threads = blockdim[0] * blockdim[1]; + static constexpr int num_threads = blockdim_x * blockdim_y; Gpu::DeviceVector vect(static_cast(num_threads) * num_blocks[0] * num_blocks[1], -999); @@ -89,10 +91,10 @@ void test2d () { LaunchRaw(num_blocks, [=] AMREX_GPU_DEVICE (auto lh) { auto block = lh.blockIdxND(); - auto thread = lh.template threadIdxND(); + auto thread = lh.template threadIdxND(); auto tmp = data[ (block[0] + block[1] * num_blocks[0]) * num_threads + - thread[1] + thread[0] * blockdim[1] + thread[1] + thread[0] * blockdim_y ]; lh.syncthreads(); data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] = tmp; @@ -101,10 +103,10 @@ void test2d () { LaunchRaw(num_blocks, num_threads, [=] AMREX_GPU_DEVICE (auto lh) { auto smem = lh.shared_memory(); - auto thread1 = lh.template threadIdxND(); - auto locid1 = thread1[1] + thread1[0] * blockdim[0]; - auto thread2 = lh.template threadIdxND(); - auto locid2 = thread2[1] + thread2[0] * blockdim[1]; + auto thread1 = lh.template threadIdxND(); + auto locid1 = thread1[1] + thread1[0] * blockdim_x; + auto thread2 = lh.template threadIdxND(); + auto locid2 = thread2[1] + thread2[0] * blockdim_y; smem[lh.threadIdx1D()] = data[lh.blockIdx1D() * lh.blockDim1D() + locid1]; lh.syncthreads(); data[lh.blockIdx1D() * lh.blockDim1D() + locid2] = smem[locid2]; @@ -123,11 +125,15 @@ void test3d () { const IntVectND<3> num_blocks {31, 23, 11}; #ifdef AMREX_USE_GPU - constexpr IntVectND<3> blockdim {2, 8, 16}; + static constexpr int blockdim_x = 2; + static constexpr int blockdim_y = 8; + static constexpr int blockdim_z = 16; #else - constexpr IntVectND<3> blockdim {1, 1, 1}; + static constexpr int blockdim_x = 1; + static constexpr int blockdim_y = 1; + static constexpr int blockdim_z = 1; #endif - constexpr int num_threads = blockdim[0] * blockdim[1] * blockdim[2]; + static constexpr int num_threads = blockdim_x * blockdim_y * blockdim_z; Gpu::DeviceVector vect(static_cast(num_threads) * num_blocks[0] * num_blocks[1] * num_blocks[2], -999); @@ -147,12 +153,12 @@ void test3d () { LaunchRaw(num_blocks, [=] AMREX_GPU_DEVICE (auto lh) { auto block = lh.blockIdxND(); - auto thread = lh.template threadIdxND(); + auto thread = lh.template threadIdxND(); auto tmp = data[ (block[0] + block[1] * num_blocks[0] + block[2] * num_blocks[0] * num_blocks[1]) * num_threads + - thread[2] + thread[1] * blockdim[2] + - thread[0] * blockdim[2] * blockdim[1] + thread[2] + thread[1] * blockdim_z + + thread[0] * blockdim_z * blockdim_y ]; lh.syncthreads(); data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] = tmp; @@ -161,12 +167,12 @@ void test3d () { LaunchRaw(num_blocks, num_threads, [=] AMREX_GPU_DEVICE (auto lh) { auto smem = lh.shared_memory(); - auto thread1 = lh.template threadIdxND(); - auto locid1 = thread1[2] + thread1[1] * blockdim[0] + - thread1[0] * blockdim[0] * blockdim[1]; - auto thread2 = lh.template threadIdxND(); - auto locid2 = thread2[0] + thread2[2] * blockdim[2] + - thread2[1] * blockdim[2] * blockdim[1]; + auto thread1 = lh.template threadIdxND(); + auto locid1 = thread1[2] + thread1[1] * blockdim_x + + thread1[0] * blockdim_x * blockdim_y; + auto thread2 = lh.template threadIdxND(); + auto locid2 = thread2[0] + thread2[2] * blockdim_z + + thread2[1] * blockdim_z * blockdim_y; smem[lh.threadIdx1D()] = data[lh.blockIdx1D() * lh.blockDim1D() + locid1]; lh.syncthreads(); data[lh.blockIdx1D() * lh.blockDim1D() + locid2] = smem[locid2]; From 528d8dc9857a3276b26cc63478506bc264d054e0 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Thu, 12 Mar 2026 10:08:05 +0100 Subject: [PATCH 34/46] try fix MSVC --- Tests/LaunchRaw/main.cpp | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/Tests/LaunchRaw/main.cpp b/Tests/LaunchRaw/main.cpp index 4b5ced4d1d4..b2ad5fb8481 100644 --- a/Tests/LaunchRaw/main.cpp +++ b/Tests/LaunchRaw/main.cpp @@ -22,17 +22,17 @@ void test1d () { auto * data = vect.dataPtr(); LaunchRaw(num_blocks, - [=] AMREX_GPU_DEVICE (auto lh) { + [data] AMREX_GPU_DEVICE (auto lh) { data[lh.globalIdx1D()] = lh.blockIdx1D(); }); LaunchRaw(num_blocks, - [=] AMREX_GPU_DEVICE (auto lh) { + [data] AMREX_GPU_DEVICE (auto lh) { data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] += lh.threadIdx1D(); }); LaunchRaw(num_blocks, - [=] AMREX_GPU_DEVICE (auto lh) { + [data] AMREX_GPU_DEVICE (auto lh) { auto block = lh.blockIdxND(); auto thread = lh.template threadIdxND(); auto tmp = data[ @@ -43,7 +43,7 @@ void test1d () { }); LaunchRaw(num_blocks, num_threads, - [=] AMREX_GPU_DEVICE (auto lh) { + [data] AMREX_GPU_DEVICE (auto lh) { auto smem = lh.shared_memory(); auto thread = lh.template threadIdxND(); auto locid = thread[0]; @@ -53,7 +53,7 @@ void test1d () { }); LaunchRaw(num_blocks, - [=] AMREX_GPU_DEVICE (auto lh) { + [data] AMREX_GPU_DEVICE (auto lh) { data[lh.globalIdx1D()] = data[lh.globalIdx1D()] == static_cast(lh.blockIdx1D() + lh.threadIdx1D()); }); @@ -79,17 +79,17 @@ void test2d () { auto * data = vect.dataPtr(); LaunchRaw(num_blocks, - [=] AMREX_GPU_DEVICE (auto lh) { + [data] AMREX_GPU_DEVICE (auto lh) { data[lh.globalIdx1D()] = lh.blockIdx1D(); }); LaunchRaw(num_blocks, - [=] AMREX_GPU_DEVICE (auto lh) { + [data] AMREX_GPU_DEVICE (auto lh) { data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] += lh.threadIdx1D(); }); LaunchRaw(num_blocks, - [=] AMREX_GPU_DEVICE (auto lh) { + [data, num_blocks] AMREX_GPU_DEVICE (auto lh) { auto block = lh.blockIdxND(); auto thread = lh.template threadIdxND(); auto tmp = data[ @@ -101,7 +101,7 @@ void test2d () { }); LaunchRaw(num_blocks, num_threads, - [=] AMREX_GPU_DEVICE (auto lh) { + [data] AMREX_GPU_DEVICE (auto lh) { auto smem = lh.shared_memory(); auto thread1 = lh.template threadIdxND(); auto locid1 = thread1[1] + thread1[0] * blockdim_x; @@ -113,7 +113,7 @@ void test2d () { }); LaunchRaw(num_blocks, - [=] AMREX_GPU_DEVICE (auto lh) { + [data] AMREX_GPU_DEVICE (auto lh) { data[lh.globalIdx1D()] = data[lh.globalIdx1D()] == static_cast(lh.blockIdx1D() + lh.threadIdx1D()); }); @@ -141,17 +141,17 @@ void test3d () { auto * data = vect.dataPtr(); LaunchRaw(num_blocks, - [=] AMREX_GPU_DEVICE (auto lh) { + [data] AMREX_GPU_DEVICE (auto lh) { data[lh.globalIdx1D()] = lh.blockIdx1D(); }); LaunchRaw(num_blocks, - [=] AMREX_GPU_DEVICE (auto lh) { + [data] AMREX_GPU_DEVICE (auto lh) { data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] += lh.threadIdx1D(); }); LaunchRaw(num_blocks, - [=] AMREX_GPU_DEVICE (auto lh) { + [data, num_blocks] AMREX_GPU_DEVICE (auto lh) { auto block = lh.blockIdxND(); auto thread = lh.template threadIdxND(); auto tmp = data[ @@ -165,7 +165,7 @@ void test3d () { }); LaunchRaw(num_blocks, num_threads, - [=] AMREX_GPU_DEVICE (auto lh) { + [data] AMREX_GPU_DEVICE (auto lh) { auto smem = lh.shared_memory(); auto thread1 = lh.template threadIdxND(); auto locid1 = thread1[2] + thread1[1] * blockdim_x + @@ -179,7 +179,7 @@ void test3d () { }); LaunchRaw(num_blocks, - [=] AMREX_GPU_DEVICE (auto lh) { + [data] AMREX_GPU_DEVICE (auto lh) { data[lh.globalIdx1D()] = data[lh.globalIdx1D()] == static_cast(lh.blockIdx1D() + lh.threadIdx1D()); }); From e13206d115d43c49ebdc5717eddec3c7e610e680 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Thu, 12 Mar 2026 11:17:23 +0100 Subject: [PATCH 35/46] try fix MSVC 2 --- Tests/LaunchRaw/main.cpp | 88 +++++++++++++++++++--------------------- 1 file changed, 41 insertions(+), 47 deletions(-) diff --git a/Tests/LaunchRaw/main.cpp b/Tests/LaunchRaw/main.cpp index b2ad5fb8481..231975fde4f 100644 --- a/Tests/LaunchRaw/main.cpp +++ b/Tests/LaunchRaw/main.cpp @@ -11,30 +11,30 @@ void test1d () { const IntVectND<1> num_blocks {31}; #ifdef AMREX_USE_GPU - static constexpr int blockdim_x = 256; + static constexpr IntVectND<1> blockdim {256}; #else - static constexpr int blockdim_x = 1; + static constexpr IntVectND<1> blockdim {1}; #endif - static constexpr int num_threads = blockdim_x; + static constexpr int num_threads = blockdim[0]; Gpu::DeviceVector vect(static_cast(num_threads) * num_blocks[0], -999); auto * data = vect.dataPtr(); LaunchRaw(num_blocks, - [data] AMREX_GPU_DEVICE (auto lh) { + [=] AMREX_GPU_DEVICE (auto lh) { data[lh.globalIdx1D()] = lh.blockIdx1D(); }); LaunchRaw(num_blocks, - [data] AMREX_GPU_DEVICE (auto lh) { + [=] AMREX_GPU_DEVICE (auto lh) { data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] += lh.threadIdx1D(); }); LaunchRaw(num_blocks, - [data] AMREX_GPU_DEVICE (auto lh) { + [=] AMREX_GPU_DEVICE (auto lh) { auto block = lh.blockIdxND(); - auto thread = lh.template threadIdxND(); + auto thread = lh.template threadIdxND(); auto tmp = data[ block[0] * num_threads + thread[0] ]; @@ -43,9 +43,9 @@ void test1d () { }); LaunchRaw(num_blocks, num_threads, - [data] AMREX_GPU_DEVICE (auto lh) { + [=] AMREX_GPU_DEVICE (auto lh) { auto smem = lh.shared_memory(); - auto thread = lh.template threadIdxND(); + auto thread = lh.template threadIdxND(); auto locid = thread[0]; smem[lh.threadIdx1D()] = data[lh.blockIdx1D() * lh.blockDim1D() + locid]; lh.syncthreads(); @@ -53,7 +53,7 @@ void test1d () { }); LaunchRaw(num_blocks, - [data] AMREX_GPU_DEVICE (auto lh) { + [=] AMREX_GPU_DEVICE (auto lh) { data[lh.globalIdx1D()] = data[lh.globalIdx1D()] == static_cast(lh.blockIdx1D() + lh.threadIdx1D()); }); @@ -65,13 +65,11 @@ void test2d () { const IntVectND<2> num_blocks {31, 23}; #ifdef AMREX_USE_GPU - static constexpr int blockdim_x = 8; - static constexpr int blockdim_y = 32; + static constexpr IntVectND<2> blockdim {8, 32}; #else - static constexpr int blockdim_x = 1; - static constexpr int blockdim_y = 1; + static constexpr IntVectND<2> blockdim {1, 1}; #endif - static constexpr int num_threads = blockdim_x * blockdim_y; + static constexpr int num_threads = blockdim[0] * blockdim[1]; Gpu::DeviceVector vect(static_cast(num_threads) * num_blocks[0] * num_blocks[1], -999); @@ -79,41 +77,41 @@ void test2d () { auto * data = vect.dataPtr(); LaunchRaw(num_blocks, - [data] AMREX_GPU_DEVICE (auto lh) { + [=] AMREX_GPU_DEVICE (auto lh) { data[lh.globalIdx1D()] = lh.blockIdx1D(); }); LaunchRaw(num_blocks, - [data] AMREX_GPU_DEVICE (auto lh) { + [=] AMREX_GPU_DEVICE (auto lh) { data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] += lh.threadIdx1D(); }); LaunchRaw(num_blocks, - [data, num_blocks] AMREX_GPU_DEVICE (auto lh) { + [=] AMREX_GPU_DEVICE (auto lh) { auto block = lh.blockIdxND(); - auto thread = lh.template threadIdxND(); + auto thread = lh.template threadIdxND(); auto tmp = data[ (block[0] + block[1] * num_blocks[0]) * num_threads + - thread[1] + thread[0] * blockdim_y + thread[1] + thread[0] * blockdim[1] ]; lh.syncthreads(); data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] = tmp; }); LaunchRaw(num_blocks, num_threads, - [data] AMREX_GPU_DEVICE (auto lh) { + [=] AMREX_GPU_DEVICE (auto lh) { auto smem = lh.shared_memory(); - auto thread1 = lh.template threadIdxND(); - auto locid1 = thread1[1] + thread1[0] * blockdim_x; - auto thread2 = lh.template threadIdxND(); - auto locid2 = thread2[1] + thread2[0] * blockdim_y; + auto thread1 = lh.template threadIdxND(); + auto locid1 = thread1[1] + thread1[0] * blockdim[0]; + auto thread2 = lh.template threadIdxND(); + auto locid2 = thread2[1] + thread2[0] * blockdim[1]; smem[lh.threadIdx1D()] = data[lh.blockIdx1D() * lh.blockDim1D() + locid1]; lh.syncthreads(); data[lh.blockIdx1D() * lh.blockDim1D() + locid2] = smem[locid2]; }); LaunchRaw(num_blocks, - [data] AMREX_GPU_DEVICE (auto lh) { + [=] AMREX_GPU_DEVICE (auto lh) { data[lh.globalIdx1D()] = data[lh.globalIdx1D()] == static_cast(lh.blockIdx1D() + lh.threadIdx1D()); }); @@ -125,15 +123,11 @@ void test3d () { const IntVectND<3> num_blocks {31, 23, 11}; #ifdef AMREX_USE_GPU - static constexpr int blockdim_x = 2; - static constexpr int blockdim_y = 8; - static constexpr int blockdim_z = 16; + static constexpr IntVectND<3> blockdim {2, 8, 16}; #else - static constexpr int blockdim_x = 1; - static constexpr int blockdim_y = 1; - static constexpr int blockdim_z = 1; + static constexpr IntVectND<3> blockdim {1, 1, 1}; #endif - static constexpr int num_threads = blockdim_x * blockdim_y * blockdim_z; + static constexpr int num_threads = blockdim[0] * blockdim[1] * blockdim[2]; Gpu::DeviceVector vect(static_cast(num_threads) * num_blocks[0] * num_blocks[1] * num_blocks[2], -999); @@ -141,45 +135,45 @@ void test3d () { auto * data = vect.dataPtr(); LaunchRaw(num_blocks, - [data] AMREX_GPU_DEVICE (auto lh) { + [=] AMREX_GPU_DEVICE (auto lh) { data[lh.globalIdx1D()] = lh.blockIdx1D(); }); LaunchRaw(num_blocks, - [data] AMREX_GPU_DEVICE (auto lh) { + [=] AMREX_GPU_DEVICE (auto lh) { data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] += lh.threadIdx1D(); }); LaunchRaw(num_blocks, - [data, num_blocks] AMREX_GPU_DEVICE (auto lh) { + [=] AMREX_GPU_DEVICE (auto lh) { auto block = lh.blockIdxND(); - auto thread = lh.template threadIdxND(); + auto thread = lh.template threadIdxND(); auto tmp = data[ (block[0] + block[1] * num_blocks[0] + block[2] * num_blocks[0] * num_blocks[1]) * num_threads + - thread[2] + thread[1] * blockdim_z + - thread[0] * blockdim_z * blockdim_y + thread[2] + thread[1] * blockdim[2] + + thread[0] * blockdim[2] * blockdim[1] ]; lh.syncthreads(); data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] = tmp; }); LaunchRaw(num_blocks, num_threads, - [data] AMREX_GPU_DEVICE (auto lh) { + [=] AMREX_GPU_DEVICE (auto lh) { auto smem = lh.shared_memory(); - auto thread1 = lh.template threadIdxND(); - auto locid1 = thread1[2] + thread1[1] * blockdim_x + - thread1[0] * blockdim_x * blockdim_y; - auto thread2 = lh.template threadIdxND(); - auto locid2 = thread2[0] + thread2[2] * blockdim_z + - thread2[1] * blockdim_z * blockdim_y; + auto thread1 = lh.template threadIdxND(); + auto locid1 = thread1[2] + thread1[1] * blockdim[0] + + thread1[0] * blockdim[0] * blockdim[1]; + auto thread2 = lh.template threadIdxND(); + auto locid2 = thread2[0] + thread2[2] * blockdim[2] + + thread2[1] * blockdim[2] * blockdim[1]; smem[lh.threadIdx1D()] = data[lh.blockIdx1D() * lh.blockDim1D() + locid1]; lh.syncthreads(); data[lh.blockIdx1D() * lh.blockDim1D() + locid2] = smem[locid2]; }); LaunchRaw(num_blocks, - [data] AMREX_GPU_DEVICE (auto lh) { + [=] AMREX_GPU_DEVICE (auto lh) { data[lh.globalIdx1D()] = data[lh.globalIdx1D()] == static_cast(lh.blockIdx1D() + lh.threadIdx1D()); }); From 72e5d88bc18af7d3bb46a06848c6d0d14dd62410 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Thu, 12 Mar 2026 11:31:23 +0100 Subject: [PATCH 36/46] try fix clang --- Tests/LaunchRaw/main.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Tests/LaunchRaw/main.cpp b/Tests/LaunchRaw/main.cpp index 231975fde4f..b34b590e530 100644 --- a/Tests/LaunchRaw/main.cpp +++ b/Tests/LaunchRaw/main.cpp @@ -11,9 +11,9 @@ void test1d () { const IntVectND<1> num_blocks {31}; #ifdef AMREX_USE_GPU - static constexpr IntVectND<1> blockdim {256}; + static constexpr int blockdim[1] {256}; #else - static constexpr IntVectND<1> blockdim {1}; + static constexpr int blockdim[1] {1}; #endif static constexpr int num_threads = blockdim[0]; @@ -65,9 +65,9 @@ void test2d () { const IntVectND<2> num_blocks {31, 23}; #ifdef AMREX_USE_GPU - static constexpr IntVectND<2> blockdim {8, 32}; + static constexpr int blockdim[2] {8, 32}; #else - static constexpr IntVectND<2> blockdim {1, 1}; + static constexpr int blockdim[2] {1, 1}; #endif static constexpr int num_threads = blockdim[0] * blockdim[1]; @@ -123,9 +123,9 @@ void test3d () { const IntVectND<3> num_blocks {31, 23, 11}; #ifdef AMREX_USE_GPU - static constexpr IntVectND<3> blockdim {2, 8, 16}; + static constexpr int blockdim[3] {2, 8, 16}; #else - static constexpr IntVectND<3> blockdim {1, 1, 1}; + static constexpr int blockdim[3] {1, 1, 1}; #endif static constexpr int num_threads = blockdim[0] * blockdim[1] * blockdim[2]; From 0c60372a4231b8286efb7d8255d02b8a9311d49a Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Thu, 12 Mar 2026 13:51:00 +0100 Subject: [PATCH 37/46] relax dim constraint for threadIdxND --- Src/Base/AMReX_GpuTypes.H | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H index f031f170d15..973aeff1ff6 100644 --- a/Src/Base/AMReX_GpuTypes.H +++ b/Src/Base/AMReX_GpuTypes.H @@ -137,17 +137,18 @@ struct LaunchHandler */ template [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE - IntVectND threadIdxND () const { - static_assert(sizeof...(nd_block_size) == dim && + IntVectND threadIdxND () const { + static_assert(sizeof...(nd_block_size) <= 3 && (1 * ... * nd_block_size) == threads_per_block); - constexpr IntVectND iv_block_size = blockDimND(); - IntVectND ret(0); + constexpr IntVectND iv_block_size = + blockDimND(); + IntVectND ret(0); unsigned int idx = threadIdx1D(); - if constexpr (dim == 3) { + if constexpr (sizeof...(nd_block_size) == 3) { ret[2] = idx / iv_block_size[1]; idx = idx - ret[2] * iv_block_size[1]; } - if constexpr (dim >= 2) { + if constexpr (sizeof...(nd_block_size) >= 2) { ret[1] = idx / iv_block_size[0]; idx = idx - ret[1] * iv_block_size[0]; } @@ -228,10 +229,9 @@ struct LaunchHandler */ template [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE - static constexpr IntVectND blockDimND () { - static_assert(sizeof...(nd_block_size) == dim && - (1 * ... * nd_block_size) == threads_per_block); - return IntVectND{nd_block_size...}; + static constexpr IntVectND blockDimND () { + static_assert((1 * ... * nd_block_size) == threads_per_block); + return IntVectND{nd_block_size...}; } /** @@ -248,6 +248,8 @@ struct LaunchHandler template [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE IntVectND globalIdxND () const { + static_assert(sizeof...(nd_block_size) == dim && + (1 * ... * nd_block_size) == threads_per_block); return blockIdxND() * blockDimND() + threadIdxND(); } From a9f0cfc96248a5a7bed5b2b74b0fa4255b996625 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Sat, 14 Mar 2026 15:43:09 +0100 Subject: [PATCH 38/46] Fix 3D threadIdxND, nd_item* and try guess fix for reqd_work_group_size --- Src/Base/AMReX_GpuLaunchFunctsG.H | 4 ++-- Src/Base/AMReX_GpuTypes.H | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H index 24b58376a58..34ea2ca566f 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsG.H +++ b/Src/Base/AMReX_GpuLaunchFunctsG.H @@ -913,7 +913,7 @@ void LaunchRaw (IntVectND nblocks, L const& f) q.submit([&] (sycl::handler& h) { h.parallel_for(sycl::nd_range(threads_total, threads_per_block), [=] (sycl::nd_item item) - [[sycl::reqd_work_group_size(1, 1, MT)]] + [[sycl::reqd_work_group_size(MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { f(Gpu::LaunchHandler{ @@ -945,7 +945,7 @@ void LaunchRaw (IntVectND nblocks, std::size_t shared_mem_elements, L const sycl::local_accessor shared_data(sycl::range<1>(shared_mem_elements), h); h.parallel_for(sycl::nd_range(threads_total, threads_per_block), [=] (sycl::nd_item item) - [[sycl::reqd_work_group_size(1, 1, MT)]] + [[sycl::reqd_work_group_size(MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { f(Gpu::LaunchHandler{ diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H index 973aeff1ff6..a2a87a6d5cc 100644 --- a/Src/Base/AMReX_GpuTypes.H +++ b/Src/Base/AMReX_GpuTypes.H @@ -145,8 +145,8 @@ struct LaunchHandler IntVectND ret(0); unsigned int idx = threadIdx1D(); if constexpr (sizeof...(nd_block_size) == 3) { - ret[2] = idx / iv_block_size[1]; - idx = idx - ret[2] * iv_block_size[1]; + ret[2] = idx / (iv_block_size[0] * iv_block_size[1]); + idx = idx - ret[2] * (iv_block_size[0] * iv_block_size[1]); } if constexpr (sizeof...(nd_block_size) >= 2) { ret[1] = idx / iv_block_size[0]; @@ -302,7 +302,7 @@ struct LaunchHandler */ [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE #if defined(AMREX_USE_SYCL) - sycl::nd_item sycl_item () const { + const sycl::nd_item* sycl_item () const { return m_item; } #else From b69038493cd919067fff3ff148f20629140b251b Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Sat, 14 Mar 2026 20:12:23 +0100 Subject: [PATCH 39/46] Add some suggestions from review --- Src/Base/AMReX_GpuLaunchFunctsC.H | 6 ++-- Src/Base/AMReX_GpuLaunchFunctsG.H | 46 +++++++++++++++++++++++++------ Src/Base/AMReX_GpuTypes.H | 4 +-- Src/Particle/AMReX_ParticleUtil.H | 5 ++-- 4 files changed, 46 insertions(+), 15 deletions(-) diff --git a/Src/Base/AMReX_GpuLaunchFunctsC.H b/Src/Base/AMReX_GpuLaunchFunctsC.H index 6f39150b460..9f88f8de884 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsC.H +++ b/Src/Base/AMReX_GpuLaunchFunctsC.H @@ -136,7 +136,8 @@ void launch (T const& n, L&& f) noexcept template void LaunchRaw (IntVectND nblocks, L const& f) { - static_assert(MT == 1); + static_assert(MT == 1, "LaunchRaw with CPU backend only with one thread per block! " + "Otherwise the syncthreads function would not work"); if constexpr(dim == 1) { for (int bx=0; bx < nblocks[0]; ++bx) { f(Gpu::LaunchHandler{ @@ -164,7 +165,8 @@ void LaunchRaw (IntVectND nblocks, L const& f) template void LaunchRaw (IntVectND nblocks, std::size_t shared_mem_elements, L const& f) { - static_assert(MT == 1); + static_assert(MT == 1, "LaunchRaw with CPU backend only with one thread per block! " + "Otherwise the syncthreads function would not work"); std::vector smem(shared_mem_elements); if constexpr(dim == 1) { for (int bx=0; bx < nblocks[0]; ++bx) { diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H index 34ea2ca566f..902f0768312 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsG.H +++ b/Src/Base/AMReX_GpuLaunchFunctsG.H @@ -899,6 +899,10 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, template void LaunchRaw (IntVectND nblocks, L const& f) { + detail::SyclKernelDevPtr skdp(f, Gpu::gpuStream()); + L const* pf = skdp.template get<0>(); + amrex::ignore_unused(pf); + auto& q = Gpu::Device::streamQueue(); sycl::range threads_per_block; @@ -916,10 +920,17 @@ void LaunchRaw (IntVectND nblocks, L const& f) [[sycl::reqd_work_group_size(MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - f(Gpu::LaunchHandler{ - &item, - nullptr - }); + if constexpr (detail::is_big_kernel()) { + (*pf)(Gpu::LaunchHandler{ + &item, + nullptr + }); + } else { + f(Gpu::LaunchHandler{ + &item, + nullptr + }); + } }); }); } catch (sycl::exception const& ex) { @@ -930,7 +941,15 @@ void LaunchRaw (IntVectND nblocks, L const& f) template void LaunchRaw (IntVectND nblocks, std::size_t shared_mem_elements, L const& f) { + detail::SyclKernelDevPtr skdp(f, Gpu::gpuStream()); + L const* pf = skdp.template get<0>(); + amrex::ignore_unused(pf); + auto& q = Gpu::Device::streamQueue(); + // use double4 for shared memory as it has the largest alignment of types that might be used + using ST = sycl::double4; + static_assert(alignof(ST) >= alignof(T)); + const std::size_t shared_mem_num = (shared_mem_elements*sizeof(T)+sizeof(ST)-1) / sizeof(ST); sycl::range threads_per_block; sycl::range threads_total; @@ -942,16 +961,25 @@ void LaunchRaw (IntVectND nblocks, std::size_t shared_mem_elements, L const try { q.submit([&] (sycl::handler& h) { - sycl::local_accessor shared_data(sycl::range<1>(shared_mem_elements), h); + sycl::local_accessor shared_data(sycl::range<1>(shared_mem_num), h); h.parallel_for(sycl::nd_range(threads_total, threads_per_block), [=] (sycl::nd_item item) [[sycl::reqd_work_group_size(MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { - f(Gpu::LaunchHandler{ - &item, - shared_data.template get_multi_ptr().get() - }); + T* shared_mem = reinterpret_cast( + shared_data.template get_multi_ptr().get()); + if constexpr (detail::is_big_kernel()) { + (*pf)(Gpu::LaunchHandler{ + &item, + shared_mem + }); + } else { + f(Gpu::LaunchHandler{ + &item, + shared_mem + }); + } }); }); } catch (sycl::exception const& ex) { diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H index a2a87a6d5cc..3c8224ee4d8 100644 --- a/Src/Base/AMReX_GpuTypes.H +++ b/Src/Base/AMReX_GpuTypes.H @@ -269,12 +269,12 @@ struct LaunchHandler /** * \brief Returns a pointer to block-local shared memory. If multiple shared memory * allocations are needed in a block, then the allocation must be manually split by adding - * offsets to it. For CUDA and HIP the memory is aligned to 32 bytes, - * for SYCL to the alignment of the chosen data type. + * offsets to it. The memory is aligned to 32 bytes. */ [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE T* shared_memory () const { #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) + // 32 bytes is sufficient for double4_32a static_assert(32 >= alignof(T)); alignas(32) extern __shared__ char smem[]; return reinterpret_cast(smem); diff --git a/Src/Particle/AMReX_ParticleUtil.H b/Src/Particle/AMReX_ParticleUtil.H index c2278f80042..b99dda6bfc8 100644 --- a/Src/Particle/AMReX_ParticleUtil.H +++ b/Src/Particle/AMReX_ParticleUtil.H @@ -821,8 +821,9 @@ void PermutationForDeposition (Gpu::DeviceVector& perm, index_type n current_idx = pllist_next[current_idx]; } - index_type num_particles_block = - Gpu::blockReduceSum(num_particles_thread); + index_type num_particles_block = Gpu::blockReduceSum + AMREX_IF_NOT_SYCL() + (num_particles_thread AMREX_IF_SYCL(, lh.handler())); if (lh.threadIdx1D() == 0) { global_idx_start = Gpu::Atomic::Add(pglobal_idx, num_particles_block); From 1f33724d3cb518889aac32431e6aa4774f9be643 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Sat, 14 Mar 2026 20:25:22 +0100 Subject: [PATCH 40/46] typo --- Src/Base/AMReX_GpuLaunchFunctsC.H | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Src/Base/AMReX_GpuLaunchFunctsC.H b/Src/Base/AMReX_GpuLaunchFunctsC.H index 9f88f8de884..a5a4ec4cdbc 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsC.H +++ b/Src/Base/AMReX_GpuLaunchFunctsC.H @@ -136,7 +136,7 @@ void launch (T const& n, L&& f) noexcept template void LaunchRaw (IntVectND nblocks, L const& f) { - static_assert(MT == 1, "LaunchRaw with CPU backend only with one thread per block! " + static_assert(MT == 1, "LaunchRaw with CPU backend only works with one thread per block! " "Otherwise the syncthreads function would not work"); if constexpr(dim == 1) { for (int bx=0; bx < nblocks[0]; ++bx) { @@ -165,7 +165,7 @@ void LaunchRaw (IntVectND nblocks, L const& f) template void LaunchRaw (IntVectND nblocks, std::size_t shared_mem_elements, L const& f) { - static_assert(MT == 1, "LaunchRaw with CPU backend only with one thread per block! " + static_assert(MT == 1, "LaunchRaw with CPU backend only works with one thread per block! " "Otherwise the syncthreads function would not work"); std::vector smem(shared_mem_elements); if constexpr(dim == 1) { From 155dd6f001bc6dbd6e79b58656844470ddab0c30 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Sat, 14 Mar 2026 21:35:18 +0100 Subject: [PATCH 41/46] Add include and use BoxIndexer for SYCL --- Src/Base/AMReX_GpuLaunchFunctsG.H | 40 ++++++++++++++++++++----------- Src/Base/AMReX_GpuTypes.H | 25 ++++++++++--------- Tests/LaunchRaw/main.cpp | 1 + 3 files changed, 39 insertions(+), 27 deletions(-) diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H index 902f0768312..a75e071655c 100644 --- a/Src/Base/AMReX_GpuLaunchFunctsG.H +++ b/Src/Base/AMReX_GpuLaunchFunctsG.H @@ -905,29 +905,35 @@ void LaunchRaw (IntVectND nblocks, L const& f) auto& q = Gpu::Device::streamQueue(); - sycl::range threads_per_block; - sycl::range threads_total; + sycl::range<1> threads_per_block{MT}; + sycl::range<1> threads_total{MT}; for (int i=0; i(nblocks[i]); } + // With SYCL it is difficult to combine a 1D blockDim with an ND gridDim, + // so we use a 1D sycl range and split the blockID with BoxIndexerND. Note that + // BoxIndexerND is a bit inefficient since it is adding the smallEnd which is always zero here. + BoxIndexerND bxi(BoxND{IntVectND(0), nblocks - 1}); + try { q.submit([&] (sycl::handler& h) { - h.parallel_for(sycl::nd_range(threads_total, threads_per_block), - [=] (sycl::nd_item item) + h.parallel_for(sycl::nd_range<1>(threads_total, threads_per_block), + [=] (sycl::nd_item<1> item) [[sycl::reqd_work_group_size(MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { if constexpr (detail::is_big_kernel()) { (*pf)(Gpu::LaunchHandler{ &item, + bxi.intVect(item.get_group(0)), nullptr }); } else { f(Gpu::LaunchHandler{ &item, + bxi.intVect(item.get_group(0)), nullptr }); } @@ -951,19 +957,23 @@ void LaunchRaw (IntVectND nblocks, std::size_t shared_mem_elements, L const static_assert(alignof(ST) >= alignof(T)); const std::size_t shared_mem_num = (shared_mem_elements*sizeof(T)+sizeof(ST)-1) / sizeof(ST); - sycl::range threads_per_block; - sycl::range threads_total; + sycl::range<1> threads_per_block{MT}; + sycl::range<1> threads_total{MT}; for (int i=0; i(nblocks[i]); } + // With SYCL it is difficult to combine a 1D blockDim with an ND gridDim, + // so we use a 1D sycl range and split the blockID with BoxIndexerND. Note that + // BoxIndexerND is a bit inefficient since it is adding the smallEnd which is always zero here. + BoxIndexerND bxi(BoxND{IntVectND(0), nblocks - 1}); + try { q.submit([&] (sycl::handler& h) { sycl::local_accessor shared_data(sycl::range<1>(shared_mem_num), h); - h.parallel_for(sycl::nd_range(threads_total, threads_per_block), - [=] (sycl::nd_item item) + h.parallel_for(sycl::nd_range<1>(threads_total, threads_per_block), + [=] (sycl::nd_item<1> item) [[sycl::reqd_work_group_size(MT)]] [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] { @@ -972,11 +982,13 @@ void LaunchRaw (IntVectND nblocks, std::size_t shared_mem_elements, L const if constexpr (detail::is_big_kernel()) { (*pf)(Gpu::LaunchHandler{ &item, + bxi.intVect(item.get_group(0)), shared_mem }); } else { f(Gpu::LaunchHandler{ &item, + bxi.intVect(item.get_group(0)), shared_mem }); } @@ -1385,7 +1397,7 @@ ParallelFor (Gpu::KernelInfo const&, * should be one of 128, 256, 512 or 1024. * * The number of total blocks can be a 1D, 2D or 3D IntVectND. Internally this uses the native - * way to split the index, such as blockIdx.x, blockIdx.y and blockIdx.z. + * way to split the index for CUDA and HIP, using blockIdx.x, blockIdx.y and blockIdx.z. * Note that this uses types int and unsigned int which might overflow if many blocks or * total threads are needed. In case this is an issue, it is necessary to explicitly add * a 64-bit grid-strided loop or to call LaunchRaw multiple times with fewer blocks. @@ -1423,7 +1435,7 @@ void LaunchRaw (IntVectND nblocks, L const& f) * should be one of 128, 256, 512 or 1024. * * The number of total blocks can be a 1D, 2D or 3D IntVectND. Internally this uses the native - * way to split the index, such as blockIdx.x, blockIdx.y and blockIdx.z. + * way to split the index for CUDA and HIP, using blockIdx.x, blockIdx.y and blockIdx.z. * Note that this uses types int and unsigned int which might overflow if many blocks or * total threads are needed. In case this is an issue, it is necessary to explicitly add * a 64-bit grid-strided loop or to call LaunchRaw multiple times with fewer blocks. diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H index 3c8224ee4d8..bf53792b8b8 100644 --- a/Src/Base/AMReX_GpuTypes.H +++ b/Src/Base/AMReX_GpuTypes.H @@ -5,6 +5,7 @@ #include #include #include +#include #ifdef AMREX_USE_GPU @@ -112,8 +113,8 @@ struct LaunchHandler #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) LaunchHandler() = default; #elif defined(AMREX_USE_SYCL) - LaunchHandler(sycl::nd_item const* a_item, T * a_shared_mem) - : m_item{a_item}, m_shared_mem{a_shared_mem} {} + LaunchHandler(sycl::nd_item<1> const* a_item, IntVectND a_blockid, T * a_shared_mem) + : m_item{a_item}, m_blockid{a_blockid}, m_shared_mem{a_shared_mem} {} #else LaunchHandler(IntVectND a_blockid, IntVectND a_griddim, T * a_shared_mem) : m_blockid{a_blockid}, m_griddim{a_griddim}, m_shared_mem{a_shared_mem} {} @@ -170,13 +171,13 @@ struct LaunchHandler } else if constexpr (dim == 2) { return static_cast(AMREX_HIP_CUDA_OR_SYCL_OR_CPU( blockIdx.x + gridDim.x * blockIdx.y, - m_item->get_group_linear_id(), + m_item->get_group(0), m_blockid[0] + m_griddim[0] * m_blockid[1] )); } else { return static_cast(AMREX_HIP_CUDA_OR_SYCL_OR_CPU( blockIdx.x + gridDim.x * blockIdx.y + gridDim.x * gridDim.y * blockIdx.z, - m_item->get_group_linear_id(), + m_item->get_group(0)), m_blockid[0] + m_griddim[0] * m_blockid[1] + m_griddim[0] * m_griddim[1] * m_blockid[2] )); @@ -191,15 +192,14 @@ struct LaunchHandler if constexpr (dim == 1) { return AMREX_HIP_CUDA_OR_SYCL_OR_CPU( IntVectND(static_cast(blockIdx.x)), - IntVectND(static_cast(m_item->get_group(0))), + m_blockid, m_blockid ); } else if constexpr (dim == 2) { return AMREX_HIP_CUDA_OR_SYCL_OR_CPU( IntVectND(static_cast(blockIdx.x), static_cast(blockIdx.y)), - IntVectND(static_cast(m_item->get_group(1)), - static_cast(m_item->get_group(0))), + m_blockid, m_blockid ); } else { @@ -207,9 +207,7 @@ struct LaunchHandler IntVectND(static_cast(blockIdx.x), static_cast(blockIdx.y), static_cast(blockIdx.z)), - IntVectND(static_cast(m_item->get_group(2)), - static_cast(m_item->get_group(1)), - static_cast(m_item->get_group(0))), + m_blockid, m_blockid ); } @@ -298,11 +296,11 @@ struct LaunchHandler } /** - * \brief Returns the internal sycl::nd_item. + * \brief Returns the internal sycl::nd_item<1>. */ [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE #if defined(AMREX_USE_SYCL) - const sycl::nd_item* sycl_item () const { + const sycl::nd_item<1>* sycl_item () const { return m_item; } #else @@ -314,7 +312,8 @@ struct LaunchHandler private: #if defined(AMREX_USE_SYCL) - sycl::nd_item const* m_item; + sycl::nd_item<1> const* m_item; + IntVectND m_blockid; T * m_shared_mem; #elif !defined(AMREX_USE_GPU) IntVectND m_blockid; diff --git a/Tests/LaunchRaw/main.cpp b/Tests/LaunchRaw/main.cpp index b34b590e530..10c4c948381 100644 --- a/Tests/LaunchRaw/main.cpp +++ b/Tests/LaunchRaw/main.cpp @@ -1,3 +1,4 @@ +#include #include #include #include From a3dfac39e936be787004bfdaf60f0baf14bade21 Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Sat, 14 Mar 2026 21:37:42 +0100 Subject: [PATCH 42/46] fix --- Src/Base/AMReX_GpuTypes.H | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H index bf53792b8b8..a6ac2b09401 100644 --- a/Src/Base/AMReX_GpuTypes.H +++ b/Src/Base/AMReX_GpuTypes.H @@ -177,7 +177,7 @@ struct LaunchHandler } else { return static_cast(AMREX_HIP_CUDA_OR_SYCL_OR_CPU( blockIdx.x + gridDim.x * blockIdx.y + gridDim.x * gridDim.y * blockIdx.z, - m_item->get_group(0)), + m_item->get_group(0), m_blockid[0] + m_griddim[0] * m_blockid[1] + m_griddim[0] * m_griddim[1] * m_blockid[2] )); From 3f2a3f88b8d03996a8c45d7dff1fe77d15fccbdf Mon Sep 17 00:00:00 2001 From: AlexanderSinn Date: Mon, 16 Mar 2026 19:35:56 +0100 Subject: [PATCH 43/46] Add missing include to Arena.H --- Src/Base/AMReX_Arena.H | 1 + 1 file changed, 1 insertion(+) diff --git a/Src/Base/AMReX_Arena.H b/Src/Base/AMReX_Arena.H index dfcd6e20289..97ab30f8232 100644 --- a/Src/Base/AMReX_Arena.H +++ b/Src/Base/AMReX_Arena.H @@ -6,6 +6,7 @@ #include #ifdef AMREX_USE_GPU #include +#include #endif #ifdef AMREX_TINY_PROFILING From cbad36a91f1a5c5ded55a2bffdf8bf12ae09b9e2 Mon Sep 17 00:00:00 2001 From: AlexanderSinn Date: Mon, 16 Mar 2026 20:09:57 +0100 Subject: [PATCH 44/46] move definition of gpuStream_t --- Src/Base/AMReX_Arena.H | 1 - Src/Base/AMReX_GpuControl.H | 10 +--------- Src/Base/AMReX_GpuTypes.H | 9 +++++++++ 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Src/Base/AMReX_Arena.H b/Src/Base/AMReX_Arena.H index 97ab30f8232..1cfcfc9bc70 100644 --- a/Src/Base/AMReX_Arena.H +++ b/Src/Base/AMReX_Arena.H @@ -5,7 +5,6 @@ #include #include #ifdef AMREX_USE_GPU -#include #include #endif diff --git a/Src/Base/AMReX_GpuControl.H b/Src/Base/AMReX_GpuControl.H index e5213f5440b..cad51dec070 100644 --- a/Src/Base/AMReX_GpuControl.H +++ b/Src/Base/AMReX_GpuControl.H @@ -90,15 +90,7 @@ namespace amrex { #define AMREX_DEFAULT_RUNON =amrex::RunOn::Host // by default run on Host when compiling for Cpu #endif -namespace amrex { // NOLINT(modernize-concat-nested-namespaces) - -#ifdef AMREX_USE_HIP -using gpuStream_t = hipStream_t; -#elif defined(AMREX_USE_CUDA) -using gpuStream_t = cudaStream_t; -#endif - -namespace Gpu { +namespace amrex::Gpu { #if defined(AMREX_USE_GPU) diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H index a6ac2b09401..3c96b046851 100644 --- a/Src/Base/AMReX_GpuTypes.H +++ b/Src/Base/AMReX_GpuTypes.H @@ -5,6 +5,7 @@ #include #include #include +#include #include #ifdef AMREX_USE_GPU @@ -35,6 +36,14 @@ struct gpuStream_t { bool operator!= (gpuStream_t const& rhs) const noexcept { return queue != rhs.queue; } }; +#elif defined(AMREX_USE_HIP) + +using gpuStream_t = hipStream_t; + +#elif defined(AMREX_USE_CUDA) + +using gpuStream_t = cudaStream_t; + #endif } From 93dd7b24ce7bc40a47acec9a7e039d4187fc1ca7 Mon Sep 17 00:00:00 2001 From: AlexanderSinn Date: Mon, 16 Mar 2026 20:14:19 +0100 Subject: [PATCH 45/46] fix --- Src/Base/AMReX_GpuControl.H | 1 - 1 file changed, 1 deletion(-) diff --git a/Src/Base/AMReX_GpuControl.H b/Src/Base/AMReX_GpuControl.H index cad51dec070..9310f9ffae1 100644 --- a/Src/Base/AMReX_GpuControl.H +++ b/Src/Base/AMReX_GpuControl.H @@ -232,7 +232,6 @@ namespace amrex::Gpu { #endif -} } #endif From b25c2a166cd86433a245ed1e38b4a00c2286d7bb Mon Sep 17 00:00:00 2001 From: Alexander Sinn Date: Tue, 17 Mar 2026 08:52:48 +0100 Subject: [PATCH 46/46] Update AMREX_HOME --- Tests/LaunchRaw/GNUmakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Tests/LaunchRaw/GNUmakefile b/Tests/LaunchRaw/GNUmakefile index 173eef4c670..05ed32d3446 100644 --- a/Tests/LaunchRaw/GNUmakefile +++ b/Tests/LaunchRaw/GNUmakefile @@ -1,4 +1,4 @@ -AMREX_HOME = ../../../ +AMREX_HOME = ../../ DEBUG = FALSE