From 406c6c43b1bde52b63d67b91aac0eeeef2d7dd3f Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Tue, 13 Jan 2026 19:05:37 +0100
Subject: [PATCH 01/46] Add amrex::Launch

---
 Src/Base/AMReX_GpuControl.H       |   8 ++
 Src/Base/AMReX_GpuLaunchFunctsG.H |  26 +++++
 Src/Base/AMReX_GpuTypes.H         | 164 ++++++++++++++++++++++++++++++
 3 files changed, 198 insertions(+)
diff --git a/Src/Base/AMReX_GpuControl.H b/Src/Base/AMReX_GpuControl.H
index c5643e2268d..20084bc34cc 100644
--- a/Src/Base/AMReX_GpuControl.H
+++ b/Src/Base/AMReX_GpuControl.H
@@ -33,6 +33,14 @@
 #define AMREX_HIP_OR_CUDA_OR_SYCL(a,b,c) ((void)0);
 #endif
 
+#if defined(AMREX_USE_HIP) || defined(AMREX_USE_CUDA)
+#define AMREX_HIP_CUDA_OR_SYCL_OR_CPU(a,b,c) a
+#elif defined(AMREX_USE_SYCL)
+#define AMREX_HIP_CUDA_OR_SYCL_OR_CPU(a,b,c) b
+#else
+#define AMREX_HIP_CUDA_OR_SYCL_OR_CPU(a,b,c) c
+#endif
+
 #ifdef AMREX_USE_GPU
 #define AMREX_GPU_OR_CPU(a,b) a
 #else
diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index 275956aaea2..29042d06135 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -705,6 +705,32 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
     }
 }
 
+template<int MT, int dim, typename L>
+void Launch (IntVectND<dim> nblocks, std::size_t shared_mem_bytes, L const& f)
+{
+    const auto nthreads_total = MT * std::size_t(nblocks);
+    const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)
+        / sizeof(unsigned long long);
+    auto& q = *(stream.queue);
+    try {
+        q.submit([&] (sycl::handler& h) {
+            sycl::local_accessor<unsigned long long>
+                shared_data(sycl::range<1>(shared_mem_numull), h);
+            h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
+                                             sycl::range<1>(MT)),
+            [=] (sycl::nd_item<1> item)
+            [[sycl::reqd_work_group_size(MT)]]
+            [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
+            {
+                f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
+            });
+        });
+    } catch (sycl::exception const& ex) {
+        amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
+    }
+}
+
+
 #else
 // CUDA or HIP
 
diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H
index e01303656ff..a1e07436c21 100644
--- a/Src/Base/AMReX_GpuTypes.H
+++ b/Src/Base/AMReX_GpuTypes.H
@@ -102,6 +102,170 @@ struct Handler {};
 
 #endif
 
+template<unsigned int threads_per_block, int dim>
+struct LaunchHandler
+{
+#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
+    LaunchHandler() = default;
+#elif defined(AMREX_USE_SYCL)
+    LaunchHandler(sycl::nd_item<dim> const* a_item, char * a_shared_mem)
+        : m_item{a_item}, m_shared_mem{a_shared_mem} {};
+#else
+    LaunchHandler(IntVectND<dim> a_blockid, IntVectND<dim> a_griddim, char * a_shared_mem)
+        : m_blockid{a_blockid}, m_griddim{a_griddim}, m_shared_mem{a_shared_mem} {};
+#endif
+
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    unsigned int threadIdx1D () const {
+        return static_cast<unsigned int>(AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
+            threadIdx.x,
+            m_item->get_local_linear_id(),
+            0
+        ));
+    }
+
+    template<int ...nd_block_size>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    IntVectND<dim> threadIdxND () const {
+        static_assert(sizeof...(nd_block_size) == dim &&
+                      (1 * ... * nd_block_size) == threads_per_block);
+        constexpr IntVectND<dim> iv_block_size = blockDimND<nd_block_size...>();
+        IntVectND<dim> ret(0);
+        unsigned int idx = threadIdx();
+        if constexpr (dim == 3) {
+            ret[2] = idx / iv_block_size[2];
+            idx = idx - ret[2] * iv_block_size[2];
+        }
+        if constexpr (dim >= 2) {
+            ret[1] = idx / iv_block_size[1];
+            idx = idx - ret[1] * iv_block_size[1];
+        }
+        ret[0] = idx;
+        return ret;
+    }
+
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    unsigned int blockIdx1D () const {
+        if constexpr (dim == 1) {
+            return static_cast<unsigned int>(AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
+                blockIdx.x,
+                m_item->get_group_id(0),
+                m_blockid[0]
+            ));
+        } else if constexpr (dim == 2) {
+            return static_cast<unsigned int>(AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
+                blockIdx.x + gridDim.x * blockIdx.y,
+                m_item->get_group_linear_id(),
+                m_blockid[0] + m_griddim[0] * m_blockid[1]
+            ));
+        } else {
+            return static_cast<unsigned int>(AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
+                blockIdx.x + gridDim.x * blockIdx.y + gridDim.x * gridDim.y * blockIdx.z,
+                m_item->get_group_linear_id(),
+                m_blockid[0] + m_griddim[0] * m_blockid[1] +
+                m_griddim[0] * m_griddim[1] * m_blockid[2]
+            ));
+        }
+    }
+
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    IntVectND<dim> blockIdxND () const {
+        if constexpr (dim == 1) {
+            return AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
+                IntVectND<dim>(static_cast<int>(blockIdx.x)),
+                IntVectND<dim>(static_cast<int>(m_item->get_group_id(0))),
+                m_blockid
+            );
+        } else if constexpr (dim == 2) {
+            return AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
+                IntVectND<dim>(static_cast<int>(blockIdx.x),
+                               static_cast<int>(blockIdx.y)),
+                IntVectND<dim>(static_cast<int>(m_item->get_group_id(1)),
+                               static_cast<int>(m_item->get_group_id(0))),
+                m_blockid
+            );
+        } else {
+            return AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
+                IntVectND<dim>(static_cast<int>(blockIdx.x),
+                               static_cast<int>(blockIdx.y),
+                               static_cast<int>(blockIdx.z)),
+                IntVectND<dim>(static_cast<int>(m_item->get_group_id(2)),
+                               static_cast<int>(m_item->get_group_id(1)),
+                               static_cast<int>(m_item->get_group_id(0))),
+                m_blockid
+            );
+        }
+    }
+
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    constexpr unsigned int blockDim1D () const {
+        return threads_per_block;
+    }
+
+    template<int ...nd_block_size>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    constexpr IntVectND<dim> blockDimND () const {
+        static_assert(sizeof...(nd_block_size) == dim &&
+                      (1 * ... * nd_block_size) == threads_per_block);
+        return {nd_block_size...};
+    }
+
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    unsigned int globalIdx1D () const {
+        return blockIdx1D() * threads_per_block + threadIdx1D();
+    }
+
+    template<int ...nd_block_size>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    IntVectND<dim> globalIdxND () const {
+        return blockIdxND() * blockDimND<nd_block_size...>() + threadIdxND<nd_block_size...>();
+    }
+
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    void syncthreads () const {
+        AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
+            __syncthreads(),
+            m_item->barrier(sycl::access::fence_space::global_and_local),
+            (void)0
+        );
+    }
+
+    template<class T>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    T* shared_memory () const {
+#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
+        alignas(32) extern __shared__ char smem[];
+        return reinterpret_cast<T*>(smem);
+#else
+        return reinterpret_cast<T*>(m_shared_mem);
+#endif
+    }
+
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    auto sycl_item () const {
+#if defined(AMREX_USE_SYCL)
+        return m_item;
+#else
+        return 0;
+#endif
+    }
+
+private:
+
+#if defined(AMREX_USE_SYCL)
+    sycl::nd_item<dim> const* m_item;
+    char * m_shared_mem;
+#endif
+
+#if !defined(AMREX_USE_GPU)
+    IntVectND<dim> m_blockid;
+    IntVectND<dim> m_griddim;
+    char * m_shared_mem;
+#endif
+};
+
+
+
 }
 
 #endif

From 2ac0e24b12095481e76fa1c3f4108ff28af2d884 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Thu, 15 Jan 2026 15:40:18 +0100
Subject: [PATCH 02/46] add CUDA and CPU versions

---
 Src/Base/AMReX_GpuLaunchFunctsC.H |  51 ++++++++++++++
 Src/Base/AMReX_GpuLaunchFunctsG.H | 110 +++++++++++++++++++++++++-----
 Src/Base/AMReX_GpuTypes.H         |  14 ++--
 3 files changed, 152 insertions(+), 23 deletions(-)

diff --git a/Src/Base/AMReX_GpuLaunchFunctsC.H b/Src/Base/AMReX_GpuLaunchFunctsC.H
index 5091970ad38..e12e8c4a376 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsC.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsC.H
@@ -149,6 +149,57 @@ void launch (T const& n, L&& f) noexcept
     std::forward<L>(f)(n);
 }
 
+template<int MT, int dim, typename L>
+void LaunchRaw (IntVectND<dim> nblocks, L const& f)
+{
+    static_assert(MT == 1);
+    if constexpr(dim == 1) {
+        for (int bx=0; bx < nblocks[0]; ++bx) {
+            f(Gpu::LaunchHandler<MT, void, dim>{{bx}, nblocks, nullptr});
+        }
+    } else if constexpr(dim == 2) {
+        for (int by=0; by < nblocks[1]; ++by) {
+            for (int bx=0; bx < nblocks[0]; ++bx) {
+                f(Gpu::LaunchHandler<MT, void, dim>{{bx, by}, nblocks, nullptr});
+            }
+        }
+    } else {
+        for (int bz=0; bz < nblocks[2]; ++bz) {
+            for (int by=0; by < nblocks[1]; ++by) {
+                for (int bx=0; bx < nblocks[0]; ++bx) {
+                    f(Gpu::LaunchHandler<MT, void, dim>{{bx, by, bz}, nblocks, nullptr});
+                }
+            }
+        }
+    }
+}
+
+template<int MT, typename T, int dim, typename L>
+void LaunchRaw (IntVectND<dim> nblocks, std::size_t shared_mem_elements, L const& f)
+{
+    static_assert(MT == 1);
+    T smem[shared_mem_elements];
+    if constexpr(dim == 1) {
+        for (int bx=0; bx < nblocks[0]; ++bx) {
+            f(Gpu::LaunchHandler<MT, void, dim>{{bx}, nblocks, &smem});
+        }
+    } else if constexpr(dim == 2) {
+        for (int by=0; by < nblocks[1]; ++by) {
+            for (int bx=0; bx < nblocks[0]; ++bx) {
+                f(Gpu::LaunchHandler<MT, void, dim>{{bx, by}, nblocks, &smem});
+            }
+        }
+    } else {
+        for (int bz=0; bz < nblocks[2]; ++bz) {
+            for (int by=0; by < nblocks[1]; ++by) {
+                for (int bx=0; bx < nblocks[0]; ++bx) {
+                    f(Gpu::LaunchHandler<MT, void, dim>{{bx, by, bz}, nblocks, &smem});
+                }
+            }
+        }
+    }
+}
+
 template <typename T, typename L, typename M=std::enable_if_t<std::is_integral_v<T>> >
 AMREX_ATTRIBUTE_FLATTEN_FOR
 void For (T n, L const& f) noexcept
diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index 29042d06135..b08acf8cc6c 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -706,27 +706,67 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
 }
 
 template<int MT, int dim, typename L>
-void Launch (IntVectND<dim> nblocks, std::size_t shared_mem_bytes, L const& f)
+void LaunchRaw (IntVectND<dim> nblocks, L const& f)
 {
-    const auto nthreads_total = MT * std::size_t(nblocks);
-    const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)
-        / sizeof(unsigned long long);
-    auto& q = *(stream.queue);
+    auto& q = Gpu::Device::streamQueue();
+
+    sycl::range<dim> threads_per_block;
+    sycl::range<dim> threads_total;
+
+    for (int i=0; i<dim; ++i) {
+        threads_per_block[i] = (i == dim-1) ? MT : 1;
+        threads_total[i] = nblocks[dim-1-i] * threads_per_block[i];
+    }
+
     try {
         q.submit([&] (sycl::handler& h) {
-            sycl::local_accessor<unsigned long long>
-                shared_data(sycl::range<1>(shared_mem_numull), h);
-            h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
-                                             sycl::range<1>(MT)),
-            [=] (sycl::nd_item<1> item)
-            [[sycl::reqd_work_group_size(MT)]]
-            [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
-            {
-                f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
-            });
+
+            h.parallel_for(sycl::nd_range<dim>(threads_total, threads_per_block),
+                [=] (sycl::nd_item<dim> item)
+                [[sycl::reqd_work_group_size(1, 1, MT)]]
+                [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
+                {
+                    f(Gpu::LaunchHandler<MT, void, dim>{
+                        &item,
+                        nullptr
+                    });
+                });
         });
     } catch (sycl::exception const& ex) {
-        amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
+        amrex::Abort(std::string("LaunchRaw: ")+ex.what()+"!!!!!");
+    }
+}
+
+template<int MT, typename T, int dim, typename L>
+void LaunchRaw (IntVectND<dim> nblocks, std::size_t shared_mem_elements, L const& f)
+{
+    auto& q = Gpu::Device::streamQueue();
+
+    sycl::range<dim> threads_per_block;
+    sycl::range<dim> threads_total;
+
+    for (int i=0; i<dim; ++i) {
+        threads_per_block[i] = (i == dim-1) ? MT : 1;
+        threads_total[i] = nblocks[dim-1-i] * threads_per_block[i];
+    }
+
+    try {
+        q.submit([&] (sycl::handler& h) {
+            sycl::local_accessor<T> shared_data(sycl::range<1>(shared_mem_elements), h);
+
+            h.parallel_for(sycl::nd_range<dim>(threads_total, threads_per_block),
+                [=] (sycl::nd_item<dim> item)
+                [[sycl::reqd_work_group_size(1, 1, MT)]]
+                [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
+                {
+                    f(Gpu::LaunchHandler<MT, T, dim>{
+                        &item,
+                        shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()
+                    });
+                });
+        });
+    } catch (sycl::exception const& ex) {
+        amrex::Abort(std::string("LaunchRaw: ")+ex.what()+"!!!!!");
     }
 }
 
@@ -1123,6 +1163,44 @@ ParallelFor (Gpu::KernelInfo const&,
     AMREX_GPU_ERROR_CHECK();
 }
 
+template<int MT, int dim, typename L>
+void LaunchRaw (IntVectND<dim> nblocks, L const& f)
+{
+    dim3 num_blocks;
+    num_blocks.x = nblocks[0];
+    if constexpr (dim >= 2) {
+        num_blocks.y = nblocks[1];
+    }
+    if constexpr (dim == 3) {
+        num_blocks.z = nblocks[2];
+    }
+
+    AMREX_LAUNCH_KERNEL(MT, num_blocks, MT, 0, Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            f(Gpu::LaunchHandler<MT, void, dim>{});
+        });
+    AMREX_GPU_ERROR_CHECK();
+}
+
+template<int MT, typename T, int dim, typename L>
+void LaunchRaw (IntVectND<dim> nblocks, std::size_t shared_mem_elements, L const& f)
+{
+    dim3 num_blocks;
+    num_blocks.x = nblocks[0];
+    if constexpr (dim >= 2) {
+        num_blocks.y = nblocks[1];
+    }
+    if constexpr (dim == 3) {
+        num_blocks.z = nblocks[2];
+    }
+
+    AMREX_LAUNCH_KERNEL(MT, num_blocks, MT, shared_mem_elements * sizeof(T), Gpu::gpuStream(),
+        [=] AMREX_GPU_DEVICE () noexcept {
+            f(Gpu::LaunchHandler<MT, T, dim>{});
+        });
+    AMREX_GPU_ERROR_CHECK();
+}
+
 #endif
 
 template <typename L>
diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H
index a1e07436c21..7ed05df3e6f 100644
--- a/Src/Base/AMReX_GpuTypes.H
+++ b/Src/Base/AMReX_GpuTypes.H
@@ -102,16 +102,16 @@ struct Handler {};
 
 #endif
 
-template<unsigned int threads_per_block, int dim>
+template<unsigned int threads_per_block, typename T, int dim>
 struct LaunchHandler
 {
 #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
     LaunchHandler() = default;
 #elif defined(AMREX_USE_SYCL)
-    LaunchHandler(sycl::nd_item<dim> const* a_item, char * a_shared_mem)
+    LaunchHandler(sycl::nd_item<dim> const* a_item, T * a_shared_mem)
         : m_item{a_item}, m_shared_mem{a_shared_mem} {};
 #else
-    LaunchHandler(IntVectND<dim> a_blockid, IntVectND<dim> a_griddim, char * a_shared_mem)
+    LaunchHandler(IntVectND<dim> a_blockid, IntVectND<dim> a_griddim, T * a_shared_mem)
         : m_blockid{a_blockid}, m_griddim{a_griddim}, m_shared_mem{a_shared_mem} {};
 #endif
 
@@ -230,14 +230,14 @@ struct LaunchHandler
         );
     }
 
-    template<class T>
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     T* shared_memory () const {
 #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
+        static_assert(32 >= alignof(T));
         alignas(32) extern __shared__ char smem[];
         return reinterpret_cast<T*>(smem);
 #else
-        return reinterpret_cast<T*>(m_shared_mem);
+        return m_shared_mem;
 #endif
     }
 
@@ -254,13 +254,13 @@ private:
 
 #if defined(AMREX_USE_SYCL)
     sycl::nd_item<dim> const* m_item;
-    char * m_shared_mem;
+    T * m_shared_mem;
 #endif
 
 #if !defined(AMREX_USE_GPU)
     IntVectND<dim> m_blockid;
     IntVectND<dim> m_griddim;
-    char * m_shared_mem;
+    T * m_shared_mem;
 #endif
 };
 

From 12be03339268b244246a8fa79f13fa6af4753aa6 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Thu, 15 Jan 2026 17:30:54 +0100
Subject: [PATCH 03/46] add includes

---
 Src/Base/AMReX_GpuTypes.H | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H
index 7ed05df3e6f..d37ed86096a 100644
--- a/Src/Base/AMReX_GpuTypes.H
+++ b/Src/Base/AMReX_GpuTypes.H
@@ -3,6 +3,8 @@
 #include <AMReX_Config.H>
 
 #include <AMReX_Extension.H>
+#include <AMReX_BaseFwd.H>
+#include <AMReX_GpuControl.H>
 
 #ifdef AMREX_USE_GPU
 

From 9912b9be38e8ac5403696e9624542ab892a78bae Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Mon, 19 Jan 2026 18:26:42 +0100
Subject: [PATCH 04/46] Fix circular include

---
 Src/Base/AMReX_GpuControl.H | 1 -
 Src/Base/AMReX_GpuTypes.H   | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/Src/Base/AMReX_GpuControl.H b/Src/Base/AMReX_GpuControl.H
index 20084bc34cc..d2f41800c90 100644
--- a/Src/Base/AMReX_GpuControl.H
+++ b/Src/Base/AMReX_GpuControl.H
@@ -3,7 +3,6 @@
 #include <AMReX_Config.H>
 
 #include <AMReX_GpuQualifiers.H>
-#include <AMReX_GpuTypes.H>
 
 #include <utility>
 
diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H
index d37ed86096a..c16cd0ee631 100644
--- a/Src/Base/AMReX_GpuTypes.H
+++ b/Src/Base/AMReX_GpuTypes.H
@@ -133,7 +133,7 @@ struct LaunchHandler
                       (1 * ... * nd_block_size) == threads_per_block);
         constexpr IntVectND<dim> iv_block_size = blockDimND<nd_block_size...>();
         IntVectND<dim> ret(0);
-        unsigned int idx = threadIdx();
+        unsigned int idx = threadIdx1D();
         if constexpr (dim == 3) {
             ret[2] = idx / iv_block_size[2];
             idx = idx - ret[2] * iv_block_size[2];

From 288428f1c93d1a6cdb4faf60df50a3a06bc14093 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Mon, 19 Jan 2026 19:08:10 +0100
Subject: [PATCH 05/46] fix index calculation

---
 Src/Base/AMReX_GpuTypes.H | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H
index c16cd0ee631..f75a99ca2df 100644
--- a/Src/Base/AMReX_GpuTypes.H
+++ b/Src/Base/AMReX_GpuTypes.H
@@ -135,12 +135,12 @@ struct LaunchHandler
         IntVectND<dim> ret(0);
         unsigned int idx = threadIdx1D();
         if constexpr (dim == 3) {
-            ret[2] = idx / iv_block_size[2];
-            idx = idx - ret[2] * iv_block_size[2];
+            ret[2] = idx / iv_block_size[1];
+            idx = idx - ret[2] * iv_block_size[1];
         }
         if constexpr (dim >= 2) {
-            ret[1] = idx / iv_block_size[1];
-            idx = idx - ret[1] * iv_block_size[1];
+            ret[1] = idx / iv_block_size[0];
+            idx = idx - ret[1] * iv_block_size[0];
         }
         ret[0] = idx;
         return ret;
@@ -200,13 +200,13 @@ struct LaunchHandler
     }
 
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-    constexpr unsigned int blockDim1D () const {
+    static constexpr unsigned int blockDim1D () {
         return threads_per_block;
     }
 
     template<int ...nd_block_size>
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-    constexpr IntVectND<dim> blockDimND () const {
+    static constexpr IntVectND<dim> blockDimND () {
         static_assert(sizeof...(nd_block_size) == dim &&
                       (1 * ... * nd_block_size) == threads_per_block);
         return {nd_block_size...};

From 418ba21a5095fe3e2980e267f5d7104d3c6d8df8 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Wed, 28 Jan 2026 17:45:21 +0100
Subject: [PATCH 06/46] add function to get the runtime value of blockDim.x

---
 Src/Base/AMReX_GpuTypes.H | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H
index f75a99ca2df..81d5c75e881 100644
--- a/Src/Base/AMReX_GpuTypes.H
+++ b/Src/Base/AMReX_GpuTypes.H
@@ -204,6 +204,15 @@ struct LaunchHandler
         return threads_per_block;
     }
 
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    unsigned int blockDim1Drt () const {
+        return static_cast<unsigned int>(AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
+            blockDim.x,
+            threads_per_block,
+            threads_per_block
+        ));
+    }
+
     template<int ...nd_block_size>
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     static constexpr IntVectND<dim> blockDimND () {

From 87d2dcf08ca708f4c2d8bcfae93d3b5d12e79459 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Wed, 4 Feb 2026 09:57:59 +0100
Subject: [PATCH 07/46] fix

---
 Src/Base/AMReX_GpuLaunchFunctsC.H | 10 ++++++----
 Src/Base/AMReX_GpuTypes.H         |  4 ++--
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/Src/Base/AMReX_GpuLaunchFunctsC.H b/Src/Base/AMReX_GpuLaunchFunctsC.H
index 6421082524e..28d75aa2e84 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsC.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsC.H
@@ -2,6 +2,8 @@
 #define AMREX_GPU_LAUNCH_FUNCTS_C_H_
 #include <AMReX_Config.H>
 
+#include <vector>
+
 namespace amrex {
 
 /** Helper type to store/access the SIMD width in ParallelForSIMD lambdas
@@ -178,22 +180,22 @@ template<int MT, typename T, int dim, typename L>
 void LaunchRaw (IntVectND<dim> nblocks, std::size_t shared_mem_elements, L const& f)
 {
     static_assert(MT == 1);
-    T smem[shared_mem_elements];
+    std::vector<T> smem(shared_mem_elements);
     if constexpr(dim == 1) {
         for (int bx=0; bx < nblocks[0]; ++bx) {
-            f(Gpu::LaunchHandler<MT, void, dim>{{bx}, nblocks, &smem});
+            f(Gpu::LaunchHandler<MT, void, dim>{{bx}, nblocks, smem.data()});
         }
     } else if constexpr(dim == 2) {
         for (int by=0; by < nblocks[1]; ++by) {
             for (int bx=0; bx < nblocks[0]; ++bx) {
-                f(Gpu::LaunchHandler<MT, void, dim>{{bx, by}, nblocks, &smem});
+                f(Gpu::LaunchHandler<MT, void, dim>{{bx, by}, nblocks, smem.data()});
             }
         }
     } else {
         for (int bz=0; bz < nblocks[2]; ++bz) {
             for (int by=0; by < nblocks[1]; ++by) {
                 for (int bx=0; bx < nblocks[0]; ++bx) {
-                    f(Gpu::LaunchHandler<MT, void, dim>{{bx, by, bz}, nblocks, &smem});
+                    f(Gpu::LaunchHandler<MT, void, dim>{{bx, by, bz}, nblocks, smem.data()});
                 }
             }
         }
diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H
index 81d5c75e881..2c24c0facfb 100644
--- a/Src/Base/AMReX_GpuTypes.H
+++ b/Src/Base/AMReX_GpuTypes.H
@@ -111,10 +111,10 @@ struct LaunchHandler
     LaunchHandler() = default;
 #elif defined(AMREX_USE_SYCL)
     LaunchHandler(sycl::nd_item<dim> const* a_item, T * a_shared_mem)
-        : m_item{a_item}, m_shared_mem{a_shared_mem} {};
+        : m_item{a_item}, m_shared_mem{a_shared_mem} {}
 #else
     LaunchHandler(IntVectND<dim> a_blockid, IntVectND<dim> a_griddim, T * a_shared_mem)
-        : m_blockid{a_blockid}, m_griddim{a_griddim}, m_shared_mem{a_shared_mem} {};
+        : m_blockid{a_blockid}, m_griddim{a_griddim}, m_shared_mem{a_shared_mem} {}
 #endif
 
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE

From 2636e1986f48388cc22bb546bdea84428a4f2cac Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Wed, 4 Feb 2026 10:23:50 +0100
Subject: [PATCH 08/46] add template for SYCL

---
 Src/Base/AMReX_GpuLaunchFunctsG.H | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index 678fc24d857..3afe26ec9e9 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -178,7 +178,9 @@ void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
             [=] (sycl::nd_item<1> item)
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
-                f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
+                f(Gpu::Handler{&item,
+                    shared_data.template get_multi_ptr<sycl::access::decorated::yes>().get()
+                });
             });
         });
     } catch (sycl::exception const& ex) {
@@ -224,7 +226,9 @@ void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
             [[sycl::reqd_work_group_size(MT)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
-                f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
+                f(Gpu::Handler{&item,
+                    shared_data.template get_multi_ptr<sycl::access::decorated::yes>().get()
+                });
             });
         });
     } catch (sycl::exception const& ex) {
@@ -302,8 +306,9 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L const& f) noexcept
                          i < std::size_t(n); i += stride) {
                         int n_active_threads = amrex::min(std::size_t(n)-i+item.get_local_id(0),
                                                           item.get_local_range(0));
-                        detail::call_f_scalar_handler(f, T(i), Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
-                                                          n_active_threads});
+                        detail::call_f_scalar_handler(f, T(i), Gpu::Handler{&item,
+                            shared_data.template get_multi_ptr<sycl::access::decorated::yes>().get(),
+                            n_active_threads});
                     }
                 });
             });
@@ -352,8 +357,9 @@ void ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, L const& f
                         auto iv = indexer.intVect(icell);
                         int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
                                                           std::uint64_t(item.get_local_range(0)));
-                        detail::call_f_intvect_handler(f, iv, Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
-                                                                n_active_threads});
+                        detail::call_f_intvect_handler(f, iv, Gpu::Handler{&item,
+                            shared_data.template get_multi_ptr<sycl::access::decorated::yes>().get(),
+                            n_active_threads});
                     }
                 });
             });
@@ -403,9 +409,9 @@ void ParallelFor (Gpu::KernelInfo const& info, BoxND<dim> const& box, T ncomp, L
                         auto iv = indexer.intVect(icell);
                         int n_active_threads = amrex::min(indexer.numPts()-icell+std::uint64_t(item.get_local_id(0)),
                                                           std::uint64_t(item.get_local_range(0)));
-                        detail::call_f_intvect_ncomp_handler(f, iv, ncomp,
-                                       Gpu::Handler{&item, shared_data.get_multi_ptr<sycl::access::decorated::yes>().get(),
-                                                    n_active_threads});
+                        detail::call_f_intvect_ncomp_handler(f, iv, ncomp, Gpu::Handler{&item,
+                            shared_data.template get_multi_ptr<sycl::access::decorated::yes>().get(),
+                            n_active_threads});
                     }
                 });
             });
@@ -761,7 +767,7 @@ void LaunchRaw (IntVectND<dim> nblocks, std::size_t shared_mem_elements, L const
                 {
                     f(Gpu::LaunchHandler<MT, T, dim>{
                         &item,
-                        shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()
+                        shared_data.template get_multi_ptr<sycl::access::decorated::yes>().get()
                     });
                 });
         });

From 536707fe248a90c08753c85055fbae6de6070f6b Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Mon, 9 Feb 2026 15:20:38 +0100
Subject: [PATCH 09/46] remove blockDim1Drt

---
 Src/Base/AMReX_GpuTypes.H | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H
index 2c24c0facfb..9731f23f5cf 100644
--- a/Src/Base/AMReX_GpuTypes.H
+++ b/Src/Base/AMReX_GpuTypes.H
@@ -204,15 +204,6 @@ struct LaunchHandler
         return threads_per_block;
     }
 
-    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-    unsigned int blockDim1Drt () const {
-        return static_cast<unsigned int>(AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
-            blockDim.x,
-            threads_per_block,
-            threads_per_block
-        ));
-    }
-
     template<int ...nd_block_size>
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     static constexpr IntVectND<dim> blockDimND () {

From e1facb833e55800d1f9b5a77089f4b8d5b5d0d03 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Mon, 9 Feb 2026 18:44:09 +0100
Subject: [PATCH 10/46] convert some functions to use LaunchRaw

---
 Src/AmrCore/AMReX_TagBox.cpp                  | 86 +++++--------------
 Src/Base/AMReX_BaseFabUtility.H               | 48 +++--------
 Src/Base/AMReX_FBI.H                          | 27 ++----
 Src/Base/AMReX_GpuContainers.H                | 40 ++-------
 Src/Base/AMReX_GpuLaunchFunctsC.H             |  6 +-
 Src/Base/AMReX_GpuLaunchFunctsG.H             |  2 -
 Src/Base/AMReX_GpuReduce.H                    |  7 ++
 Src/Base/AMReX_GpuTypes.H                     | 19 ++--
 .../MLMG/AMReX_MLEBTensorOp_bc.cpp            | 18 ++--
 Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp   | 16 +---
 Src/Particle/AMReX_ParticleUtil.H             | 45 +++++-----
 11 files changed, 107 insertions(+), 207 deletions(-)

diff --git a/Src/AmrCore/AMReX_TagBox.cpp b/Src/AmrCore/AMReX_TagBox.cpp
index a0a3a0e9940..066c863a01f 100644
--- a/Src/AmrCore/AMReX_TagBox.cpp
+++ b/Src/AmrCore/AMReX_TagBox.cpp
@@ -446,33 +446,18 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
         int* ntags = dv_ntags.data() + blockoffset[li];
         const int ncells = fai.fabbox().numPts();
         const char* tags = (*this)[fai].dataPtr();
-#ifdef AMREX_USE_SYCL
-        amrex::launch<block_size>(nblocks[li], sizeof(int)*Gpu::Device::warp_size,
-                                  Gpu::Device::gpuStream(),
-        [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
-        {
-            int bid = h.item->get_group_linear_id();
-            int tid = h.item->get_local_id(0);
-            int icell = h.item->get_global_id(0);
-
-            int t = 0;
-            if (icell < ncells && tags[icell] != TagBox::CLEAR) {
-                t = 1;
-            }
 
-            t = Gpu::blockReduce<Gpu::Device::warp_size>
-                (t, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, h);
-            if (tid == 0) {
-                ntags[bid] = t;
-            }
-        });
+        amrex::LaunchRaw<block_size, int>(amrex::IntVectND<1>{nblocks[li]},
+#ifdef AMREX_USE_SYCL
+            Gpu::Device::warp_size,
 #else
-        amrex::launch<block_size>(nblocks[li], Gpu::Device::gpuStream(),
-        [=] AMREX_GPU_DEVICE () noexcept
+            0,
+#endif
+        [=] AMREX_GPU_DEVICE (auto lh) noexcept
         {
-            int bid = blockIdx.x;
-            int tid = threadIdx.x;
-            int icell = block_size*blockIdx.x+threadIdx.x;
+            int bid = lh.blockIdx1D();
+            int tid = lh.threadIdx1D();
+            int icell = lh.globalIdx1D();
 
             int t = 0;
             if (icell < ncells && tags[icell] != TagBox::CLEAR) {
@@ -480,12 +465,12 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
             }
 
             t = Gpu::blockReduce<Gpu::Device::warp_size>
-                (t, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
+                (t, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0,
+                lh.handler());
             if (tid == 0) {
                 ntags[bid] = t;
             }
         });
-#endif
     }
 
     Gpu::PinnedVector<int> hv_ntags(ntotblocks);
@@ -524,51 +509,27 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
             const auto lenx  = len.x;
             const int ncells = bx.numPts();
             const char* tags = (*this)[fai].dataPtr();
-#ifdef AMREX_USE_SYCL
-            amrex::launch<block_size>(nblocks[li], sizeof(unsigned int), Gpu::Device::gpuStream(),
-            [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
+            amrex::LaunchRaw<block_size, unsigned int>(amrex::IntVectND<1>{nblocks[li]}, 1,
+            [=] AMREX_GPU_DEVICE (auto lh) noexcept
             {
-                int bid = h.item->get_group(0);
-                int tid = h.item->get_local_id(0);
-                int icell = h.item->get_global_id(0);
+                int bid = lh.blockIdx1D();
+                int tid = lh.threadIdx1D();
+                int icell = lh.globalIdx1D();
 
-                unsigned int* shared_counter = (unsigned int*)h.local;
+                unsigned int * shared_counter = lh.shared_memory();
                 if (tid == 0) {
                     *shared_counter = 0;
                 }
-                h.item->barrier(sycl::access::fence_space::local_space);
+                lh.syncthreads();
 
                 if (icell < ncells && tags[icell] != TagBox::CLEAR) {
-                    unsigned int itag = Gpu::Atomic::Add<unsigned int,
-                                                         sycl::access::address_space::local_space>
-                        (shared_counter, 1u);
-                    IntVect* p = dp_tags + dp_tags_offset[iblock_begin+bid];
-                    int k =  icell /   lenxy;
-                    int j = (icell - k*lenxy) /   lenx;
-                    int i = (icell - k*lenxy) - j*lenx;
-                    i += lo.x;
-                    j += lo.y;
-                    k += lo.z;
-                    p[itag] = IntVect(AMREX_D_DECL(i,j,k));
-                }
-            });
-#else
-            amrex::launch<block_size>(nblocks[li], sizeof(unsigned int), Gpu::Device::gpuStream(),
-            [=] AMREX_GPU_DEVICE () noexcept
-            {
-                int bid = blockIdx.x;
-                int tid = threadIdx.x;
-                int icell = block_size*blockIdx.x+threadIdx.x;
 
-                Gpu::SharedMemory<unsigned int> gsm;
-                unsigned int * shared_counter = gsm.dataPtr();
-                if (tid == 0) {
-                    *shared_counter = 0;
-                }
-                __syncthreads();
+                    unsigned int itag = Gpu::Atomic::Add
+#ifdef AMREX_USE_SYCL
+                        <unsigned int, sycl::access::address_space::local_space>
+#endif
+                        (shared_counter, 1u);
 
-                if (icell < ncells && tags[icell] != TagBox::CLEAR) {
-                    unsigned int itag = Gpu::Atomic::Add(shared_counter, 1u);
                     IntVect* p = dp_tags + dp_tags_offset[iblock_begin+bid];
                     int k =  icell /   lenxy;
                     int j = (icell - k*lenxy) /   lenx;
@@ -579,7 +540,6 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
                     p[itag] = IntVect(AMREX_D_DECL(i,j,k));
                 }
             });
-#endif
         }
     }
 
diff --git a/Src/Base/AMReX_BaseFabUtility.H b/Src/Base/AMReX_BaseFabUtility.H
index d0c1e78847b..c6c125db5be 100644
--- a/Src/Base/AMReX_BaseFabUtility.H
+++ b/Src/Base/AMReX_BaseFabUtility.H
@@ -42,49 +42,27 @@ void fill (BaseFab<STRUCT>& aos_fab, F const& f)
         std::uint64_t nblocks_long = (ntotcells+nthreads_per_block-1)/nthreads_per_block;
         AMREX_ASSERT(nblocks_long <= std::uint64_t(std::numeric_limits<int>::max()));
         auto nblocks = int(nblocks_long);
-        std::size_t shared_mem_bytes = nthreads_per_block * sizeof(STRUCT);
+        std::size_t shared_mem_elem = nthreads_per_block * STRUCTSIZE;
         T* p = (T*)aos_fab.dataPtr();
-#ifdef AMREX_USE_SYCL
-        amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
-        [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
+        amrex::LaunchRaw<nthreads_per_block, T>(amrex::IntVectND<1>{nblocks}, shared_mem_elem,
+        [=] AMREX_GPU_DEVICE (auto lh) noexcept
         {
-            auto const icell = std::uint64_t(handler.globalIdx());
-            std::uint64_t const blockDimx = handler.blockDim();
-            std::uint64_t const threadIdxx = handler.threadIdx();
-            std::uint64_t const blockIdxx = handler.blockIdx();
-            auto const shared = (T*)handler.sharedMemory();
+            std::uint64_t const icell =
+                std::uint64_t(lh.blockDim1D())*lh.blockIdx1D()+lh.threadIdx1D();
+            T* const shared = lh.shared_memory();
             if (icell < indexer.numPts()) {
-                auto ga = new(shared+threadIdxx*STRUCTSIZE) STRUCT;
+                auto ga = new(shared+std::uint64_t(lh.threadIdx1D())*STRUCTSIZE) STRUCT;
                 auto [i, j, k] = indexer(icell);
                 f(*ga, i, j, k);
             }
-            handler.sharedBarrier();
-            for (std::uint64_t m = threadIdxx,
-                     mend = amrex::min<std::uint64_t>(blockDimx, indexer.numPts()-blockDimx*blockIdxx) * STRUCTSIZE;
-                 m < mend; m += blockDimx) {
-                p[blockDimx*blockIdxx*STRUCTSIZE+m] = shared[m];
+            lh.syncthreads();
+            for (std::uint64_t m = lh.threadIdx1D(),
+                    mend = amrex::min<std::uint64_t>(lh.blockDim1D(),
+                    indexer.numPts()-std::uint64_t(lh.blockDim1D())*lh.blockIdx1D()) * STRUCTSIZE;
+                 m < mend; m += lh.blockDim1D()) {
+                p[std::uint64_t(lh.blockDim1D())*lh.blockIdx1D()*STRUCTSIZE+m] = shared[m];
             }
         });
-#else
-        amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
-        [=] AMREX_GPU_DEVICE () noexcept
-        {
-            std::uint64_t const icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
-            Gpu::SharedMemory<T> gsm;
-            T* const shared = gsm.dataPtr();
-            if (icell < indexer.numPts()) {
-                auto ga = new(shared+std::uint64_t(threadIdx.x)*STRUCTSIZE) STRUCT;
-                auto [i, j, k] = indexer(icell);
-                f(*ga, i, j, k);
-            }
-            __syncthreads();
-            for (std::uint64_t m = threadIdx.x,
-                     mend = amrex::min<std::uint64_t>(blockDim.x, indexer.numPts()-std::uint64_t(blockDim.x)*blockIdx.x) * STRUCTSIZE;
-                 m < mend; m += blockDim.x) {
-                p[std::uint64_t(blockDim.x)*blockIdx.x*STRUCTSIZE+m] = shared[m];
-            }
-        });
-#endif
     } else
 #endif
     {
diff --git a/Src/Base/AMReX_FBI.H b/Src/Base/AMReX_FBI.H
index 1b3f8cc13f0..ee9584a4c0e 100644
--- a/Src/Base/AMReX_FBI.H
+++ b/Src/Base/AMReX_FBI.H
@@ -295,25 +295,16 @@ void deterministic_fab_to_fab (Vector<Array4CopyTag<T0,T1>> const& a_tags, int s
     auto const* pntags = d_ntags.data();
     auto const nblocks = int(h_ntags.size()-1);
     constexpr auto nthreads = 256;
-    amrex::launch<nthreads>(nblocks, Gpu::gpuStream(),
-#ifdef AMREX_USE_SYCL
-        [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept
-        [[sycl::reqd_work_group_size(nthreads)]]
-#else
-        [=] AMREX_GPU_DEVICE () noexcept
-#endif
-    {
-#ifdef AMREX_USE_SYCL
-        Dim1 blockIdx{item.get_group_linear_id()};
-        Dim1 threadIdx{item.get_local_linear_id()};
-#endif
 
-        for (unsigned int itag = pntags[blockIdx.x]; itag < pntags[blockIdx.x+1]; ++itag) {
+    amrex::LaunchRaw<nthreads>(amrex::IntVectND<1>{nblocks},
+    [=] AMREX_GPU_DEVICE (auto lh) noexcept
+    {
+        for (unsigned int itag = pntags[lh.blockIdx1D()]; itag < pntags[lh.blockIdx1D()+1]; ++itag) {
             auto const tag = ptag[itag];
             auto ncells = int(tag.dbox.numPts());
             const auto len = amrex::length(tag.dbox);
             const auto lo  = amrex::lbound(tag.dbox);
-            for (int icell = int(threadIdx.x); icell < ncells; icell += nthreads) {
+            for (int icell = int(lh.threadIdx1D()); icell < ncells; icell += nthreads) {
                 int k =  icell /   (len.x*len.y);
                 int j = (icell - k*(len.x*len.y)) /   len.x;
                 int i = (icell - k*(len.x*len.y)) - j*len.x;
@@ -328,12 +319,8 @@ void deterministic_fab_to_fab (Vector<Array4CopyTag<T0,T1>> const& a_tags, int s
                 }
             }
 
-            if (itag+1 < pntags[blockIdx.x+1]) {
-#ifdef AMREX_USE_SYCL
-                sycl::group_barrier(item.get_group());
-#else
-                __syncthreads();
-#endif
+            if (itag+1 < pntags[lh.blockIdx1D()+1]) {
+                lh.syncthreads();
             }
         }
     });
diff --git a/Src/Base/AMReX_GpuContainers.H b/Src/Base/AMReX_GpuContainers.H
index 11f4585feda..b1d0be9de89 100644
--- a/Src/Base/AMReX_GpuContainers.H
+++ b/Src/Base/AMReX_GpuContainers.H
@@ -438,51 +438,27 @@ namespace amrex::Gpu {
             auto pu = reinterpret_cast<U*>(p);
             constexpr int nthreads_per_block = (sizeof(T) <= 64) ? 256 : 128;
             int nblocks = static_cast<int>((N+nthreads_per_block-1)/nthreads_per_block);
-            std::size_t shared_mem_bytes = nthreads_per_block * sizeof(T);
-#ifdef AMREX_USE_SYCL
-            amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
-            [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
+            std::size_t shared_mem_Uelem = nthreads_per_block * nU;
+            amrex::LaunchRaw<nthreads_per_block, U>(amrex::IntVectND<1>{nblocks}, shared_mem_Uelem,
+            [=] AMREX_GPU_DEVICE (auto lh) noexcept
             {
-                Long i = handler.globalIdx();
-                Long blockDimx = handler.blockDim();
-                Long threadIdxx = handler.threadIdx();
-                Long blockIdxx = handler.blockIdx();
-                auto const shared_U = (U*)handler.sharedMemory();
-                auto const shared_T = (T*)shared_U;
-                if (i < N) {
-                    auto ga = new(shared_T+threadIdxx) T;
-                    f(*ga, i);
-                }
-                handler.sharedBarrier();
-                for (Long m = threadIdxx,
-                         mend = nU * amrex::min(blockDimx, N-blockDimx*blockIdxx);
-                     m < mend; m += blockDimx) {
-                    pu[blockDimx*blockIdxx*nU+m] = shared_U[m];
-                }
-            });
-#else
-            amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
-                          [=] AMREX_GPU_DEVICE () noexcept
-            {
-                Long blockDimx = blockDim.x;
-                Long threadIdxx = threadIdx.x;
-                Long blockIdxx = blockIdx.x;
+                Long blockDimx = lh.blockDim1D();
+                Long threadIdxx = lh.threadIdx1D();
+                Long blockIdxx = lh.blockIdx1D();
                 Long i = blockDimx*blockIdxx + threadIdxx;
-                Gpu::SharedMemory<U> gsm;
-                auto const shared_U = gsm.dataPtr();
+                auto const shared_U = lh.shared_memory();
                 auto const shared_T = (T*)shared_U;
                 if (i < N) {
                     auto ga = new(shared_T+threadIdxx) T;
                     f(*ga, i);
                 }
-                __syncthreads();
+                lh.syncthreads();
                 for (Long m = threadIdxx,
                          mend = nU * amrex::min(blockDimx, N-blockDimx*blockIdxx);
                      m < mend; m += blockDimx) {
                     pu[blockDimx*blockIdxx*nU+m] = shared_U[m];
                 }
             });
-#endif
         }
 #endif
     }
diff --git a/Src/Base/AMReX_GpuLaunchFunctsC.H b/Src/Base/AMReX_GpuLaunchFunctsC.H
index 0c13b1cb6bd..4a10a3a70ed 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsC.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsC.H
@@ -165,19 +165,19 @@ void LaunchRaw (IntVectND<dim> nblocks, std::size_t shared_mem_elements, L const
     std::vector<T> smem(shared_mem_elements);
     if constexpr(dim == 1) {
         for (int bx=0; bx < nblocks[0]; ++bx) {
-            f(Gpu::LaunchHandler<MT, void, dim>{{bx}, nblocks, smem.data()});
+            f(Gpu::LaunchHandler<MT, T, dim>{{bx}, nblocks, smem.data()});
         }
     } else if constexpr(dim == 2) {
         for (int by=0; by < nblocks[1]; ++by) {
             for (int bx=0; bx < nblocks[0]; ++bx) {
-                f(Gpu::LaunchHandler<MT, void, dim>{{bx, by}, nblocks, smem.data()});
+                f(Gpu::LaunchHandler<MT, T, dim>{{bx, by}, nblocks, smem.data()});
             }
         }
     } else {
         for (int bz=0; bz < nblocks[2]; ++bz) {
             for (int by=0; by < nblocks[1]; ++by) {
                 for (int bx=0; bx < nblocks[0]; ++bx) {
-                    f(Gpu::LaunchHandler<MT, void, dim>{{bx, by, bz}, nblocks, smem.data()});
+                    f(Gpu::LaunchHandler<MT, T, dim>{{bx, by, bz}, nblocks, smem.data()});
                 }
             }
         }
diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index 3afe26ec9e9..b647c58bd3f 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -726,7 +726,6 @@ void LaunchRaw (IntVectND<dim> nblocks, L const& f)
 
     try {
         q.submit([&] (sycl::handler& h) {
-
             h.parallel_for(sycl::nd_range<dim>(threads_total, threads_per_block),
                 [=] (sycl::nd_item<dim> item)
                 [[sycl::reqd_work_group_size(1, 1, MT)]]
@@ -759,7 +758,6 @@ void LaunchRaw (IntVectND<dim> nblocks, std::size_t shared_mem_elements, L const
     try {
         q.submit([&] (sycl::handler& h) {
             sycl::local_accessor<T> shared_data(sycl::range<1>(shared_mem_elements), h);
-
             h.parallel_for(sycl::nd_range<dim>(threads_total, threads_per_block),
                 [=] (sycl::nd_item<dim> item)
                 [[sycl::reqd_work_group_size(1, 1, MT)]]
diff --git a/Src/Base/AMReX_GpuReduce.H b/Src/Base/AMReX_GpuReduce.H
index b0d8c451427..45aca944157 100644
--- a/Src/Base/AMReX_GpuReduce.H
+++ b/Src/Base/AMReX_GpuReduce.H
@@ -330,6 +330,13 @@ T blockReduce (T x, WARPREDUCE && warp_reduce, T x0)
     return x;
 }
 
+template <int warpSize, typename T, typename WARPREDUCE>
+AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+T blockReduce (T x, WARPREDUCE && warp_reduce, T x0, Gpu::Handler const&)
+{
+    return blockReduce<warpSize>(x, warp_reduce, x0);
+}
+
 template <int warpSize, typename T, typename WARPREDUCE, typename ATOMICOP>
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
 void blockReduce_partial (T* dest, T x, WARPREDUCE && warp_reduce, ATOMICOP && atomic_op,
diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H
index 9731f23f5cf..7a63dd050cf 100644
--- a/Src/Base/AMReX_GpuTypes.H
+++ b/Src/Base/AMReX_GpuTypes.H
@@ -107,6 +107,8 @@ struct Handler {};
 template<unsigned int threads_per_block, typename T, int dim>
 struct LaunchHandler
 {
+    static_assert(dim >= 1 && dim <= 3);
+
 #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
     LaunchHandler() = default;
 #elif defined(AMREX_USE_SYCL)
@@ -243,6 +245,17 @@ struct LaunchHandler
 #endif
     }
 
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    auto handler () const {
+#if defined(AMREX_USE_SYCL)
+        return Gpu::Handler(m_item, reinterpret_cast<void*>(m_shared_mem), threads_per_block);
+#elif defined(AMREX_USE_GPU)
+        return Gpu::Handler(threads_per_block);
+#else
+        return 0;
+#endif
+    }
+
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     auto sycl_item () const {
 #if defined(AMREX_USE_SYCL)
@@ -257,17 +270,13 @@ private:
 #if defined(AMREX_USE_SYCL)
     sycl::nd_item<dim> const* m_item;
     T * m_shared_mem;
-#endif
-
-#if !defined(AMREX_USE_GPU)
+#elif !defined(AMREX_USE_GPU)
     IntVectND<dim> m_blockid;
     IntVectND<dim> m_griddim;
     T * m_shared_mem;
 #endif
 };
 
-
-
 }
 
 #endif
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp_bc.cpp b/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp_bc.cpp
index 5e1fb62fc20..d9a29036d70 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp_bc.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp_bc.cpp
@@ -85,20 +85,12 @@ MLEBTensorOp::applyBCTensor (int amrlev, int mglev, MultiFab& vel,
 
 #ifdef AMREX_USE_GPU
             if (Gpu::inLaunchRegion()) {
-                amrex::launch<64>(12, Gpu::gpuStream(),
-#ifdef AMREX_USE_SYCL
-                [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item)
+                amrex::LaunchRaw<64>(amrex::IntVectND<1>{12},
+                [=] AMREX_GPU_DEVICE (auto lh)
                 {
-                    int bid = item.get_group_linear_id();
-                    int tid = item.get_local_linear_id();
-                    int bdim = item.get_local_range(0);
-#else
-                [=] AMREX_GPU_DEVICE ()
-                {
-                    int bid = blockIdx.x;
-                    int tid = threadIdx.x;
-                    int bdim = blockDim.x;
-#endif
+                    int bid = lh.blockIdx1D();
+                    int tid = lh.threadIdx1D();
+                    int bdim = lh.blockDim1D();
                     mltensor_fill_edges(bid, tid, bdim, vbx, velfab,
                                         mxlo, mylo, mzlo, mxhi, myhi, mzhi,
                                         bvxlo, bvylo, bvzlo, bvxhi, bvyhi, bvzhi,
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp b/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp
index 8d72107d541..dfd8dce0544 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp
@@ -408,20 +408,12 @@ MLTensorOp::applyBCTensor (int amrlev, int mglev, MultiFab& vel, // NOLINT(reada
         // only edge vals used in 3D stencil
 #ifdef AMREX_USE_GPU
         if (Gpu::inLaunchRegion()) {
-            amrex::launch<64>(12, Gpu::gpuStream(),
-#ifdef AMREX_USE_SYCL
+            amrex::LaunchRaw<64>(amrex::IntVectND<1>{12},
             [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item)
             {
-                int bid = item.get_group_linear_id();
-                int tid = item.get_local_linear_id();
-                int bdim = item.get_local_range(0);
-#else
-            [=] AMREX_GPU_DEVICE ()
-            {
-                int bid = blockIdx.x;
-                int tid = threadIdx.x;
-                int bdim = blockDim.x;
-#endif
+                int bid = lh.blockIdx1D();
+                int tid = lh.threadIdx1D();
+                int bdim = lh.blockDim1D();
                 mltensor_fill_edges(bid, tid, bdim, vbx, velfab,
                                     mxlo, mylo, mzlo, mxhi, myhi, mzhi,
                                     bvxlo, bvylo, bvzlo, bvxhi, bvyhi, bvzhi,
diff --git a/Src/Particle/AMReX_ParticleUtil.H b/Src/Particle/AMReX_ParticleUtil.H
index fe1b599f0e4..c2278f80042 100644
--- a/Src/Particle/AMReX_ParticleUtil.H
+++ b/Src/Particle/AMReX_ParticleUtil.H
@@ -799,12 +799,13 @@ void PermutationForDeposition (Gpu::DeviceVector<index_type>& perm, index_type n
         pllist_next[i] = Gpu::Atomic::Exch(pllist_start + f(i), i);
     });
 
-#if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
-    amrex::launch<gpu_block_size>(nbins / gpu_block_size, Gpu::gpuStream(),
-        [pllist_start,pllist_next,pperm,pglobal_idx] AMREX_GPU_DEVICE () {
-            __shared__ index_type sdata[gpu_block_size];
-            __shared__ index_type global_idx_start;
-            __shared__ index_type idx_start;
+#if defined(AMREX_USE_GPU)
+    amrex::LaunchRaw<gpu_block_size, index_type>(amrex::IntVectND<1>{nbins / gpu_block_size},
+        gpu_block_size + 2,
+        [pllist_start,pllist_next,pperm,pglobal_idx] AMREX_GPU_DEVICE (auto lh) {
+            index_type* sdata = lh.shared_memory();
+            index_type& global_idx_start = *(sdata + gpu_block_size);
+            index_type& idx_start = *(sdata + gpu_block_size + 1);
 
             index_type current_idx = 0;
 
@@ -812,7 +813,7 @@ void PermutationForDeposition (Gpu::DeviceVector<index_type>& perm, index_type n
                 // Compressed layout: subsequent sweeps of up to gpu_block_size contiguous particles
                 // are put right next to each other, while without the compressed layout,
                 // there can be other particle sweeps from different locations between them.
-                current_idx = pllist_start[threadIdx.x + gpu_block_size * blockIdx.x];
+                current_idx = pllist_start[lh.threadIdx1D() + gpu_block_size * lh.blockIdx1D()];
 
                 index_type num_particles_thread = 0;
                 while (current_idx != llist_guard) {
@@ -823,33 +824,33 @@ void PermutationForDeposition (Gpu::DeviceVector<index_type>& perm, index_type n
                 index_type num_particles_block =
                     Gpu::blockReduceSum<gpu_block_size>(num_particles_thread);
 
-                if (threadIdx.x == 0) {
+                if (lh.threadIdx1D() == 0) {
                     global_idx_start = Gpu::Atomic::Add(pglobal_idx, num_particles_block);
                 }
             }
 
-            current_idx = pllist_start[threadIdx.x + gpu_block_size * blockIdx.x];
+            current_idx = pllist_start[lh.threadIdx1D() + gpu_block_size * lh.blockIdx1D()];
 
             while (true) {
-                sdata[threadIdx.x] = index_type(current_idx != llist_guard);
+                sdata[lh.threadIdx1D()] = index_type(current_idx != llist_guard);
                 index_type x = 0;
 
                 // simple block wide prefix sum
                 for (index_type i = 1; i<gpu_block_size; i*=2) {
-                    __syncthreads();
-                    if (threadIdx.x >= i) {
-                        x = sdata[threadIdx.x - i];
+                    lh.syncthreads();
+                    if (lh.threadIdx1D() >= i) {
+                        x = sdata[lh.threadIdx1D() - i];
                     }
-                    __syncthreads();
-                    if (threadIdx.x >= i) {
-                        sdata[threadIdx.x] += x;
+                    lh.syncthreads();
+                    if (lh.threadIdx1D() >= i) {
+                        sdata[lh.threadIdx1D()] += x;
                     }
                 }
-                __syncthreads();
+                lh.syncthreads();
                 if (sdata[gpu_block_size_m1] == 0) {
                     break;
                 }
-                if (threadIdx.x == gpu_block_size_m1) {
+                if (lh.threadIdx1D() == gpu_block_size_m1) {
                     if constexpr (compressed_layout) {
                         idx_start = global_idx_start;
                         global_idx_start += sdata[gpu_block_size_m1];
@@ -857,17 +858,17 @@ void PermutationForDeposition (Gpu::DeviceVector<index_type>& perm, index_type n
                         idx_start = Gpu::Atomic::Add(pglobal_idx, sdata[gpu_block_size_m1]);
                     }
                 }
-                __syncthreads();
-                sdata[threadIdx.x] += idx_start;
+                lh.syncthreads();
+                sdata[lh.threadIdx1D()] += idx_start;
                 if (current_idx != llist_guard) {
-                    pperm[sdata[threadIdx.x] - 1] = current_idx;
+                    pperm[sdata[lh.threadIdx1D()] - 1] = current_idx;
                     current_idx = pllist_next[current_idx];
                 }
             }
         });
 #else
     amrex::ignore_unused(pperm, pglobal_idx, compressed_layout);
-    Abort("PermutationForDeposition only implemented for CUDA and HIP");
+    Abort("PermutationForDeposition only implemented for GPU");
 #endif
 
     Gpu::Device::streamSynchronize();

From 1a0897b3e7296778cbeb6e729fb66beb3f7a65e0 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Tue, 10 Feb 2026 17:26:52 +0100
Subject: [PATCH 11/46] Use LaunchRaw in more functions

---
 Src/AmrCore/AMReX_TagBox.cpp                |  10 +-
 Src/Base/AMReX_GpuReduce.H                  |   7 -
 Src/Base/AMReX_MultiFabUtil.H               |  22 +-
 Src/Base/AMReX_MultiFabUtil.cpp             |  29 +--
 Src/Base/AMReX_Reduce.H                     | 247 +++++++-------------
 Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp |   2 +-
 6 files changed, 105 insertions(+), 212 deletions(-)

diff --git a/Src/AmrCore/AMReX_TagBox.cpp b/Src/AmrCore/AMReX_TagBox.cpp
index 066c863a01f..3600d42cc62 100644
--- a/Src/AmrCore/AMReX_TagBox.cpp
+++ b/Src/AmrCore/AMReX_TagBox.cpp
@@ -448,11 +448,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
         const char* tags = (*this)[fai].dataPtr();
 
         amrex::LaunchRaw<block_size, int>(amrex::IntVectND<1>{nblocks[li]},
-#ifdef AMREX_USE_SYCL
-            Gpu::Device::warp_size,
-#else
-            0,
-#endif
+        AMREX_SYCL_REDUCE_SMEM(Gpu::Device::warp_size),
         [=] AMREX_GPU_DEVICE (auto lh) noexcept
         {
             int bid = lh.blockIdx1D();
@@ -465,8 +461,8 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
             }
 
             t = Gpu::blockReduce<Gpu::Device::warp_size>
-                (t, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0,
-                lh.handler());
+                (t, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0
+                AMREX_SYCL_REDUCE_HANDLER(lh.handler()));
             if (tid == 0) {
                 ntags[bid] = t;
             }
diff --git a/Src/Base/AMReX_GpuReduce.H b/Src/Base/AMReX_GpuReduce.H
index 45aca944157..b0d8c451427 100644
--- a/Src/Base/AMReX_GpuReduce.H
+++ b/Src/Base/AMReX_GpuReduce.H
@@ -330,13 +330,6 @@ T blockReduce (T x, WARPREDUCE && warp_reduce, T x0)
     return x;
 }
 
-template <int warpSize, typename T, typename WARPREDUCE>
-AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-T blockReduce (T x, WARPREDUCE && warp_reduce, T x0, Gpu::Handler const&)
-{
-    return blockReduce<warpSize>(x, warp_reduce, x0);
-}
-
 template <int warpSize, typename T, typename WARPREDUCE, typename ATOMICOP>
 AMREX_GPU_DEVICE AMREX_FORCE_INLINE
 void blockReduce_partial (T* dest, T x, WARPREDUCE && warp_reduce, ATOMICOP && atomic_op,
diff --git a/Src/Base/AMReX_MultiFabUtil.H b/Src/Base/AMReX_MultiFabUtil.H
index 440f5e27f2a..ce890a249a0 100644
--- a/Src/Base/AMReX_MultiFabUtil.H
+++ b/Src/Base/AMReX_MultiFabUtil.H
@@ -1209,20 +1209,14 @@ void reduce_to_plane (Array4<T> const& ar, int direction, Box const& bx, int box
             const auto len = amrex::length(bx);
             constexpr int nthreads = 128;
             auto nblocks = static_cast<int>(b2d.numPts());
-#ifdef AMREX_USE_SYCL
-            constexpr std::size_t shared_mem_bytes = sizeof(T)*Gpu::Device::warp_size;
-            amrex::launch<nthreads>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
-                                    [=] AMREX_GPU_DEVICE (Gpu::Handler const& h)
-            {
-                int bid = h.blockIdx();
-                int tid = h.threadIdx();
-#else
-            amrex::launch<nthreads>(nblocks, Gpu::gpuStream(),
-                                    [=] AMREX_GPU_DEVICE ()
+
+            amrex::LaunchRaw<nthreads, T>(amrex::IntVectND<1>{nblocks},
+                AMREX_SYCL_REDUCE_SMEM(Gpu::Device::warp_size),
+                [=] AMREX_GPU_DEVICE (auto lh)
             {
-                int bid = blockIdx.x;
-                int tid = threadIdx.x;
-#endif
+                int bid = lh.blockIdx1D();
+                int tid = lh.threadIdx1D();
+
                 T tmp;
                 Op().init(tmp);
                 T* p;
@@ -1255,7 +1249,7 @@ void reduce_to_plane (Array4<T> const& ar, int direction, Box const& bx, int box
                     p = ar.ptr(i,j,0);
                 }
 #ifdef AMREX_USE_SYCL
-                Op().template parallel_update<T>(*p, tmp, h);
+                Op().template parallel_update<T>(*p, tmp, lh.handler());
 #else
                 Op().template parallel_update<T,nthreads>(*p, tmp);
 #endif
diff --git a/Src/Base/AMReX_MultiFabUtil.cpp b/Src/Base/AMReX_MultiFabUtil.cpp
index 16b7afc362f..de1685b9b5a 100644
--- a/Src/Base/AMReX_MultiFabUtil.cpp
+++ b/Src/Base/AMReX_MultiFabUtil.cpp
@@ -866,22 +866,14 @@ namespace amrex
                 }
                 int n2dblocks = (n2d+AMREX_GPU_MAX_THREADS-1)/AMREX_GPU_MAX_THREADS;
                 int nblocks = n2dblocks * b.length(direction);
-#ifdef AMREX_USE_SYCL
-                std::size_t shared_mem_byte = sizeof(Real)*Gpu::Device::warp_size;
-                amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks, shared_mem_byte, Gpu::gpuStream(),
-                              [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
-#else
-                amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks, Gpu::gpuStream(),
-                              [=] AMREX_GPU_DEVICE () noexcept
-#endif
+
+                amrex::LaunchRaw<AMREX_GPU_MAX_THREADS, amrex::Real>(amrex::IntVectND<1>{nblocks},
+                    AMREX_SYCL_REDUCE_SMEM(Gpu::Device::warp_size),
+                    [=] AMREX_GPU_DEVICE (auto lh) noexcept
                 {
-#ifdef AMREX_USE_SYCL
-                    int i1d = h.blockIdx() / n2dblocks;
-                    int i2d = h.threadIdx() + AMREX_GPU_MAX_THREADS*(h.blockIdx()-i1d*n2dblocks);
-#else
-                    int i1d = blockIdx.x / n2dblocks;
-                    int i2d = threadIdx.x + AMREX_GPU_MAX_THREADS*(blockIdx.x-i1d*n2dblocks);
-#endif
+                    int i1d = lh.blockIdx1D() / n2dblocks;
+                    int i2d = lh.threadIdx1D() +
+                        AMREX_GPU_MAX_THREADS*(lh.blockIdx1D()-i1d*n2dblocks);
                     int i2dy = i2d / n2dx;
                     int i2dx = i2d - i2dy*n2dx;
                     int i, j, k, idir;
@@ -903,11 +895,8 @@ namespace amrex
                     }
                     for (int n = 0; n < ncomp; ++n) {
                         Real r = (i2d < n2d) ? fab(i,j,k,n+icomp) : Real(0.0);
-#ifdef AMREX_USE_SYCL
-                        Gpu::deviceReduceSum_full(p+n+ncomp*idir, r, h);
-#else
-                        Gpu::deviceReduceSum_full(p+n+ncomp*idir, r);
-#endif
+                        Gpu::deviceReduceSum_full(p+n+ncomp*idir, r
+                            AMREX_SYCL_REDUCE_HANDLER(lh.handler()));
                     }
                 });
             }
diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H
index f97ecbaaa9e..e7f33db5569 100644
--- a/Src/Base/AMReX_Reduce.H
+++ b/Src/Base/AMReX_Reduce.H
@@ -229,6 +229,10 @@ namespace Reduce::detail {
         P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);
         for_each_parallel<I+1,T,P1,Ps...>(d, s, h);
     }
+
+#define AMREX_SYCL_REDUCE_SMEM(warp_size) warp_size
+#define AMREX_SYCL_REDUCE_HANDLER(launch_handler) , launch_handler
+
 #else
     template <std::size_t I, typename T, typename P>
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
@@ -244,6 +248,10 @@ namespace Reduce::detail {
         P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));
         for_each_parallel<I+1,T,P1,Ps...>(d, s);
     }
+
+#define AMREX_SYCL_REDUCE_SMEM(warp_size) 0
+#define AMREX_SYCL_REDUCE_HANDLER(launch_handler)
+
 #endif
 #endif
 
@@ -604,29 +612,22 @@ public:
             reduce_data.nBlocks(stream) = nblocks_ec;
             reduce_data.updateMaxStreamIndex(stream);
 
-#ifdef AMREX_USE_SYCL
             // device reduce needs local(i.e., shared) memory
-            constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
-            amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
-                          [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
+            amrex::LaunchRaw<AMREX_GPU_MAX_THREADS, unsigned long long>(
+                amrex::IntVectND<1>{nblocks_ec},
+                AMREX_SYCL_REDUCE_SMEM(Gpu::Device::warp_size),
+                [=] AMREX_GPU_DEVICE (auto lh) noexcept
             {
-                Dim1 blockIdx {gh.blockIdx()};
-                Dim1 threadIdx{gh.threadIdx()};
-#else
-            amrex::launch_global<AMREX_GPU_MAX_THREADS>
-                <<<nblocks_ec, AMREX_GPU_MAX_THREADS, 0, stream>>>
-                ([=] AMREX_GPU_DEVICE () noexcept
-            {
-#endif
                 ReduceTuple r;
                 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
                 ReduceTuple& dst = pdst[blockIdx.x];
-                if (threadIdx.x == 0) {
+                if (lh.threadIdx1D() == 0) {
                     dst = r;
                 }
-                for (int iblock = blockIdx.x; iblock < nblocks; iblock += nblocks_ec) {
+                for (int iblock = lh.blockIdx1D(); iblock < nblocks; iblock += nblocks_ec) {
                     int ibox = iblock / nblocks_per_box;
-                    auto icell = std::uint64_t(iblock-ibox*nblocks_per_box)*AMREX_GPU_MAX_THREADS + threadIdx.x;
+                    auto icell = std::uint64_t(iblock-ibox*nblocks_per_box)*AMREX_GPU_MAX_THREADS
+                        + lh.threadIdx1D();
 
                     BoxIndexer const& indexer = dp_boxes[ibox];
                     if (icell < indexer.numPts()) {
@@ -635,11 +636,8 @@ public:
                             (f, ibox, i, j, k, ncomp, r);
                     }
                 }
-#ifdef AMREX_USE_SYCL
-                Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
-#else
-                Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
-#endif
+                Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r
+                    AMREX_SYCL_REDUCE_HANDLER(lh.handler()));
             });
         }
     }
@@ -659,28 +657,20 @@ public:
             / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
         nblocks_ec = std::min<Long>(nblocks_ec, reduce_data.maxBlocks());
         reduce_data.updateMaxStreamIndex(stream);
-#ifdef AMREX_USE_SYCL
-        // device reduce needs local(i.e., shared) memory
-        constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
-        amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
-        [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
+        amrex::LaunchRaw<AMREX_GPU_MAX_THREADS, unsigned long long>(
+            amrex::IntVectND<1>{static_cast<int>(nblocks_ec)},
+            AMREX_SYCL_REDUCE_SMEM(Gpu::Device::warp_size),
+            [=] AMREX_GPU_DEVICE (auto lh) noexcept
         {
-            Dim1 blockIdx {gh.blockIdx()};
-            Dim1 threadIdx{gh.threadIdx()};
-            Dim1 gridDim  {gh.gridDim()};
-#else
-        amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
-        [=] AMREX_GPU_DEVICE () noexcept
-        {
-#endif
             ReduceTuple r;
             Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
-            ReduceTuple& dst = *(dp+blockIdx.x);
-            if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
+            ReduceTuple& dst = *(dp+lh.blockIdx1D());
+            if (lh.threadIdx1D() == 0 && static_cast<int>(lh.blockIdx1D()) >= nblocks) {
                 dst = r;
             }
-            for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,
-                 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x;
+            for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*lh.blockIdx1D()
+                + lh.threadIdx1D(),
+                 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*nblocks_ec;
                  icell < indexer.numPts();
                  icell += stride)
             {
@@ -696,11 +686,8 @@ public:
                     }
                 }
             }
-#ifdef AMREX_USE_SYCL
-            Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
-#else
-            Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
-#endif
+            Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r
+                AMREX_SYCL_REDUCE_HANDLER(lh.handler()));
         });
         nblocks = std::max(nblocks, static_cast<int>(nblocks_ec));
     }
@@ -792,39 +779,28 @@ public:
             / (nitems_per_thread*AMREX_GPU_MAX_THREADS);
         nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
         reduce_data.updateMaxStreamIndex(stream);
-#ifdef AMREX_USE_SYCL
         // device reduce needs local(i.e., shared) memory
-        constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
-        amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
-        [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
+        amrex::LaunchRaw<AMREX_GPU_MAX_THREADS, unsigned long long>(
+            amrex::IntVectND<1>{nblocks_ec},
+            AMREX_SYCL_REDUCE_SMEM(Gpu::Device::warp_size),
+            [=] AMREX_GPU_DEVICE (auto lh) noexcept
         {
-            Dim1 blockIdx {gh.blockIdx()};
-            Dim1 threadIdx{gh.threadIdx()};
-            Dim1 gridDim  {gh.gridDim()};
-#else
-        amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
-        [=] AMREX_GPU_DEVICE () noexcept
-        {
-#endif
             ReduceTuple r;
             Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
-            ReduceTuple& dst = *(dp+blockIdx.x);
-            if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
+            ReduceTuple& dst = *(dp+lh.blockIdx1D());
+            if (lh.threadIdx1D() == 0 && static_cast<int>(lh.blockIdx1D()) >= nblocks) {
                 dst = r;
             }
-            for (N i = N(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,
-                 stride = N(AMREX_GPU_MAX_THREADS)*gridDim.x;
+            for (N i = N(AMREX_GPU_MAX_THREADS)*lh.blockIdx1D()+lh.threadIdx1D(),
+                 stride = N(AMREX_GPU_MAX_THREADS)*nblocks_ec;
                  i < n;
                  i += stride)
             {
                 auto pr = f(i);
                 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r,pr);
             }
-#ifdef AMREX_USE_SYCL
-            Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
-#else
-            Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
-#endif
+            Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r
+                AMREX_SYCL_REDUCE_HANDLER(lh.handler()));
         });
         nblocks = amrex::max(nblocks, nblocks_ec);
     }
@@ -860,52 +836,34 @@ public:
 #endif
         {
             int maxblocks = reduce_data.maxBlocks();
-#ifdef AMREX_USE_SYCL
-            // device reduce needs local(i.e., shared) memory
-            constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
-#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
+#if !defined(AMREX_NO_SYCL_REDUCE_WORKAROUND) && defined(AMREX_USE_SYCL)
             // xxxxx SYCL todo: reduce bug workaround
             Gpu::DeviceVector<ReduceTuple> dtmp(1);
             auto presult = dtmp.data();
 #else
             auto presult = hp;
 #endif
-            amrex::launch<AMREX_GPU_MAX_THREADS>(1, shared_mem_bytes, stream,
-            [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
+            amrex::LaunchRaw<AMREX_GPU_MAX_THREADS, unsigned long long>(
+                amrex::IntVectND<1>{1},
+                AMREX_SYCL_REDUCE_SMEM(Gpu::Device::warp_size),
+                [=] AMREX_GPU_DEVICE (auto lh) noexcept
             {
                 ReduceTuple r;
                 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
                 ReduceTuple dst = r;
                 for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
                     auto dp_stream = dp+istream*maxblocks;
-                    for (int i = gh.item->get_global_id(0), stride = gh.item->get_global_range(0);
+                    for (int i = lh.globalIdx1D(), stride = AMREX_GPU_MAX_THREADS;
                          i < nblocks[istream]; i += stride) {
                         Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
                     }
                 }
-                Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r, gh);
-                if (gh.threadIdx() == 0) { *presult = dst; }
+                Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r
+                    AMREX_SYCL_REDUCE_HANDLER(lh.handler()));
+                if (lh.threadIdx1D() == 0) { *presult = dst; }
             });
-#ifndef AMREX_NO_SYCL_REDUCE_WORKAROUND
+#if !defined(AMREX_NO_SYCL_REDUCE_WORKAROUND) && defined(AMREX_USE_SYCL)
             Gpu::dtoh_memcpy_async(hp, dtmp.data(), sizeof(ReduceTuple));
-#endif
-#else
-            amrex::launch<AMREX_GPU_MAX_THREADS>(1, 0, stream,
-            [=] AMREX_GPU_DEVICE () noexcept
-            {
-                ReduceTuple r;
-                Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
-                ReduceTuple dst = r;
-                for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
-                    auto dp_stream = dp+istream*maxblocks;
-                    for (int i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
-                         i < nblocks[istream]; i += stride) {
-                        Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
-                    }
-                }
-                Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r);
-                if (threadIdx.x == 0) { *hp = dst; }
-            });
 #endif
             Gpu::streamSynchronize();
         }
@@ -1036,51 +994,34 @@ bool AnyOf (N n, T const* v, P const& pred)
     int* dp = ds.dataPtr();
     auto ec = Gpu::ExecutionConfig(n);
     ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
-
-#ifdef AMREX_USE_SYCL
-    const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
-    const std::size_t shared_mem_bytes = num_ints*sizeof(int);
-    amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
-        int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
-        if (gh.threadIdx() == 0) { *has_any = *dp; }
-        gh.sharedBarrier();
-
-        if (!(*has_any))
-        {
-            int r = false;
-            for (N i = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
-                 i < n && !r; i += stride)
-            {
-                r = pred(v[i]) ? 1 : 0;
-            }
-
-            r = Gpu::blockReduce<Gpu::Device::warp_size>
-                (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, gh);
-            if (gh.threadIdx() == 0 && r) { *dp = 1; }
-        }
-    });
-#else
-    amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, 0, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        __shared__ int has_any;
-        if (threadIdx.x == 0) { has_any = *dp; }
-        __syncthreads();
+    const int nblocks_ec = ec.numBlocks.x;
+    [[maybe_unused]] const int num_ints =
+        std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size);
+
+    amrex::LaunchRaw<AMREX_GPU_MAX_THREADS, int>(
+        amrex::IntVectND<1>{nblocks_ec},
+        AMREX_SYCL_REDUCE_SMEM(num_ints) + 1,
+        [=] AMREX_GPU_DEVICE (auto lh) noexcept
+    {
+        int& has_any = *(lh.shared_memory() + AMREX_SYCL_REDUCE_SMEM(num_ints));
+        if (lh.threadIdx1D() == 0) { has_any = *dp; }
+        lh.syncthreads();
 
         if (!has_any)
         {
             int r = false;
-            for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
-                 i < n && !r; i += stride)
+            for (N i = AMREX_GPU_MAX_THREADS*lh.blockIdx1D()+lh.threadIdx1D(),
+                stride = AMREX_GPU_MAX_THREADS*nblocks_ec;
+                i < n && !r; i += stride)
             {
                 r = pred(v[i]) ? 1 : 0;
             }
             r = Gpu::blockReduce<Gpu::Device::warp_size>
-                (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
+                (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0
+                AMREX_SYCL_REDUCE_HANDLER(lh.handler()));
             if (threadIdx.x == 0 && r) *dp = 1;
         }
     });
-#endif
     return ds.dataValue();
 }
 
@@ -1093,45 +1034,25 @@ bool AnyOf (BoxND<dim> const& box, P const& pred)
     const BoxIndexerND<dim> indexer(box);
     auto ec = Gpu::ExecutionConfig(box.numPts());
     ec.numBlocks.x = std::min(ec.numBlocks.x, Gpu::Device::maxBlocksPerLaunch());
-
-#ifdef AMREX_USE_SYCL
-    const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
-    const std::size_t shared_mem_bytes = num_ints*sizeof(int);
-    amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
-        int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
-        if (gh.threadIdx() == 0) { *has_any = *dp; }
-        gh.sharedBarrier();
-
-        if (!(*has_any))
-        {
-            int r = false;
-            for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*gh.blockIdx()+gh.threadIdx(),
-                 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gh.gridDim();
-                 icell < indexer.numPts() && !r;
-                 icell += stride)
-            {
-                auto iv = indexer.intVect(icell);
-                r = amrex::detail::call_f_intvect(pred, iv) ? 1 : 0;
-            }
-            r = Gpu::blockReduce<Gpu::Device::warp_size>
-                (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0, gh);
-            if (gh.threadIdx() == 0 && r) { *dp = 1; }
-        }
-    });
-#else
-    AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, ec.numBlocks, ec.numThreads, 0,
-                        Gpu::gpuStream(),
-    [=] AMREX_GPU_DEVICE () noexcept {
-        __shared__ int has_any;
-        if (threadIdx.x == 0) { has_any = *dp; }
-        __syncthreads();
+    const int nblocks_ec = ec.numBlocks.x;
+    [[maybe_unused]] const int num_ints =
+        std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size);
+
+    amrex::LaunchRaw<AMREX_GPU_MAX_THREADS, int>(
+        amrex::IntVectND<1>{nblocks_ec},
+        AMREX_SYCL_REDUCE_SMEM(num_ints) + 1,
+        [=] AMREX_GPU_DEVICE (auto lh) noexcept
+    {
+        int& has_any = *(lh.shared_memory() + AMREX_SYCL_REDUCE_SMEM(num_ints));
+        if (lh.threadIdx1D() == 0) { has_any = *dp; }
+        lh.syncthreads();
 
         if (!has_any)
         {
             int r = false;
-            for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*blockIdx.x+threadIdx.x,
-                 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*gridDim.x;
+            for (std::uint64_t icell = std::uint64_t(AMREX_GPU_MAX_THREADS)*
+                    lh.blockIdx1D()+lh.threadIdx1D(),
+                 stride = std::uint64_t(AMREX_GPU_MAX_THREADS)*nblocks_ec;
                  icell < indexer.numPts() && !r;
                  icell += stride)
             {
@@ -1139,11 +1060,11 @@ bool AnyOf (BoxND<dim> const& box, P const& pred)
                 r = amrex::detail::call_f_intvect(pred, iv) ? 1 : 0;
             }
             r = Gpu::blockReduce<Gpu::Device::warp_size>
-                (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0);
-            if (threadIdx.x == 0 && r) *dp = 1;
+                (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0
+                AMREX_SYCL_REDUCE_HANDLER(lh.handler()));
+            if (lh.threadIdx1D() == 0 && r) *dp = 1;
         }
     });
-#endif
     return ds.dataValue();
 }
 
diff --git a/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp b/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp
index dfd8dce0544..372fd1ae09f 100644
--- a/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp
+++ b/Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp
@@ -409,7 +409,7 @@ MLTensorOp::applyBCTensor (int amrlev, int mglev, MultiFab& vel, // NOLINT(reada
 #ifdef AMREX_USE_GPU
         if (Gpu::inLaunchRegion()) {
             amrex::LaunchRaw<64>(amrex::IntVectND<1>{12},
-            [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item)
+            [=] AMREX_GPU_DEVICE (auto lh)
             {
                 int bid = lh.blockIdx1D();
                 int tid = lh.threadIdx1D();

From 4956fde3e02934a8f13d5b1958af9fd511756a5d Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Tue, 10 Feb 2026 17:34:52 +0100
Subject: [PATCH 12/46] fix

---
 Src/Base/AMReX_Reduce.H | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H
index e7f33db5569..260f8be11f8 100644
--- a/Src/Base/AMReX_Reduce.H
+++ b/Src/Base/AMReX_Reduce.H
@@ -620,7 +620,7 @@ public:
             {
                 ReduceTuple r;
                 Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
-                ReduceTuple& dst = pdst[blockIdx.x];
+                ReduceTuple& dst = pdst[lh.blockIdx1D()];
                 if (lh.threadIdx1D() == 0) {
                     dst = r;
                 }
@@ -815,7 +815,6 @@ public:
         }
 
         using ReduceTuple = typename D::Type;
-        auto const& stream = Gpu::gpuStream();
         auto dp = reduce_data.devicePtr();
         auto const& nblocks = reduce_data.nBlocks();
 #if defined(AMREX_USE_SYCL)
@@ -1019,7 +1018,7 @@ bool AnyOf (N n, T const* v, P const& pred)
             r = Gpu::blockReduce<Gpu::Device::warp_size>
                 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0
                 AMREX_SYCL_REDUCE_HANDLER(lh.handler()));
-            if (threadIdx.x == 0 && r) *dp = 1;
+            if (lh.threadIdx1D() == 0 && r) *dp = 1;
         }
     });
     return ds.dataValue();

From 19c909a79ae97d9f9077031d6cf9897272114757 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Tue, 10 Feb 2026 17:43:08 +0100
Subject: [PATCH 13/46] fix sycl

---
 Src/Base/AMReX_GpuTypes.H | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H
index 7a63dd050cf..4928328ce7e 100644
--- a/Src/Base/AMReX_GpuTypes.H
+++ b/Src/Base/AMReX_GpuTypes.H
@@ -153,7 +153,7 @@ struct LaunchHandler
         if constexpr (dim == 1) {
             return static_cast<unsigned int>(AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
                 blockIdx.x,
-                m_item->get_group_id(0),
+                m_item->get_group(0),
                 m_blockid[0]
             ));
         } else if constexpr (dim == 2) {
@@ -177,15 +177,15 @@ struct LaunchHandler
         if constexpr (dim == 1) {
             return AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
                 IntVectND<dim>(static_cast<int>(blockIdx.x)),
-                IntVectND<dim>(static_cast<int>(m_item->get_group_id(0))),
+                IntVectND<dim>(static_cast<int>(m_item->get_group(0))),
                 m_blockid
             );
         } else if constexpr (dim == 2) {
             return AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
                 IntVectND<dim>(static_cast<int>(blockIdx.x),
                                static_cast<int>(blockIdx.y)),
-                IntVectND<dim>(static_cast<int>(m_item->get_group_id(1)),
-                               static_cast<int>(m_item->get_group_id(0))),
+                IntVectND<dim>(static_cast<int>(m_item->get_group(1)),
+                               static_cast<int>(m_item->get_group(0))),
                 m_blockid
             );
         } else {
@@ -193,9 +193,9 @@ struct LaunchHandler
                 IntVectND<dim>(static_cast<int>(blockIdx.x),
                                static_cast<int>(blockIdx.y),
                                static_cast<int>(blockIdx.z)),
-                IntVectND<dim>(static_cast<int>(m_item->get_group_id(2)),
-                               static_cast<int>(m_item->get_group_id(1)),
-                               static_cast<int>(m_item->get_group_id(0))),
+                IntVectND<dim>(static_cast<int>(m_item->get_group(2)),
+                               static_cast<int>(m_item->get_group(1)),
+                               static_cast<int>(m_item->get_group(0))),
                 m_blockid
             );
         }

From c0a70a9240d55a18f3210a997372f9d7ae7f15c3 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Tue, 10 Mar 2026 11:35:17 +0100
Subject: [PATCH 14/46] Add AMREX_IF_SYCL and nodiscard

---
 Src/AmrCore/AMReX_TagBox.cpp    |  4 ++--
 Src/Base/AMReX_GpuControl.H     |  8 ++++++++
 Src/Base/AMReX_GpuTypes.H       | 24 +++++++++++-----------
 Src/Base/AMReX_MultiFabUtil.H   |  2 +-
 Src/Base/AMReX_MultiFabUtil.cpp |  4 ++--
 Src/Base/AMReX_Reduce.H         | 36 +++++++++++++--------------------
 6 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/Src/AmrCore/AMReX_TagBox.cpp b/Src/AmrCore/AMReX_TagBox.cpp
index 2c71eafbfbe..62e6d051490 100644
--- a/Src/AmrCore/AMReX_TagBox.cpp
+++ b/Src/AmrCore/AMReX_TagBox.cpp
@@ -448,7 +448,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
         const char* tags = (*this)[fai].dataPtr();
 
         amrex::LaunchRaw<block_size, int>(amrex::IntVectND<1>{nblocks[li]},
-        AMREX_SYCL_REDUCE_SMEM(Gpu::Device::warp_size),
+        AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0),
         [=] AMREX_GPU_DEVICE (auto lh) noexcept
         {
             int bid = lh.blockIdx1D();
@@ -462,7 +462,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
 
             t = Gpu::blockReduce<Gpu::Device::warp_size>
                 (t, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0
-                AMREX_SYCL_REDUCE_HANDLER(lh.handler()));
+                AMREX_IF_SYCL(, lh.handler()));
             if (tid == 0) {
                 ntags[bid] = t;
             }
diff --git a/Src/Base/AMReX_GpuControl.H b/Src/Base/AMReX_GpuControl.H
index 6a7ebca1762..e5213f5440b 100644
--- a/Src/Base/AMReX_GpuControl.H
+++ b/Src/Base/AMReX_GpuControl.H
@@ -46,6 +46,14 @@
 #define AMREX_GPU_OR_CPU(a,b) b
 #endif
 
+#ifdef AMREX_USE_SYCL
+#define AMREX_IF_SYCL(...) __VA_ARGS__
+#define AMREX_IF_NOT_SYCL(...)
+#else
+#define AMREX_IF_SYCL(...)
+#define AMREX_IF_NOT_SYCL(...) __VA_ARGS__
+#endif
+
 #ifdef AMREX_USE_SYCL
 #define AMREX_SYCL_ONLY(a) a
 #else
diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H
index 5a43a6426ba..433984ee786 100644
--- a/Src/Base/AMReX_GpuTypes.H
+++ b/Src/Base/AMReX_GpuTypes.H
@@ -119,7 +119,7 @@ struct LaunchHandler
         : m_blockid{a_blockid}, m_griddim{a_griddim}, m_shared_mem{a_shared_mem} {}
 #endif
 
-    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]]
     unsigned int threadIdx1D () const {
         return static_cast<unsigned int>(AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
             threadIdx.x,
@@ -129,7 +129,7 @@ struct LaunchHandler
     }
 
     template<int ...nd_block_size>
-    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]]
     IntVectND<dim> threadIdxND () const {
         static_assert(sizeof...(nd_block_size) == dim &&
                       (1 * ... * nd_block_size) == threads_per_block);
@@ -148,7 +148,7 @@ struct LaunchHandler
         return ret;
     }
 
-    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]]
     unsigned int blockIdx1D () const {
         if constexpr (dim == 1) {
             return static_cast<unsigned int>(AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
@@ -172,7 +172,7 @@ struct LaunchHandler
         }
     }
 
-    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]]
     IntVectND<dim> blockIdxND () const {
         if constexpr (dim == 1) {
             return AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
@@ -201,31 +201,31 @@ struct LaunchHandler
         }
     }
 
-    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]]
     static constexpr unsigned int blockDim1D () {
         return threads_per_block;
     }
 
     template<int ...nd_block_size>
-    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]]
     static constexpr IntVectND<dim> blockDimND () {
         static_assert(sizeof...(nd_block_size) == dim &&
                       (1 * ... * nd_block_size) == threads_per_block);
         return {nd_block_size...};
     }
 
-    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]]
     unsigned int globalIdx1D () const {
         return blockIdx1D() * threads_per_block + threadIdx1D();
     }
 
     template<int ...nd_block_size>
-    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]]
     IntVectND<dim> globalIdxND () const {
         return blockIdxND() * blockDimND<nd_block_size...>() + threadIdxND<nd_block_size...>();
     }
 
-    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]]
     void syncthreads () const {
         AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
             __syncthreads(),
@@ -234,7 +234,7 @@ struct LaunchHandler
         );
     }
 
-    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]]
     T* shared_memory () const {
 #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
         static_assert(32 >= alignof(T));
@@ -245,7 +245,7 @@ struct LaunchHandler
 #endif
     }
 
-    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]]
     auto handler () const {
 #if defined(AMREX_USE_SYCL)
         return Gpu::Handler(m_item, reinterpret_cast<void*>(m_shared_mem), threads_per_block);
@@ -256,7 +256,7 @@ struct LaunchHandler
 #endif
     }
 
-    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]]
     auto sycl_item () const {
 #if defined(AMREX_USE_SYCL)
         return m_item;
diff --git a/Src/Base/AMReX_MultiFabUtil.H b/Src/Base/AMReX_MultiFabUtil.H
index 96ee5e58043..a01fcd0c831 100644
--- a/Src/Base/AMReX_MultiFabUtil.H
+++ b/Src/Base/AMReX_MultiFabUtil.H
@@ -1255,7 +1255,7 @@ void reduce_to_plane (Array4<T> const& ar, int direction, Box const& bx, int box
             auto nblocks = static_cast<int>(b2d.numPts());
 
             amrex::LaunchRaw<nthreads, T>(amrex::IntVectND<1>{nblocks},
-                AMREX_SYCL_REDUCE_SMEM(Gpu::Device::warp_size),
+                AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0),
                 [=] AMREX_GPU_DEVICE (auto lh)
             {
                 int bid = lh.blockIdx1D();
diff --git a/Src/Base/AMReX_MultiFabUtil.cpp b/Src/Base/AMReX_MultiFabUtil.cpp
index 7adba546ac7..3870394bd86 100644
--- a/Src/Base/AMReX_MultiFabUtil.cpp
+++ b/Src/Base/AMReX_MultiFabUtil.cpp
@@ -872,7 +872,7 @@ namespace amrex
                 int nblocks = n2dblocks * b.length(direction);
 
                 amrex::LaunchRaw<AMREX_GPU_MAX_THREADS, amrex::Real>(amrex::IntVectND<1>{nblocks},
-                    AMREX_SYCL_REDUCE_SMEM(Gpu::Device::warp_size),
+                    AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0),
                     [=] AMREX_GPU_DEVICE (auto lh) noexcept
                 {
                     int i1d = lh.blockIdx1D() / n2dblocks;
@@ -900,7 +900,7 @@ namespace amrex
                     for (int n = 0; n < ncomp; ++n) {
                         Real r = (i2d < n2d) ? fab(i,j,k,n+icomp) : Real(0.0);
                         Gpu::deviceReduceSum_full(p+n+ncomp*idir, r
-                            AMREX_SYCL_REDUCE_HANDLER(lh.handler()));
+                            AMREX_IF_SYCL(, lh.handler()));
                     }
                 });
             }
diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H
index d2e1f538310..18ceb3abe51 100644
--- a/Src/Base/AMReX_Reduce.H
+++ b/Src/Base/AMReX_Reduce.H
@@ -229,10 +229,6 @@ namespace Reduce::detail {
         P().parallel_update(amrex::get<I>(d), amrex::get<I>(s), h);
         for_each_parallel<I+1,T,P1,Ps...>(d, s, h);
     }
-
-#define AMREX_SYCL_REDUCE_SMEM(warp_size) warp_size
-#define AMREX_SYCL_REDUCE_HANDLER(launch_handler) , launch_handler
-
 #else
     template <std::size_t I, typename T, typename P>
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
@@ -248,10 +244,6 @@ namespace Reduce::detail {
         P().parallel_update(amrex::get<I>(d), amrex::get<I>(s));
         for_each_parallel<I+1,T,P1,Ps...>(d, s);
     }
-
-#define AMREX_SYCL_REDUCE_SMEM(warp_size) 0
-#define AMREX_SYCL_REDUCE_HANDLER(launch_handler)
-
 #endif
 #endif
 
@@ -615,7 +607,7 @@ public:
             // device reduce needs local(i.e., shared) memory
             amrex::LaunchRaw<AMREX_GPU_MAX_THREADS, unsigned long long>(
                 amrex::IntVectND<1>{nblocks_ec},
-                AMREX_SYCL_REDUCE_SMEM(Gpu::Device::warp_size),
+                AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0),
                 [=] AMREX_GPU_DEVICE (auto lh) noexcept
             {
                 ReduceTuple r;
@@ -637,7 +629,7 @@ public:
                     }
                 }
                 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r
-                    AMREX_SYCL_REDUCE_HANDLER(lh.handler()));
+                    AMREX_IF_SYCL(, lh.handler()));
             });
         }
     }
@@ -659,7 +651,7 @@ public:
         reduce_data.updateMaxStreamIndex(stream);
         amrex::LaunchRaw<AMREX_GPU_MAX_THREADS, unsigned long long>(
             amrex::IntVectND<1>{static_cast<int>(nblocks_ec)},
-            AMREX_SYCL_REDUCE_SMEM(Gpu::Device::warp_size),
+            AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0),
             [=] AMREX_GPU_DEVICE (auto lh) noexcept
         {
             ReduceTuple r;
@@ -687,7 +679,7 @@ public:
                 }
             }
             Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r
-                AMREX_SYCL_REDUCE_HANDLER(lh.handler()));
+                AMREX_IF_SYCL(, lh.handler()));
         });
         nblocks = std::max(nblocks, static_cast<int>(nblocks_ec));
     }
@@ -782,7 +774,7 @@ public:
         // device reduce needs local(i.e., shared) memory
         amrex::LaunchRaw<AMREX_GPU_MAX_THREADS, unsigned long long>(
             amrex::IntVectND<1>{nblocks_ec},
-            AMREX_SYCL_REDUCE_SMEM(Gpu::Device::warp_size),
+            AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0),
             [=] AMREX_GPU_DEVICE (auto lh) noexcept
         {
             ReduceTuple r;
@@ -800,7 +792,7 @@ public:
                 Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r,pr);
             }
             Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r
-                AMREX_SYCL_REDUCE_HANDLER(lh.handler()));
+                AMREX_IF_SYCL(, lh.handler()));
         });
         nblocks = amrex::max(nblocks, nblocks_ec);
     }
@@ -844,7 +836,7 @@ public:
 #endif
             amrex::LaunchRaw<AMREX_GPU_MAX_THREADS, unsigned long long>(
                 amrex::IntVectND<1>{1},
-                AMREX_SYCL_REDUCE_SMEM(Gpu::Device::warp_size),
+                AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0),
                 [=] AMREX_GPU_DEVICE (auto lh) noexcept
             {
                 ReduceTuple r;
@@ -858,7 +850,7 @@ public:
                     }
                 }
                 Reduce::detail::for_each_parallel<0, ReduceTuple, Ps...>(dst, r
-                    AMREX_SYCL_REDUCE_HANDLER(lh.handler()));
+                    AMREX_IF_SYCL(, lh.handler()));
                 if (lh.threadIdx1D() == 0) { *presult = dst; }
             });
 #if !defined(AMREX_NO_SYCL_REDUCE_WORKAROUND) && defined(AMREX_USE_SYCL)
@@ -999,10 +991,10 @@ bool AnyOf (N n, T const* v, P const& pred)
 
     amrex::LaunchRaw<AMREX_GPU_MAX_THREADS, int>(
         amrex::IntVectND<1>{nblocks_ec},
-        AMREX_SYCL_REDUCE_SMEM(num_ints) + 1,
+        AMREX_IF_SYCL(num_ints + 1) AMREX_IF_NOT_SYCL(1),
         [=] AMREX_GPU_DEVICE (auto lh) noexcept
     {
-        int& has_any = *(lh.shared_memory() + AMREX_SYCL_REDUCE_SMEM(num_ints));
+        int& has_any = *(lh.shared_memory() AMREX_IF_SYCL(+ num_ints));
         if (lh.threadIdx1D() == 0) { has_any = *dp; }
         lh.syncthreads();
 
@@ -1017,7 +1009,7 @@ bool AnyOf (N n, T const* v, P const& pred)
             }
             r = Gpu::blockReduce<Gpu::Device::warp_size>
                 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0
-                AMREX_SYCL_REDUCE_HANDLER(lh.handler()));
+                AMREX_IF_SYCL(, lh.handler()));
             if (lh.threadIdx1D() == 0 && r) *dp = 1;
         }
     });
@@ -1039,10 +1031,10 @@ bool AnyOf (BoxND<dim> const& box, P const& pred)
 
     amrex::LaunchRaw<AMREX_GPU_MAX_THREADS, int>(
         amrex::IntVectND<1>{nblocks_ec},
-        AMREX_SYCL_REDUCE_SMEM(num_ints) + 1,
+        AMREX_IF_SYCL(num_ints + 1) AMREX_IF_NOT_SYCL(1),
         [=] AMREX_GPU_DEVICE (auto lh) noexcept
     {
-        int& has_any = *(lh.shared_memory() + AMREX_SYCL_REDUCE_SMEM(num_ints));
+        int& has_any = *(lh.shared_memory() AMREX_IF_SYCL(+ num_ints));
         if (lh.threadIdx1D() == 0) { has_any = *dp; }
         lh.syncthreads();
 
@@ -1060,7 +1052,7 @@ bool AnyOf (BoxND<dim> const& box, P const& pred)
             }
             r = Gpu::blockReduce<Gpu::Device::warp_size>
                 (r, Gpu::warpReduce<Gpu::Device::warp_size,int,amrex::Plus<int> >(), 0
-                AMREX_SYCL_REDUCE_HANDLER(lh.handler()));
+                AMREX_IF_SYCL(, lh.handler()));
             if (lh.threadIdx1D() == 0 && r) *dp = 1;
         }
     });

From 232988f88ded71486114378143f00105985acf95 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Tue, 10 Mar 2026 11:39:32 +0100
Subject: [PATCH 15/46] fix nodiscard

---
 Src/Base/AMReX_GpuTypes.H | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H
index 433984ee786..6d5f295ad29 100644
--- a/Src/Base/AMReX_GpuTypes.H
+++ b/Src/Base/AMReX_GpuTypes.H
@@ -119,7 +119,7 @@ struct LaunchHandler
         : m_blockid{a_blockid}, m_griddim{a_griddim}, m_shared_mem{a_shared_mem} {}
 #endif
 
-    AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]]
+    [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     unsigned int threadIdx1D () const {
         return static_cast<unsigned int>(AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
             threadIdx.x,
@@ -129,7 +129,7 @@ struct LaunchHandler
     }
 
     template<int ...nd_block_size>
-    AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]]
+    [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     IntVectND<dim> threadIdxND () const {
         static_assert(sizeof...(nd_block_size) == dim &&
                       (1 * ... * nd_block_size) == threads_per_block);
@@ -148,7 +148,7 @@ struct LaunchHandler
         return ret;
     }
 
-    AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]]
+    [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     unsigned int blockIdx1D () const {
         if constexpr (dim == 1) {
             return static_cast<unsigned int>(AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
@@ -172,7 +172,7 @@ struct LaunchHandler
         }
     }
 
-    AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]]
+    [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     IntVectND<dim> blockIdxND () const {
         if constexpr (dim == 1) {
             return AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
@@ -201,31 +201,31 @@ struct LaunchHandler
         }
     }
 
-    AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]]
+    [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     static constexpr unsigned int blockDim1D () {
         return threads_per_block;
     }
 
     template<int ...nd_block_size>
-    AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]]
+    [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     static constexpr IntVectND<dim> blockDimND () {
         static_assert(sizeof...(nd_block_size) == dim &&
                       (1 * ... * nd_block_size) == threads_per_block);
         return {nd_block_size...};
     }
 
-    AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]]
+    [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     unsigned int globalIdx1D () const {
         return blockIdx1D() * threads_per_block + threadIdx1D();
     }
 
     template<int ...nd_block_size>
-    AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]]
+    [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     IntVectND<dim> globalIdxND () const {
         return blockIdxND() * blockDimND<nd_block_size...>() + threadIdxND<nd_block_size...>();
     }
 
-    AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]]
+    [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     void syncthreads () const {
         AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
             __syncthreads(),
@@ -234,7 +234,7 @@ struct LaunchHandler
         );
     }
 
-    AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]]
+    [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     T* shared_memory () const {
 #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
         static_assert(32 >= alignof(T));
@@ -245,7 +245,7 @@ struct LaunchHandler
 #endif
     }
 
-    AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]]
+    [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     auto handler () const {
 #if defined(AMREX_USE_SYCL)
         return Gpu::Handler(m_item, reinterpret_cast<void*>(m_shared_mem), threads_per_block);
@@ -256,7 +256,7 @@ struct LaunchHandler
 #endif
     }
 
-    AMREX_GPU_DEVICE AMREX_FORCE_INLINE [[nodiscard]]
+    [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     auto sycl_item () const {
 #if defined(AMREX_USE_SYCL)
         return m_item;

From c392c70d289b62a6be39bf20996f32f375156d9a Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Tue, 10 Mar 2026 11:43:49 +0100
Subject: [PATCH 16/46] fix nodiscard 2

---
 Src/Base/AMReX_GpuTypes.H | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H
index 6d5f295ad29..ae01f08178d 100644
--- a/Src/Base/AMReX_GpuTypes.H
+++ b/Src/Base/AMReX_GpuTypes.H
@@ -225,7 +225,7 @@ struct LaunchHandler
         return blockIdxND() * blockDimND<nd_block_size...>() + threadIdxND<nd_block_size...>();
     }
 
-    [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     void syncthreads () const {
         AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
             __syncthreads(),

From 62565bb21fa61fde3c20b2d2c95358988972e4c0 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Tue, 10 Mar 2026 12:42:55 +0100
Subject: [PATCH 17/46] try fix void return issue

---
 Src/Base/AMReX_GpuTypes.H | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H
index ae01f08178d..be186c19c35 100644
--- a/Src/Base/AMReX_GpuTypes.H
+++ b/Src/Base/AMReX_GpuTypes.H
@@ -252,7 +252,7 @@ struct LaunchHandler
 #elif defined(AMREX_USE_GPU)
         return Gpu::Handler(threads_per_block);
 #else
-        return 0;
+        return int{0};
 #endif
     }
 
@@ -261,7 +261,7 @@ struct LaunchHandler
 #if defined(AMREX_USE_SYCL)
         return m_item;
 #else
-        return 0;
+        return int{0};
 #endif
     }
 

From 5c59a308ccb0e79d69269486b2f856235e40a270 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Tue, 10 Mar 2026 12:51:28 +0100
Subject: [PATCH 18/46] remove nodiscard

---
 Src/Base/AMReX_GpuTypes.H | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H
index be186c19c35..fcc6b072bf5 100644
--- a/Src/Base/AMReX_GpuTypes.H
+++ b/Src/Base/AMReX_GpuTypes.H
@@ -245,23 +245,23 @@ struct LaunchHandler
 #endif
     }
 
-    [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     auto handler () const {
 #if defined(AMREX_USE_SYCL)
         return Gpu::Handler(m_item, reinterpret_cast<void*>(m_shared_mem), threads_per_block);
 #elif defined(AMREX_USE_GPU)
         return Gpu::Handler(threads_per_block);
 #else
-        return int{0};
+        return 0;
 #endif
     }
 
-    [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     auto sycl_item () const {
 #if defined(AMREX_USE_SYCL)
         return m_item;
 #else
-        return int{0};
+        return 0;
 #endif
     }
 

From 6161d9015ed0caaf9cf3ef0267a44b0577291d8f Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Tue, 10 Mar 2026 13:19:59 +0100
Subject: [PATCH 19/46] try fix cuda first capture issue

---
 Src/Base/AMReX_Reduce.H | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H
index 18ceb3abe51..db006dd71ab 100644
--- a/Src/Base/AMReX_Reduce.H
+++ b/Src/Base/AMReX_Reduce.H
@@ -667,13 +667,16 @@ public:
                  icell += stride)
             {
                 auto iv = indexer.intVect(icell);
-                amrex::ignore_unused(f,ncomp,ixtype); // work around first-capture
+                // work around first-capture
+                auto f2 = f;
+                auto ncomp2 = ncomp;
+                auto ixtype2 = ixtype;
                 if constexpr (std::is_same_v<Reduce::detail::iterate_box,I>) {
-                    auto pr = Reduce::detail::call_f_intvect_box(f, iv, ixtype);
+                    auto pr = Reduce::detail::call_f_intvect_box(f2, iv, ixtype2);
                     Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
                 } else {
-                    for (int n = 0; n < ncomp; ++n) {
-                        auto pr = Reduce::detail::call_f_intvect_n(f, iv, n);
+                    for (int n = 0; n < ncomp2; ++n) {
+                        auto pr = Reduce::detail::call_f_intvect_n(f2, iv, n);
                         Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, pr);
                     }
                 }

From ec26b0b305a27276dbd1cd576cc00c6fd78331c3 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Tue, 10 Mar 2026 14:25:26 +0100
Subject: [PATCH 20/46] add documentation

---
 Docs/Doxygen/groups.dox           |  1 +
 Src/Base/AMReX_GpuLaunchFunctsG.H | 43 +++++++++++++++++++++++++++++++
 Src/Base/AMReX_GpuTypes.H         | 42 ++++++++++++++++++++++++++++++
 3 files changed, 86 insertions(+)

diff --git a/Docs/Doxygen/groups.dox b/Docs/Doxygen/groups.dox
index c2d946159fd..1265c9a5050 100644
--- a/Docs/Doxygen/groups.dox
+++ b/Docs/Doxygen/groups.dox
@@ -187,6 +187,7 @@
  * - \ref amrex::ParallelFor
  * - \ref amrex::ParallelForOMP
  * - \ref amrex::ParallelForRNG
+ * - \ref amrex::LaunchRaw
  */
 
 /**
diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index fe9578793cd..24b58376a58 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -1348,6 +1348,24 @@ ParallelFor (Gpu::KernelInfo const&,
     AMREX_GPU_ERROR_CHECK();
 }
 
+/**
+ * \ingroup amrex_execution
+ * \brief Performance-portable kernel launch function
+ * that provides low-level access to GPU thread blocks through \ref amrex::Gpu::LaunchHandler.
+ *
+ * The number of threads per block is a compile-time-known one-dimensional value that usually
+ * should be one of 128, 256, 512 or 1024.
+ *
+ * The number of total blocks can be a 1D, 2D or 3D IntVectND. Internally this uses the native
+ * way to split the index, such as blockIdx.x, blockIdx.y and blockIdx.z.
+ * Note that this uses types int and unsigned int which might overflow if many blocks or
+ * total threads are needed. In case this is an issue, it is necessary to explicitly add
+ * a 64-bit grid-strided loop or to call LaunchRaw multiple times with fewer blocks.
+ *
+ * \tparam MT number of threads per GPU block.
+ * \param nblocks number of GPU blocks to launch.
+ * \param f a callable object that takes amrex::Gpu::LaunchHandler as input.
+ */
 template<int MT, int dim, typename L>
 void LaunchRaw (IntVectND<dim> nblocks, L const& f)
 {
@@ -1367,6 +1385,31 @@ void LaunchRaw (IntVectND<dim> nblocks, L const& f)
     AMREX_GPU_ERROR_CHECK();
 }
 
+/**
+ * \ingroup amrex_execution
+ * \brief Performance-portable kernel launch function
+ * that provides low-level access to GPU thread blocks and shared memory
+ * through \ref amrex::Gpu::LaunchHandler.
+ *
+ * The number of threads per block is a compile-time-known one-dimensional value that usually
+ * should be one of 128, 256, 512 or 1024.
+ *
+ * The number of total blocks can be a 1D, 2D or 3D IntVectND. Internally this uses the native
+ * way to split the index, such as blockIdx.x, blockIdx.y and blockIdx.z.
+ * Note that this uses types int and unsigned int which might overflow if many blocks or
+ * total threads are needed. In case this is an issue, it is necessary to explicitly add
+ * a 64-bit grid-strided loop or to call LaunchRaw multiple times with fewer blocks.
+ *
+ * This version of LaunchRaw supports the use of dynamic shared memory inside the thread block.
+ * Shared memory is a fast cache local to thread blocks. It has different names in
+ * different GPU backends (CUDA: shared memory, HIP: local data share, SYCL: local memory).
+ *
+ * \tparam MT number of threads per GPU block.
+ * \tparam T data type of shared memory elements.
+ * \param nblocks number of GPU blocks to launch.
+ * \param shared_mem_elements number of shared memory elements per block to allocate.
+ * \param f a callable object that takes amrex::Gpu::LaunchHandler as input.
+ */
 template<int MT, typename T, int dim, typename L>
 void LaunchRaw (IntVectND<dim> nblocks, std::size_t shared_mem_elements, L const& f)
 {
diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H
index fcc6b072bf5..db647cbf9a7 100644
--- a/Src/Base/AMReX_GpuTypes.H
+++ b/Src/Base/AMReX_GpuTypes.H
@@ -119,6 +119,9 @@ struct LaunchHandler
         : m_blockid{a_blockid}, m_griddim{a_griddim}, m_shared_mem{a_shared_mem} {}
 #endif
 
+    /**
+     * \brief Returns the thread ID in the local block.
+     */
     [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     unsigned int threadIdx1D () const {
         return static_cast<unsigned int>(AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
@@ -128,6 +131,10 @@ struct LaunchHandler
         ));
     }
 
+    /**
+     * \brief Splits the local thread ID into N dimensions.
+     * The sizes of the dimensions are supplied as template arguments.
+     */
     template<int ...nd_block_size>
     [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     IntVectND<dim> threadIdxND () const {
@@ -148,6 +155,9 @@ struct LaunchHandler
         return ret;
     }
 
+    /**
+     * \brief Returns the block ID flattened to 1D.
+     */
     [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     unsigned int blockIdx1D () const {
         if constexpr (dim == 1) {
@@ -172,6 +182,9 @@ struct LaunchHandler
         }
     }
 
+    /**
+     * \brief Returns the 1D/2D/3D block ID.
+     */
     [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     IntVectND<dim> blockIdxND () const {
         if constexpr (dim == 1) {
@@ -201,11 +214,18 @@ struct LaunchHandler
         }
     }
 
+    /**
+     * \brief Returns the number of threads inside a block.
+     */
     [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     static constexpr unsigned int blockDim1D () {
         return threads_per_block;
     }
 
+    /**
+     * \brief Analogous to threadIdxND but returns the ND block size.
+     * This effectively just returns the template arguments as an IntVectND.
+     */
     template<int ...nd_block_size>
     [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     static constexpr IntVectND<dim> blockDimND () {
@@ -214,17 +234,27 @@ struct LaunchHandler
         return {nd_block_size...};
     }
 
+    /**
+     * \brief Returns the global thread index flattened to 1D.
+     */
     [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     unsigned int globalIdx1D () const {
         return blockIdx1D() * threads_per_block + threadIdx1D();
     }
 
+    /**
+     * \brief Returns the global 1D/2D/3D thread index.
+     */
     template<int ...nd_block_size>
     [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     IntVectND<dim> globalIdxND () const {
         return blockIdxND() * blockDimND<nd_block_size...>() + threadIdxND<nd_block_size...>();
     }
 
+    /**
+     * \brief Synchronizes all threads within a block. This is needed before
+     * accessing (shared) memory that was previously written by another thread in the block.
+     */
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     void syncthreads () const {
         AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
@@ -234,6 +264,12 @@ struct LaunchHandler
         );
     }
 
+    /**
+     * \brief Returns a pointer to block-local shared memory. If multiple shared memory
+     * allocations are needed in a block, then the allocation must be manually split by adding
+     * offsets to it. For CUDA and HIP the memory is aligned to 32 bytes,
+     * for SYCL to the alignment of the chosen data type.
+     */
     [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     T* shared_memory () const {
 #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
@@ -245,6 +281,9 @@ struct LaunchHandler
 #endif
     }
 
+    /**
+     * \brief Returns an amrex::Gpu::Handler object, which is sometimes needed for reductions.
+     */
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     auto handler () const {
 #if defined(AMREX_USE_SYCL)
@@ -256,6 +295,9 @@ struct LaunchHandler
 #endif
     }
 
+    /**
+     * \brief Returns the internal sycl::nd_item.
+     */
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     auto sycl_item () const {
 #if defined(AMREX_USE_SYCL)

From cf44395d9f03eaa183786cfafa1d1eea3090f5c3 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Tue, 10 Mar 2026 18:14:20 +0100
Subject: [PATCH 21/46] start to add test

---
 Tests/GPU/LaunchRaw/GNUmakefile  | 23 ++++++++++
 Tests/GPU/LaunchRaw/Make.package |  2 +
 Tests/GPU/LaunchRaw/inputs       |  1 +
 Tests/GPU/LaunchRaw/main.cpp     | 73 ++++++++++++++++++++++++++++++++
 4 files changed, 99 insertions(+)
 create mode 100644 Tests/GPU/LaunchRaw/GNUmakefile
 create mode 100644 Tests/GPU/LaunchRaw/Make.package
 create mode 100644 Tests/GPU/LaunchRaw/inputs
 create mode 100644 Tests/GPU/LaunchRaw/main.cpp

diff --git a/Tests/GPU/LaunchRaw/GNUmakefile b/Tests/GPU/LaunchRaw/GNUmakefile
new file mode 100644
index 00000000000..42cbc3ea210
--- /dev/null
+++ b/Tests/GPU/LaunchRaw/GNUmakefile
@@ -0,0 +1,23 @@
+AMREX_HOME = ../../../
+
+DEBUG	= FALSE
+
+DIM	= 3
+
+COMP    = gcc
+
+USE_CUDA  = FALSE
+USE_ACC   = FALSE
+USE_OMP_OFFLOAD = FALSE
+
+USE_MPI   = FALSE
+USE_OMP   = FALSE
+
+TINY_PROFILE = FALSE
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.defs
+
+include ./Make.package
+include $(AMREX_HOME)/Src/Base/Make.package
+
+include $(AMREX_HOME)/Tools/GNUMake/Make.rules
diff --git a/Tests/GPU/LaunchRaw/Make.package b/Tests/GPU/LaunchRaw/Make.package
new file mode 100644
index 00000000000..7f43e5e87cb
--- /dev/null
+++ b/Tests/GPU/LaunchRaw/Make.package
@@ -0,0 +1,2 @@
+CEXE_sources += main.cpp
+
diff --git a/Tests/GPU/LaunchRaw/inputs b/Tests/GPU/LaunchRaw/inputs
new file mode 100644
index 00000000000..bba7f4f305b
--- /dev/null
+++ b/Tests/GPU/LaunchRaw/inputs
@@ -0,0 +1 @@
+amrex.the_arena_is_managed = 0
diff --git a/Tests/GPU/LaunchRaw/main.cpp b/Tests/GPU/LaunchRaw/main.cpp
new file mode 100644
index 00000000000..e3408685679
--- /dev/null
+++ b/Tests/GPU/LaunchRaw/main.cpp
@@ -0,0 +1,73 @@
+#include <AMReX.H>
+#include <AMReX_Gpu.H>
+#include <AMReX_GpuContainers.H>
+#include <AMReX_ParmParse.H>
+#include <AMReX_GpuPrint.H>
+#include <AMReX_Reduce.H>
+
+using namespace amrex;
+
+void test3d () {
+
+    constexpr int num_threads = 256;
+    IntVectND<3> num_blocks {31, 23, 11};
+
+    Gpu::DeviceVector<int> vect(num_threads * num_blocks[0] * num_blocks[1] * num_blocks[2], -999);
+
+    auto data = vect.dataPtr();
+
+    LaunchRaw<num_threads>(num_blocks,
+        [=](auto lh){
+            data[lh.globalIdx1D()] = lh.blockIdx1D();
+        });
+
+    LaunchRaw<num_threads>(num_blocks,
+        [=](auto lh){
+            data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] += lh.threadIdx1D();
+        });
+
+    LaunchRaw<num_threads>(num_blocks,
+        [=](auto lh){
+            auto block = lh.blockIdxND();
+            auto thread = lh.template threadIdxND<2, 8, 16>();
+            auto tmp = data[
+                (block[0] + block[1] * num_blocks[0] +
+                block[2] * num_blocks[0] * num_blocks[1]) * num_threads +
+                thread[2] + thread[1] * 16 +
+                thread[0] * 16 * 8
+            ];
+            lh.synctheads();
+            data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] = tmp;
+        });
+
+    LaunchRaw<num_threads, int>(num_blocks, num_threads,
+        [=](auto lh){
+            auto smem = lh.shared_memory();
+            auto thread1 = lh.template threadIdxND<16, 8, 2>();
+            auto locid1 = thread1[2] + thread1[1] * 2 + thread1[0] * 2 * 8;
+            auto thread2 = lh.template threadIdxND<16, 2, 8>();
+            auto locid2 = thread2[0] + thread2[2] * 16 + thread2[1] * 16 * 8;
+
+            smem[lh.threadIdx1D()] = data[lh.blockIdx1D() * lh.blockDim1D() + locid1];
+            lh.syncthreads();
+            data[lh.blockIdx1D() * lh.blockDim1D() + locid2] = smem[locid2];
+        });
+
+    LaunchRaw<num_threads>(num_blocks,
+        [=](auto lh){
+            data[lh.globalIdx1D()] = data[lh.globalIdx1D()] == (lh.blockIdx1D() + lh.threadIdx1D());
+        });
+
+    AMREX_ALWAYS_ASSERT(Reduce::Sum(vect.size(), data, 0) == vect.size());
+}
+
+int main (int argc, char* argv[])
+{
+    amrex::Initialize(argc,argv);
+    {
+        test3d();
+
+        amrex::Print() << "Passed! \n";
+    }
+    amrex::Finalize();
+}

From 09d75ee02b27714d27f6ff93a0b2c6153436ae6b Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Wed, 11 Mar 2026 13:43:57 +0100
Subject: [PATCH 22/46] update test and see if it is compiled

---
 Tests/GPU/LaunchRaw/GNUmakefile |  6 ------
 Tests/GPU/LaunchRaw/main.cpp    | 28 +++++++++++++++++-----------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/Tests/GPU/LaunchRaw/GNUmakefile b/Tests/GPU/LaunchRaw/GNUmakefile
index 42cbc3ea210..8c8ce8057ac 100644
--- a/Tests/GPU/LaunchRaw/GNUmakefile
+++ b/Tests/GPU/LaunchRaw/GNUmakefile
@@ -4,12 +4,6 @@ DEBUG	= FALSE
 
 DIM	= 3
 
-COMP    = gcc
-
-USE_CUDA  = FALSE
-USE_ACC   = FALSE
-USE_OMP_OFFLOAD = FALSE
-
 USE_MPI   = FALSE
 USE_OMP   = FALSE
 
diff --git a/Tests/GPU/LaunchRaw/main.cpp b/Tests/GPU/LaunchRaw/main.cpp
index e3408685679..6d83980797b 100644
--- a/Tests/GPU/LaunchRaw/main.cpp
+++ b/Tests/GPU/LaunchRaw/main.cpp
@@ -9,8 +9,13 @@ using namespace amrex;
 
 void test3d () {
 
-    constexpr int num_threads = 256;
-    IntVectND<3> num_blocks {31, 23, 11};
+    const IntVectND<3> num_blocks {31, 23, 11};
+#ifdef AMREX_USE_GPU
+    static constexpr IntVectND<3> blockdim {2, 8, 16};
+#else
+    static constexpr IntVectND<3> blockdim {1, 1, 1};
+#endif
+    static constexpr int num_threads = blockdim[0] * blockdim[1] * blockdim[2];
 
     Gpu::DeviceVector<int> vect(num_threads * num_blocks[0] * num_blocks[1] * num_blocks[2], -999);
 
@@ -29,12 +34,12 @@ void test3d () {
     LaunchRaw<num_threads>(num_blocks,
         [=](auto lh){
             auto block = lh.blockIdxND();
-            auto thread = lh.template threadIdxND<2, 8, 16>();
+            auto thread = lh.template threadIdxND<blockdim[0], blockdim[1], blockdim[2]>();
             auto tmp = data[
                 (block[0] + block[1] * num_blocks[0] +
                 block[2] * num_blocks[0] * num_blocks[1]) * num_threads +
-                thread[2] + thread[1] * 16 +
-                thread[0] * 16 * 8
+                thread[2] + thread[1] * blockdim[2] +
+                thread[0] * blockdim[2] * blockdim[1]
             ];
             lh.synctheads();
             data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] = tmp;
@@ -43,11 +48,12 @@ void test3d () {
     LaunchRaw<num_threads, int>(num_blocks, num_threads,
         [=](auto lh){
             auto smem = lh.shared_memory();
-            auto thread1 = lh.template threadIdxND<16, 8, 2>();
-            auto locid1 = thread1[2] + thread1[1] * 2 + thread1[0] * 2 * 8;
-            auto thread2 = lh.template threadIdxND<16, 2, 8>();
-            auto locid2 = thread2[0] + thread2[2] * 16 + thread2[1] * 16 * 8;
-
+            auto thread1 = lh.template threadIdxND<blockdim[2], blockdim[1], blockdim[0]>();
+            auto locid1 = thread1[2] + thread1[1] * blockdim[0] +
+                          thread1[0] * blockdim[0] * blockdim[1];
+            auto thread2 = lh.template threadIdxND<blockdim[2], blockdim[0], blockdim[1]>();
+            auto locid2 = thread2[0] + thread2[2] * blockdim[2] +
+                          thread2[1] * blockdim[2] * blockdim[1];
             smem[lh.threadIdx1D()] = data[lh.blockIdx1D() * lh.blockDim1D() + locid1];
             lh.syncthreads();
             data[lh.blockIdx1D() * lh.blockDim1D() + locid2] = smem[locid2];
@@ -65,7 +71,7 @@ int main (int argc, char* argv[])
 {
     amrex::Initialize(argc,argv);
     {
-        test3d();
+        test3d()
 
         amrex::Print() << "Passed! \n";
     }

From 76a84019b0dfce40c66a6791bf71e6f330349928 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Wed, 11 Mar 2026 14:39:46 +0100
Subject: [PATCH 23/46] move test dir

---
 Tests/CMakeLists.txt                   | 2 +-
 Tests/LaunchRaw/CMakeLists.txt         | 9 +++++++++
 Tests/{GPU => }/LaunchRaw/GNUmakefile  | 2 --
 Tests/{GPU => }/LaunchRaw/Make.package | 0
 Tests/{GPU => }/LaunchRaw/inputs       | 0
 Tests/{GPU => }/LaunchRaw/main.cpp     | 0
 6 files changed, 10 insertions(+), 3 deletions(-)
 create mode 100644 Tests/LaunchRaw/CMakeLists.txt
 rename Tests/{GPU => }/LaunchRaw/GNUmakefile (96%)
 rename Tests/{GPU => }/LaunchRaw/Make.package (100%)
 rename Tests/{GPU => }/LaunchRaw/inputs (100%)
 rename Tests/{GPU => }/LaunchRaw/main.cpp (100%)

diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt
index 975b575c951..e4e8b7d2794 100644
--- a/Tests/CMakeLists.txt
+++ b/Tests/CMakeLists.txt
@@ -126,7 +126,7 @@ else()
    # List of subdirectories to search for CMakeLists.
    #
    set( AMREX_TESTS_SUBDIRS Amr ArrayND AsyncOut CallNoinline CLZ CommType CTOParFor DeviceGlobal
-                            Enum HeatEquation MultiBlock MultiPeriod ParmParse Parser Parser2
+                            Enum HeatEquation LaunchRaw MultiBlock MultiPeriod ParmParse Parser Parser2
                             ParserUserFn Reducer ReduceToPlanePatchy Reinit RoundoffDomain SIMD
                             SmallMatrix SumBoundary TOML)
 
diff --git a/Tests/LaunchRaw/CMakeLists.txt b/Tests/LaunchRaw/CMakeLists.txt
new file mode 100644
index 00000000000..224c4563c83
--- /dev/null
+++ b/Tests/LaunchRaw/CMakeLists.txt
@@ -0,0 +1,9 @@
+foreach(D IN LISTS AMReX_SPACEDIM)
+    set(_sources     main.cpp)
+    set(_input_files)
+
+    setup_test(${D} _sources _input_files)
+
+    unset(_sources)
+    unset(_input_files)
+endforeach()
diff --git a/Tests/GPU/LaunchRaw/GNUmakefile b/Tests/LaunchRaw/GNUmakefile
similarity index 96%
rename from Tests/GPU/LaunchRaw/GNUmakefile
rename to Tests/LaunchRaw/GNUmakefile
index 8c8ce8057ac..173eef4c670 100644
--- a/Tests/GPU/LaunchRaw/GNUmakefile
+++ b/Tests/LaunchRaw/GNUmakefile
@@ -2,8 +2,6 @@ AMREX_HOME = ../../../
 
 DEBUG	= FALSE
 
-DIM	= 3
-
 USE_MPI   = FALSE
 USE_OMP   = FALSE
 
diff --git a/Tests/GPU/LaunchRaw/Make.package b/Tests/LaunchRaw/Make.package
similarity index 100%
rename from Tests/GPU/LaunchRaw/Make.package
rename to Tests/LaunchRaw/Make.package
diff --git a/Tests/GPU/LaunchRaw/inputs b/Tests/LaunchRaw/inputs
similarity index 100%
rename from Tests/GPU/LaunchRaw/inputs
rename to Tests/LaunchRaw/inputs
diff --git a/Tests/GPU/LaunchRaw/main.cpp b/Tests/LaunchRaw/main.cpp
similarity index 100%
rename from Tests/GPU/LaunchRaw/main.cpp
rename to Tests/LaunchRaw/main.cpp

From 9df5358d58359f8a31c7ff60b9e949e28de55658 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Wed, 11 Mar 2026 16:09:12 +0100
Subject: [PATCH 24/46] fix test

---
 Tests/LaunchRaw/main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Tests/LaunchRaw/main.cpp b/Tests/LaunchRaw/main.cpp
index 6d83980797b..2b3cf0c0efa 100644
--- a/Tests/LaunchRaw/main.cpp
+++ b/Tests/LaunchRaw/main.cpp
@@ -71,7 +71,7 @@ int main (int argc, char* argv[])
 {
     amrex::Initialize(argc,argv);
     {
-        test3d()
+        test3d();
 
         amrex::Print() << "Passed! \n";
     }

From 8a78b4f6081b9888806141477144c807a33ac5e4 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Wed, 11 Mar 2026 16:16:53 +0100
Subject: [PATCH 25/46] add static cast

---
 Tests/LaunchRaw/main.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/Tests/LaunchRaw/main.cpp b/Tests/LaunchRaw/main.cpp
index 2b3cf0c0efa..d5cdb0316b7 100644
--- a/Tests/LaunchRaw/main.cpp
+++ b/Tests/LaunchRaw/main.cpp
@@ -41,7 +41,7 @@ void test3d () {
                 thread[2] + thread[1] * blockdim[2] +
                 thread[0] * blockdim[2] * blockdim[1]
             ];
-            lh.synctheads();
+            lh.syncthreads();
             data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] = tmp;
         });
 
@@ -61,10 +61,11 @@ void test3d () {
 
     LaunchRaw<num_threads>(num_blocks,
         [=](auto lh){
-            data[lh.globalIdx1D()] = data[lh.globalIdx1D()] == (lh.blockIdx1D() + lh.threadIdx1D());
+            data[lh.globalIdx1D()] =
+                data[lh.globalIdx1D()] == static_cast<int>(lh.blockIdx1D() + lh.threadIdx1D());
         });
 
-    AMREX_ALWAYS_ASSERT(Reduce::Sum(vect.size(), data, 0) == vect.size());
+    AMREX_ALWAYS_ASSERT(Reduce::Sum(vect.size(), data, 0) == static_cast<int>(vect.size()));
 }
 
 int main (int argc, char* argv[])

From f29aef1eab8664fdc56c6ab58dcda828af9eb1ce Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Wed, 11 Mar 2026 16:38:25 +0100
Subject: [PATCH 26/46] add 1d and 2d tests

---
 Src/Base/AMReX_GpuTypes.H |  14 +++--
 Tests/LaunchRaw/main.cpp  | 119 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 125 insertions(+), 8 deletions(-)

diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H
index db647cbf9a7..4b666a5847f 100644
--- a/Src/Base/AMReX_GpuTypes.H
+++ b/Src/Base/AMReX_GpuTypes.H
@@ -284,28 +284,30 @@ struct LaunchHandler
     /**
      * \brief Returns an amrex::Gpu::Handler object, which is sometimes needed for reductions.
      */
-    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-    auto handler () const {
+    [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    Gpu::Handler handler () const {
 #if defined(AMREX_USE_SYCL)
         return Gpu::Handler(m_item, reinterpret_cast<void*>(m_shared_mem), threads_per_block);
 #elif defined(AMREX_USE_GPU)
         return Gpu::Handler(threads_per_block);
 #else
-        return 0;
+        return Gpu::Handler{};
 #endif
     }
 
     /**
      * \brief Returns the internal sycl::nd_item.
      */
-    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-    auto sycl_item () const {
+    [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
 #if defined(AMREX_USE_SYCL)
+    sycl::nd_item<dim> sycl_item () const {
         return m_item;
+    }
 #else
+    int sycl_item () const {
         return 0;
-#endif
     }
+#endif
 
 private:
 
diff --git a/Tests/LaunchRaw/main.cpp b/Tests/LaunchRaw/main.cpp
index d5cdb0316b7..eb53f64971d 100644
--- a/Tests/LaunchRaw/main.cpp
+++ b/Tests/LaunchRaw/main.cpp
@@ -7,6 +7,117 @@
 
 using namespace amrex;
 
+void test1d () {
+
+    const IntVectND<1> num_blocks {31};
+#ifdef AMREX_USE_GPU
+    static constexpr IntVectND<1> blockdim {256};
+#else
+    static constexpr IntVectND<1> blockdim {1};
+#endif
+    static constexpr int num_threads = blockdim[0];
+
+    Gpu::DeviceVector<int> vect(num_threads * num_blocks[0], -999);
+
+    auto * data = vect.dataPtr();
+
+    LaunchRaw<num_threads>(num_blocks,
+        [=](auto lh){
+            data[lh.globalIdx1D()] = lh.blockIdx1D();
+        });
+
+    LaunchRaw<num_threads>(num_blocks,
+        [=](auto lh){
+            data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] += lh.threadIdx1D();
+        });
+
+    LaunchRaw<num_threads>(num_blocks,
+        [=](auto lh){
+            auto block = lh.blockIdxND();
+            auto thread = lh.template threadIdxND<blockdim[0]>();
+            auto tmp = data[
+                block[0] * num_threads + thread[0]
+            ];
+            lh.syncthreads();
+            data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] = tmp;
+        });
+
+    LaunchRaw<num_threads, int>(num_blocks, num_threads,
+        [=](auto lh){
+            auto smem = lh.shared_memory();
+            auto thread = lh.template threadIdxND<blockdim[0]>();
+            auto locid = thread[0];
+            smem[lh.threadIdx1D()] = data[lh.blockIdx1D() * lh.blockDim1D() + locid];
+            lh.syncthreads();
+            data[lh.blockIdx1D() * lh.blockDim1D() + locid] = smem[locid];
+        });
+
+    LaunchRaw<num_threads>(num_blocks,
+        [=](auto lh){
+            data[lh.globalIdx1D()] =
+                data[lh.globalIdx1D()] == static_cast<int>(lh.blockIdx1D() + lh.threadIdx1D());
+        });
+
+    AMREX_ALWAYS_ASSERT(Reduce::Sum(vect.size(), data, 0) == static_cast<int>(vect.size()));
+}
+
+void test2d () {
+
+    const IntVectND<2> num_blocks {31, 23};
+#ifdef AMREX_USE_GPU
+    static constexpr IntVectND<2> blockdim {8, 32};
+#else
+    static constexpr IntVectND<2> blockdim {1, 1};
+#endif
+    static constexpr int num_threads = blockdim[0] * blockdim[1];
+
+    Gpu::DeviceVector<int> vect(num_threads * num_blocks[0] * num_blocks[1], -999);
+
+    auto * data = vect.dataPtr();
+
+    LaunchRaw<num_threads>(num_blocks,
+        [=](auto lh){
+            data[lh.globalIdx1D()] = lh.blockIdx1D();
+        });
+
+    LaunchRaw<num_threads>(num_blocks,
+        [=](auto lh){
+            data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] += lh.threadIdx1D();
+        });
+
+    LaunchRaw<num_threads>(num_blocks,
+        [=](auto lh){
+            auto block = lh.blockIdxND();
+            auto thread = lh.template threadIdxND<blockdim[0], blockdim[1]>();
+            auto tmp = data[
+                (block[0] + block[1] * num_blocks[0]) * num_threads +
+                thread[1] + thread[0] * blockdim[1]
+            ];
+            lh.syncthreads();
+            data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] = tmp;
+        });
+
+    LaunchRaw<num_threads, int>(num_blocks, num_threads,
+        [=](auto lh){
+            auto smem = lh.shared_memory();
+            auto thread1 = lh.template threadIdxND<blockdim[1], blockdim[0]>();
+            auto locid1 = thread1[1] + thread1[0] * blockdim[0];
+            auto thread2 = lh.template threadIdxND<blockdim[0], blockdim[1]>();
+            auto locid2 = thread2[1] + thread2[0] * blockdim[1];
+            smem[lh.threadIdx1D()] = data[lh.blockIdx1D() * lh.blockDim1D() + locid1];
+            lh.syncthreads();
+            data[lh.blockIdx1D() * lh.blockDim1D() + locid2] = smem[locid2];
+        });
+
+    LaunchRaw<num_threads>(num_blocks,
+        [=](auto lh){
+            data[lh.globalIdx1D()] =
+                data[lh.globalIdx1D()] == static_cast<int>(lh.blockIdx1D() + lh.threadIdx1D());
+        });
+
+    AMREX_ALWAYS_ASSERT(Reduce::Sum(vect.size(), data, 0) == static_cast<int>(vect.size()));
+}
+
 void test3d () {
 
     const IntVectND<3> num_blocks {31, 23, 11};
@@ -19,7 +130,7 @@ void test3d () {
 
     Gpu::DeviceVector<int> vect(num_threads * num_blocks[0] * num_blocks[1] * num_blocks[2], -999);
 
-    auto data = vect.dataPtr();
+    auto * data = vect.dataPtr();
 
     LaunchRaw<num_threads>(num_blocks,
         [=](auto lh){
@@ -70,8 +181,12 @@ void test3d () {
 
 int main (int argc, char* argv[])
 {
-    amrex::Initialize(argc,argv);
+    amrex::Initialize(argc, argv);
     {
+        test1d();
+
+        test2d();
+
         test3d();
 
         amrex::Print() << "Passed! \n";

From 28b39497412e51131765926ef430d59c3f762424 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Wed, 11 Mar 2026 16:51:23 +0100
Subject: [PATCH 27/46] fix constructor

---
 Src/Base/AMReX_GpuLaunchFunctsC.H | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/Src/Base/AMReX_GpuLaunchFunctsC.H b/Src/Base/AMReX_GpuLaunchFunctsC.H
index 4a10a3a70ed..6f39150b460 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsC.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsC.H
@@ -139,19 +139,22 @@ void LaunchRaw (IntVectND<dim> nblocks, L const& f)
     static_assert(MT == 1);
     if constexpr(dim == 1) {
         for (int bx=0; bx < nblocks[0]; ++bx) {
-            f(Gpu::LaunchHandler<MT, void, dim>{{bx}, nblocks, nullptr});
+            f(Gpu::LaunchHandler<MT, void, dim>{
+                IntVectND<1>{bx}, nblocks, nullptr});
         }
     } else if constexpr(dim == 2) {
         for (int by=0; by < nblocks[1]; ++by) {
             for (int bx=0; bx < nblocks[0]; ++bx) {
-                f(Gpu::LaunchHandler<MT, void, dim>{{bx, by}, nblocks, nullptr});
+                f(Gpu::LaunchHandler<MT, void, dim>{
+                    IntVectND<2>{bx, by}, nblocks, nullptr});
             }
         }
     } else {
         for (int bz=0; bz < nblocks[2]; ++bz) {
             for (int by=0; by < nblocks[1]; ++by) {
                 for (int bx=0; bx < nblocks[0]; ++bx) {
-                    f(Gpu::LaunchHandler<MT, void, dim>{{bx, by, bz}, nblocks, nullptr});
+                    f(Gpu::LaunchHandler<MT, void, dim>{
+                        IntVectND<3>{bx, by, bz}, nblocks, nullptr});
                 }
             }
         }
@@ -165,19 +168,22 @@ void LaunchRaw (IntVectND<dim> nblocks, std::size_t shared_mem_elements, L const
     std::vector<T> smem(shared_mem_elements);
     if constexpr(dim == 1) {
         for (int bx=0; bx < nblocks[0]; ++bx) {
-            f(Gpu::LaunchHandler<MT, T, dim>{{bx}, nblocks, smem.data()});
+            f(Gpu::LaunchHandler<MT, T, dim>{
+                IntVectND<1>{bx}, nblocks, smem.data()});
         }
     } else if constexpr(dim == 2) {
         for (int by=0; by < nblocks[1]; ++by) {
             for (int bx=0; bx < nblocks[0]; ++bx) {
-                f(Gpu::LaunchHandler<MT, T, dim>{{bx, by}, nblocks, smem.data()});
+                f(Gpu::LaunchHandler<MT, T, dim>{
+                    IntVectND<2>{bx, by}, nblocks, smem.data()});
             }
         }
     } else {
         for (int bz=0; bz < nblocks[2]; ++bz) {
             for (int by=0; by < nblocks[1]; ++by) {
                 for (int bx=0; bx < nblocks[0]; ++bx) {
-                    f(Gpu::LaunchHandler<MT, T, dim>{{bx, by, bz}, nblocks, smem.data()});
+                    f(Gpu::LaunchHandler<MT, T, dim>{
+                        IntVectND<3>{bx, by, bz}, nblocks, smem.data()});
                 }
             }
         }

From aa9bbead67ada28e008d2b8ecfb6efba61d97596 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Wed, 11 Mar 2026 16:58:30 +0100
Subject: [PATCH 28/46] fix constructor 2

---
 Src/Base/AMReX_GpuTypes.H | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H
index 4b666a5847f..f031f170d15 100644
--- a/Src/Base/AMReX_GpuTypes.H
+++ b/Src/Base/AMReX_GpuTypes.H
@@ -231,7 +231,7 @@ struct LaunchHandler
     static constexpr IntVectND<dim> blockDimND () {
         static_assert(sizeof...(nd_block_size) == dim &&
                       (1 * ... * nd_block_size) == threads_per_block);
-        return {nd_block_size...};
+        return IntVectND<dim>{nd_block_size...};
     }
 
     /**

From 4bb24ae28c9e3ebc4f57fa4e5031e8c4b71dca46 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Wed, 11 Mar 2026 17:16:46 +0100
Subject: [PATCH 29/46] test

---
 Src/Base/AMReX_Reduce.H  | 18 +++++++++---------
 Tests/LaunchRaw/main.cpp | 30 +++++++++++++++---------------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H
index db006dd71ab..83e48ebe7d2 100644
--- a/Src/Base/AMReX_Reduce.H
+++ b/Src/Base/AMReX_Reduce.H
@@ -526,19 +526,19 @@ namespace Reduce::detail {
 
     template <typename F, int dim>
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-    auto call_f_intvect_box (F const& f, IntVectND<dim> iv, IndexTypeND<dim>) noexcept ->
-        decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
+    auto call_f_intvect_box (F const& f, IntVectND<dim> iv, IndexTypeND<dim>) noexcept //->
+        // decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
     {
         return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
     }
 
-    template <typename F, int dim>
-    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-    auto call_f_intvect_box (F const& f, IntVectND<dim> iv, IndexTypeND<dim> t) noexcept ->
-        decltype(f(BoxND<dim>(iv, iv, t)))
-    {
-        return f(BoxND<dim>(iv, iv, t));
-    }
+    // template <typename F, int dim>
+    // AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    // auto call_f_intvect_box (F const& f, IntVectND<dim> iv, IndexTypeND<dim> t) noexcept ->
+    //     decltype(f(BoxND<dim>(iv, iv, t)))
+    // {
+    //     return f(BoxND<dim>(iv, iv, t));
+    // }
 
     // call_f_intvect_n
     template <typename F, typename T, int dim>
diff --git a/Tests/LaunchRaw/main.cpp b/Tests/LaunchRaw/main.cpp
index eb53f64971d..68b9366dcea 100644
--- a/Tests/LaunchRaw/main.cpp
+++ b/Tests/LaunchRaw/main.cpp
@@ -22,17 +22,17 @@ void test1d () {
     auto * data = vect.dataPtr();
 
     LaunchRaw<num_threads>(num_blocks,
-        [=](auto lh){
+        [=] AMREX_GPU_DEVICE (auto lh) {
             data[lh.globalIdx1D()] = lh.blockIdx1D();
         });
 
     LaunchRaw<num_threads>(num_blocks,
-        [=](auto lh){
+        [=] AMREX_GPU_DEVICE (auto lh) {
             data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] += lh.threadIdx1D();
         });
 
     LaunchRaw<num_threads>(num_blocks,
-        [=](auto lh){
+        [=] AMREX_GPU_DEVICE (auto lh) {
             auto block = lh.blockIdxND();
             auto thread = lh.template threadIdxND<blockdim[0]>();
             auto tmp = data[
@@ -43,7 +43,7 @@ void test1d () {
         });
 
     LaunchRaw<num_threads, int>(num_blocks, num_threads,
-        [=](auto lh){
+        [=] AMREX_GPU_DEVICE (auto lh) {
             auto smem = lh.shared_memory();
             auto thread = lh.template threadIdxND<blockdim[0]>();
             auto locid = thread[0];
@@ -53,7 +53,7 @@ void test1d () {
         });
 
     LaunchRaw<num_threads>(num_blocks,
-        [=](auto lh){
+        [=] AMREX_GPU_DEVICE (auto lh) {
             data[lh.globalIdx1D()] =
                 data[lh.globalIdx1D()] == static_cast<int>(lh.blockIdx1D() + lh.threadIdx1D());
         });
@@ -76,17 +76,17 @@ void test2d () {
     auto * data = vect.dataPtr();
 
     LaunchRaw<num_threads>(num_blocks,
-        [=](auto lh){
+        [=] AMREX_GPU_DEVICE (auto lh) {
             data[lh.globalIdx1D()] = lh.blockIdx1D();
         });
 
     LaunchRaw<num_threads>(num_blocks,
-        [=](auto lh){
+        [=] AMREX_GPU_DEVICE (auto lh) {
             data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] += lh.threadIdx1D();
         });
 
     LaunchRaw<num_threads>(num_blocks,
-        [=](auto lh){
+        [=] AMREX_GPU_DEVICE (auto lh) {
             auto block = lh.blockIdxND();
             auto thread = lh.template threadIdxND<blockdim[0], blockdim[1]>();
             auto tmp = data[
@@ -98,7 +98,7 @@ void test2d () {
         });
 
     LaunchRaw<num_threads, int>(num_blocks, num_threads,
-        [=](auto lh){
+        [=] AMREX_GPU_DEVICE (auto lh) {
             auto smem = lh.shared_memory();
             auto thread1 = lh.template threadIdxND<blockdim[1], blockdim[0]>();
             auto locid1 = thread1[1] + thread1[0] * blockdim[0];
@@ -110,7 +110,7 @@ void test2d () {
         });
 
     LaunchRaw<num_threads>(num_blocks,
-        [=](auto lh){
+        [=] AMREX_GPU_DEVICE (auto lh) {
             data[lh.globalIdx1D()] =
                 data[lh.globalIdx1D()] == static_cast<int>(lh.blockIdx1D() + lh.threadIdx1D());
         });
@@ -133,17 +133,17 @@ void test3d () {
     auto * data = vect.dataPtr();
 
     LaunchRaw<num_threads>(num_blocks,
-        [=](auto lh){
+        [=] AMREX_GPU_DEVICE (auto lh) {
             data[lh.globalIdx1D()] = lh.blockIdx1D();
         });
 
     LaunchRaw<num_threads>(num_blocks,
-        [=](auto lh){
+        [=] AMREX_GPU_DEVICE (auto lh) {
             data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] += lh.threadIdx1D();
         });
 
     LaunchRaw<num_threads>(num_blocks,
-        [=](auto lh){
+        [=] AMREX_GPU_DEVICE (auto lh) {
             auto block = lh.blockIdxND();
             auto thread = lh.template threadIdxND<blockdim[0], blockdim[1], blockdim[2]>();
             auto tmp = data[
@@ -157,7 +157,7 @@ void test3d () {
         });
 
     LaunchRaw<num_threads, int>(num_blocks, num_threads,
-        [=](auto lh){
+        [=] AMREX_GPU_DEVICE (auto lh) {
             auto smem = lh.shared_memory();
             auto thread1 = lh.template threadIdxND<blockdim[2], blockdim[1], blockdim[0]>();
             auto locid1 = thread1[2] + thread1[1] * blockdim[0] +
@@ -171,7 +171,7 @@ void test3d () {
         });
 
     LaunchRaw<num_threads>(num_blocks,
-        [=](auto lh){
+        [=] AMREX_GPU_DEVICE (auto lh) {
             data[lh.globalIdx1D()] =
                 data[lh.globalIdx1D()] == static_cast<int>(lh.blockIdx1D() + lh.threadIdx1D());
         });

From 8bcefc0097c211aed038e3e77015a02ba08ca590 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Wed, 11 Mar 2026 17:48:02 +0100
Subject: [PATCH 30/46] test2

---
 Src/Base/AMReX_Reduce.H | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H
index 83e48ebe7d2..cd042dbfa5c 100644
--- a/Src/Base/AMReX_Reduce.H
+++ b/Src/Base/AMReX_Reduce.H
@@ -526,19 +526,19 @@ namespace Reduce::detail {
 
     template <typename F, int dim>
     AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-    auto call_f_intvect_box (F const& f, IntVectND<dim> iv, IndexTypeND<dim>) noexcept //->
-        // decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
+    auto call_f_intvect_box (F const& f, IntVectND<dim> iv, IndexTypeND<dim>) noexcept ->
+        decltype(amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv))
     {
         return amrex::detail::call_f_intvect_inner(std::make_index_sequence<dim>(), f, iv);
     }
 
-    // template <typename F, int dim>
-    // AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-    // auto call_f_intvect_box (F const& f, IntVectND<dim> iv, IndexTypeND<dim> t) noexcept ->
-    //     decltype(f(BoxND<dim>(iv, iv, t)))
-    // {
-    //     return f(BoxND<dim>(iv, iv, t));
-    // }
+    template <typename F, int dim>
+    AMREX_GPU_DEVICE AMREX_FORCE_INLINE
+    auto call_f_intvect_box (F const& f, IntVectND<dim> iv, IndexTypeND<dim> t) noexcept ->
+        decltype(f(BoxND<dim>(iv, iv, t)))
+    {
+        return f(BoxND<dim>(iv, iv, t));
+    }
 
     // call_f_intvect_n
     template <typename F, typename T, int dim>
@@ -652,7 +652,9 @@ public:
         amrex::LaunchRaw<AMREX_GPU_MAX_THREADS, unsigned long long>(
             amrex::IntVectND<1>{static_cast<int>(nblocks_ec)},
             AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0),
-            [=] AMREX_GPU_DEVICE (auto lh) noexcept
+            [=] AMREX_GPU_DEVICE (
+                amrex::LaunchHandler<AMREX_GPU_MAX_THREADS, unsigned long long, 1> lh
+            ) noexcept
         {
             ReduceTuple r;
             Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);

From cb9a3fdef2c550d7949a0707ea5d4b2959a0cd71 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Wed, 11 Mar 2026 17:51:31 +0100
Subject: [PATCH 31/46] fix

---
 Src/Base/AMReX_Reduce.H  | 2 +-
 Tests/LaunchRaw/main.cpp | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H
index cd042dbfa5c..1eac0423616 100644
--- a/Src/Base/AMReX_Reduce.H
+++ b/Src/Base/AMReX_Reduce.H
@@ -653,7 +653,7 @@ public:
             amrex::IntVectND<1>{static_cast<int>(nblocks_ec)},
             AMREX_IF_SYCL(Gpu::Device::warp_size) AMREX_IF_NOT_SYCL(0),
             [=] AMREX_GPU_DEVICE (
-                amrex::LaunchHandler<AMREX_GPU_MAX_THREADS, unsigned long long, 1> lh
+                Gpu::LaunchHandler<AMREX_GPU_MAX_THREADS, unsigned long long, 1> lh
             ) noexcept
         {
             ReduceTuple r;
diff --git a/Tests/LaunchRaw/main.cpp b/Tests/LaunchRaw/main.cpp
index 68b9366dcea..231975fde4f 100644
--- a/Tests/LaunchRaw/main.cpp
+++ b/Tests/LaunchRaw/main.cpp
@@ -17,7 +17,7 @@ void test1d () {
 #endif
     static constexpr int num_threads = blockdim[0];
 
-    Gpu::DeviceVector<int> vect(num_threads * num_blocks[0], -999);
+    Gpu::DeviceVector<int> vect(static_cast<std::size_t>(num_threads) * num_blocks[0], -999);
 
     auto * data = vect.dataPtr();
 
@@ -71,7 +71,8 @@ void test2d () {
 #endif
     static constexpr int num_threads = blockdim[0] * blockdim[1];
 
-    Gpu::DeviceVector<int> vect(num_threads * num_blocks[0] * num_blocks[1], -999);
+    Gpu::DeviceVector<int> vect(static_cast<std::size_t>(num_threads)
+        * num_blocks[0] * num_blocks[1], -999);
 
     auto * data = vect.dataPtr();
 
@@ -128,7 +129,8 @@ void test3d () {
 #endif
     static constexpr int num_threads = blockdim[0] * blockdim[1] * blockdim[2];
 
-    Gpu::DeviceVector<int> vect(num_threads * num_blocks[0] * num_blocks[1] * num_blocks[2], -999);
+    Gpu::DeviceVector<int> vect(static_cast<std::size_t>(num_threads)
+         * num_blocks[0] * num_blocks[1] * num_blocks[2], -999);
 
     auto * data = vect.dataPtr();
 

From d4566ec4802f89d9378ff419593398cec9181cb5 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Wed, 11 Mar 2026 18:11:33 +0100
Subject: [PATCH 32/46] remove static

---
 Tests/LaunchRaw/main.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/Tests/LaunchRaw/main.cpp b/Tests/LaunchRaw/main.cpp
index 231975fde4f..ac125be7edd 100644
--- a/Tests/LaunchRaw/main.cpp
+++ b/Tests/LaunchRaw/main.cpp
@@ -11,11 +11,11 @@ void test1d () {
 
     const IntVectND<1> num_blocks {31};
 #ifdef AMREX_USE_GPU
-    static constexpr IntVectND<1> blockdim {256};
+    constexpr IntVectND<1> blockdim {256};
 #else
-    static constexpr IntVectND<1> blockdim {1};
+    constexpr IntVectND<1> blockdim {1};
 #endif
-    static constexpr int num_threads = blockdim[0];
+    constexpr int num_threads = blockdim[0];
 
     Gpu::DeviceVector<int> vect(static_cast<std::size_t>(num_threads) * num_blocks[0], -999);
 
@@ -65,11 +65,11 @@ void test2d () {
 
     const IntVectND<2> num_blocks {31, 23};
 #ifdef AMREX_USE_GPU
-    static constexpr IntVectND<2> blockdim {8, 32};
+    constexpr IntVectND<2> blockdim {8, 32};
 #else
-    static constexpr IntVectND<2> blockdim {1, 1};
+    constexpr IntVectND<2> blockdim {1, 1};
 #endif
-    static constexpr int num_threads = blockdim[0] * blockdim[1];
+    constexpr int num_threads = blockdim[0] * blockdim[1];
 
     Gpu::DeviceVector<int> vect(static_cast<std::size_t>(num_threads)
         * num_blocks[0] * num_blocks[1], -999);
@@ -123,11 +123,11 @@ void test3d () {
 
     const IntVectND<3> num_blocks {31, 23, 11};
 #ifdef AMREX_USE_GPU
-    static constexpr IntVectND<3> blockdim {2, 8, 16};
+    constexpr IntVectND<3> blockdim {2, 8, 16};
 #else
-    static constexpr IntVectND<3> blockdim {1, 1, 1};
+    constexpr IntVectND<3> blockdim {1, 1, 1};
 #endif
-    static constexpr int num_threads = blockdim[0] * blockdim[1] * blockdim[2];
+    constexpr int num_threads = blockdim[0] * blockdim[1] * blockdim[2];
 
     Gpu::DeviceVector<int> vect(static_cast<std::size_t>(num_threads)
          * num_blocks[0] * num_blocks[1] * num_blocks[2], -999);

From 13b8a4248994805a408d423c2485bbe01d30f725 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Wed, 11 Mar 2026 18:25:07 +0100
Subject: [PATCH 33/46] fix27

---
 Tests/LaunchRaw/main.cpp | 58 ++++++++++++++++++++++------------------
 1 file changed, 32 insertions(+), 26 deletions(-)

diff --git a/Tests/LaunchRaw/main.cpp b/Tests/LaunchRaw/main.cpp
index ac125be7edd..4b5ced4d1d4 100644
--- a/Tests/LaunchRaw/main.cpp
+++ b/Tests/LaunchRaw/main.cpp
@@ -11,11 +11,11 @@ void test1d () {
 
     const IntVectND<1> num_blocks {31};
 #ifdef AMREX_USE_GPU
-    constexpr IntVectND<1> blockdim {256};
+    static constexpr int blockdim_x = 256;
 #else
-    constexpr IntVectND<1> blockdim {1};
+    static constexpr int blockdim_x = 1;
 #endif
-    constexpr int num_threads = blockdim[0];
+    static constexpr int num_threads = blockdim_x;
 
     Gpu::DeviceVector<int> vect(static_cast<std::size_t>(num_threads) * num_blocks[0], -999);
 
@@ -34,7 +34,7 @@ void test1d () {
     LaunchRaw<num_threads>(num_blocks,
         [=] AMREX_GPU_DEVICE (auto lh) {
             auto block = lh.blockIdxND();
-            auto thread = lh.template threadIdxND<blockdim[0]>();
+            auto thread = lh.template threadIdxND<blockdim_x>();
             auto tmp = data[
                 block[0] * num_threads + thread[0]
             ];
@@ -45,7 +45,7 @@ void test1d () {
     LaunchRaw<num_threads, int>(num_blocks, num_threads,
         [=] AMREX_GPU_DEVICE (auto lh) {
             auto smem = lh.shared_memory();
-            auto thread = lh.template threadIdxND<blockdim[0]>();
+            auto thread = lh.template threadIdxND<blockdim_x>();
             auto locid = thread[0];
             smem[lh.threadIdx1D()] = data[lh.blockIdx1D() * lh.blockDim1D() + locid];
             lh.syncthreads();
@@ -65,11 +65,13 @@ void test2d () {
 
     const IntVectND<2> num_blocks {31, 23};
 #ifdef AMREX_USE_GPU
-    constexpr IntVectND<2> blockdim {8, 32};
+    static constexpr int blockdim_x = 8;
+    static constexpr int blockdim_y = 32;
 #else
-    constexpr IntVectND<2> blockdim {1, 1};
+    static constexpr int blockdim_x = 1;
+    static constexpr int blockdim_y = 1;
 #endif
-    constexpr int num_threads = blockdim[0] * blockdim[1];
+    static constexpr int num_threads = blockdim_x * blockdim_y;
 
     Gpu::DeviceVector<int> vect(static_cast<std::size_t>(num_threads)
         * num_blocks[0] * num_blocks[1], -999);
@@ -89,10 +91,10 @@ void test2d () {
     LaunchRaw<num_threads>(num_blocks,
         [=] AMREX_GPU_DEVICE (auto lh) {
             auto block = lh.blockIdxND();
-            auto thread = lh.template threadIdxND<blockdim[0], blockdim[1]>();
+            auto thread = lh.template threadIdxND<blockdim_x, blockdim_y>();
             auto tmp = data[
                 (block[0] + block[1] * num_blocks[0]) * num_threads +
-                thread[1] + thread[0] * blockdim[1]
+                thread[1] + thread[0] * blockdim_y
             ];
             lh.syncthreads();
             data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] = tmp;
@@ -101,10 +103,10 @@ void test2d () {
     LaunchRaw<num_threads, int>(num_blocks, num_threads,
         [=] AMREX_GPU_DEVICE (auto lh) {
             auto smem = lh.shared_memory();
-            auto thread1 = lh.template threadIdxND<blockdim[1], blockdim[0]>();
-            auto locid1 = thread1[1] + thread1[0] * blockdim[0];
-            auto thread2 = lh.template threadIdxND<blockdim[0], blockdim[1]>();
-            auto locid2 = thread2[1] + thread2[0] * blockdim[1];
+            auto thread1 = lh.template threadIdxND<blockdim_y, blockdim_x>();
+            auto locid1 = thread1[1] + thread1[0] * blockdim_x;
+            auto thread2 = lh.template threadIdxND<blockdim_x, blockdim_y>();
+            auto locid2 = thread2[1] + thread2[0] * blockdim_y;
             smem[lh.threadIdx1D()] = data[lh.blockIdx1D() * lh.blockDim1D() + locid1];
             lh.syncthreads();
             data[lh.blockIdx1D() * lh.blockDim1D() + locid2] = smem[locid2];
@@ -123,11 +125,15 @@ void test3d () {
 
     const IntVectND<3> num_blocks {31, 23, 11};
 #ifdef AMREX_USE_GPU
-    constexpr IntVectND<3> blockdim {2, 8, 16};
+    static constexpr int blockdim_x = 2;
+    static constexpr int blockdim_y = 8;
+    static constexpr int blockdim_z = 16;
 #else
-    constexpr IntVectND<3> blockdim {1, 1, 1};
+    static constexpr int blockdim_x = 1;
+    static constexpr int blockdim_y = 1;
+    static constexpr int blockdim_z = 1;
 #endif
-    constexpr int num_threads = blockdim[0] * blockdim[1] * blockdim[2];
+    static constexpr int num_threads = blockdim_x * blockdim_y * blockdim_z;
 
     Gpu::DeviceVector<int> vect(static_cast<std::size_t>(num_threads)
          * num_blocks[0] * num_blocks[1] * num_blocks[2], -999);
@@ -147,12 +153,12 @@ void test3d () {
     LaunchRaw<num_threads>(num_blocks,
         [=] AMREX_GPU_DEVICE (auto lh) {
             auto block = lh.blockIdxND();
-            auto thread = lh.template threadIdxND<blockdim[0], blockdim[1], blockdim[2]>();
+            auto thread = lh.template threadIdxND<blockdim_x, blockdim_y, blockdim_z>();
             auto tmp = data[
                 (block[0] + block[1] * num_blocks[0] +
                 block[2] * num_blocks[0] * num_blocks[1]) * num_threads +
-                thread[2] + thread[1] * blockdim[2] +
-                thread[0] * blockdim[2] * blockdim[1]
+                thread[2] + thread[1] * blockdim_z +
+                thread[0] * blockdim_z * blockdim_y
             ];
             lh.syncthreads();
             data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] = tmp;
@@ -161,12 +167,12 @@ void test3d () {
     LaunchRaw<num_threads, int>(num_blocks, num_threads,
         [=] AMREX_GPU_DEVICE (auto lh) {
             auto smem = lh.shared_memory();
-            auto thread1 = lh.template threadIdxND<blockdim[2], blockdim[1], blockdim[0]>();
-            auto locid1 = thread1[2] + thread1[1] * blockdim[0] +
-                          thread1[0] * blockdim[0] * blockdim[1];
-            auto thread2 = lh.template threadIdxND<blockdim[2], blockdim[0], blockdim[1]>();
-            auto locid2 = thread2[0] + thread2[2] * blockdim[2] +
-                          thread2[1] * blockdim[2] * blockdim[1];
+            auto thread1 = lh.template threadIdxND<blockdim_z, blockdim_y, blockdim_x>();
+            auto locid1 = thread1[2] + thread1[1] * blockdim_x +
+                          thread1[0] * blockdim_x * blockdim_y;
+            auto thread2 = lh.template threadIdxND<blockdim_z, blockdim_x, blockdim_y>();
+            auto locid2 = thread2[0] + thread2[2] * blockdim_z +
+                          thread2[1] * blockdim_z * blockdim_y;
             smem[lh.threadIdx1D()] = data[lh.blockIdx1D() * lh.blockDim1D() + locid1];
             lh.syncthreads();
             data[lh.blockIdx1D() * lh.blockDim1D() + locid2] = smem[locid2];

From 528d8dc9857a3276b26cc63478506bc264d054e0 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Thu, 12 Mar 2026 10:08:05 +0100
Subject: [PATCH 34/46] try fix MSVC

---
 Tests/LaunchRaw/main.cpp | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/Tests/LaunchRaw/main.cpp b/Tests/LaunchRaw/main.cpp
index 4b5ced4d1d4..b2ad5fb8481 100644
--- a/Tests/LaunchRaw/main.cpp
+++ b/Tests/LaunchRaw/main.cpp
@@ -22,17 +22,17 @@ void test1d () {
     auto * data = vect.dataPtr();
 
     LaunchRaw<num_threads>(num_blocks,
-        [=] AMREX_GPU_DEVICE (auto lh) {
+        [data] AMREX_GPU_DEVICE (auto lh) {
             data[lh.globalIdx1D()] = lh.blockIdx1D();
         });
 
     LaunchRaw<num_threads>(num_blocks,
-        [=] AMREX_GPU_DEVICE (auto lh) {
+        [data] AMREX_GPU_DEVICE (auto lh) {
             data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] += lh.threadIdx1D();
         });
 
     LaunchRaw<num_threads>(num_blocks,
-        [=] AMREX_GPU_DEVICE (auto lh) {
+        [data] AMREX_GPU_DEVICE (auto lh) {
             auto block = lh.blockIdxND();
             auto thread = lh.template threadIdxND<blockdim_x>();
             auto tmp = data[
@@ -43,7 +43,7 @@ void test1d () {
         });
 
     LaunchRaw<num_threads, int>(num_blocks, num_threads,
-        [=] AMREX_GPU_DEVICE (auto lh) {
+        [data] AMREX_GPU_DEVICE (auto lh) {
             auto smem = lh.shared_memory();
             auto thread = lh.template threadIdxND<blockdim_x>();
             auto locid = thread[0];
@@ -53,7 +53,7 @@ void test1d () {
         });
 
     LaunchRaw<num_threads>(num_blocks,
-        [=] AMREX_GPU_DEVICE (auto lh) {
+        [data] AMREX_GPU_DEVICE (auto lh) {
             data[lh.globalIdx1D()] =
                 data[lh.globalIdx1D()] == static_cast<int>(lh.blockIdx1D() + lh.threadIdx1D());
         });
@@ -79,17 +79,17 @@ void test2d () {
     auto * data = vect.dataPtr();
 
     LaunchRaw<num_threads>(num_blocks,
-        [=] AMREX_GPU_DEVICE (auto lh) {
+        [data] AMREX_GPU_DEVICE (auto lh) {
             data[lh.globalIdx1D()] = lh.blockIdx1D();
         });
 
     LaunchRaw<num_threads>(num_blocks,
-        [=] AMREX_GPU_DEVICE (auto lh) {
+        [data] AMREX_GPU_DEVICE (auto lh) {
             data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] += lh.threadIdx1D();
         });
 
     LaunchRaw<num_threads>(num_blocks,
-        [=] AMREX_GPU_DEVICE (auto lh) {
+        [data, num_blocks] AMREX_GPU_DEVICE (auto lh) {
             auto block = lh.blockIdxND();
             auto thread = lh.template threadIdxND<blockdim_x, blockdim_y>();
             auto tmp = data[
@@ -101,7 +101,7 @@ void test2d () {
         });
 
     LaunchRaw<num_threads, int>(num_blocks, num_threads,
-        [=] AMREX_GPU_DEVICE (auto lh) {
+        [data] AMREX_GPU_DEVICE (auto lh) {
             auto smem = lh.shared_memory();
             auto thread1 = lh.template threadIdxND<blockdim_y, blockdim_x>();
             auto locid1 = thread1[1] + thread1[0] * blockdim_x;
@@ -113,7 +113,7 @@ void test2d () {
         });
 
     LaunchRaw<num_threads>(num_blocks,
-        [=] AMREX_GPU_DEVICE (auto lh) {
+        [data] AMREX_GPU_DEVICE (auto lh) {
             data[lh.globalIdx1D()] =
                 data[lh.globalIdx1D()] == static_cast<int>(lh.blockIdx1D() + lh.threadIdx1D());
         });
@@ -141,17 +141,17 @@ void test3d () {
     auto * data = vect.dataPtr();
 
     LaunchRaw<num_threads>(num_blocks,
-        [=] AMREX_GPU_DEVICE (auto lh) {
+        [data] AMREX_GPU_DEVICE (auto lh) {
             data[lh.globalIdx1D()] = lh.blockIdx1D();
         });
 
     LaunchRaw<num_threads>(num_blocks,
-        [=] AMREX_GPU_DEVICE (auto lh) {
+        [data] AMREX_GPU_DEVICE (auto lh) {
             data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] += lh.threadIdx1D();
         });
 
     LaunchRaw<num_threads>(num_blocks,
-        [=] AMREX_GPU_DEVICE (auto lh) {
+        [data, num_blocks] AMREX_GPU_DEVICE (auto lh) {
             auto block = lh.blockIdxND();
             auto thread = lh.template threadIdxND<blockdim_x, blockdim_y, blockdim_z>();
             auto tmp = data[
@@ -165,7 +165,7 @@ void test3d () {
         });
 
     LaunchRaw<num_threads, int>(num_blocks, num_threads,
-        [=] AMREX_GPU_DEVICE (auto lh) {
+        [data] AMREX_GPU_DEVICE (auto lh) {
             auto smem = lh.shared_memory();
             auto thread1 = lh.template threadIdxND<blockdim_z, blockdim_y, blockdim_x>();
             auto locid1 = thread1[2] + thread1[1] * blockdim_x +
@@ -179,7 +179,7 @@ void test3d () {
         });
 
     LaunchRaw<num_threads>(num_blocks,
-        [=] AMREX_GPU_DEVICE (auto lh) {
+        [data] AMREX_GPU_DEVICE (auto lh) {
             data[lh.globalIdx1D()] =
                 data[lh.globalIdx1D()] == static_cast<int>(lh.blockIdx1D() + lh.threadIdx1D());
         });

From e13206d115d43c49ebdc5717eddec3c7e610e680 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Thu, 12 Mar 2026 11:17:23 +0100
Subject: [PATCH 35/46] try fix MSVC 2

---
 Tests/LaunchRaw/main.cpp | 88 +++++++++++++++++++---------------------
 1 file changed, 41 insertions(+), 47 deletions(-)

diff --git a/Tests/LaunchRaw/main.cpp b/Tests/LaunchRaw/main.cpp
index b2ad5fb8481..231975fde4f 100644
--- a/Tests/LaunchRaw/main.cpp
+++ b/Tests/LaunchRaw/main.cpp
@@ -11,30 +11,30 @@ void test1d () {
 
     const IntVectND<1> num_blocks {31};
 #ifdef AMREX_USE_GPU
-    static constexpr int blockdim_x = 256;
+    static constexpr IntVectND<1> blockdim {256};
 #else
-    static constexpr int blockdim_x = 1;
+    static constexpr IntVectND<1> blockdim {1};
 #endif
-    static constexpr int num_threads = blockdim_x;
+    static constexpr int num_threads = blockdim[0];
 
     Gpu::DeviceVector<int> vect(static_cast<std::size_t>(num_threads) * num_blocks[0], -999);
 
     auto * data = vect.dataPtr();
 
     LaunchRaw<num_threads>(num_blocks,
-        [data] AMREX_GPU_DEVICE (auto lh) {
+        [=] AMREX_GPU_DEVICE (auto lh) {
             data[lh.globalIdx1D()] = lh.blockIdx1D();
         });
 
     LaunchRaw<num_threads>(num_blocks,
-        [data] AMREX_GPU_DEVICE (auto lh) {
+        [=] AMREX_GPU_DEVICE (auto lh) {
             data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] += lh.threadIdx1D();
         });
 
     LaunchRaw<num_threads>(num_blocks,
-        [data] AMREX_GPU_DEVICE (auto lh) {
+        [=] AMREX_GPU_DEVICE (auto lh) {
             auto block = lh.blockIdxND();
-            auto thread = lh.template threadIdxND<blockdim_x>();
+            auto thread = lh.template threadIdxND<blockdim[0]>();
             auto tmp = data[
                 block[0] * num_threads + thread[0]
             ];
@@ -43,9 +43,9 @@ void test1d () {
         });
 
     LaunchRaw<num_threads, int>(num_blocks, num_threads,
-        [data] AMREX_GPU_DEVICE (auto lh) {
+        [=] AMREX_GPU_DEVICE (auto lh) {
             auto smem = lh.shared_memory();
-            auto thread = lh.template threadIdxND<blockdim_x>();
+            auto thread = lh.template threadIdxND<blockdim[0]>();
             auto locid = thread[0];
             smem[lh.threadIdx1D()] = data[lh.blockIdx1D() * lh.blockDim1D() + locid];
             lh.syncthreads();
@@ -53,7 +53,7 @@ void test1d () {
         });
 
     LaunchRaw<num_threads>(num_blocks,
-        [data] AMREX_GPU_DEVICE (auto lh) {
+        [=] AMREX_GPU_DEVICE (auto lh) {
             data[lh.globalIdx1D()] =
                 data[lh.globalIdx1D()] == static_cast<int>(lh.blockIdx1D() + lh.threadIdx1D());
         });
@@ -65,13 +65,11 @@ void test2d () {
 
     const IntVectND<2> num_blocks {31, 23};
 #ifdef AMREX_USE_GPU
-    static constexpr int blockdim_x = 8;
-    static constexpr int blockdim_y = 32;
+    static constexpr IntVectND<2> blockdim {8, 32};
 #else
-    static constexpr int blockdim_x = 1;
-    static constexpr int blockdim_y = 1;
+    static constexpr IntVectND<2> blockdim {1, 1};
 #endif
-    static constexpr int num_threads = blockdim_x * blockdim_y;
+    static constexpr int num_threads = blockdim[0] * blockdim[1];
 
     Gpu::DeviceVector<int> vect(static_cast<std::size_t>(num_threads)
         * num_blocks[0] * num_blocks[1], -999);
@@ -79,41 +77,41 @@ void test2d () {
     auto * data = vect.dataPtr();
 
     LaunchRaw<num_threads>(num_blocks,
-        [data] AMREX_GPU_DEVICE (auto lh) {
+        [=] AMREX_GPU_DEVICE (auto lh) {
             data[lh.globalIdx1D()] = lh.blockIdx1D();
         });
 
     LaunchRaw<num_threads>(num_blocks,
-        [data] AMREX_GPU_DEVICE (auto lh) {
+        [=] AMREX_GPU_DEVICE (auto lh) {
             data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] += lh.threadIdx1D();
         });
 
     LaunchRaw<num_threads>(num_blocks,
-        [data, num_blocks] AMREX_GPU_DEVICE (auto lh) {
+        [=] AMREX_GPU_DEVICE (auto lh) {
             auto block = lh.blockIdxND();
-            auto thread = lh.template threadIdxND<blockdim_x, blockdim_y>();
+            auto thread = lh.template threadIdxND<blockdim[0], blockdim[1]>();
             auto tmp = data[
                 (block[0] + block[1] * num_blocks[0]) * num_threads +
-                thread[1] + thread[0] * blockdim_y
+                thread[1] + thread[0] * blockdim[1]
             ];
             lh.syncthreads();
             data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] = tmp;
         });
 
     LaunchRaw<num_threads, int>(num_blocks, num_threads,
-        [data] AMREX_GPU_DEVICE (auto lh) {
+        [=] AMREX_GPU_DEVICE (auto lh) {
             auto smem = lh.shared_memory();
-            auto thread1 = lh.template threadIdxND<blockdim_y, blockdim_x>();
-            auto locid1 = thread1[1] + thread1[0] * blockdim_x;
-            auto thread2 = lh.template threadIdxND<blockdim_x, blockdim_y>();
-            auto locid2 = thread2[1] + thread2[0] * blockdim_y;
+            auto thread1 = lh.template threadIdxND<blockdim[1], blockdim[0]>();
+            auto locid1 = thread1[1] + thread1[0] * blockdim[0];
+            auto thread2 = lh.template threadIdxND<blockdim[0], blockdim[1]>();
+            auto locid2 = thread2[1] + thread2[0] * blockdim[1];
             smem[lh.threadIdx1D()] = data[lh.blockIdx1D() * lh.blockDim1D() + locid1];
             lh.syncthreads();
             data[lh.blockIdx1D() * lh.blockDim1D() + locid2] = smem[locid2];
         });
 
     LaunchRaw<num_threads>(num_blocks,
-        [data] AMREX_GPU_DEVICE (auto lh) {
+        [=] AMREX_GPU_DEVICE (auto lh) {
             data[lh.globalIdx1D()] =
                 data[lh.globalIdx1D()] == static_cast<int>(lh.blockIdx1D() + lh.threadIdx1D());
         });
@@ -125,15 +123,11 @@ void test3d () {
 
     const IntVectND<3> num_blocks {31, 23, 11};
 #ifdef AMREX_USE_GPU
-    static constexpr int blockdim_x = 2;
-    static constexpr int blockdim_y = 8;
-    static constexpr int blockdim_z = 16;
+    static constexpr IntVectND<3> blockdim {2, 8, 16};
 #else
-    static constexpr int blockdim_x = 1;
-    static constexpr int blockdim_y = 1;
-    static constexpr int blockdim_z = 1;
+    static constexpr IntVectND<3> blockdim {1, 1, 1};
 #endif
-    static constexpr int num_threads = blockdim_x * blockdim_y * blockdim_z;
+    static constexpr int num_threads = blockdim[0] * blockdim[1] * blockdim[2];
 
     Gpu::DeviceVector<int> vect(static_cast<std::size_t>(num_threads)
          * num_blocks[0] * num_blocks[1] * num_blocks[2], -999);
@@ -141,45 +135,45 @@ void test3d () {
     auto * data = vect.dataPtr();
 
     LaunchRaw<num_threads>(num_blocks,
-        [data] AMREX_GPU_DEVICE (auto lh) {
+        [=] AMREX_GPU_DEVICE (auto lh) {
             data[lh.globalIdx1D()] = lh.blockIdx1D();
         });
 
     LaunchRaw<num_threads>(num_blocks,
-        [data] AMREX_GPU_DEVICE (auto lh) {
+        [=] AMREX_GPU_DEVICE (auto lh) {
             data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] += lh.threadIdx1D();
         });
 
     LaunchRaw<num_threads>(num_blocks,
-        [data, num_blocks] AMREX_GPU_DEVICE (auto lh) {
+        [=] AMREX_GPU_DEVICE (auto lh) {
             auto block = lh.blockIdxND();
-            auto thread = lh.template threadIdxND<blockdim_x, blockdim_y, blockdim_z>();
+            auto thread = lh.template threadIdxND<blockdim[0], blockdim[1], blockdim[2]>();
             auto tmp = data[
                 (block[0] + block[1] * num_blocks[0] +
                 block[2] * num_blocks[0] * num_blocks[1]) * num_threads +
-                thread[2] + thread[1] * blockdim_z +
-                thread[0] * blockdim_z * blockdim_y
+                thread[2] + thread[1] * blockdim[2] +
+                thread[0] * blockdim[2] * blockdim[1]
             ];
             lh.syncthreads();
             data[lh.blockIdx1D() * lh.blockDim1D() + lh.threadIdx1D()] = tmp;
         });
 
     LaunchRaw<num_threads, int>(num_blocks, num_threads,
-        [data] AMREX_GPU_DEVICE (auto lh) {
+        [=] AMREX_GPU_DEVICE (auto lh) {
             auto smem = lh.shared_memory();
-            auto thread1 = lh.template threadIdxND<blockdim_z, blockdim_y, blockdim_x>();
-            auto locid1 = thread1[2] + thread1[1] * blockdim_x +
-                          thread1[0] * blockdim_x * blockdim_y;
-            auto thread2 = lh.template threadIdxND<blockdim_z, blockdim_x, blockdim_y>();
-            auto locid2 = thread2[0] + thread2[2] * blockdim_z +
-                          thread2[1] * blockdim_z * blockdim_y;
+            auto thread1 = lh.template threadIdxND<blockdim[2], blockdim[1], blockdim[0]>();
+            auto locid1 = thread1[2] + thread1[1] * blockdim[0] +
+                          thread1[0] * blockdim[0] * blockdim[1];
+            auto thread2 = lh.template threadIdxND<blockdim[2], blockdim[0], blockdim[1]>();
+            auto locid2 = thread2[0] + thread2[2] * blockdim[2] +
+                          thread2[1] * blockdim[2] * blockdim[1];
             smem[lh.threadIdx1D()] = data[lh.blockIdx1D() * lh.blockDim1D() + locid1];
             lh.syncthreads();
             data[lh.blockIdx1D() * lh.blockDim1D() + locid2] = smem[locid2];
         });
 
     LaunchRaw<num_threads>(num_blocks,
-        [data] AMREX_GPU_DEVICE (auto lh) {
+        [=] AMREX_GPU_DEVICE (auto lh) {
             data[lh.globalIdx1D()] =
                 data[lh.globalIdx1D()] == static_cast<int>(lh.blockIdx1D() + lh.threadIdx1D());
         });

From 72e5d88bc18af7d3bb46a06848c6d0d14dd62410 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Thu, 12 Mar 2026 11:31:23 +0100
Subject: [PATCH 36/46] try fix clang

---
 Tests/LaunchRaw/main.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/Tests/LaunchRaw/main.cpp b/Tests/LaunchRaw/main.cpp
index 231975fde4f..b34b590e530 100644
--- a/Tests/LaunchRaw/main.cpp
+++ b/Tests/LaunchRaw/main.cpp
@@ -11,9 +11,9 @@ void test1d () {
 
     const IntVectND<1> num_blocks {31};
 #ifdef AMREX_USE_GPU
-    static constexpr IntVectND<1> blockdim {256};
+    static constexpr int blockdim[1] {256};
 #else
-    static constexpr IntVectND<1> blockdim {1};
+    static constexpr int blockdim[1] {1};
 #endif
     static constexpr int num_threads = blockdim[0];
 
@@ -65,9 +65,9 @@ void test2d () {
 
     const IntVectND<2> num_blocks {31, 23};
 #ifdef AMREX_USE_GPU
-    static constexpr IntVectND<2> blockdim {8, 32};
+    static constexpr int blockdim[2] {8, 32};
 #else
-    static constexpr IntVectND<2> blockdim {1, 1};
+    static constexpr int blockdim[2] {1, 1};
 #endif
     static constexpr int num_threads = blockdim[0] * blockdim[1];
 
@@ -123,9 +123,9 @@ void test3d () {
 
     const IntVectND<3> num_blocks {31, 23, 11};
 #ifdef AMREX_USE_GPU
-    static constexpr IntVectND<3> blockdim {2, 8, 16};
+    static constexpr int blockdim[3] {2, 8, 16};
 #else
-    static constexpr IntVectND<3> blockdim {1, 1, 1};
+    static constexpr int blockdim[3] {1, 1, 1};
 #endif
     static constexpr int num_threads = blockdim[0] * blockdim[1] * blockdim[2];
 

From 0c60372a4231b8286efb7d8255d02b8a9311d49a Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Thu, 12 Mar 2026 13:51:00 +0100
Subject: [PATCH 37/46] relax dim constraint for threadIdxND

---
 Src/Base/AMReX_GpuTypes.H | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H
index f031f170d15..973aeff1ff6 100644
--- a/Src/Base/AMReX_GpuTypes.H
+++ b/Src/Base/AMReX_GpuTypes.H
@@ -137,17 +137,18 @@ struct LaunchHandler
      */
     template<int ...nd_block_size>
     [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-    IntVectND<dim> threadIdxND () const {
-        static_assert(sizeof...(nd_block_size) == dim &&
+    IntVectND<sizeof...(nd_block_size)> threadIdxND () const {
+        static_assert(sizeof...(nd_block_size) <= 3 &&
                       (1 * ... * nd_block_size) == threads_per_block);
-        constexpr IntVectND<dim> iv_block_size = blockDimND<nd_block_size...>();
-        IntVectND<dim> ret(0);
+        constexpr IntVectND<sizeof...(nd_block_size)> iv_block_size =
+            blockDimND<nd_block_size...>();
+        IntVectND<sizeof...(nd_block_size)> ret(0);
         unsigned int idx = threadIdx1D();
-        if constexpr (dim == 3) {
+        if constexpr (sizeof...(nd_block_size) == 3) {
             ret[2] = idx / iv_block_size[1];
             idx = idx - ret[2] * iv_block_size[1];
         }
-        if constexpr (dim >= 2) {
+        if constexpr (sizeof...(nd_block_size) >= 2) {
             ret[1] = idx / iv_block_size[0];
             idx = idx - ret[1] * iv_block_size[0];
         }
@@ -228,10 +229,9 @@ struct LaunchHandler
      */
     template<int ...nd_block_size>
     [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
-    static constexpr IntVectND<dim> blockDimND () {
-        static_assert(sizeof...(nd_block_size) == dim &&
-                      (1 * ... * nd_block_size) == threads_per_block);
-        return IntVectND<dim>{nd_block_size...};
+    static constexpr IntVectND<sizeof...(nd_block_size)> blockDimND () {
+        static_assert((1 * ... * nd_block_size) == threads_per_block);
+        return IntVectND<sizeof...(nd_block_size)>{nd_block_size...};
     }
 
     /**
@@ -248,6 +248,8 @@ struct LaunchHandler
     template<int ...nd_block_size>
     [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     IntVectND<dim> globalIdxND () const {
+        static_assert(sizeof...(nd_block_size) == dim &&
+                      (1 * ... * nd_block_size) == threads_per_block);
         return blockIdxND() * blockDimND<nd_block_size...>() + threadIdxND<nd_block_size...>();
     }
 

From a9f0cfc96248a5a7bed5b2b74b0fa4255b996625 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Sat, 14 Mar 2026 15:43:09 +0100
Subject: [PATCH 38/46] Fix 3D threadIdxND, nd_item* and try guess fix for
 reqd_work_group_size

---
 Src/Base/AMReX_GpuLaunchFunctsG.H | 4 ++--
 Src/Base/AMReX_GpuTypes.H         | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index 24b58376a58..34ea2ca566f 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -913,7 +913,7 @@ void LaunchRaw (IntVectND<dim> nblocks, L const& f)
         q.submit([&] (sycl::handler& h) {
             h.parallel_for(sycl::nd_range<dim>(threads_total, threads_per_block),
                 [=] (sycl::nd_item<dim> item)
-                [[sycl::reqd_work_group_size(1, 1, MT)]]
+                [[sycl::reqd_work_group_size(MT)]]
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
                     f(Gpu::LaunchHandler<MT, void, dim>{
@@ -945,7 +945,7 @@ void LaunchRaw (IntVectND<dim> nblocks, std::size_t shared_mem_elements, L const
             sycl::local_accessor<T> shared_data(sycl::range<1>(shared_mem_elements), h);
             h.parallel_for(sycl::nd_range<dim>(threads_total, threads_per_block),
                 [=] (sycl::nd_item<dim> item)
-                [[sycl::reqd_work_group_size(1, 1, MT)]]
+                [[sycl::reqd_work_group_size(MT)]]
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
                     f(Gpu::LaunchHandler<MT, T, dim>{
diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H
index 973aeff1ff6..a2a87a6d5cc 100644
--- a/Src/Base/AMReX_GpuTypes.H
+++ b/Src/Base/AMReX_GpuTypes.H
@@ -145,8 +145,8 @@ struct LaunchHandler
         IntVectND<sizeof...(nd_block_size)> ret(0);
         unsigned int idx = threadIdx1D();
         if constexpr (sizeof...(nd_block_size) == 3) {
-            ret[2] = idx / iv_block_size[1];
-            idx = idx - ret[2] * iv_block_size[1];
+            ret[2] = idx / (iv_block_size[0] * iv_block_size[1]);
+            idx = idx - ret[2] * (iv_block_size[0] * iv_block_size[1]);
         }
         if constexpr (sizeof...(nd_block_size) >= 2) {
             ret[1] = idx / iv_block_size[0];
@@ -302,7 +302,7 @@ struct LaunchHandler
      */
     [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
 #if defined(AMREX_USE_SYCL)
-    sycl::nd_item<dim> sycl_item () const {
+    const sycl::nd_item<dim>* sycl_item () const {
         return m_item;
     }
 #else

From b69038493cd919067fff3ff148f20629140b251b Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Sat, 14 Mar 2026 20:12:23 +0100
Subject: [PATCH 39/46] Add some suggestions from review

---
 Src/Base/AMReX_GpuLaunchFunctsC.H |  6 ++--
 Src/Base/AMReX_GpuLaunchFunctsG.H | 46 +++++++++++++++++++++++++------
 Src/Base/AMReX_GpuTypes.H         |  4 +--
 Src/Particle/AMReX_ParticleUtil.H |  5 ++--
 4 files changed, 46 insertions(+), 15 deletions(-)

diff --git a/Src/Base/AMReX_GpuLaunchFunctsC.H b/Src/Base/AMReX_GpuLaunchFunctsC.H
index 6f39150b460..9f88f8de884 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsC.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsC.H
@@ -136,7 +136,8 @@ void launch (T const& n, L&& f) noexcept
 template<int MT, int dim, typename L>
 void LaunchRaw (IntVectND<dim> nblocks, L const& f)
 {
-    static_assert(MT == 1);
+    static_assert(MT == 1, "LaunchRaw with CPU backend only with one thread per block! "
+        "Otherwise the syncthreads function would not work");
     if constexpr(dim == 1) {
         for (int bx=0; bx < nblocks[0]; ++bx) {
             f(Gpu::LaunchHandler<MT, void, dim>{
@@ -164,7 +165,8 @@ void LaunchRaw (IntVectND<dim> nblocks, L const& f)
 template<int MT, typename T, int dim, typename L>
 void LaunchRaw (IntVectND<dim> nblocks, std::size_t shared_mem_elements, L const& f)
 {
-    static_assert(MT == 1);
+    static_assert(MT == 1, "LaunchRaw with CPU backend only with one thread per block! "
+        "Otherwise the syncthreads function would not work");
     std::vector<T> smem(shared_mem_elements);
     if constexpr(dim == 1) {
         for (int bx=0; bx < nblocks[0]; ++bx) {
diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index 34ea2ca566f..902f0768312 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -899,6 +899,10 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
 template<int MT, int dim, typename L>
 void LaunchRaw (IntVectND<dim> nblocks, L const& f)
 {
+    detail::SyclKernelDevPtr<L> skdp(f, Gpu::gpuStream());
+    L const* pf = skdp.template get<0>();
+    amrex::ignore_unused(pf);
+
     auto& q = Gpu::Device::streamQueue();
 
     sycl::range<dim> threads_per_block;
@@ -916,10 +920,17 @@ void LaunchRaw (IntVectND<dim> nblocks, L const& f)
                 [[sycl::reqd_work_group_size(MT)]]
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
-                    f(Gpu::LaunchHandler<MT, void, dim>{
-                        &item,
-                        nullptr
-                    });
+                    if constexpr (detail::is_big_kernel<L>()) {
+                        (*pf)(Gpu::LaunchHandler<MT, void, dim>{
+                            &item,
+                            nullptr
+                        });
+                    } else {
+                        f(Gpu::LaunchHandler<MT, void, dim>{
+                            &item,
+                            nullptr
+                        });
+                    }
                 });
         });
     } catch (sycl::exception const& ex) {
@@ -930,7 +941,15 @@ void LaunchRaw (IntVectND<dim> nblocks, L const& f)
 template<int MT, typename T, int dim, typename L>
 void LaunchRaw (IntVectND<dim> nblocks, std::size_t shared_mem_elements, L const& f)
 {
+    detail::SyclKernelDevPtr<L> skdp(f, Gpu::gpuStream());
+    L const* pf = skdp.template get<0>();
+    amrex::ignore_unused(pf);
+
     auto& q = Gpu::Device::streamQueue();
+    // use double4 for shared memory as it has the largest alignment of types that might be used
+    using ST = sycl::double4;
+    static_assert(alignof(ST) >= alignof(T));
+    const std::size_t shared_mem_num = (shared_mem_elements*sizeof(T)+sizeof(ST)-1) / sizeof(ST);
 
     sycl::range<dim> threads_per_block;
     sycl::range<dim> threads_total;
@@ -942,16 +961,25 @@ void LaunchRaw (IntVectND<dim> nblocks, std::size_t shared_mem_elements, L const
 
     try {
         q.submit([&] (sycl::handler& h) {
-            sycl::local_accessor<T> shared_data(sycl::range<1>(shared_mem_elements), h);
+            sycl::local_accessor<ST> shared_data(sycl::range<1>(shared_mem_num), h);
             h.parallel_for(sycl::nd_range<dim>(threads_total, threads_per_block),
                 [=] (sycl::nd_item<dim> item)
                 [[sycl::reqd_work_group_size(MT)]]
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
-                    f(Gpu::LaunchHandler<MT, T, dim>{
-                        &item,
-                        shared_data.template get_multi_ptr<sycl::access::decorated::yes>().get()
-                    });
+                    T* shared_mem = reinterpret_cast<T*>(
+                        shared_data.template get_multi_ptr<sycl::access::decorated::yes>().get());
+                    if constexpr (detail::is_big_kernel<L>()) {
+                        (*pf)(Gpu::LaunchHandler<MT, T, dim>{
+                            &item,
+                            shared_mem
+                        });
+                    } else {
+                        f(Gpu::LaunchHandler<MT, T, dim>{
+                            &item,
+                            shared_mem
+                        });
+                    }
                 });
         });
     } catch (sycl::exception const& ex) {
diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H
index a2a87a6d5cc..3c8224ee4d8 100644
--- a/Src/Base/AMReX_GpuTypes.H
+++ b/Src/Base/AMReX_GpuTypes.H
@@ -269,12 +269,12 @@ struct LaunchHandler
     /**
      * \brief Returns a pointer to block-local shared memory. If multiple shared memory
      * allocations are needed in a block, then the allocation must be manually split by adding
-     * offsets to it. For CUDA and HIP the memory is aligned to 32 bytes,
-     * for SYCL to the alignment of the chosen data type.
+     * offsets to it. The memory is aligned to 32 bytes.
      */
     [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
     T* shared_memory () const {
 #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
+        // 32 bytes is sufficient for double4_32a
         static_assert(32 >= alignof(T));
         alignas(32) extern __shared__ char smem[];
         return reinterpret_cast<T*>(smem);
diff --git a/Src/Particle/AMReX_ParticleUtil.H b/Src/Particle/AMReX_ParticleUtil.H
index c2278f80042..b99dda6bfc8 100644
--- a/Src/Particle/AMReX_ParticleUtil.H
+++ b/Src/Particle/AMReX_ParticleUtil.H
@@ -821,8 +821,9 @@ void PermutationForDeposition (Gpu::DeviceVector<index_type>& perm, index_type n
                     current_idx = pllist_next[current_idx];
                 }
 
-                index_type num_particles_block =
-                    Gpu::blockReduceSum<gpu_block_size>(num_particles_thread);
+                index_type num_particles_block = Gpu::blockReduceSum
+                    AMREX_IF_NOT_SYCL(<gpu_block_size>)
+                    (num_particles_thread AMREX_IF_SYCL(, lh.handler()));
 
                 if (lh.threadIdx1D() == 0) {
                     global_idx_start = Gpu::Atomic::Add(pglobal_idx, num_particles_block);

From 1f33724d3cb518889aac32431e6aa4774f9be643 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Sat, 14 Mar 2026 20:25:22 +0100
Subject: [PATCH 40/46] typo

---
 Src/Base/AMReX_GpuLaunchFunctsC.H | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Src/Base/AMReX_GpuLaunchFunctsC.H b/Src/Base/AMReX_GpuLaunchFunctsC.H
index 9f88f8de884..a5a4ec4cdbc 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsC.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsC.H
@@ -136,7 +136,7 @@ void launch (T const& n, L&& f) noexcept
 template<int MT, int dim, typename L>
 void LaunchRaw (IntVectND<dim> nblocks, L const& f)
 {
-    static_assert(MT == 1, "LaunchRaw with CPU backend only with one thread per block! "
+    static_assert(MT == 1, "LaunchRaw with CPU backend only works with one thread per block! "
         "Otherwise the syncthreads function would not work");
     if constexpr(dim == 1) {
         for (int bx=0; bx < nblocks[0]; ++bx) {
@@ -165,7 +165,7 @@ void LaunchRaw (IntVectND<dim> nblocks, L const& f)
 template<int MT, typename T, int dim, typename L>
 void LaunchRaw (IntVectND<dim> nblocks, std::size_t shared_mem_elements, L const& f)
 {
-    static_assert(MT == 1, "LaunchRaw with CPU backend only with one thread per block! "
+    static_assert(MT == 1, "LaunchRaw with CPU backend only works with one thread per block! "
         "Otherwise the syncthreads function would not work");
     std::vector<T> smem(shared_mem_elements);
     if constexpr(dim == 1) {

From 155dd6f001bc6dbd6e79b58656844470ddab0c30 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Sat, 14 Mar 2026 21:35:18 +0100
Subject: [PATCH 41/46] Add include and use BoxIndexer for SYCL

---
 Src/Base/AMReX_GpuLaunchFunctsG.H | 40 ++++++++++++++++++++-----------
 Src/Base/AMReX_GpuTypes.H         | 25 ++++++++++---------
 Tests/LaunchRaw/main.cpp          |  1 +
 3 files changed, 39 insertions(+), 27 deletions(-)

diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
index 902f0768312..a75e071655c 100644
--- a/Src/Base/AMReX_GpuLaunchFunctsG.H
+++ b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -905,29 +905,35 @@ void LaunchRaw (IntVectND<dim> nblocks, L const& f)
 
     auto& q = Gpu::Device::streamQueue();
 
-    sycl::range<dim> threads_per_block;
-    sycl::range<dim> threads_total;
+    sycl::range<1> threads_per_block{MT};
+    sycl::range<1> threads_total{MT};
 
     for (int i=0; i<dim; ++i) {
-        threads_per_block[i] = (i == dim-1) ? MT : 1;
-        threads_total[i] = nblocks[dim-1-i] * threads_per_block[i];
+        threads_total[0] *= static_cast<std::size_t>(nblocks[i]);
     }
 
+    // With SYCL it is difficult to combine a 1D blockDim with an ND gridDim,
+    // so we use a 1D sycl range and split the blockID with BoxIndexerND. Note that
+    // BoxIndexerND is a bit inefficient since it is adding the smallEnd which is always zero here.
+    BoxIndexerND<dim> bxi(BoxND<dim>{IntVectND<dim>(0), nblocks - 1});
+
     try {
         q.submit([&] (sycl::handler& h) {
-            h.parallel_for(sycl::nd_range<dim>(threads_total, threads_per_block),
-                [=] (sycl::nd_item<dim> item)
+            h.parallel_for(sycl::nd_range<1>(threads_total, threads_per_block),
+                [=] (sycl::nd_item<1> item)
                 [[sycl::reqd_work_group_size(MT)]]
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
                     if constexpr (detail::is_big_kernel<L>()) {
                         (*pf)(Gpu::LaunchHandler<MT, void, dim>{
                             &item,
+                            bxi.intVect(item.get_group(0)),
                             nullptr
                         });
                     } else {
                         f(Gpu::LaunchHandler<MT, void, dim>{
                             &item,
+                            bxi.intVect(item.get_group(0)),
                             nullptr
                         });
                     }
@@ -951,19 +957,23 @@ void LaunchRaw (IntVectND<dim> nblocks, std::size_t shared_mem_elements, L const
     static_assert(alignof(ST) >= alignof(T));
     const std::size_t shared_mem_num = (shared_mem_elements*sizeof(T)+sizeof(ST)-1) / sizeof(ST);
 
-    sycl::range<dim> threads_per_block;
-    sycl::range<dim> threads_total;
+    sycl::range<1> threads_per_block{MT};
+    sycl::range<1> threads_total{MT};
 
     for (int i=0; i<dim; ++i) {
-        threads_per_block[i] = (i == dim-1) ? MT : 1;
-        threads_total[i] = nblocks[dim-1-i] * threads_per_block[i];
+        threads_total[0] *= static_cast<std::size_t>(nblocks[i]);
     }
 
+    // With SYCL it is difficult to combine a 1D blockDim with an ND gridDim,
+    // so we use a 1D sycl range and split the blockID with BoxIndexerND. Note that
+    // BoxIndexerND is a bit inefficient since it is adding the smallEnd which is always zero here.
+    BoxIndexerND<dim> bxi(BoxND<dim>{IntVectND<dim>(0), nblocks - 1});
+
     try {
         q.submit([&] (sycl::handler& h) {
             sycl::local_accessor<ST> shared_data(sycl::range<1>(shared_mem_num), h);
-            h.parallel_for(sycl::nd_range<dim>(threads_total, threads_per_block),
-                [=] (sycl::nd_item<dim> item)
+            h.parallel_for(sycl::nd_range<1>(threads_total, threads_per_block),
+                [=] (sycl::nd_item<1> item)
                 [[sycl::reqd_work_group_size(MT)]]
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
@@ -972,11 +982,13 @@ void LaunchRaw (IntVectND<dim> nblocks, std::size_t shared_mem_elements, L const
                     if constexpr (detail::is_big_kernel<L>()) {
                         (*pf)(Gpu::LaunchHandler<MT, T, dim>{
                             &item,
+                            bxi.intVect(item.get_group(0)),
                             shared_mem
                         });
                     } else {
                         f(Gpu::LaunchHandler<MT, T, dim>{
                             &item,
+                            bxi.intVect(item.get_group(0)),
                             shared_mem
                         });
                     }
@@ -1385,7 +1397,7 @@ ParallelFor (Gpu::KernelInfo const&,
  * should be one of 128, 256, 512 or 1024.
  *
  * The number of total blocks can be a 1D, 2D or 3D IntVectND. Internally this uses the native
- * way to split the index, such as blockIdx.x, blockIdx.y and blockIdx.z.
+ * way to split the index for CUDA and HIP, using blockIdx.x, blockIdx.y and blockIdx.z.
  * Note that this uses types int and unsigned int which might overflow if many blocks or
  * total threads are needed. In case this is an issue, it is necessary to explicitly add
  * a 64-bit grid-strided loop or to call LaunchRaw multiple times with fewer blocks.
@@ -1423,7 +1435,7 @@ void LaunchRaw (IntVectND<dim> nblocks, L const& f)
  * should be one of 128, 256, 512 or 1024.
  *
  * The number of total blocks can be a 1D, 2D or 3D IntVectND. Internally this uses the native
- * way to split the index, such as blockIdx.x, blockIdx.y and blockIdx.z.
+ * way to split the index for CUDA and HIP, using blockIdx.x, blockIdx.y and blockIdx.z.
  * Note that this uses types int and unsigned int which might overflow if many blocks or
  * total threads are needed. In case this is an issue, it is necessary to explicitly add
  * a 64-bit grid-strided loop or to call LaunchRaw multiple times with fewer blocks.
diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H
index 3c8224ee4d8..bf53792b8b8 100644
--- a/Src/Base/AMReX_GpuTypes.H
+++ b/Src/Base/AMReX_GpuTypes.H
@@ -5,6 +5,7 @@
 #include <AMReX_Extension.H>
 #include <AMReX_BaseFwd.H>
 #include <AMReX_GpuControl.H>
+#include <AMReX_IntVect.H>
 
 #ifdef AMREX_USE_GPU
 
@@ -112,8 +113,8 @@ struct LaunchHandler
 #if defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
     LaunchHandler() = default;
 #elif defined(AMREX_USE_SYCL)
-    LaunchHandler(sycl::nd_item<dim> const* a_item, T * a_shared_mem)
-        : m_item{a_item}, m_shared_mem{a_shared_mem} {}
+    LaunchHandler(sycl::nd_item<1> const* a_item, IntVectND<dim> a_blockid, T * a_shared_mem)
+        : m_item{a_item}, m_blockid{a_blockid}, m_shared_mem{a_shared_mem} {}
 #else
     LaunchHandler(IntVectND<dim> a_blockid, IntVectND<dim> a_griddim, T * a_shared_mem)
         : m_blockid{a_blockid}, m_griddim{a_griddim}, m_shared_mem{a_shared_mem} {}
@@ -170,13 +171,13 @@ struct LaunchHandler
         } else if constexpr (dim == 2) {
             return static_cast<unsigned int>(AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
                 blockIdx.x + gridDim.x * blockIdx.y,
-                m_item->get_group_linear_id(),
+                m_item->get_group(0),
                 m_blockid[0] + m_griddim[0] * m_blockid[1]
             ));
         } else {
             return static_cast<unsigned int>(AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
                 blockIdx.x + gridDim.x * blockIdx.y + gridDim.x * gridDim.y * blockIdx.z,
-                m_item->get_group_linear_id(),
+                m_item->get_group(0)),
                 m_blockid[0] + m_griddim[0] * m_blockid[1] +
                 m_griddim[0] * m_griddim[1] * m_blockid[2]
             ));
@@ -191,15 +192,14 @@ struct LaunchHandler
         if constexpr (dim == 1) {
             return AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
                 IntVectND<dim>(static_cast<int>(blockIdx.x)),
-                IntVectND<dim>(static_cast<int>(m_item->get_group(0))),
+                m_blockid,
                 m_blockid
             );
         } else if constexpr (dim == 2) {
             return AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
                 IntVectND<dim>(static_cast<int>(blockIdx.x),
                                static_cast<int>(blockIdx.y)),
-                IntVectND<dim>(static_cast<int>(m_item->get_group(1)),
-                               static_cast<int>(m_item->get_group(0))),
+                m_blockid,
                 m_blockid
             );
         } else {
@@ -207,9 +207,7 @@ struct LaunchHandler
                 IntVectND<dim>(static_cast<int>(blockIdx.x),
                                static_cast<int>(blockIdx.y),
                                static_cast<int>(blockIdx.z)),
-                IntVectND<dim>(static_cast<int>(m_item->get_group(2)),
-                               static_cast<int>(m_item->get_group(1)),
-                               static_cast<int>(m_item->get_group(0))),
+                m_blockid,
                 m_blockid
             );
         }
@@ -298,11 +296,11 @@ struct LaunchHandler
     }
 
     /**
-     * \brief Returns the internal sycl::nd_item.
+     * \brief Returns the internal sycl::nd_item<1>.
      */
     [[nodiscard]] AMREX_GPU_DEVICE AMREX_FORCE_INLINE
 #if defined(AMREX_USE_SYCL)
-    const sycl::nd_item<dim>* sycl_item () const {
+    const sycl::nd_item<1>* sycl_item () const {
         return m_item;
     }
 #else
@@ -314,7 +312,8 @@ struct LaunchHandler
 private:
 
 #if defined(AMREX_USE_SYCL)
-    sycl::nd_item<dim> const* m_item;
+    sycl::nd_item<1> const* m_item;
+    IntVectND<dim> m_blockid;
     T * m_shared_mem;
 #elif !defined(AMREX_USE_GPU)
     IntVectND<dim> m_blockid;
diff --git a/Tests/LaunchRaw/main.cpp b/Tests/LaunchRaw/main.cpp
index b34b590e530..10c4c948381 100644
--- a/Tests/LaunchRaw/main.cpp
+++ b/Tests/LaunchRaw/main.cpp
@@ -1,3 +1,4 @@
+#include <AMReX_GpuTypes.H>
 #include <AMReX.H>
 #include <AMReX_Gpu.H>
 #include <AMReX_GpuContainers.H>

From a3dfac39e936be787004bfdaf60f0baf14bade21 Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Sat, 14 Mar 2026 21:37:42 +0100
Subject: [PATCH 42/46] fix

---
 Src/Base/AMReX_GpuTypes.H | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H
index bf53792b8b8..a6ac2b09401 100644
--- a/Src/Base/AMReX_GpuTypes.H
+++ b/Src/Base/AMReX_GpuTypes.H
@@ -177,7 +177,7 @@ struct LaunchHandler
         } else {
             return static_cast<unsigned int>(AMREX_HIP_CUDA_OR_SYCL_OR_CPU(
                 blockIdx.x + gridDim.x * blockIdx.y + gridDim.x * gridDim.y * blockIdx.z,
-                m_item->get_group(0)),
+                m_item->get_group(0),
                 m_blockid[0] + m_griddim[0] * m_blockid[1] +
                 m_griddim[0] * m_griddim[1] * m_blockid[2]
             ));

From 3f2a3f88b8d03996a8c45d7dff1fe77d15fccbdf Mon Sep 17 00:00:00 2001
From: AlexanderSinn <alexander.sinn@desy.de>
Date: Mon, 16 Mar 2026 19:35:56 +0100
Subject: [PATCH 43/46] Add missing include to Arena.H

---
 Src/Base/AMReX_Arena.H | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Src/Base/AMReX_Arena.H b/Src/Base/AMReX_Arena.H
index dfcd6e20289..97ab30f8232 100644
--- a/Src/Base/AMReX_Arena.H
+++ b/Src/Base/AMReX_Arena.H
@@ -6,6 +6,7 @@
 #include <AMReX_INT.H>
 #ifdef AMREX_USE_GPU
 #include <AMReX_GpuControl.H>
+#include <AMReX_GpuTypes.H>
 #endif
 
 #ifdef AMREX_TINY_PROFILING

From cbad36a91f1a5c5ded55a2bffdf8bf12ae09b9e2 Mon Sep 17 00:00:00 2001
From: AlexanderSinn <alexander.sinn@desy.de>
Date: Mon, 16 Mar 2026 20:09:57 +0100
Subject: [PATCH 44/46] move definition of gpuStream_t

---
 Src/Base/AMReX_Arena.H      |  1 -
 Src/Base/AMReX_GpuControl.H | 10 +---------
 Src/Base/AMReX_GpuTypes.H   |  9 +++++++++
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/Src/Base/AMReX_Arena.H b/Src/Base/AMReX_Arena.H
index 97ab30f8232..1cfcfc9bc70 100644
--- a/Src/Base/AMReX_Arena.H
+++ b/Src/Base/AMReX_Arena.H
@@ -5,7 +5,6 @@
 #include <AMReX_BLassert.H>
 #include <AMReX_INT.H>
 #ifdef AMREX_USE_GPU
-#include <AMReX_GpuControl.H>
 #include <AMReX_GpuTypes.H>
 #endif
 
diff --git a/Src/Base/AMReX_GpuControl.H b/Src/Base/AMReX_GpuControl.H
index e5213f5440b..cad51dec070 100644
--- a/Src/Base/AMReX_GpuControl.H
+++ b/Src/Base/AMReX_GpuControl.H
@@ -90,15 +90,7 @@ namespace amrex {
 #define AMREX_DEFAULT_RUNON =amrex::RunOn::Host // by default run on Host when compiling for Cpu
 #endif
 
-namespace amrex { // NOLINT(modernize-concat-nested-namespaces)
-
-#ifdef AMREX_USE_HIP
-using gpuStream_t = hipStream_t;
-#elif defined(AMREX_USE_CUDA)
-using gpuStream_t = cudaStream_t;
-#endif
-
-namespace Gpu {
+namespace amrex::Gpu {
 
 #if defined(AMREX_USE_GPU)
 
diff --git a/Src/Base/AMReX_GpuTypes.H b/Src/Base/AMReX_GpuTypes.H
index a6ac2b09401..3c96b046851 100644
--- a/Src/Base/AMReX_GpuTypes.H
+++ b/Src/Base/AMReX_GpuTypes.H
@@ -5,6 +5,7 @@
 #include <AMReX_Extension.H>
 #include <AMReX_BaseFwd.H>
 #include <AMReX_GpuControl.H>
+#include <AMReX_GpuQualifiers.H>
 #include <AMReX_IntVect.H>
 
 #ifdef AMREX_USE_GPU
@@ -35,6 +36,14 @@ struct gpuStream_t {
     bool operator!= (gpuStream_t const& rhs) const noexcept { return queue != rhs.queue; }
 };
 
+#elif defined(AMREX_USE_HIP)
+
+using gpuStream_t = hipStream_t;
+
+#elif defined(AMREX_USE_CUDA)
+
+using gpuStream_t = cudaStream_t;
+
 #endif
 
 }

From 93dd7b24ce7bc40a47acec9a7e039d4187fc1ca7 Mon Sep 17 00:00:00 2001
From: AlexanderSinn <alexander.sinn@desy.de>
Date: Mon, 16 Mar 2026 20:14:19 +0100
Subject: [PATCH 45/46] fix

---
 Src/Base/AMReX_GpuControl.H | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Src/Base/AMReX_GpuControl.H b/Src/Base/AMReX_GpuControl.H
index cad51dec070..9310f9ffae1 100644
--- a/Src/Base/AMReX_GpuControl.H
+++ b/Src/Base/AMReX_GpuControl.H
@@ -232,7 +232,6 @@ namespace amrex::Gpu {
 
 #endif
 
-}
 }
 
 #endif

From b25c2a166cd86433a245ed1e38b4a00c2286d7bb Mon Sep 17 00:00:00 2001
From: Alexander Sinn <alexander.sinn@desy.de>
Date: Tue, 17 Mar 2026 08:52:48 +0100
Subject: [PATCH 46/46] Update AMREX_HOME

---
 Tests/LaunchRaw/GNUmakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Tests/LaunchRaw/GNUmakefile b/Tests/LaunchRaw/GNUmakefile
index 173eef4c670..05ed32d3446 100644
--- a/Tests/LaunchRaw/GNUmakefile
+++ b/Tests/LaunchRaw/GNUmakefile
@@ -1,4 +1,4 @@
-AMREX_HOME = ../../../
+AMREX_HOME = ../../
 
 DEBUG	= FALSE