diff --git a/CUDADataFormats/Common/BuildFile.xml b/CUDADataFormats/Common/BuildFile.xml
deleted file mode 100644
index e4971bdf3ebbe..0000000000000
--- a/CUDADataFormats/Common/BuildFile.xml
+++ /dev/null
@@ -1,9 +0,0 @@
-
-
-
-
-
-
-
-
-
diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
deleted file mode 100644
index 8cfa5c9f5ffde..0000000000000
--- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h
+++ /dev/null
@@ -1,194 +0,0 @@
-#ifndef CUDADataFormatsCommonHeterogeneousSoA_H
-#define CUDADataFormatsCommonHeterogeneousSoA_H
-
-#include
-
-#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
-
-// a heterogeneous unique pointer...
-template
-class HeterogeneousSoA {
-public:
- using Product = T;
-
- HeterogeneousSoA() = default; // make root happy
- ~HeterogeneousSoA() = default;
- HeterogeneousSoA(HeterogeneousSoA &&) = default;
- HeterogeneousSoA &operator=(HeterogeneousSoA &&) = default;
-
- explicit HeterogeneousSoA(cms::cuda::device::unique_ptr &&p) : dm_ptr(std::move(p)) {}
- explicit HeterogeneousSoA(cms::cuda::host::unique_ptr &&p) : hm_ptr(std::move(p)) {}
- explicit HeterogeneousSoA(std::unique_ptr &&p) : std_ptr(std::move(p)) {}
-
- auto const *get() const { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); }
-
- auto const &operator*() const { return *get(); }
-
- auto const *operator->() const { return get(); }
-
- auto *get() { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); }
-
- auto &operator*() { return *get(); }
-
- auto *operator->() { return get(); }
-
- // in reality valid only for GPU version...
- cms::cuda::host::unique_ptr toHostAsync(cudaStream_t stream) const {
- assert(dm_ptr);
- auto ret = cms::cuda::make_host_unique(stream);
- cudaCheck(cudaMemcpyAsync(ret.get(), dm_ptr.get(), sizeof(T), cudaMemcpyDefault, stream));
- return ret;
- }
-
-private:
- // a union wan't do it, a variant will not be more efficienct
- cms::cuda::device::unique_ptr dm_ptr; //!
- cms::cuda::host::unique_ptr hm_ptr; //!
- std::unique_ptr std_ptr; //!
-};
-
-namespace cms {
- namespace cudacompat {
-
- struct GPUTraits {
- template
- using unique_ptr = cms::cuda::device::unique_ptr;
-
- template
- static auto make_unique(cudaStream_t stream) {
- return cms::cuda::make_device_unique(stream);
- }
-
- template
- static auto make_unique(size_t size, cudaStream_t stream) {
- return cms::cuda::make_device_unique(size, stream);
- }
-
- template
- static auto make_host_unique(cudaStream_t stream) {
- return cms::cuda::make_host_unique(stream);
- }
-
- template
- static auto make_device_unique(cudaStream_t stream) {
- return cms::cuda::make_device_unique(stream);
- }
-
- template
- static auto make_device_unique(size_t size, cudaStream_t stream) {
- return cms::cuda::make_device_unique(size, stream);
- }
- };
-
- struct HostTraits {
- template
- using unique_ptr = cms::cuda::host::unique_ptr;
-
- template
- static auto make_unique(cudaStream_t stream) {
- return cms::cuda::make_host_unique(stream);
- }
-
- template
- static auto make_unique(size_t size, cudaStream_t stream) {
- return cms::cuda::make_host_unique(size, stream);
- }
-
- template
- static auto make_host_unique(cudaStream_t stream) {
- return cms::cuda::make_host_unique(stream);
- }
-
- template
- static auto make_device_unique(cudaStream_t stream) {
- return cms::cuda::make_device_unique(stream);
- }
-
- template
- static auto make_device_unique(size_t size, cudaStream_t stream) {
- return cms::cuda::make_device_unique(size, stream);
- }
- };
-
- struct CPUTraits {
- template
- using unique_ptr = std::unique_ptr;
-
- template
- static auto make_unique(cudaStream_t) {
- return std::make_unique();
- }
-
- template
- static auto make_unique(size_t size, cudaStream_t) {
- return std::make_unique(size);
- }
-
- template
- static auto make_host_unique(cudaStream_t) {
- return std::make_unique();
- }
-
- template
- static auto make_device_unique(cudaStream_t) {
- return std::make_unique();
- }
-
- template
- static auto make_device_unique(size_t size, cudaStream_t) {
- return std::make_unique(size);
- }
- };
-
- } // namespace cudacompat
-} // namespace cms
-
-// a heterogeneous unique pointer (of a different sort) ...
-template
-class HeterogeneousSoAImpl {
-public:
- template
- using unique_ptr = typename Traits::template unique_ptr;
-
- HeterogeneousSoAImpl() = default; // make root happy
- ~HeterogeneousSoAImpl() = default;
- HeterogeneousSoAImpl(HeterogeneousSoAImpl &&) = default;
- HeterogeneousSoAImpl &operator=(HeterogeneousSoAImpl &&) = default;
-
- explicit HeterogeneousSoAImpl(unique_ptr &&p) : m_ptr(std::move(p)) {}
- explicit HeterogeneousSoAImpl(cudaStream_t stream);
-
- T const *get() const { return m_ptr.get(); }
-
- T *get() { return m_ptr.get(); }
-
- cms::cuda::host::unique_ptr toHostAsync(cudaStream_t stream) const;
-
-private:
- unique_ptr m_ptr; //!
-};
-
-template
-HeterogeneousSoAImpl::HeterogeneousSoAImpl(cudaStream_t stream) {
- m_ptr = Traits::template make_unique(stream);
-}
-
-// in reality valid only for GPU version...
-template
-cms::cuda::host::unique_ptr HeterogeneousSoAImpl::toHostAsync(cudaStream_t stream) const {
- auto ret = cms::cuda::make_host_unique(stream);
- cudaCheck(cudaMemcpyAsync(ret.get(), get(), sizeof(T), cudaMemcpyDefault, stream));
- return ret;
-}
-
-template
-using HeterogeneousSoAGPU = HeterogeneousSoAImpl;
-template
-using HeterogeneousSoACPU = HeterogeneousSoAImpl;
-template
-using HeterogeneousSoAHost = HeterogeneousSoAImpl;
-
-#endif
diff --git a/CUDADataFormats/Common/interface/HostProduct.h b/CUDADataFormats/Common/interface/HostProduct.h
deleted file mode 100644
index 63a152298e42b..0000000000000
--- a/CUDADataFormats/Common/interface/HostProduct.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef CUDADataFormatsCommonHostProduct_H
-#define CUDADataFormatsCommonHostProduct_H
-
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
-
-// a heterogeneous unique pointer...
-template
-class HostProduct {
-public:
- HostProduct() = default; // make root happy
- ~HostProduct() = default;
- HostProduct(HostProduct&&) = default;
- HostProduct& operator=(HostProduct&&) = default;
-
- explicit HostProduct(cms::cuda::host::unique_ptr&& p) : hm_ptr(std::move(p)) {}
- explicit HostProduct(std::unique_ptr&& p) : std_ptr(std::move(p)) {}
-
- auto const* get() const { return hm_ptr ? hm_ptr.get() : std_ptr.get(); }
-
- auto const& operator*() const { return *get(); }
-
- auto const* operator->() const { return get(); }
-
-private:
- cms::cuda::host::unique_ptr hm_ptr; //!
- std::unique_ptr std_ptr; //!
-};
-
-#endif
diff --git a/CUDADataFormats/Common/interface/PortableDeviceCollection.h b/CUDADataFormats/Common/interface/PortableDeviceCollection.h
deleted file mode 100644
index 78f72cb3d5437..0000000000000
--- a/CUDADataFormats/Common/interface/PortableDeviceCollection.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef CUDADataFormats_Common_interface_PortableDeviceCollection_h
-#define CUDADataFormats_Common_interface_PortableDeviceCollection_h
-
-#include
-#include
-
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-
-namespace cms::cuda {
-
- // generic SoA-based product in device memory
- template
- class PortableDeviceCollection {
- public:
- using Layout = T;
- using View = typename Layout::View;
- using ConstView = typename Layout::ConstView;
- using Buffer = cms::cuda::device::unique_ptr;
-
- PortableDeviceCollection() = default;
-
- PortableDeviceCollection(int32_t elements, cudaStream_t stream)
- : buffer_{cms::cuda::make_device_unique(Layout::computeDataSize(elements), stream)},
- layout_{buffer_.get(), elements},
- view_{layout_} {
- // CUDA device memory uses a default alignment of at least 128 bytes
- assert(reinterpret_cast(buffer_.get()) % Layout::alignment == 0);
- }
-
- // non-copyable
- PortableDeviceCollection(PortableDeviceCollection const&) = delete;
- PortableDeviceCollection& operator=(PortableDeviceCollection const&) = delete;
-
- // movable
- PortableDeviceCollection(PortableDeviceCollection&&) = default;
- PortableDeviceCollection& operator=(PortableDeviceCollection&&) = default;
-
- // default destructor
- ~PortableDeviceCollection() = default;
-
- // access the View
- View& view() { return view_; }
- ConstView const& view() const { return view_; }
- ConstView const& const_view() const { return view_; }
-
- View& operator*() { return view_; }
- ConstView const& operator*() const { return view_; }
-
- View* operator->() { return &view_; }
- ConstView const* operator->() const { return &view_; }
-
- // access the Buffer
- Buffer& buffer() { return buffer_; }
- Buffer const& buffer() const { return buffer_; }
- Buffer const& const_buffer() const { return buffer_; }
-
- size_t bufferSize() const { return layout_.metadata().byteSize(); }
-
- private:
- Buffer buffer_; //!
- Layout layout_; //
- View view_; //!
- };
-
-} // namespace cms::cuda
-
-#endif // CUDADataFormats_Common_interface_PortableDeviceCollection_h
diff --git a/CUDADataFormats/Common/interface/PortableHostCollection.h b/CUDADataFormats/Common/interface/PortableHostCollection.h
deleted file mode 100644
index cfaf40c85b3bc..0000000000000
--- a/CUDADataFormats/Common/interface/PortableHostCollection.h
+++ /dev/null
@@ -1,85 +0,0 @@
-#ifndef CUDADataFormats_Common_interface_PortableHostCollection_h
-#define CUDADataFormats_Common_interface_PortableHostCollection_h
-
-#include
-#include
-
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
-
-namespace cms::cuda {
-
- // generic SoA-based product in host memory
- template
- class PortableHostCollection {
- public:
- using Layout = T;
- using View = typename Layout::View;
- using ConstView = typename Layout::ConstView;
- using Buffer = cms::cuda::host::unique_ptr;
-
- PortableHostCollection() = default;
-
- PortableHostCollection(int32_t elements)
- // allocate pageable host memory
- : buffer_{cms::cuda::make_host_unique(Layout::computeDataSize(elements))},
- layout_{buffer_.get(), elements},
- view_{layout_} {
- // make_host_unique for pageable host memory uses a default alignment of 128 bytes
- assert(reinterpret_cast(buffer_.get()) % Layout::alignment == 0);
- }
-
- PortableHostCollection(int32_t elements, cudaStream_t stream)
- // allocate pinned host memory, accessible by the current device
- : buffer_{cms::cuda::make_host_unique(Layout::computeDataSize(elements), stream)},
- layout_{buffer_.get(), elements},
- view_{layout_} {
- // CUDA pinned host memory uses a default alignment of at least 128 bytes
- assert(reinterpret_cast(buffer_.get()) % Layout::alignment == 0);
- }
-
- // non-copyable
- PortableHostCollection(PortableHostCollection const&) = delete;
- PortableHostCollection& operator=(PortableHostCollection const&) = delete;
-
- // movable
- PortableHostCollection(PortableHostCollection&&) = default;
- PortableHostCollection& operator=(PortableHostCollection&&) = default;
-
- // default destructor
- ~PortableHostCollection() = default;
-
- // access the View
- View& view() { return view_; }
- ConstView const& view() const { return view_; }
- ConstView const& const_view() const { return view_; }
-
- View& operator*() { return view_; }
- ConstView const& operator*() const { return view_; }
-
- View* operator->() { return &view_; }
- ConstView const* operator->() const { return &view_; }
-
- // access the Buffer
- Buffer& buffer() { return buffer_; }
- Buffer const& buffer() const { return buffer_; }
- Buffer const& const_buffer() const { return buffer_; }
-
- size_t bufferSize() const { return layout_.metadata().byteSize(); }
-
- // part of the ROOT read streamer
- static void ROOTReadStreamer(PortableHostCollection* newObj, Layout const& layout) {
- newObj->~PortableHostCollection();
- // allocate pinned host memory using the legacy stream, that synchronises with all (blocking) streams
- new (newObj) PortableHostCollection(layout.metadata().size());
- newObj->layout_.ROOTReadStreamer(layout);
- }
-
- private:
- Buffer buffer_; //!
- Layout layout_; //
- View view_; //!
- };
-
-} // namespace cms::cuda
-
-#endif // CUDADataFormats_Common_interface_PortableHostCollection_h
diff --git a/CUDADataFormats/Common/interface/Product.h b/CUDADataFormats/Common/interface/Product.h
deleted file mode 100644
index 41bb8356e67cf..0000000000000
--- a/CUDADataFormats/Common/interface/Product.h
+++ /dev/null
@@ -1,60 +0,0 @@
-#ifndef CUDADataFormats_Common_Product_h
-#define CUDADataFormats_Common_Product_h
-
-#include
-
-#include "CUDADataFormats/Common/interface/ProductBase.h"
-
-namespace edm {
- template
- class Wrapper;
-}
-
-namespace cms {
- namespace cuda {
- namespace impl {
- class ScopedContextGetterBase;
- }
-
- /**
- * The purpose of this class is to wrap CUDA data to edm::Event in a
- * way which forces correct use of various utilities.
- *
- * The non-default construction has to be done with cms::cuda::ScopedContext
- * (in order to properly register the CUDA event).
- *
- * The default constructor is needed only for the ROOT dictionary generation.
- *
- * The CUDA event is in practice needed only for stream-stream
- * synchronization, but someone with long-enough lifetime has to own
- * it. Here is a somewhat natural place. If overhead is too much, we
- * can use them only where synchronization between streams is needed.
- */
- template
- class Product : public ProductBase {
- public:
- Product() = default; // Needed only for ROOT dictionary generation
-
- Product(const Product&) = delete;
- Product& operator=(const Product&) = delete;
- Product(Product&&) = default;
- Product& operator=(Product&&) = default;
-
- private:
- friend class impl::ScopedContextGetterBase;
- friend class ScopedContextProduce;
- friend class edm::Wrapper>;
-
- explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, T data)
- : ProductBase(device, std::move(stream), std::move(event)), data_(std::move(data)) {}
-
- template
- explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, Args&&... args)
- : ProductBase(device, std::move(stream), std::move(event)), data_(std::forward(args)...) {}
-
- T data_; //!
- };
- } // namespace cuda
-} // namespace cms
-
-#endif
diff --git a/CUDADataFormats/Common/interface/ProductBase.h b/CUDADataFormats/Common/interface/ProductBase.h
deleted file mode 100644
index efe2242903bd0..0000000000000
--- a/CUDADataFormats/Common/interface/ProductBase.h
+++ /dev/null
@@ -1,93 +0,0 @@
-#ifndef CUDADataFormats_Common_ProductBase_h
-#define CUDADataFormats_Common_ProductBase_h
-
-#include
-#include
-
-#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h"
-
-namespace cms {
- namespace cuda {
- namespace impl {
- class ScopedContextBase;
- }
-
- /**
- * Base class for all instantiations of CUDA to hold the
- * non-T-dependent members.
- */
- class ProductBase {
- public:
- ProductBase() = default; // Needed only for ROOT dictionary generation
- ~ProductBase();
-
- ProductBase(const ProductBase&) = delete;
- ProductBase& operator=(const ProductBase&) = delete;
- ProductBase(ProductBase&& other)
- : stream_{std::move(other.stream_)},
- event_{std::move(other.event_)},
- mayReuseStream_{other.mayReuseStream_.load()},
- device_{other.device_} {}
- ProductBase& operator=(ProductBase&& other) {
- stream_ = std::move(other.stream_);
- event_ = std::move(other.event_);
- mayReuseStream_ = other.mayReuseStream_.load();
- device_ = other.device_;
- return *this;
- }
-
- bool isValid() const { return stream_.get() != nullptr; }
- bool isAvailable() const;
-
- int device() const { return device_; }
-
- // cudaStream_t is a pointer to a thread-safe object, for which a
- // mutable access is needed even if the cms::cuda::ScopedContext itself
- // would be const. Therefore it is ok to return a non-const
- // pointer from a const method here.
- cudaStream_t stream() const { return stream_.get(); }
-
- // cudaEvent_t is a pointer to a thread-safe object, for which a
- // mutable access is needed even if the cms::cuda::ScopedContext itself
- // would be const. Therefore it is ok to return a non-const
- // pointer from a const method here.
- cudaEvent_t event() const { return event_.get(); }
-
- protected:
- explicit ProductBase(int device, SharedStreamPtr stream, SharedEventPtr event)
- : stream_{std::move(stream)}, event_{std::move(event)}, device_{device} {}
-
- private:
- friend class impl::ScopedContextBase;
- friend class ScopedContextProduce;
-
- // The following function is intended to be used only from ScopedContext
- const SharedStreamPtr& streamPtr() const { return stream_; }
-
- bool mayReuseStream() const {
- bool expected = true;
- bool changed = mayReuseStream_.compare_exchange_strong(expected, false);
- // If the current thread is the one flipping the flag, it may
- // reuse the stream.
- return changed;
- }
-
- // The cudaStream_t is really shared among edm::Event products, so
- // using shared_ptr also here
- SharedStreamPtr stream_; //!
- // shared_ptr because of caching in cms::cuda::EventCache
- SharedEventPtr event_; //!
-
- // This flag tells whether the CUDA stream may be reused by a
- // consumer or not. The goal is to have a "chain" of modules to
- // queue their work to the same stream.
- mutable std::atomic mayReuseStream_ = true; //!
-
- // The CUDA device associated with this product
- int device_ = -1; //!
- };
- } // namespace cuda
-} // namespace cms
-
-#endif
diff --git a/CUDADataFormats/Common/src/ProductBase.cc b/CUDADataFormats/Common/src/ProductBase.cc
deleted file mode 100644
index 8e1cf64b17122..0000000000000
--- a/CUDADataFormats/Common/src/ProductBase.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-#include "CUDADataFormats/Common/interface/ProductBase.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"
-
-namespace cms::cuda {
- bool ProductBase::isAvailable() const {
- // if default-constructed, the product is not available
- if (not event_) {
- return false;
- }
- return eventWorkHasCompleted(event_.get());
- }
-
- ProductBase::~ProductBase() {
- // Make sure that the production of the product in the GPU is
- // complete before destructing the product. This is to make sure
- // that the EDM stream does not move to the next event before all
- // asynchronous processing of the current is complete.
-
- // TODO: a callback notifying a WaitingTaskHolder (or similar)
- // would avoid blocking the CPU, but would also require more work.
- //
- // Intentionally not checking the return value to avoid throwing
- // exceptions. If this call would fail, we should get failures
- // elsewhere as well.
- if (event_) {
- cudaEventSynchronize(event_.get());
- }
- }
-} // namespace cms::cuda
diff --git a/CUDADataFormats/Common/src/classes.h b/CUDADataFormats/Common/src/classes.h
deleted file mode 100644
index 239e071d513a2..0000000000000
--- a/CUDADataFormats/Common/src/classes.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef CUDADataFormats_Common_src_classes_h
-#define CUDADataFormats_Common_src_classes_h
-
-#include "CUDADataFormats/Common/interface/HostProduct.h"
-#include "DataFormats/Common/interface/Wrapper.h"
-
-#endif // CUDADataFormats_Common_src_classes_h
diff --git a/CUDADataFormats/Common/src/classes_def.xml b/CUDADataFormats/Common/src/classes_def.xml
deleted file mode 100644
index d8514251c807a..0000000000000
--- a/CUDADataFormats/Common/src/classes_def.xml
+++ /dev/null
@@ -1,4 +0,0 @@
-
-
-
-
diff --git a/CUDADataFormats/Common/test/BuildFile.xml b/CUDADataFormats/Common/test/BuildFile.xml
deleted file mode 100644
index a0cbbdd8a7858..0000000000000
--- a/CUDADataFormats/Common/test/BuildFile.xml
+++ /dev/null
@@ -1,8 +0,0 @@
-
-
-
-
-
-
-
-
diff --git a/CUDADataFormats/Common/test/test_Product.cc b/CUDADataFormats/Common/test/test_Product.cc
deleted file mode 100644
index 5790d07bec56d..0000000000000
--- a/CUDADataFormats/Common/test/test_Product.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-#include "catch2/catch_all.hpp"
-
-#include "CUDADataFormats/Common/interface/Product.h"
-#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
-
-#include
-
-namespace cms::cudatest {
- class TestScopedContext {
- public:
- static cuda::ScopedContextProduce make(int dev, bool createEvent) {
- cms::cuda::SharedEventPtr event;
- if (createEvent) {
- event = cms::cuda::getEventCache().get();
- }
- return cuda::ScopedContextProduce(dev, cms::cuda::getStreamCache().get(), std::move(event));
- }
- };
-} // namespace cms::cudatest
-
-TEST_CASE("Use of cms::cuda::Product template", "[CUDACore]") {
- SECTION("Default constructed") {
- auto foo = cms::cuda::Product();
- REQUIRE(!foo.isValid());
-
- auto bar = std::move(foo);
- }
-
- if (not cms::cudatest::testDevices()) {
- return;
- }
-
- constexpr int defaultDevice = 0;
- cudaCheck(cudaSetDevice(defaultDevice));
- {
- auto ctx = cms::cudatest::TestScopedContext::make(defaultDevice, true);
- std::unique_ptr> dataPtr = ctx.wrap(10);
- auto& data = *dataPtr;
-
- SECTION("Construct from cms::cuda::ScopedContext") {
- REQUIRE(data.isValid());
- REQUIRE(data.device() == defaultDevice);
- REQUIRE(data.stream() == ctx.stream());
- REQUIRE(data.event() != nullptr);
- }
-
- SECTION("Move constructor") {
- auto data2 = cms::cuda::Product(std::move(data));
- REQUIRE(data2.isValid());
- REQUIRE(!data.isValid());
- }
-
- SECTION("Move assignment") {
- cms::cuda::Product data2;
- data2 = std::move(data);
- REQUIRE(data2.isValid());
- REQUIRE(!data.isValid());
- }
- }
-
- cudaCheck(cudaSetDevice(defaultDevice));
- cudaCheck(cudaDeviceSynchronize());
- // Note: CUDA resources are cleaned up by the destructors of the global cache objects
-}
diff --git a/CUDADataFormats/Common/test/test_main.cc b/CUDADataFormats/Common/test/test_main.cc
deleted file mode 100644
index b3ea47c29c7a7..0000000000000
--- a/CUDADataFormats/Common/test/test_main.cc
+++ /dev/null
@@ -1,2 +0,0 @@
-#define CATCH_CONFIG_MAIN
-#include "catch2/catch_all.hpp"
diff --git a/CUDADataFormats/PortableTestObjects/BuildFile.xml b/CUDADataFormats/PortableTestObjects/BuildFile.xml
deleted file mode 100644
index 595a743a6c4c5..0000000000000
--- a/CUDADataFormats/PortableTestObjects/BuildFile.xml
+++ /dev/null
@@ -1,7 +0,0 @@
-
-
-
-
-
-
-
diff --git a/CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h b/CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h
deleted file mode 100644
index 621f0939116d7..0000000000000
--- a/CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef CUDADataFormats_PortableTestObjects_interface_TestDeviceCollection_h
-#define CUDADataFormats_PortableTestObjects_interface_TestDeviceCollection_h
-
-#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
-#include "DataFormats/PortableTestObjects/interface/TestSoA.h"
-
-namespace cudatest {
-
- // Eigen matrix
- using Matrix = portabletest::Matrix;
- using Array = portabletest::Array;
-
- // SoA with x, y, z, id fields, r scalar, m matrix, in device global memory
- using TestDeviceCollection = cms::cuda::PortableDeviceCollection;
-
-} // namespace cudatest
-
-#endif // CUDADataFormats_PortableTestObjects_interface_TestDeviceCollection_h
diff --git a/CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h b/CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h
deleted file mode 100644
index 9426b6c6a8275..0000000000000
--- a/CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef CUDADataFormats_PortableTestObjects_interface_TestHostCollection_h
-#define CUDADataFormats_PortableTestObjects_interface_TestHostCollection_h
-
-#include "CUDADataFormats/Common/interface/PortableHostCollection.h"
-#include "DataFormats/PortableTestObjects/interface/TestSoA.h"
-
-namespace cudatest {
-
- // Eigen matrix
- using Matrix = portabletest::Matrix;
- using Array = portabletest::Array;
-
- // SoA with x, y, z, id fields, r scalar, m matrix, in host memory
- using TestHostCollection = cms::cuda::PortableHostCollection;
-
-} // namespace cudatest
-
-#endif // CUDADataFormats_PortableTestObjects_interface_TestHostCollection_h
diff --git a/HeterogeneousCore/CUDACore/BuildFile.xml b/HeterogeneousCore/CUDACore/BuildFile.xml
deleted file mode 100644
index 42f7db8fc72d6..0000000000000
--- a/HeterogeneousCore/CUDACore/BuildFile.xml
+++ /dev/null
@@ -1,15 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/HeterogeneousCore/CUDACore/README.md b/HeterogeneousCore/CUDACore/README.md
deleted file mode 100644
index 92a29e1460f19..0000000000000
--- a/HeterogeneousCore/CUDACore/README.md
+++ /dev/null
@@ -1,812 +0,0 @@
-# CUDA algorithms in CMSSW
-
-## Outline
-
-* [Introduction](#introduction)
- * [Design goals](#design-goals)
- * [Overall guidelines](#overall-guidelines)
-* [Sub-packages](#sub-packages)
-* [Examples](#examples)
- * [Isolated producer (no CUDA input nor output)](#isolated-producer-no-cuda-input-nor-output)
- * [Producer with CUDA output](#producer-with-cuda-output)
- * [Producer with CUDA input](#producer-with-cuda-input)
- * [Producer with CUDA input and output (with ExternalWork)](#producer-with-cuda-input-and-output-with-externalwork)
- * [Producer with CUDA input and output, and internal chain of CPU and GPU tasks (with ExternalWork)](producer-with-cuda-input-and-output-and-internal-chain-of-cpu-and-gpu-tasks-with-externalwork)
- * [Producer with CUDA input and output (without ExternalWork)](#producer-with-cuda-input-and-output-without-externalwork)
- * [Analyzer with CUDA input](#analyzer-with-cuda-input)
- * [Configuration](#configuration)
- * [GPU-only configuration](#gpu-only-configuration)
-* [More details](#more-details)
- * [Device choice](#device-choice)
- * [Data model](#data-model)
- * [CUDA EDProducer](#cuda-edproducer)
- * [Class declaration](#class-declaration)
- * [Memory allocation](#memory-allocation)
- * [Caching allocator](#caching-allocator)
- * [Non-cached pinned host `unique_ptr`](#non-cached-pinned-host-unique_ptr)
- * [CUDA API](#cuda-api)
- * [Setting the current device](#setting-the-current-device)
- * [Getting input](#getting-input)
- * [Calling the CUDA kernels](#calling-the-cuda-kernels)
- * [Putting output](#putting-output)
- * [`ExternalWork` extension](#externalwork-extension)
- * [Module-internal chain of CPU and GPU tasks](#module-internal-chain-of-cpu-and-gpu-tasks)
- * [Transferring GPU data to CPU](#transferring-gpu-data-to-cpu)
- * [Synchronizing between CUDA streams](#synchronizing-between-cuda-streams)
- * [CUDA ESProduct](#cuda-esproduct)
-
-## Introduction
-
-This page documents the CUDA integration within CMSSW
-
-### Design goals
-
-1. Provide a mechanism for a chain of modules to share a resource
- * Resource can be e.g. CUDA device memory or a CUDA stream
-2. Minimize data movements between the CPU and the device
-3. Support multiple devices
-4. Allow the same job configuration to be used on all hardware combinations
-
-### Overall guidelines
-
-1. Within the `acquire()`/`produce()` functions all CUDA operations should be asynchronous, i.e.
- * Use `cudaMemcpyAsync()`, `cudaMemsetAsync()`, `cudaMemPrefetchAsync()` etc.
- * Avoid `cudaMalloc*()`, `cudaHostAlloc()`, `cudaFree*()`, `cudaHostRegister()`, `cudaHostUnregister()` on every event
- * Occasional calls are permitted through a caching mechanism that amortizes the cost (see also [Caching allocator](#caching-allocator))
- * Avoid `assert()` in device functions, or use `#include HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h`
- * With the latter the `assert()` calls in CUDA code are disabled by
- default, but can be enabled by defining a `GPU_DEBUG` macro
- (before the aforementioned include)
-2. Synchronization needs should be fulfilled with
- [`ExternalWork`](https://twiki.cern.ch/twiki/bin/view/CMSPublic/FWMultithreadedFrameworkStreamModuleInterface#edm_ExternalWork)
- extension to EDProducers
- * `ExternalWork` can be used to replace one synchronization point
- (e.g. between device kernels and copying a known amount of data
- back to CPU).
- * For further synchronization points (e.g. copying data whose
- amount is known only at the device side), split the work to
- multiple `ExternalWork` producers. This approach has the added
- benefit that e.g. data transfers to CPU become on-demand automatically
- * A general breakdown of the possible steps:
- * Convert input legacy CPU data format to CPU SoA
- * Transfer input CPU SoA to GPU
- * Launch kernels
- * Transfer the number of output elements to CPU
- * Transfer the output data from GPU to CPU SoA
- * Convert the output SoA to legacy CPU data formats
-3. Within `acquire()`/`produce()`, the current CUDA device is set
- implicitly and the CUDA stream is provided by the system (with
- `cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`)
- * It is strongly recommended to use the provided CUDA stream for all operations
- * If that is not feasible for some reason, the provided CUDA
- stream must synchronize with the work queued on other CUDA
- streams (with CUDA events and `cudaStreamWaitEvent()`)
-4. Outside of `acquire()`/`produce()`, CUDA API functions may be
- called only if the `CUDAService` implementation of the `CUDAInterface`
- is available and `CUDAService::enabled()` returns `true`:
- ```c++
- edm::Service cuda;
- if (cuda and cuda->enabled()) {
- // CUDA calls ca be made here
- }
- ```
- * With point 3 it follows that in these cases multiple devices have
- to be dealt with explicitly, as well as CUDA streams
-
-## Sub-packages
-* [`HeterogeneousCore/CUDACore`](#cuda-integration) CUDA-specific core components
-* [`HeterogeneousCore/CUDAServices`](../CUDAServices) Various edm::Services related to CUDA
-* [`HeterogeneousCore/CUDAUtilities`](../CUDAUtilities) Various utilities for CUDA kernel code
-* [`HeterogeneousCore/CUDATest`](../CUDATest) Test modules and configurations
-* [`CUDADataFormats/Common`](../../CUDADataFormats/Common) Utilities for event products with CUDA data
-
-## Examples
-
-### Isolated producer (no CUDA input nor output)
-
-```cpp
-class IsolatedProducerCUDA: public edm::stream::EDProducer {
-public:
- ...
- void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
- void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
- ...
-private:
- ...
- IsolatedProducerGPUAlgo gpuAlgo_;
- edm::EDGetTokenT inputToken_;
- edm::EDPutTokenT outputToken_;
-};
-...
-void IsolatedProducerCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
- // Sets the current device and creates a CUDA stream
- cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
-
- auto const& inputData = iEvent.get(inputToken_);
-
- // Queues asynchronous data transfers and kernels to the CUDA stream
- // returned by cms::cuda::ScopedContextAcquire::stream()
- gpuAlgo_.makeAsync(inputData, ctx.stream());
-
- // Destructor of ctx queues a callback to the CUDA stream notifying
- // waitingTaskHolder when the queued asynchronous work has finished
-}
-
-// Called after the asynchronous work has finished
-void IsolatedProducerCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
- // Real life is likely more complex than this simple example. Here
- // getResult() returns some data in CPU memory that is passed
- // directly to the OutputData constructor.
- iEvent.emplace(outputToken_, gpuAlgo_.getResult());
-}
-```
-
-### Producer with CUDA output
-
-```cpp
-class ProducerOutputCUDA: public edm::stream::EDProducer {
-public:
- ...
- void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
- void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
- ...
-private:
- ...
- ProducerOutputGPUAlgo gpuAlgo_;
- edm::EDGetTokenT inputToken_;
- edm::EDPutTokenT> outputToken_;
- cms::cuda::ContextState ctxState_;
-};
-...
-void ProducerOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
- // Sets the current device and creates a CUDA stream
- cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder), ctxState_};
-
- auto const& inputData = iEvent.get(inputToken_);
-
- // Queues asynchronous data transfers and kernels to the CUDA stream
- // returned by cms::cuda::ScopedContextAcquire::stream()
- gpuAlgo.makeAsync(inputData, ctx.stream());
-
- // Destructor of ctx queues a callback to the CUDA stream notifying
- // waitingTaskHolder when the queued asynchronous work has finished,
- // and saves the device and CUDA stream to ctxState_
-}
-
-// Called after the asynchronous work has finished
-void ProducerOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
- // Sets again the current device, uses the CUDA stream created in the acquire()
- cms::cuda::ScopedContextProduce ctx{ctxState_};
-
- // Now getResult() returns data in GPU memory that is passed to the
- // constructor of OutputData. cms::cuda::ScopedContextProduce::emplace() wraps the
- // OutputData to cms::cuda::Product. cms::cuda::Product stores also
- // the current device and the CUDA stream since those will be needed
- // in the consumer side.
- ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult());
-}
-```
-
-### Producer with CUDA input
-
-```cpp
-class ProducerInputCUDA: public edm::stream::EDProducer {
-public:
- ...
- void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
- void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
- ...
-private:
- ...
- ProducerInputGPUAlgo gpuAlgo_;
- edm::EDGetTokenT> inputToken_;
- edm::EDGetTokenT> otherInputToken_;
- edm::EDPutTokenT outputToken_;
-};
-...
-void ProducerInputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
- cms::cuda::Product const& inputDataWrapped = iEvent.get(inputToken_);
-
- // Set the current device to the same that was used to produce
- // InputData, and possibly use the same CUDA stream
- cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
-
- // Grab the real input data. Checks that the input data is on the
- // current device. If the input data was produced in a different CUDA
- // stream than the cms::cuda::ScopedContextAcquire holds, create an inter-stream
- // synchronization point with CUDA event and cudaStreamWaitEvent()
- auto const& inputData = ctx.get(inputDataWrapped);
-
- // Input data from another producer
- auto const& otherInputData = ctx.get(iEvent.get(otherInputToken_));
- // or
- auto const& otherInputData = ctx.get(iEvent, otherInputToken_);
-
-
- // Queues asynchronous data transfers and kernels to the CUDA stream
- // returned by cms::cuda::ScopedContextAcquire::stream()
- gpuAlgo.makeAsync(inputData, otherInputData, ctx.stream());
-
- // Destructor of ctx queues a callback to the CUDA stream notifying
- // waitingTaskHolder when the queued asynchronous work has finished
-}
-
-// Called after the asynchronous work has finished
-void ProducerInputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) {
- // Real life is likely more complex than this simple example. Here
- // getResult() returns some data in CPU memory that is passed
- // directly to the OutputData constructor.
- iEvent.emplace(outputToken_, gpuAlgo_.getResult());
-}
-```
-
-See [further below](#setting-the-current-device) for the conditions
-when the `cms::cuda::ScopedContextAcquire` constructor reuses the CUDA stream. Note
-that the `cms::cuda::ScopedContextAcquire` constructor taking `edm::StreamID` is
-allowed, it will just always create a new CUDA stream.
-
-
-### Producer with CUDA input and output (with ExternalWork)
-
-```cpp
-class ProducerInputOutputCUDA: public edm::stream::EDProducer {
-public:
- ...
- void acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
- void produce(edm::Event& iEvent, edm::EventSetup& iSetup) override;
- ...
-private:
- ...
- ProducerInputGPUAlgo gpuAlgo_;
- edm::EDGetTokenT> inputToken_;
- edm::EDPutTokenT> outputToken_;
-};
-...
-void ProducerInputOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
- cms::cuda::Product const& inputDataWrapped = iEvent.get(inputToken_);
-
- // Set the current device to the same that was used to produce
- // InputData, and also use the same CUDA stream
- cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder), ctxState_};
-
- // Grab the real input data. Checks that the input data is on the
- // current device. If the input data was produced in a different CUDA
- // stream than the cms::cuda::ScopedContextAcquire holds, create an inter-stream
- // synchronization point with CUDA event and cudaStreamWaitEvent()
- auto const& inputData = ctx.get(inputDataWrapped);
-
- // Queues asynchronous data transfers and kernels to the CUDA stream
- // returned by cms::cuda::ScopedContextAcquire::stream()
- gpuAlgo.makeAsync(inputData, ctx.stream());
-
- // Destructor of ctx queues a callback to the CUDA stream notifying
- // waitingTaskHolder when the queued asynchronous work has finished,
- // and saves the device and CUDA stream to ctxState_
-}
-
-// Called after the asynchronous work has finished
-void ProducerInputOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) {
- // Sets again the current device, uses the CUDA stream created in the acquire()
- cms::cuda::ScopedContextProduce ctx{ctxState_};
-
- // Now getResult() returns data in GPU memory that is passed to the
- // constructor of OutputData. cms::cuda::ScopedContextProduce::emplace() wraps the
- // OutputData to cms::cuda::Product. cms::cuda::Product stores also
- // the current device and the CUDA stream since those will be needed
- // in the consumer side.
- ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult());
-}
-```
-
-[Complete example](../CUDATest/plugins/TestCUDAProducerGPUEW.cc)
-
-
-### Producer with CUDA input and output (without ExternalWork)
-
-If the producer does not need to transfer anything back to CPU (like
-the number of output elements), the `ExternalWork` extension is not
-needed as there is no need to synchronize.
-
-```cpp
-class ProducerInputOutputCUDA: public edm::global::EDProducer<> {
-public:
- ...
- void produce(edm::StreamID streamID, edm::Event& iEvent, edm::EventSetup& iSetup) const override;
- ...
-private:
- ...
- ProducerInputGPUAlgo gpuAlgo_;
- edm::EDGetTokenT> inputToken_;
- edm::EDPutTokenT> outputToken_;
-};
-...
-void ProducerInputOutputCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, edm::EventSetup& iSetup) const {
- cms::cuda::Product const& inputDataWrapped = iEvent.get(inputToken_);
-
- // Set the current device to the same that was used to produce
- // InputData, and possibly use the same CUDA stream
- cms::cuda::ScopedContextProduce ctx{inputDataWrapped};
-
- // Grab the real input data. Checks that the input data is on the
- // current device. If the input data was produced in a different CUDA
- // stream than the cms::cuda::ScopedContextProduce holds, create an inter-stream
- // synchronization point with CUDA event and cudaStreamWaitEvent()
- auto const& inputData = ctx.get(inputDataWrapped);
-
- // Queues asynchronous data transfers and kernels to the CUDA stream
- // returned by cms::cuda::ScopedContextProduce::stream(). Here makeAsync() also
- // returns data in GPU memory that is passed to the constructor of
- // OutputData. cms::cuda::ScopedContextProduce::emplace() wraps the OutputData to
- // cms::cuda::Product. cms::cuda::Product stores also the current
- // device and the CUDA stream since those will be needed in the
- // consumer side.
- ctx.emplace(iEvent, outputToken, gpuAlgo.makeAsync(inputData, ctx.stream());
-
- // Destructor of ctx queues a callback to the CUDA stream notifying
- // waitingTaskHolder when the queued asynchronous work has finished
-}
-```
-
-[Complete example](../CUDATest/plugins/TestCUDAProducerGPU.cc)
-
-
-### Analyzer with CUDA input
-
-Analyzer with CUDA input is similar to [producer with CUDA
-input](#producer-with-cuda-input). Note that currently we do not have
-a mechanism for portable configurations with analyzers. This means
-that a configuration with a CUDA analyzer can only run on a machine
-with CUDA device(s).
-
-```cpp
-class AnalyzerInputCUDA: public edm::global::EDAnalyzer<> {
-public:
- ...
- void analyzer(edm::Event const& iEvent, edm::EventSetup const& iSetup) override;
- ...
-private:
- ...
- AnalyzerInputGPUAlgo gpuAlgo_;
- edm::EDGetTokenT> inputToken_;
- edm::EDGetTokenT> otherInputToken_;
-};
-...
-void AnalyzerInputCUDA::analyze(edm::Event const& iEvent, edm::EventSetup& iSetup) {
- cms::cuda::Product const& inputDataWrapped = iEvent.get(inputToken_);
-
- // Set the current device to the same that was used to produce
- // InputData, and possibly use the same CUDA stream
- cms::cuda::ScopedContextAnalyze ctx{inputDataWrapped};
-
- // Grab the real input data. Checks that the input data is on the
- // current device. If the input data was produced in a different CUDA
- // stream than the cms::cuda::ScopedContextAnalyze holds, create an inter-stream
- // synchronization point with CUDA event and cudaStreamWaitEvent()
- auto const& inputData = ctx.get(inputDataWrapped);
-
- // Input data from another producer
- auto const& otherInputData = ctx.get(iEvent.get(otherInputToken_));
- // or
- auto const& otherInputData = ctx.get(iEvent, otherInputToken_);
-
-
- // Queues asynchronous data transfers and kernels to the CUDA stream
- // returned by cms::cuda::ScopedContextAnalyze::stream()
- gpuAlgo.analyzeAsync(inputData, otherInputData, ctx.stream());
-}
-```
-
-[Complete example](../CUDATest/plugins/TestCUDAAnalyzerGPU.cc)
-
-
-### Configuration
-
-#### GPU-only configuration
-
-For a GPU-only configuration there is nothing special to be done, just
-construct the Paths/Sequences/Tasks from the GPU modules.
-
-## More details
-
-### Device choice
-
-For multi-GPU setup the device is chosen in the first CUDA module in a
-chain of modules by one of the constructors of
-`cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`
-```cpp
-// In ExternalWork acquire()
-cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), ...};
-
-// In normal produce() (or filter())
-cms::cuda::ScopedContextProduce ctx{iEvent.streamID()};
-```
-As the choice is still the static EDM stream to device assignment, the
-EDM stream ID is needed. The logic will likely evolve in the future to
-be more dynamic, and likely the device choice has to be made for the
-full event.
-
-### Data model
-
-The "GPU data product" should be a class/struct containing smart
-pointer(s) to device data (see [Memory allocation](#memory-allocation)).
-When putting the data to event, the data is wrapped to
-`cms::cuda::Product` template, which holds
-* the GPU data product
- * must be moveable, but no other restrictions
-* the current device where the data was produced, and the CUDA stream the data was produced with
-* [CUDA event for synchronization between multiple CUDA streams](#synchronizing-between-cuda-streams)
-
-Note that the `cms::cuda::Product` wrapper can be constructed only with
-`cms::cuda::ScopedContextProduce::wrap()`, and the data `T` can be obtained
-from it only with
-`cms::cuda::ScopedContextAcquire::get()`/`cms::cuda::ScopedContextProduce::get()`/`cms::cuda::ScopedContextAnalyze::get()`,
-as described further below. When putting the data product directly to
-`edm::Event`, also `cms::cuda::SCopedContextProduce::emplace()` can be used.
-
-The GPU data products that depend on the CUDA runtime should be placed
-under `CUDADataFormats` package, using the same name for sub-package
-that would be used in `DataFormats`. Everything else, e.g. SoA for
-CPU, should go under `DataFormats` as usual.
-
-
-### CUDA EDProducer
-
-#### Class declaration
-
-The CUDA producers are normal EDProducers. The `ExternalWork`
-extension should be used if a synchronization between the GPU and CPU
-is needed, e.g. when transferring data from GPU to CPU.
-
-#### Memory allocation
-
-##### Caching allocator
-
-The memory allocations should be done dynamically with the following functions
-```cpp
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
-
-cms::cuda::device::unique_ptr device_buffer = cms::cuda::make_device_unique(50, cudaStream);
-cms::cuda::host::unique_ptr host_buffer = cms::cuda::make_host_unique(50, cudaStream);
-```
-
-in the `acquire()` and `produce()` functions. The same
-`cudaStream_t` object that is used for transfers and kernels
-should be passed to the allocator.
-
-The allocator is based on [`cub::CachingDeviceAllocator`](https://nvlabs.github.io/cub/structcub_1_1_caching_device_allocator.html).
-The memory is guaranteed to be reserved
-* for the host: up to the destructor of the `unique_ptr`
-* for the device: until all work queued in the `cudaStream` up to the point when the `unique_ptr` destructor is called has finished
-
-##### Non-cached pinned host `unique_ptr`
-
-In producers transferring data to GPU one may want to pinned host
-memory allocated with `cudaHostAllocWriteCombined`. As of now we don't
-want to include the flag dimension to the caching allocator. The CUDA
-API wrapper library does not support allocation flags, so we add our
-own `unique_ptr` for that.
-
-```cpp
-#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
-
-cms::cuda::host::noncached_unique_ptr host_buffer = cms::cuda::make_host_noncached_unique(50, flags);
-```
-The `flags` is passed directly to `cudaHostAlloc()`.
-
-##### CUDA API
-
-The `cudaMalloc()` etc may be used outside of the event loop, but that
-should be limited to only relatively small allocations in order to
-allow as much re-use of device memory as possible.
-
-If really needed, the `cudaMalloc()` etc may be used also within the
-event loop, but then the cost of allocation and implicit
-synchronization should be explicitly amortized e.g. by caching.
-
-#### Setting the current device
-
-A CUDA producer should construct `cms::cuda::ScopedContextAcquire` in
-`acquire()` (`cms::cuda::ScopedContextProduce` `produce()` if not using
-`ExternalWork`) either with `edm::StreamID`, or with a
-`cms::cuda::Product` read as an input.
-
-```cpp
-// From edm::StreamID
-cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), ...};
-// or
-cms::cuda::ScopedContextProduce ctx{iEvent.streamID()};
-
-
-// From cms::cuda::Product
-cms::cuda::Product const& cclus = iEvent.get(srcToken_);
-cms::cuda::ScopedContextAcquire ctx{cclus, ...};
-// or
-cms::cuda::ScopedContextProduce ctx{cclus};
-```
-
-A CUDA analyzer should construct `cms::cuda::ScopedContextAnalyze` with a
-`cms::cuda::Product` read as an input.
-
-```cpp
-cms::cuda::Product const& cclus = iEvent.get(srcToken_);
-cms::cuda::ScopedContextAnalyze ctx{cclus};
-```
-
-`cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`/`cms::cuda::ScopedContextAnalyze` work in the RAII way and does the following
-* Sets the current device for the current scope
- - If constructed from the `edm::StreamID`, chooses the device and creates a new CUDA stream
- - If constructed from the `cms::cuda::Product`, uses the same device and possibly the same CUDA stream as was used to produce the `cms::cuda::Product`
- * The CUDA stream is reused if this producer is the first consumer
- of the `cms::cuda::Product`, otherwise a new CUDA stream is created.
- This approach is simple compromise to automatically express the work of
- parallel producers in different CUDA streams, and at the same
- time allow a chain of producers to queue their work to the same
- CUDA stream.
-* Gives access to the CUDA stream the algorithm should use to queue asynchronous work
-* `cms::cuda::ScopedContextAcquire` calls `edm::WaitingTaskWithArenaHolder::doneWaiting()` when necessary (in its destructor)
-* [Synchronizes between CUDA streams if necessary](#synchronizing-between-cuda-streams)
-* Needed to get `cms::cuda::Product` from the event
- * `cms::cuda::ScopedContextProduce` is needed to put `cms::cuda::Product` to the event
-
-In case of multiple input products, from possibly different CUDA
-streams and/or CUDA devices, this approach gives the developer full
-control in which of them the kernels of the algorithm should be run.
-
-#### Getting input
-
-The real product (`T`) can be obtained from `cms::cuda::Product` only with
-the help of
-`cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`/`cms::cuda::ScopedContextAnalyze`.
-
-```cpp
-// From cms::cuda::Product
-cms::cuda::Product cclus = iEvent.get(srcToken_);
-GPUClusters const& clus = ctx.get(cclus);
-
-// Directly from Event
-GPUClusters const& clus = ctx.get(iEvent, srcToken_);
-```
-
-This step is needed to
-* check that the data are on the same CUDA device
- * if not, throw an exception (with unified memory could prefetch instead)
-* if the CUDA streams are different, synchronize between them
-
-#### Calling the CUDA kernels
-
-It is usually best to wrap the CUDA kernel calls to a separate class,
-and then call methods of that class from the EDProducer. The only
-requirement is that the CUDA stream where to queue the operations
-should be the one from the
-`cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`/`cms::cuda::ScopedContextAnalyze`.
-
-```cpp
-gpuAlgo.makeClustersAsync(..., ctx.stream());
-```
-
-If necessary, different CUDA streams may be used internally, but they
-should to be made to synchronize with the provided CUDA stream with
-CUDA events and `cudaStreamWaitEvent()`.
-
-
-#### Putting output
-
-The GPU data needs to be wrapped to `cms::cuda::Product` template with
-`cms::cuda::ScopedContextProduce::wrap()` or `cms::cuda::ScopedContextProduce::emplace()`
-
-```cpp
-GPUClusters clusters = gpuAlgo.makeClustersAsync(..., ctx.stream());
-std::unique_ptr> ret = ctx.wrap(clusters);
-iEvent.put(std::move(ret));
-
-// or with one line
-iEvent.put(ctx.wrap(gpuAlgo.makeClustersAsync(ctx.stream())));
-
-// or avoid one unique_ptr with emplace
-edm::PutTokenT> putToken_ = produces>(); // in constructor
-...
-ctx.emplace(iEvent, putToken_, gpuAlgo.makeClustersAsync(ctx.stream()));
-```
-
-This step is needed to
-* store the current device and CUDA stream into `cms::cuda::Product`
-* record the CUDA event needed for CUDA stream synchronization
-
-#### `ExternalWork` extension
-
-Everything above works both with and without `ExternalWork`.
-
-Without `ExternalWork` the `EDProducer`s act similar to TBB
-flowgraph's "streaming node". In other words, they just queue more
-asynchronous work to the CUDA stream in their `produce()`.
-
-The `ExternalWork` is needed when one would otherwise call
-`cudeStreamSynchronize()`. For example transferring something to CPU
-needed for downstream DQM, or queueing more asynchronous work. With
-`ExternalWork` an `acquire()` method needs to be implemented that gets
-an `edm::WaitingTaskWithArenaHolder` parameter. The
-`edm::WaitingTaskWithArenaHolder` should then be passed to the
-constructor of `cms::cuda::ScopedContextAcquire` along
-
-```cpp
-void acquire(..., edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
- cms::cuda::Product const& cclus = iEvent.get(token_);
- cms::cuda::ScopedContextAcquire ctx{cclus, std::move(waitingTaskHolder)}; // can also copy instead of move if waitingTaskHolder is needed for something else as well
- ...
-```
-
-When constructed this way, `cms::cuda::ScopedContextAcquire` registers a
-callback function to the CUDA stream in its destructor to call
-`waitingTaskHolder.doneWaiting()`.
-
-A GPU->GPU producer needs a `cms::cuda::ScopedContext` also in its
-`produce()`. The device and CUDA stream are transferred via
-`cms::cuda::ContextState` member variable:
-
-```cpp
-class FooProducerCUDA ... {
- ...
- cms::cuda::ContextState ctxState_;
-};
-
-void FooProducerCUDA::acquire(...) {
- ...
- cms::cuda::ScopedContextAcquire ctx{..., std::move(waitingTaskHolder), ctxState_};
- ...
-}
-
-void FooProducerCUDA::produce(...( {
- ...
- cms::cuda::ScopedContextProduce ctx{ctxState_};
-}
-```
-
-The `cms::cuda::ScopedContextAcquire` saves its state to the `ctxState_` in
-the destructor, and `cms::cuda::ScopedContextProduce` then restores the
-context.
-
-
-#### Transferring GPU data to CPU
-
-The GPU->CPU data transfer needs synchronization to ensure the CPU
-memory to have all data before putting that to the event. This means
-the `ExternalWork` needs to be used along
-* In `acquire()`
- * (allocate CPU memory buffers)
- * Queue all GPU->CPU transfers asynchronously
-* In `produce()`
- * If needed, read additional CPU products (e.g. from `edm::Ref`s)
- * Reformat data back to legacy data formats
- * Note: `cms::cuda::ScopedContextProduce` is **not** needed in `produce()`
-
-#### Synchronizing between CUDA streams
-
-In case the producer needs input data that were produced in two (or
-more) CUDA streams, these streams have to be synchronized. Here this
-synchronization is achieved with CUDA events.
-
-Each `cms::cuda::Product` constains also a CUDA event object. The call to
-`cms::cuda::ScopedContextProduce::wrap()` will *record* the event in the CUDA
-stream. This means that when all work queued to the CUDA stream up to
-that point has been finished, the CUDA event becomes *occurred*. Then,
-in
-`cms::cuda::ScopedContextAcquire::get()`/`cms::cuda::ScopedContextProduce::get()`/`cms::cuda::ScopedContextAnalyze::get()`,
-if the `cms::cuda::Product` to get from has a different CUDA stream than
-the
-`cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`/`cms::cuda::ScopedContextAnalyze`,
-`cudaStreamWaitEvent(stream, event)` is called. This means that all
-subsequent work queued to the CUDA stream will wait for the CUDA event
-to become occurred. Therefore this subsequent work can assume that the
-to-be-getted CUDA product exists.
-
-
-### CUDA ESProduct
-
-Conditions data can be transferred to the device with the following
-pattern.
-
-1. Define a `class`/`struct` for the data to be transferred in the format accessed in the device (hereafter referred to as "payload")
-2. Define a wrapper ESProduct that holds the aforementioned data in the pinned host memory
-3. The wrapper should have a function returning the payload on the
- device memory. The function should transfer the data to the device
- asynchronously with the help of `cms::cuda::ESProduct`.
-
-#### Example
-
-```cpp
-#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
-
-// Declare the struct for the payload to be transferred. Here the
-// example is an array with (potentially) dynamic size. Note that all of
-// below becomes simpler if the array has compile-time size.
-struct ESProductExampleCUDA {
- float *someData;
- unsigned int size;
-};
-
-// Declare the wrapper ESProduct. The corresponding ESProducer should
-// produce objects of this type.
-class ESProductExampleCUDAWrapper {
-public:
- // Constructor takes the standard CPU ESProduct, and transforms the
- // necessary data to array(s) in pinned host memory
- ESProductExampleCUDAWrapper(ESProductExample const&);
-
- // Deallocates all pinned host memory
- ~ESProductExampleCUDAWrapper();
-
- // Function to return the actual payload on the memory of the current device
- ESProductExampleCUDA const *getGPUProductAsync(cudaStream_t stream) const;
-
-private:
- // Holds the data in pinned CPU memory
- float *someData_;
- unsigned int size_;
-
- // Helper struct to hold all information that has to be allocated and
- // deallocated per device
- struct GPUData {
- // Destructor should free all member pointers
- ~GPUData();
- // internal pointers are on device, struct itself is on CPU
- ESProductExampleCUDA *esproductHost = nullptr;
- // internal pounters and struct are on device
- ESProductExampleCUDA *esproductDevice = nullptr;
- };
-
- // Helper that takes care of complexity of transferring the data to
- // multiple devices
- cms::cuda::ESProduct gpuData_;
-};
-
-ESProductExampleCUDAWrapper::ESProductExampleCUDAWrapper(ESProductExample const& cpuProduct) {
- cudaCheck(cudaMallocHost(&someData_, sizeof(float)*NUM_ELEMENTS));
- // fill someData_ and size_ from cpuProduct
-}
-
-ESProductExampleCUDA const *ESProductExampleCUDAWrapper::getGPUProductAsync(cudaStream_t stream) const {
- // cms::cuda::ESProduct essentially holds an array of GPUData objects,
- // one per device. If the data have already been transferred to the
- // current device (or the transfer has been queued), the helper just
- // returns a reference to that GPUData object. Otherwise, i.e. data are
- // not yet on the current device, the helper calls the lambda to do the
- // necessary memory allocations and to queue the transfers.
- auto const& data = gpuData_.dataForCurrentDeviceAsync(stream, [this](GPUData& data, cudaStream_t stream) {
- // Allocate memory. Currently this can be with the CUDA API,
- // sometime we'll migrate to the caching allocator. Assumption is
- // that IOV changes are rare enough that adding global synchronization
- // points is not that bad (for now).
-
- // Allocate the payload object on pinned host memory.
- cudaCheck(cudaMallocHost(&data.esproductHost, sizeof(ESProductExampleCUDA)));
- // Allocate the payload array(s) on device memory.
- cudaCheck(cudaMalloc(&data.esproductHost->someData, sizeof(float)*NUM_ELEMENTS));
-
- // Allocate the payload object on the device memory.
- cudaCheck(cudaMalloc(&data.esproductDevice, sizeof(ESProductDevice)));
-
- // Complete the host-side information on the payload
- data.cablingMapHost->size = this->size_;
-
-
- // Transfer the payload, first the array(s) ...
- cudaCheck(cudaMemcpyAsync(data.esproductHost->someData, this->someData, sizeof(float)*NUM_ELEMENTS, cudaMemcpyDefault, stream));
- // ... and then the payload object
- cudaCheck(cudaMemcpyAsync(data.esproductDevice, data.esproduceHost, sizeof(ESProductExampleCUDA), cudaMemcpyDefault, stream));
-});
-
- // Returns the payload object on the memory of the current device
- return data.esproductDevice;
-}
-
-// Destructor frees all member pointers
-ESProductExampleCUDA::GPUData::~GPUData() {
- if(esproductHost != nullptr) {
- cudaCheck(cudaFree(esproductHost->someData));
- cudaCheck(cudaFreeHost(esproductHost));
- }
- cudaCheck(cudaFree(esProductDevice));
-}
-
-```
diff --git a/HeterogeneousCore/CUDACore/interface/ContextState.h b/HeterogeneousCore/CUDACore/interface/ContextState.h
deleted file mode 100644
index 9c52113cc1e8d..0000000000000
--- a/HeterogeneousCore/CUDACore/interface/ContextState.h
+++ /dev/null
@@ -1,60 +0,0 @@
-#ifndef HeterogeneousCore_CUDACore_ContextState_h
-#define HeterogeneousCore_CUDACore_ContextState_h
-
-#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
-
-#include
-
-namespace cms {
- namespace cuda {
- /**
- * The purpose of this class is to deliver the device and CUDA stream
- * information from ExternalWork's acquire() to producer() via a
- * member/StreamCache variable.
- */
- class ContextState {
- public:
- ContextState() = default;
- ~ContextState() = default;
-
- ContextState(const ContextState&) = delete;
- ContextState& operator=(const ContextState&) = delete;
- ContextState(ContextState&&) = delete;
- ContextState& operator=(ContextState&& other) = delete;
-
- private:
- friend class ScopedContextAcquire;
- friend class ScopedContextProduce;
-
- void set(int device, SharedStreamPtr stream) {
- throwIfStream();
- device_ = device;
- stream_ = std::move(stream);
- }
-
- int device() const { return device_; }
-
- const SharedStreamPtr& streamPtr() const {
- throwIfNoStream();
- return stream_;
- }
-
- SharedStreamPtr releaseStreamPtr() {
- throwIfNoStream();
- // This function needs to effectively reset stream_ (i.e. stream_
- // must be empty after this function). This behavior ensures that
- // the SharedStreamPtr is not hold for inadvertedly long (i.e. to
- // the next event), and is checked at run time.
- return std::move(stream_);
- }
-
- void throwIfStream() const;
- void throwIfNoStream() const;
-
- SharedStreamPtr stream_;
- int device_;
- };
- } // namespace cuda
-} // namespace cms
-
-#endif
diff --git a/HeterogeneousCore/CUDACore/interface/ConvertingESProducerT.h b/HeterogeneousCore/CUDACore/interface/ConvertingESProducerT.h
deleted file mode 100644
index 8018fcede7809..0000000000000
--- a/HeterogeneousCore/CUDACore/interface/ConvertingESProducerT.h
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifndef HeterogeneousCore_CUDACore_interface_ConvertingESProducerT_h
-#define HeterogeneousCore_CUDACore_interface_ConvertingESProducerT_h
-
-#include "FWCore/Framework/interface/ESProducer.h"
-#include "FWCore/Framework/interface/ESTransientHandle.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-#include "FWCore/Framework/interface/ModuleFactory.h"
-#include "FWCore/Framework/interface/eventsetuprecord_registration_macro.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/Utilities/interface/typelookup.h"
-
-/* class template: ConvertingESProducerT
- *
- * This class template can be used to simplify the implementation of any ESProducer that reads
- * conditions data from a record and pushes derived conditions data to the same record.
- * The current use case is to convert and copy the calibrations from the CPU to the GPUs.
- */
-
-template
-class ConvertingESProducerT : public edm::ESProducer {
-public:
- explicit ConvertingESProducerT(edm::ParameterSet const& ps) {
- auto const& label = ps.getParameter("label");
- auto const& name = ps.getParameter("ComponentName");
- auto cc = setWhatProduced(this, name);
- token_ = cc.consumes(edm::ESInputTag{"", label});
- }
-
- std::unique_ptr produce(Record const& record) {
- // retrieve conditions in the old format and build a product in the new format
- return std::make_unique(record.get(token_));
- }
-
- static void fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
- edm::ParameterSetDescription desc;
-
- desc.add("ComponentName", "");
- desc.add("label", "")->setComment("ESProduct label");
- confDesc.addWithDefaultLabel(desc);
- }
-
-private:
- edm::ESGetToken token_;
-};
-
-#endif // HeterogeneousCore_CUDACore_interface_ConvertingESProducerT_h
diff --git a/HeterogeneousCore/CUDACore/interface/ConvertingESProducerWithDependenciesT.h b/HeterogeneousCore/CUDACore/interface/ConvertingESProducerWithDependenciesT.h
deleted file mode 100644
index 9a57e405ceb5c..0000000000000
--- a/HeterogeneousCore/CUDACore/interface/ConvertingESProducerWithDependenciesT.h
+++ /dev/null
@@ -1,118 +0,0 @@
-#ifndef HeterogeneousCore_CUDACore_interface_ConvertingESProducerWithDependenciesT_h
-#define HeterogeneousCore_CUDACore_interface_ConvertingESProducerWithDependenciesT_h
-
-#include
-#include
-
-#include "FWCore/Framework/interface/ESProducer.h"
-#include "FWCore/Framework/interface/ESHandle.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-#include "FWCore/Framework/interface/ModuleFactory.h"
-#include "FWCore/Framework/interface/eventsetuprecord_registration_macro.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/Utilities/interface/typelookup.h"
-
-/* class template: ConvertingESProducerWithDependenciesT
- *
- * This class template can be used to simplify the implementation of any ESProducer that reads
- * multiple conditions data from one or more records record and pushes derived conditions data
- * to a combined dependent record.
- * The current use case is to convert and copy the calibrations from the CPU to the GPUs.
- */
-
-namespace detail {
- // simple implementation of a type zipper over 2 tuples
- // here, the main requirement is the default constructor for Gen template
- // which __does__ exist for ESGetToken
-
- template class Gen, typename Tuple1, typename Tuple2>
- struct TypeZipper;
-
- template class Gen, typename Tuple1, typename Tuple2, std::size_t... Is>
- auto TypeZipperImpl(Tuple1 const& t1, Tuple2 const& t2, std::index_sequence) {
- return std::make_tuple(
- Gen::type, typename std::tuple_element::type>{}...);
- }
-
- template class Gen, typename... Ts1, typename... Ts2>
- struct TypeZipper, std::tuple> {
- static_assert(sizeof...(Ts1) == sizeof...(Ts2));
- using type = typename std::decay(
- std::tuple{}, std::tuple{}, std::index_sequence_for{}))>::type;
- };
-
-} // namespace detail
-
-template
-class ConvertingESProducerWithDependenciesT;
-
-template typename CombinedRecord,
- typename... DepsRecords,
- typename Target,
- typename... Dependencies>
-class ConvertingESProducerWithDependenciesT, Target, Dependencies...>
- : public edm::ESProducer {
-public:
- static constexpr std::size_t nsources = sizeof...(Dependencies);
- static_assert(sizeof...(Dependencies) == sizeof...(DepsRecords));
-
- explicit ConvertingESProducerWithDependenciesT(edm::ParameterSet const& ps) {
- std::vector tags(nsources);
- for (std::size_t i = 0; i < nsources; i++)
- tags[i] = edm::ESInputTag{"", ps.getParameter("label" + std::to_string(i))};
-
- std::string const& name = ps.getParameter("ComponentName");
- edm::ESConsumesCollectorT> cc = setWhatProduced(this, name);
- WalkConsumes::iterate(cc, tokens_, tags);
- }
-
- std::unique_ptr produce(CombinedRecord const& record) {
- auto handles = std::tuple...>{};
- WalkAndCall...>::iterate(record, handles, tokens_);
-
- return std::apply([](auto const&... handles) { return std::make_unique((*handles)...); }, handles);
- }
-
- static void fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
- edm::ParameterSetDescription desc;
-
- desc.add("ComponentName", "");
- for (std::size_t i = 0; i < nsources; i++)
- desc.add("label" + std::to_string(i), "")->setComment("Product Label");
- confDesc.addWithDefaultLabel(desc);
- }
-
-private:
- using TokenType =
- typename detail::TypeZipper, std::tuple>::type;
- TokenType tokens_;
-
-private:
- template
- struct WalkConsumes {
- static void iterate(edm::ESConsumesCollectorT>& cc,
- TokenType& tokens,
- std::vector const& tags) {
- if constexpr (N > 0)
- WalkConsumes::iterate(cc, tokens, tags);
- std::get(tokens) = cc.consumes(tags[N]);
- }
- };
-
- template
- struct WalkAndCall {
- static void iterate(CombinedRecord const& containingRecord,
- std::tuple& ts,
- TokenType const& tokens) {
- using Record = typename std::tuple_element>::type;
- if constexpr (N > 0)
- WalkAndCall::iterate(containingRecord, ts, tokens);
- // get the right dependent record
- auto const& record = containingRecord.template getRecord();
- // assign the right element of the tuple
- std::get(ts) = record.getHandle(std::get(tokens));
- }
- };
-};
-
-#endif // HeterogeneousCore_CUDACore_interface_ConvertingESProducerWithDependenciesT_h
diff --git a/HeterogeneousCore/CUDACore/interface/ESProduct.h b/HeterogeneousCore/CUDACore/interface/ESProduct.h
deleted file mode 100644
index fbe1825b0fa4d..0000000000000
--- a/HeterogeneousCore/CUDACore/interface/ESProduct.h
+++ /dev/null
@@ -1,110 +0,0 @@
-#ifndef HeterogeneousCore_CUDACore_ESProduct_h
-#define HeterogeneousCore_CUDACore_ESProduct_h
-
-#include
-#include
-#include
-#include
-
-#include "FWCore/Utilities/interface/thread_safety_macros.h"
-#include "HeterogeneousCore/CUDAServices/interface/numberOfDevices.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"
-
-namespace cms {
- namespace cuda {
- template
- class ESProduct {
- public:
- ESProduct() : gpuDataPerDevice_(numberOfDevices()) {
- if (not gpuDataPerDevice_.empty()) {
- cms::cuda::ScopedSetDevice scopedDevice;
- for (size_t i = 0; i < gpuDataPerDevice_.size(); ++i) {
- scopedDevice.set(i);
- gpuDataPerDevice_[i].m_event = getEventCache().get();
- }
- }
- }
-
- ~ESProduct() = default;
-
- // transferAsync should be a function of (T&, cudaStream_t)
- // which enqueues asynchronous transfers (possibly kernels as well)
- // to the CUDA stream
- template
- const T& dataForCurrentDeviceAsync(cudaStream_t cudaStream, F transferAsync) const {
- int device = currentDevice();
- auto& data = gpuDataPerDevice_[device];
-
- // If the GPU data has already been filled, we can return it immediately
- if (not data.m_filled.load()) {
- // It wasn't, so need to fill it
- std::scoped_lock lk{data.m_mutex};
-
- if (data.m_filled.load()) {
- // Other thread marked it filled while we were locking the mutex, so we're free to return it
- return data.m_data;
- }
-
- if (data.m_fillingStream != nullptr) {
- // Someone else is filling
-
- // Check first if the recorded event has occurred
- if (eventWorkHasCompleted(data.m_event.get())) {
- // It was, so data is accessible from all CUDA streams on
- // the device. Set the 'filled' for all subsequent calls and
- // return the value
- auto should_be_false = data.m_filled.exchange(true);
- assert(not should_be_false);
- data.m_fillingStream = nullptr;
- } else if (data.m_fillingStream != cudaStream) {
- // Filling is still going on. For other CUDA stream, add
- // wait on the CUDA stream and return the value. Subsequent
- // work queued on the stream will wait for the event to
- // occur (i.e. transfer to finish).
- cudaCheck(cudaStreamWaitEvent(cudaStream, data.m_event.get(), 0),
- "Failed to make a stream to wait for an event");
- }
- // else: filling is still going on. But for the same CUDA
- // stream (which would be a bit strange but fine), we can just
- // return as all subsequent work should be enqueued to the
- // same CUDA stream (or stream to be explicitly synchronized
- // by the caller)
- } else {
- // Now we can be sure that the data is not yet on the GPU, and
- // this thread is the first to try that.
- transferAsync(data.m_data, cudaStream);
- assert(data.m_fillingStream == nullptr);
- data.m_fillingStream = cudaStream;
- // Record in the cudaStream an event to mark the readiness of the
- // EventSetup data on the GPU, so other streams can check for it
- cudaCheck(cudaEventRecord(data.m_event.get(), cudaStream));
- // Now the filling has been enqueued to the cudaStream, so we
- // can return the GPU data immediately, since all subsequent
- // work must be either enqueued to the cudaStream, or the cudaStream
- // must be synchronized by the caller
- }
- }
-
- return data.m_data;
- }
-
- private:
- struct Item {
- mutable std::mutex m_mutex;
- CMS_THREAD_GUARD(m_mutex) mutable SharedEventPtr m_event;
- // non-null if some thread is already filling (cudaStream_t is just a pointer)
- CMS_THREAD_GUARD(m_mutex) mutable cudaStream_t m_fillingStream = nullptr;
- mutable std::atomic m_filled = false; // easy check if data has been filled already or not
- CMS_THREAD_GUARD(m_mutex) mutable T m_data;
- };
-
- std::vector gpuDataPerDevice_;
- };
- } // namespace cuda
-} // namespace cms
-
-#endif // HeterogeneousCore_CUDACore_ESProduct_h
diff --git a/HeterogeneousCore/CUDACore/interface/JobConfigurationGPURecord.h b/HeterogeneousCore/CUDACore/interface/JobConfigurationGPURecord.h
deleted file mode 100644
index fd0b3864e909c..0000000000000
--- a/HeterogeneousCore/CUDACore/interface/JobConfigurationGPURecord.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef HeterogeneousCore_CUDACore_interface_JobConfigurationGPURecord_h
-#define HeterogeneousCore_CUDACore_interface_JobConfigurationGPURecord_h
-
-#include "FWCore/Framework/interface/EventSetupRecordImplementation.h"
-
-class JobConfigurationGPURecord : public edm::eventsetup::EventSetupRecordImplementation {};
-
-#endif // HeterogeneousCore_CUDACore_interface_JobConfigurationGPURecord_h
diff --git a/HeterogeneousCore/CUDACore/interface/ScopedContext.h b/HeterogeneousCore/CUDACore/interface/ScopedContext.h
deleted file mode 100644
index 68dfa57e7719d..0000000000000
--- a/HeterogeneousCore/CUDACore/interface/ScopedContext.h
+++ /dev/null
@@ -1,200 +0,0 @@
-#ifndef HeterogeneousCore_CUDACore_ScopedContext_h
-#define HeterogeneousCore_CUDACore_ScopedContext_h
-
-#include
-
-#include "CUDADataFormats/Common/interface/Product.h"
-#include "FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Utilities/interface/EDGetToken.h"
-#include "FWCore/Utilities/interface/EDPutToken.h"
-#include "FWCore/Utilities/interface/StreamID.h"
-#include "HeterogeneousCore/CUDACore/interface/ContextState.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
-
-namespace cms {
- namespace cudatest {
- class TestScopedContext;
- }
-
- namespace cuda {
-
- namespace impl {
- // This class is intended to be derived by other ScopedContext*, not for general use
- class ScopedContextBase {
- public:
- int device() const { return currentDevice_; }
-
- // cudaStream_t is a pointer to a thread-safe object, for which a
- // mutable access is needed even if the ScopedContext itself
- // would be const. Therefore it is ok to return a non-const
- // pointer from a const method here.
- cudaStream_t stream() const { return stream_.get(); }
- const SharedStreamPtr& streamPtr() const { return stream_; }
-
- protected:
- // The constructors set the current device, but the device
- // is not set back to the previous value at the destructor. This
- // should be sufficient (and tiny bit faster) as all CUDA API
- // functions relying on the current device should be called from
- // the scope where this context is. The current device doesn't
- // really matter between modules (or across TBB tasks).
- explicit ScopedContextBase(edm::StreamID streamID);
-
- explicit ScopedContextBase(const ProductBase& data);
-
- explicit ScopedContextBase(int device, SharedStreamPtr stream);
-
- private:
- int currentDevice_;
- SharedStreamPtr stream_;
- };
-
- class ScopedContextGetterBase : public ScopedContextBase {
- public:
- template
- const T& get(const Product& data) {
- synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event());
- return data.data_;
- }
-
- template
- const T& get(const edm::Event& iEvent, edm::EDGetTokenT> token) {
- return get(iEvent.get(token));
- }
-
- protected:
- template
- ScopedContextGetterBase(Args&&... args) : ScopedContextBase(std::forward(args)...) {}
-
- void synchronizeStreams(int dataDevice, cudaStream_t dataStream, bool available, cudaEvent_t dataEvent);
- };
-
- class ScopedContextHolderHelper {
- public:
- ScopedContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder)
- : waitingTaskHolder_{std::move(waitingTaskHolder)} {}
-
- template
- void pushNextTask(F&& f, ContextState const* state);
-
- void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
- waitingTaskHolder_ = std::move(waitingTaskHolder);
- }
-
- void enqueueCallback(int device, cudaStream_t stream);
-
- private:
- edm::WaitingTaskWithArenaHolder waitingTaskHolder_;
- };
- } // namespace impl
-
- /**
- * The aim of this class is to do necessary per-event "initialization" in ExternalWork acquire():
- * - setting the current device
- * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
- * - synchronizing between CUDA streams if necessary
- * and enforce that those get done in a proper way in RAII fashion.
- */
- class ScopedContextAcquire : public impl::ScopedContextGetterBase {
- public:
- /// Constructor to create a new CUDA stream (no need for context beyond acquire())
- explicit ScopedContextAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
- : ScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)} {}
-
- /// Constructor to create a new CUDA stream, and the context is needed after acquire()
- explicit ScopedContextAcquire(edm::StreamID streamID,
- edm::WaitingTaskWithArenaHolder waitingTaskHolder,
- ContextState& state)
- : ScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {}
-
- /// Constructor to (possibly) re-use a CUDA stream (no need for context beyond acquire())
- explicit ScopedContextAcquire(const ProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
- : ScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)} {}
-
- /// Constructor to (possibly) re-use a CUDA stream, and the context is needed after acquire()
- explicit ScopedContextAcquire(const ProductBase& data,
- edm::WaitingTaskWithArenaHolder waitingTaskHolder,
- ContextState& state)
- : ScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {}
-
- ~ScopedContextAcquire() noexcept(false);
-
- template
- void pushNextTask(F&& f) {
- if (contextState_ == nullptr)
- throwNoState();
- holderHelper_.pushNextTask(std::forward(f), contextState_);
- }
-
- void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
- holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
- }
-
- private:
- void throwNoState();
-
- impl::ScopedContextHolderHelper holderHelper_;
- ContextState* contextState_ = nullptr;
- };
-
- /**
- * The aim of this class is to do necessary per-event "initialization" in ExternalWork produce() or normal produce():
- * - setting the current device
- * - synchronizing between CUDA streams if necessary
- * and enforce that those get done in a proper way in RAII fashion.
- */
- class ScopedContextProduce : public impl::ScopedContextGetterBase {
- public:
- /// Constructor to create a new CUDA stream (non-ExternalWork module)
- explicit ScopedContextProduce(edm::StreamID streamID) : ScopedContextGetterBase(streamID) {}
-
- /// Constructor to (possibly) re-use a CUDA stream (non-ExternalWork module)
- explicit ScopedContextProduce(const ProductBase& data) : ScopedContextGetterBase(data) {}
-
- /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module)
- explicit ScopedContextProduce(ContextState& state)
- : ScopedContextGetterBase(state.device(), state.releaseStreamPtr()) {}
-
- /// Record the CUDA event, all asynchronous work must have been queued before the destructor
- ~ScopedContextProduce();
-
- template
- std::unique_ptr> wrap(T data) {
- // make_unique doesn't work because of private constructor
- return std::unique_ptr>(new Product(device(), streamPtr(), event_, std::move(data)));
- }
-
- template
- auto emplace(edm::Event& iEvent, edm::EDPutTokenT token, Args&&... args) {
- return iEvent.emplace(token, device(), streamPtr(), event_, std::forward(args)...);
- }
-
- private:
- friend class cudatest::TestScopedContext;
-
- // This construcor is only meant for testing
- explicit ScopedContextProduce(int device, SharedStreamPtr stream, SharedEventPtr event)
- : ScopedContextGetterBase(device, std::move(stream)), event_{std::move(event)} {}
-
- // create the CUDA Event upfront to catch possible errors from its creation
- SharedEventPtr event_ = getEventCache().get();
- };
-
- /**
- * The aim of this class is to do necessary per-event "initialization" in analyze()
- * - setting the current device
- * - synchronizing between CUDA streams if necessary
- * and enforce that those get done in a proper way in RAII fashion.
- */
- class ScopedContextAnalyze : public impl::ScopedContextGetterBase {
- public:
- /// Constructor to (possibly) re-use a CUDA stream
- explicit ScopedContextAnalyze(const ProductBase& data) : ScopedContextGetterBase(data) {}
- };
- } // namespace cuda
-} // namespace cms
-
-#endif
diff --git a/HeterogeneousCore/CUDACore/src/ContextState.cc b/HeterogeneousCore/CUDACore/src/ContextState.cc
deleted file mode 100644
index 0670f01d472f3..0000000000000
--- a/HeterogeneousCore/CUDACore/src/ContextState.cc
+++ /dev/null
@@ -1,16 +0,0 @@
-#include "HeterogeneousCore/CUDACore/interface/ContextState.h"
-#include "FWCore/Utilities/interface/Exception.h"
-
-namespace cms::cuda {
- void ContextState::throwIfStream() const {
- if (stream_) {
- throw cms::Exception("LogicError") << "Trying to set ContextState, but it already had a valid state";
- }
- }
-
- void ContextState::throwIfNoStream() const {
- if (not stream_) {
- throw cms::Exception("LogicError") << "Trying to get ContextState, but it did not have a valid state";
- }
- }
-} // namespace cms::cuda
diff --git a/HeterogeneousCore/CUDACore/src/JobConfigurationGPURecord.cc b/HeterogeneousCore/CUDACore/src/JobConfigurationGPURecord.cc
deleted file mode 100644
index b4bff09f07090..0000000000000
--- a/HeterogeneousCore/CUDACore/src/JobConfigurationGPURecord.cc
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "FWCore/Framework/interface/eventsetuprecord_registration_macro.h"
-#include "HeterogeneousCore/CUDACore/interface/JobConfigurationGPURecord.h"
-
-EVENTSETUP_RECORD_REG(JobConfigurationGPURecord);
diff --git a/HeterogeneousCore/CUDACore/src/ScopedContext.cc b/HeterogeneousCore/CUDACore/src/ScopedContext.cc
deleted file mode 100644
index 367e0fef6c8ac..0000000000000
--- a/HeterogeneousCore/CUDACore/src/ScopedContext.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
-
-#include "FWCore/Concurrency/interface/Async.h"
-#include "FWCore/MessageLogger/interface/MessageLogger.h"
-#include "FWCore/ServiceRegistry/interface/Service.h"
-#include "FWCore/Utilities/interface/Exception.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-
-#include "chooseDevice.h"
-
-namespace cms::cuda {
- namespace impl {
- ScopedContextBase::ScopedContextBase(edm::StreamID streamID) : currentDevice_(chooseDevice(streamID)) {
- cudaCheck(cudaSetDevice(currentDevice_));
- stream_ = getStreamCache().get();
- }
-
- ScopedContextBase::ScopedContextBase(const ProductBase& data) : currentDevice_(data.device()) {
- cudaCheck(cudaSetDevice(currentDevice_));
- if (data.mayReuseStream()) {
- stream_ = data.streamPtr();
- } else {
- stream_ = getStreamCache().get();
- }
- }
-
- ScopedContextBase::ScopedContextBase(int device, SharedStreamPtr stream)
- : currentDevice_(device), stream_(std::move(stream)) {
- cudaCheck(cudaSetDevice(currentDevice_));
- }
-
- ////////////////////
-
- void ScopedContextGetterBase::synchronizeStreams(int dataDevice,
- cudaStream_t dataStream,
- bool available,
- cudaEvent_t dataEvent) {
- if (dataDevice != device()) {
- // Eventually replace with prefetch to current device (assuming unified memory works)
- // If we won't go to unified memory, need to figure out something else...
- throw cms::Exception("LogicError") << "Handling data from multiple devices is not yet supported";
- }
-
- if (dataStream != stream()) {
- // Different streams, need to synchronize
- if (not available) {
- // Event not yet occurred, so need to add synchronization
- // here. Sychronization is done by making the CUDA stream to
- // wait for an event, so all subsequent work in the stream
- // will run only after the event has "occurred" (i.e. data
- // product became available).
- cudaCheck(cudaStreamWaitEvent(stream(), dataEvent, 0), "Failed to make a stream to wait for an event");
- }
- }
- }
-
- void ScopedContextHolderHelper::enqueueCallback(int device, cudaStream_t stream) {
- edm::Service async;
- SharedEventPtr event = getEventCache().get();
- cudaCheck(cudaEventRecord(event.get(), stream));
- async->runAsync(
- std::move(waitingTaskHolder_),
- [event = std::move(event)]() mutable { cudaCheck(cudaEventSynchronize(event.get())); },
- []() { return "Enqueued by cms::cuda::ScopedContextHolderHelper::enqueueCallback()"; });
- }
- } // namespace impl
-
- ////////////////////
-
- ScopedContextAcquire::~ScopedContextAcquire() noexcept(false) {
- holderHelper_.enqueueCallback(device(), stream());
- if (contextState_) {
- contextState_->set(device(), streamPtr());
- }
- }
-
- void ScopedContextAcquire::throwNoState() {
- throw cms::Exception("LogicError")
- << "Calling ScopedContextAcquire::insertNextTask() requires ScopedContextAcquire to be constructed with "
- "ContextState, but that was not the case";
- }
-
- ////////////////////
-
- ScopedContextProduce::~ScopedContextProduce() {
- // Intentionally not checking the return value to avoid throwing
- // exceptions. If this call would fail, we should get failures
- // elsewhere as well.
- cudaEventRecord(event_.get(), stream());
- }
-} // namespace cms::cuda
diff --git a/HeterogeneousCore/CUDACore/src/chooseDevice.cc b/HeterogeneousCore/CUDACore/src/chooseDevice.cc
deleted file mode 100644
index a768cea02c5ca..0000000000000
--- a/HeterogeneousCore/CUDACore/src/chooseDevice.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-#include "FWCore/ServiceRegistry/interface/Service.h"
-#include "FWCore/Utilities/interface/Exception.h"
-#include "HeterogeneousCore/CUDAServices/interface/CUDAInterface.h"
-
-#include "chooseDevice.h"
-
-namespace cms::cuda {
- int chooseDevice(edm::StreamID id) {
- edm::Service cuda;
- if (not cuda or not cuda->enabled()) {
- cms::Exception ex("CUDAError");
- ex << "Unable to choose current device because CUDAService is not preset or disabled.\n"
- << "If CUDAService was not explicitly disabled in the configuration, the probable\n"
- << "cause is that there is no GPU or there is some problem in the CUDA runtime or\n"
- << "drivers.";
- ex.addContext("Calling cms::cuda::chooseDevice()");
- throw ex;
- }
-
- // For startes we "statically" assign the device based on
- // edm::Stream number. This is suboptimal if the number of
- // edm::Streams is not a multiple of the number of CUDA devices
- // (and even then there is no load balancing).
- //
- // TODO: improve the "assignment" logic
- return id % cuda->numberOfDevices();
- }
-} // namespace cms::cuda
diff --git a/HeterogeneousCore/CUDACore/src/chooseDevice.h b/HeterogeneousCore/CUDACore/src/chooseDevice.h
deleted file mode 100644
index ab642325aaecf..0000000000000
--- a/HeterogeneousCore/CUDACore/src/chooseDevice.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef HeterogeneousCore_CUDACore_chooseDevice_h
-#define HeterogeneousCore_CUDACore_chooseDevice_h
-
-#include "FWCore/Utilities/interface/StreamID.h"
-
-namespace cms::cuda {
- int chooseDevice(edm::StreamID id);
-}
-
-#endif
diff --git a/HeterogeneousCore/CUDACore/test/BuildFile.xml b/HeterogeneousCore/CUDACore/test/BuildFile.xml
index c11ac87aeb176..19807bdf7928b 100644
--- a/HeterogeneousCore/CUDACore/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDACore/test/BuildFile.xml
@@ -1,16 +1,4 @@
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/HeterogeneousCore/CUDACore/test/test_ScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_ScopedContext.cc
deleted file mode 100644
index f2a514e3a7d48..0000000000000
--- a/HeterogeneousCore/CUDACore/test/test_ScopedContext.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-#include "catch2/catch_all.hpp"
-
-#include "CUDADataFormats/Common/interface/Product.h"
-#include "FWCore/Concurrency/interface/FinalWaitingTask.h"
-#include "FWCore/Concurrency/interface/WaitingTask.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h"
-
-#include "test_ScopedContextKernels.h"
-
-#include "oneapi/tbb/task_arena.h"
-#include "oneapi/tbb/task_group.h"
-
-namespace cms::cudatest {
- class TestScopedContext {
- public:
- static cuda::ScopedContextProduce make(int dev, bool createEvent) {
- cms::cuda::SharedEventPtr event;
- if (createEvent) {
- event = cms::cuda::getEventCache().get();
- }
- return cuda::ScopedContextProduce(dev, cms::cuda::getStreamCache().get(), std::move(event));
- }
- };
-} // namespace cms::cudatest
-
-namespace {
- std::unique_ptr> produce(int device, int* d, int* h) {
- auto ctx = cms::cudatest::TestScopedContext::make(device, true);
- cudaCheck(cudaMemcpyAsync(d, h, sizeof(int), cudaMemcpyHostToDevice, ctx.stream()));
- cms::cudatest::testScopedContextKernels_single(d, ctx.stream());
- return ctx.wrap(d);
- }
-} // namespace
-
-TEST_CASE("Use of cms::cuda::ScopedContext", "[CUDACore]") {
- if (not cms::cudatest::testDevices()) {
- return;
- }
-
- constexpr int defaultDevice = 0;
- {
- auto ctx = cms::cudatest::TestScopedContext::make(defaultDevice, true);
-
- SECTION("Construct from device ID") { REQUIRE(cms::cuda::currentDevice() == defaultDevice); }
-
- SECTION("Wrap T to cms::cuda::Product") {
- std::unique_ptr> dataPtr = ctx.wrap(10);
- REQUIRE(dataPtr.get() != nullptr);
- REQUIRE(dataPtr->device() == ctx.device());
- REQUIRE(dataPtr->stream() == ctx.stream());
- }
-
- SECTION("Construct from from cms::cuda::Product") {
- std::unique_ptr> dataPtr = ctx.wrap(10);
- const auto& data = *dataPtr;
-
- cms::cuda::ScopedContextProduce ctx2{data};
- REQUIRE(cms::cuda::currentDevice() == data.device());
- REQUIRE(ctx2.stream() == data.stream());
-
- // Second use of a product should lead to new stream
- cms::cuda::ScopedContextProduce ctx3{data};
- REQUIRE(cms::cuda::currentDevice() == data.device());
- REQUIRE(ctx3.stream() != data.stream());
- }
-
- SECTION("Storing state in cms::cuda::ContextState") {
- oneapi::tbb::task_arena arena(1);
- arena.execute([&ctx]() {
- cms::cuda::ContextState ctxstate;
- { // acquire
- std::unique_ptr> dataPtr = ctx.wrap(10);
- const auto& data = *dataPtr;
- oneapi::tbb::task_group group;
- edm::FinalWaitingTask waitTask{group};
- {
- edm::WaitingTaskWithArenaHolder dummy{group, &waitTask};
- cms::cuda::ScopedContextAcquire ctx2{data, dummy, ctxstate};
- }
- waitTask.wait();
- }
-
- { // produce
- cms::cuda::ScopedContextProduce ctx2{ctxstate};
- REQUIRE(cms::cuda::currentDevice() == ctx.device());
- REQUIRE(ctx2.stream() == ctx.stream());
- }
- });
- }
-
- SECTION("Joining multiple CUDA streams") {
- cms::cuda::ScopedSetDevice setDeviceForThisScope(defaultDevice);
-
- // Mimick a producer on the first CUDA stream
- int h_a1 = 1;
- auto d_a1 = cms::cuda::make_device_unique(nullptr);
- auto wprod1 = produce(defaultDevice, d_a1.get(), &h_a1);
-
- // Mimick a producer on the second CUDA stream
- int h_a2 = 2;
- auto d_a2 = cms::cuda::make_device_unique(nullptr);
- auto wprod2 = produce(defaultDevice, d_a2.get(), &h_a2);
-
- REQUIRE(wprod1->stream() != wprod2->stream());
-
- // Mimick a third producer "joining" the two streams
- cms::cuda::ScopedContextProduce ctx2{*wprod1};
-
- auto prod1 = ctx2.get(*wprod1);
- auto prod2 = ctx2.get(*wprod2);
-
- auto d_a3 = cms::cuda::make_device_unique(nullptr);
- cms::cudatest::testScopedContextKernels_join(prod1, prod2, d_a3.get(), ctx2.stream());
- cudaCheck(cudaStreamSynchronize(ctx2.stream()));
- REQUIRE(wprod2->isAvailable());
- REQUIRE(cms::cuda::eventWorkHasCompleted(wprod2->event()));
-
- h_a1 = 0;
- h_a2 = 0;
- int h_a3 = 0;
-
- cudaCheck(cudaMemcpyAsync(&h_a1, d_a1.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
- cudaCheck(cudaMemcpyAsync(&h_a2, d_a2.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
- cudaCheck(cudaMemcpyAsync(&h_a3, d_a3.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
-
- REQUIRE(h_a1 == 2);
- REQUIRE(h_a2 == 4);
- REQUIRE(h_a3 == 6);
- }
- }
-
- cudaCheck(cudaSetDevice(defaultDevice));
- cudaCheck(cudaDeviceSynchronize());
- // Note: CUDA resources are cleaned up by the destructors of the global cache objects
-}
diff --git a/HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.cu b/HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.cu
deleted file mode 100644
index b87f1e20a5f24..0000000000000
--- a/HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.cu
+++ /dev/null
@@ -1,17 +0,0 @@
-#include "test_ScopedContextKernels.h"
-
-namespace {
- __global__ void single_mul(int *d) { d[0] = d[0] * 2; }
-
- __global__ void join_add(const int *d1, const int *d2, int *d3) { d3[0] = d1[0] + d2[0]; }
-} // namespace
-
-namespace cms {
- namespace cudatest {
- void testScopedContextKernels_single(int *d, cudaStream_t stream) { single_mul<<<1, 1, 0, stream>>>(d); }
-
- void testScopedContextKernels_join(const int *d1, const int *d2, int *d3, cudaStream_t stream) {
- join_add<<<1, 1, 0, stream>>>(d1, d2, d3);
- }
- } // namespace cudatest
-} // namespace cms
diff --git a/HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.h b/HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.h
deleted file mode 100644
index dfc55682afc76..0000000000000
--- a/HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef HeterogeneousCore_CUDACore_test_ScopedContextKernels_h
-#define HeterogeneousCore_CUDACore_test_ScopedContextKernels_h
-
-#include