diff --git a/CUDADataFormats/Common/BuildFile.xml b/CUDADataFormats/Common/BuildFile.xml deleted file mode 100644 index e4971bdf3ebbe..0000000000000 --- a/CUDADataFormats/Common/BuildFile.xml +++ /dev/null @@ -1,9 +0,0 @@ - - - - - - - - - diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h deleted file mode 100644 index 8cfa5c9f5ffde..0000000000000 --- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h +++ /dev/null @@ -1,194 +0,0 @@ -#ifndef CUDADataFormatsCommonHeterogeneousSoA_H -#define CUDADataFormatsCommonHeterogeneousSoA_H - -#include - -#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" - -// a heterogeneous unique pointer... -template -class HeterogeneousSoA { -public: - using Product = T; - - HeterogeneousSoA() = default; // make root happy - ~HeterogeneousSoA() = default; - HeterogeneousSoA(HeterogeneousSoA &&) = default; - HeterogeneousSoA &operator=(HeterogeneousSoA &&) = default; - - explicit HeterogeneousSoA(cms::cuda::device::unique_ptr &&p) : dm_ptr(std::move(p)) {} - explicit HeterogeneousSoA(cms::cuda::host::unique_ptr &&p) : hm_ptr(std::move(p)) {} - explicit HeterogeneousSoA(std::unique_ptr &&p) : std_ptr(std::move(p)) {} - - auto const *get() const { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); } - - auto const &operator*() const { return *get(); } - - auto const *operator->() const { return get(); } - - auto *get() { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); } - - auto &operator*() { return *get(); } - - auto *operator->() { return get(); } - - // in reality valid only for GPU version... - cms::cuda::host::unique_ptr toHostAsync(cudaStream_t stream) const { - assert(dm_ptr); - auto ret = cms::cuda::make_host_unique(stream); - cudaCheck(cudaMemcpyAsync(ret.get(), dm_ptr.get(), sizeof(T), cudaMemcpyDefault, stream)); - return ret; - } - -private: - // a union wan't do it, a variant will not be more efficienct - cms::cuda::device::unique_ptr dm_ptr; //! - cms::cuda::host::unique_ptr hm_ptr; //! - std::unique_ptr std_ptr; //! -}; - -namespace cms { - namespace cudacompat { - - struct GPUTraits { - template - using unique_ptr = cms::cuda::device::unique_ptr; - - template - static auto make_unique(cudaStream_t stream) { - return cms::cuda::make_device_unique(stream); - } - - template - static auto make_unique(size_t size, cudaStream_t stream) { - return cms::cuda::make_device_unique(size, stream); - } - - template - static auto make_host_unique(cudaStream_t stream) { - return cms::cuda::make_host_unique(stream); - } - - template - static auto make_device_unique(cudaStream_t stream) { - return cms::cuda::make_device_unique(stream); - } - - template - static auto make_device_unique(size_t size, cudaStream_t stream) { - return cms::cuda::make_device_unique(size, stream); - } - }; - - struct HostTraits { - template - using unique_ptr = cms::cuda::host::unique_ptr; - - template - static auto make_unique(cudaStream_t stream) { - return cms::cuda::make_host_unique(stream); - } - - template - static auto make_unique(size_t size, cudaStream_t stream) { - return cms::cuda::make_host_unique(size, stream); - } - - template - static auto make_host_unique(cudaStream_t stream) { - return cms::cuda::make_host_unique(stream); - } - - template - static auto make_device_unique(cudaStream_t stream) { - return cms::cuda::make_device_unique(stream); - } - - template - static auto make_device_unique(size_t size, cudaStream_t stream) { - return cms::cuda::make_device_unique(size, stream); - } - }; - - struct CPUTraits { - template - using unique_ptr = std::unique_ptr; - - template - static auto make_unique(cudaStream_t) { - return std::make_unique(); - } - - template - static auto make_unique(size_t size, cudaStream_t) { - return std::make_unique(size); - } - - template - static auto make_host_unique(cudaStream_t) { - return std::make_unique(); - } - - template - static auto make_device_unique(cudaStream_t) { - return std::make_unique(); - } - - template - static auto make_device_unique(size_t size, cudaStream_t) { - return std::make_unique(size); - } - }; - - } // namespace cudacompat -} // namespace cms - -// a heterogeneous unique pointer (of a different sort) ... -template -class HeterogeneousSoAImpl { -public: - template - using unique_ptr = typename Traits::template unique_ptr; - - HeterogeneousSoAImpl() = default; // make root happy - ~HeterogeneousSoAImpl() = default; - HeterogeneousSoAImpl(HeterogeneousSoAImpl &&) = default; - HeterogeneousSoAImpl &operator=(HeterogeneousSoAImpl &&) = default; - - explicit HeterogeneousSoAImpl(unique_ptr &&p) : m_ptr(std::move(p)) {} - explicit HeterogeneousSoAImpl(cudaStream_t stream); - - T const *get() const { return m_ptr.get(); } - - T *get() { return m_ptr.get(); } - - cms::cuda::host::unique_ptr toHostAsync(cudaStream_t stream) const; - -private: - unique_ptr m_ptr; //! -}; - -template -HeterogeneousSoAImpl::HeterogeneousSoAImpl(cudaStream_t stream) { - m_ptr = Traits::template make_unique(stream); -} - -// in reality valid only for GPU version... -template -cms::cuda::host::unique_ptr HeterogeneousSoAImpl::toHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(stream); - cudaCheck(cudaMemcpyAsync(ret.get(), get(), sizeof(T), cudaMemcpyDefault, stream)); - return ret; -} - -template -using HeterogeneousSoAGPU = HeterogeneousSoAImpl; -template -using HeterogeneousSoACPU = HeterogeneousSoAImpl; -template -using HeterogeneousSoAHost = HeterogeneousSoAImpl; - -#endif diff --git a/CUDADataFormats/Common/interface/HostProduct.h b/CUDADataFormats/Common/interface/HostProduct.h deleted file mode 100644 index 63a152298e42b..0000000000000 --- a/CUDADataFormats/Common/interface/HostProduct.h +++ /dev/null @@ -1,29 +0,0 @@ -#ifndef CUDADataFormatsCommonHostProduct_H -#define CUDADataFormatsCommonHostProduct_H - -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" - -// a heterogeneous unique pointer... -template -class HostProduct { -public: - HostProduct() = default; // make root happy - ~HostProduct() = default; - HostProduct(HostProduct&&) = default; - HostProduct& operator=(HostProduct&&) = default; - - explicit HostProduct(cms::cuda::host::unique_ptr&& p) : hm_ptr(std::move(p)) {} - explicit HostProduct(std::unique_ptr&& p) : std_ptr(std::move(p)) {} - - auto const* get() const { return hm_ptr ? hm_ptr.get() : std_ptr.get(); } - - auto const& operator*() const { return *get(); } - - auto const* operator->() const { return get(); } - -private: - cms::cuda::host::unique_ptr hm_ptr; //! - std::unique_ptr std_ptr; //! -}; - -#endif diff --git a/CUDADataFormats/Common/interface/PortableDeviceCollection.h b/CUDADataFormats/Common/interface/PortableDeviceCollection.h deleted file mode 100644 index 78f72cb3d5437..0000000000000 --- a/CUDADataFormats/Common/interface/PortableDeviceCollection.h +++ /dev/null @@ -1,67 +0,0 @@ -#ifndef CUDADataFormats_Common_interface_PortableDeviceCollection_h -#define CUDADataFormats_Common_interface_PortableDeviceCollection_h - -#include -#include - -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" - -namespace cms::cuda { - - // generic SoA-based product in device memory - template - class PortableDeviceCollection { - public: - using Layout = T; - using View = typename Layout::View; - using ConstView = typename Layout::ConstView; - using Buffer = cms::cuda::device::unique_ptr; - - PortableDeviceCollection() = default; - - PortableDeviceCollection(int32_t elements, cudaStream_t stream) - : buffer_{cms::cuda::make_device_unique(Layout::computeDataSize(elements), stream)}, - layout_{buffer_.get(), elements}, - view_{layout_} { - // CUDA device memory uses a default alignment of at least 128 bytes - assert(reinterpret_cast(buffer_.get()) % Layout::alignment == 0); - } - - // non-copyable - PortableDeviceCollection(PortableDeviceCollection const&) = delete; - PortableDeviceCollection& operator=(PortableDeviceCollection const&) = delete; - - // movable - PortableDeviceCollection(PortableDeviceCollection&&) = default; - PortableDeviceCollection& operator=(PortableDeviceCollection&&) = default; - - // default destructor - ~PortableDeviceCollection() = default; - - // access the View - View& view() { return view_; } - ConstView const& view() const { return view_; } - ConstView const& const_view() const { return view_; } - - View& operator*() { return view_; } - ConstView const& operator*() const { return view_; } - - View* operator->() { return &view_; } - ConstView const* operator->() const { return &view_; } - - // access the Buffer - Buffer& buffer() { return buffer_; } - Buffer const& buffer() const { return buffer_; } - Buffer const& const_buffer() const { return buffer_; } - - size_t bufferSize() const { return layout_.metadata().byteSize(); } - - private: - Buffer buffer_; //! - Layout layout_; // - View view_; //! - }; - -} // namespace cms::cuda - -#endif // CUDADataFormats_Common_interface_PortableDeviceCollection_h diff --git a/CUDADataFormats/Common/interface/PortableHostCollection.h b/CUDADataFormats/Common/interface/PortableHostCollection.h deleted file mode 100644 index cfaf40c85b3bc..0000000000000 --- a/CUDADataFormats/Common/interface/PortableHostCollection.h +++ /dev/null @@ -1,85 +0,0 @@ -#ifndef CUDADataFormats_Common_interface_PortableHostCollection_h -#define CUDADataFormats_Common_interface_PortableHostCollection_h - -#include -#include - -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" - -namespace cms::cuda { - - // generic SoA-based product in host memory - template - class PortableHostCollection { - public: - using Layout = T; - using View = typename Layout::View; - using ConstView = typename Layout::ConstView; - using Buffer = cms::cuda::host::unique_ptr; - - PortableHostCollection() = default; - - PortableHostCollection(int32_t elements) - // allocate pageable host memory - : buffer_{cms::cuda::make_host_unique(Layout::computeDataSize(elements))}, - layout_{buffer_.get(), elements}, - view_{layout_} { - // make_host_unique for pageable host memory uses a default alignment of 128 bytes - assert(reinterpret_cast(buffer_.get()) % Layout::alignment == 0); - } - - PortableHostCollection(int32_t elements, cudaStream_t stream) - // allocate pinned host memory, accessible by the current device - : buffer_{cms::cuda::make_host_unique(Layout::computeDataSize(elements), stream)}, - layout_{buffer_.get(), elements}, - view_{layout_} { - // CUDA pinned host memory uses a default alignment of at least 128 bytes - assert(reinterpret_cast(buffer_.get()) % Layout::alignment == 0); - } - - // non-copyable - PortableHostCollection(PortableHostCollection const&) = delete; - PortableHostCollection& operator=(PortableHostCollection const&) = delete; - - // movable - PortableHostCollection(PortableHostCollection&&) = default; - PortableHostCollection& operator=(PortableHostCollection&&) = default; - - // default destructor - ~PortableHostCollection() = default; - - // access the View - View& view() { return view_; } - ConstView const& view() const { return view_; } - ConstView const& const_view() const { return view_; } - - View& operator*() { return view_; } - ConstView const& operator*() const { return view_; } - - View* operator->() { return &view_; } - ConstView const* operator->() const { return &view_; } - - // access the Buffer - Buffer& buffer() { return buffer_; } - Buffer const& buffer() const { return buffer_; } - Buffer const& const_buffer() const { return buffer_; } - - size_t bufferSize() const { return layout_.metadata().byteSize(); } - - // part of the ROOT read streamer - static void ROOTReadStreamer(PortableHostCollection* newObj, Layout const& layout) { - newObj->~PortableHostCollection(); - // allocate pinned host memory using the legacy stream, that synchronises with all (blocking) streams - new (newObj) PortableHostCollection(layout.metadata().size()); - newObj->layout_.ROOTReadStreamer(layout); - } - - private: - Buffer buffer_; //! - Layout layout_; // - View view_; //! - }; - -} // namespace cms::cuda - -#endif // CUDADataFormats_Common_interface_PortableHostCollection_h diff --git a/CUDADataFormats/Common/interface/Product.h b/CUDADataFormats/Common/interface/Product.h deleted file mode 100644 index 41bb8356e67cf..0000000000000 --- a/CUDADataFormats/Common/interface/Product.h +++ /dev/null @@ -1,60 +0,0 @@ -#ifndef CUDADataFormats_Common_Product_h -#define CUDADataFormats_Common_Product_h - -#include - -#include "CUDADataFormats/Common/interface/ProductBase.h" - -namespace edm { - template - class Wrapper; -} - -namespace cms { - namespace cuda { - namespace impl { - class ScopedContextGetterBase; - } - - /** - * The purpose of this class is to wrap CUDA data to edm::Event in a - * way which forces correct use of various utilities. - * - * The non-default construction has to be done with cms::cuda::ScopedContext - * (in order to properly register the CUDA event). - * - * The default constructor is needed only for the ROOT dictionary generation. - * - * The CUDA event is in practice needed only for stream-stream - * synchronization, but someone with long-enough lifetime has to own - * it. Here is a somewhat natural place. If overhead is too much, we - * can use them only where synchronization between streams is needed. - */ - template - class Product : public ProductBase { - public: - Product() = default; // Needed only for ROOT dictionary generation - - Product(const Product&) = delete; - Product& operator=(const Product&) = delete; - Product(Product&&) = default; - Product& operator=(Product&&) = default; - - private: - friend class impl::ScopedContextGetterBase; - friend class ScopedContextProduce; - friend class edm::Wrapper>; - - explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, T data) - : ProductBase(device, std::move(stream), std::move(event)), data_(std::move(data)) {} - - template - explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, Args&&... args) - : ProductBase(device, std::move(stream), std::move(event)), data_(std::forward(args)...) {} - - T data_; //! - }; - } // namespace cuda -} // namespace cms - -#endif diff --git a/CUDADataFormats/Common/interface/ProductBase.h b/CUDADataFormats/Common/interface/ProductBase.h deleted file mode 100644 index efe2242903bd0..0000000000000 --- a/CUDADataFormats/Common/interface/ProductBase.h +++ /dev/null @@ -1,93 +0,0 @@ -#ifndef CUDADataFormats_Common_ProductBase_h -#define CUDADataFormats_Common_ProductBase_h - -#include -#include - -#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h" - -namespace cms { - namespace cuda { - namespace impl { - class ScopedContextBase; - } - - /** - * Base class for all instantiations of CUDA to hold the - * non-T-dependent members. - */ - class ProductBase { - public: - ProductBase() = default; // Needed only for ROOT dictionary generation - ~ProductBase(); - - ProductBase(const ProductBase&) = delete; - ProductBase& operator=(const ProductBase&) = delete; - ProductBase(ProductBase&& other) - : stream_{std::move(other.stream_)}, - event_{std::move(other.event_)}, - mayReuseStream_{other.mayReuseStream_.load()}, - device_{other.device_} {} - ProductBase& operator=(ProductBase&& other) { - stream_ = std::move(other.stream_); - event_ = std::move(other.event_); - mayReuseStream_ = other.mayReuseStream_.load(); - device_ = other.device_; - return *this; - } - - bool isValid() const { return stream_.get() != nullptr; } - bool isAvailable() const; - - int device() const { return device_; } - - // cudaStream_t is a pointer to a thread-safe object, for which a - // mutable access is needed even if the cms::cuda::ScopedContext itself - // would be const. Therefore it is ok to return a non-const - // pointer from a const method here. - cudaStream_t stream() const { return stream_.get(); } - - // cudaEvent_t is a pointer to a thread-safe object, for which a - // mutable access is needed even if the cms::cuda::ScopedContext itself - // would be const. Therefore it is ok to return a non-const - // pointer from a const method here. - cudaEvent_t event() const { return event_.get(); } - - protected: - explicit ProductBase(int device, SharedStreamPtr stream, SharedEventPtr event) - : stream_{std::move(stream)}, event_{std::move(event)}, device_{device} {} - - private: - friend class impl::ScopedContextBase; - friend class ScopedContextProduce; - - // The following function is intended to be used only from ScopedContext - const SharedStreamPtr& streamPtr() const { return stream_; } - - bool mayReuseStream() const { - bool expected = true; - bool changed = mayReuseStream_.compare_exchange_strong(expected, false); - // If the current thread is the one flipping the flag, it may - // reuse the stream. - return changed; - } - - // The cudaStream_t is really shared among edm::Event products, so - // using shared_ptr also here - SharedStreamPtr stream_; //! - // shared_ptr because of caching in cms::cuda::EventCache - SharedEventPtr event_; //! - - // This flag tells whether the CUDA stream may be reused by a - // consumer or not. The goal is to have a "chain" of modules to - // queue their work to the same stream. - mutable std::atomic mayReuseStream_ = true; //! - - // The CUDA device associated with this product - int device_ = -1; //! - }; - } // namespace cuda -} // namespace cms - -#endif diff --git a/CUDADataFormats/Common/src/ProductBase.cc b/CUDADataFormats/Common/src/ProductBase.cc deleted file mode 100644 index 8e1cf64b17122..0000000000000 --- a/CUDADataFormats/Common/src/ProductBase.cc +++ /dev/null @@ -1,29 +0,0 @@ -#include "CUDADataFormats/Common/interface/ProductBase.h" -#include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h" - -namespace cms::cuda { - bool ProductBase::isAvailable() const { - // if default-constructed, the product is not available - if (not event_) { - return false; - } - return eventWorkHasCompleted(event_.get()); - } - - ProductBase::~ProductBase() { - // Make sure that the production of the product in the GPU is - // complete before destructing the product. This is to make sure - // that the EDM stream does not move to the next event before all - // asynchronous processing of the current is complete. - - // TODO: a callback notifying a WaitingTaskHolder (or similar) - // would avoid blocking the CPU, but would also require more work. - // - // Intentionally not checking the return value to avoid throwing - // exceptions. If this call would fail, we should get failures - // elsewhere as well. - if (event_) { - cudaEventSynchronize(event_.get()); - } - } -} // namespace cms::cuda diff --git a/CUDADataFormats/Common/src/classes.h b/CUDADataFormats/Common/src/classes.h deleted file mode 100644 index 239e071d513a2..0000000000000 --- a/CUDADataFormats/Common/src/classes.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef CUDADataFormats_Common_src_classes_h -#define CUDADataFormats_Common_src_classes_h - -#include "CUDADataFormats/Common/interface/HostProduct.h" -#include "DataFormats/Common/interface/Wrapper.h" - -#endif // CUDADataFormats_Common_src_classes_h diff --git a/CUDADataFormats/Common/src/classes_def.xml b/CUDADataFormats/Common/src/classes_def.xml deleted file mode 100644 index d8514251c807a..0000000000000 --- a/CUDADataFormats/Common/src/classes_def.xml +++ /dev/null @@ -1,4 +0,0 @@ - - - - diff --git a/CUDADataFormats/Common/test/BuildFile.xml b/CUDADataFormats/Common/test/BuildFile.xml deleted file mode 100644 index a0cbbdd8a7858..0000000000000 --- a/CUDADataFormats/Common/test/BuildFile.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - diff --git a/CUDADataFormats/Common/test/test_Product.cc b/CUDADataFormats/Common/test/test_Product.cc deleted file mode 100644 index 5790d07bec56d..0000000000000 --- a/CUDADataFormats/Common/test/test_Product.cc +++ /dev/null @@ -1,68 +0,0 @@ -#include "catch2/catch_all.hpp" - -#include "CUDADataFormats/Common/interface/Product.h" -#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" -#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h" -#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h" - -#include - -namespace cms::cudatest { - class TestScopedContext { - public: - static cuda::ScopedContextProduce make(int dev, bool createEvent) { - cms::cuda::SharedEventPtr event; - if (createEvent) { - event = cms::cuda::getEventCache().get(); - } - return cuda::ScopedContextProduce(dev, cms::cuda::getStreamCache().get(), std::move(event)); - } - }; -} // namespace cms::cudatest - -TEST_CASE("Use of cms::cuda::Product template", "[CUDACore]") { - SECTION("Default constructed") { - auto foo = cms::cuda::Product(); - REQUIRE(!foo.isValid()); - - auto bar = std::move(foo); - } - - if (not cms::cudatest::testDevices()) { - return; - } - - constexpr int defaultDevice = 0; - cudaCheck(cudaSetDevice(defaultDevice)); - { - auto ctx = cms::cudatest::TestScopedContext::make(defaultDevice, true); - std::unique_ptr> dataPtr = ctx.wrap(10); - auto& data = *dataPtr; - - SECTION("Construct from cms::cuda::ScopedContext") { - REQUIRE(data.isValid()); - REQUIRE(data.device() == defaultDevice); - REQUIRE(data.stream() == ctx.stream()); - REQUIRE(data.event() != nullptr); - } - - SECTION("Move constructor") { - auto data2 = cms::cuda::Product(std::move(data)); - REQUIRE(data2.isValid()); - REQUIRE(!data.isValid()); - } - - SECTION("Move assignment") { - cms::cuda::Product data2; - data2 = std::move(data); - REQUIRE(data2.isValid()); - REQUIRE(!data.isValid()); - } - } - - cudaCheck(cudaSetDevice(defaultDevice)); - cudaCheck(cudaDeviceSynchronize()); - // Note: CUDA resources are cleaned up by the destructors of the global cache objects -} diff --git a/CUDADataFormats/Common/test/test_main.cc b/CUDADataFormats/Common/test/test_main.cc deleted file mode 100644 index b3ea47c29c7a7..0000000000000 --- a/CUDADataFormats/Common/test/test_main.cc +++ /dev/null @@ -1,2 +0,0 @@ -#define CATCH_CONFIG_MAIN -#include "catch2/catch_all.hpp" diff --git a/CUDADataFormats/PortableTestObjects/BuildFile.xml b/CUDADataFormats/PortableTestObjects/BuildFile.xml deleted file mode 100644 index 595a743a6c4c5..0000000000000 --- a/CUDADataFormats/PortableTestObjects/BuildFile.xml +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - - diff --git a/CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h b/CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h deleted file mode 100644 index 621f0939116d7..0000000000000 --- a/CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef CUDADataFormats_PortableTestObjects_interface_TestDeviceCollection_h -#define CUDADataFormats_PortableTestObjects_interface_TestDeviceCollection_h - -#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h" -#include "DataFormats/PortableTestObjects/interface/TestSoA.h" - -namespace cudatest { - - // Eigen matrix - using Matrix = portabletest::Matrix; - using Array = portabletest::Array; - - // SoA with x, y, z, id fields, r scalar, m matrix, in device global memory - using TestDeviceCollection = cms::cuda::PortableDeviceCollection; - -} // namespace cudatest - -#endif // CUDADataFormats_PortableTestObjects_interface_TestDeviceCollection_h diff --git a/CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h b/CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h deleted file mode 100644 index 9426b6c6a8275..0000000000000 --- a/CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef CUDADataFormats_PortableTestObjects_interface_TestHostCollection_h -#define CUDADataFormats_PortableTestObjects_interface_TestHostCollection_h - -#include "CUDADataFormats/Common/interface/PortableHostCollection.h" -#include "DataFormats/PortableTestObjects/interface/TestSoA.h" - -namespace cudatest { - - // Eigen matrix - using Matrix = portabletest::Matrix; - using Array = portabletest::Array; - - // SoA with x, y, z, id fields, r scalar, m matrix, in host memory - using TestHostCollection = cms::cuda::PortableHostCollection; - -} // namespace cudatest - -#endif // CUDADataFormats_PortableTestObjects_interface_TestHostCollection_h diff --git a/HeterogeneousCore/CUDACore/BuildFile.xml b/HeterogeneousCore/CUDACore/BuildFile.xml deleted file mode 100644 index 42f7db8fc72d6..0000000000000 --- a/HeterogeneousCore/CUDACore/BuildFile.xml +++ /dev/null @@ -1,15 +0,0 @@ - - - - - - - - - - - - - - - diff --git a/HeterogeneousCore/CUDACore/README.md b/HeterogeneousCore/CUDACore/README.md deleted file mode 100644 index 92a29e1460f19..0000000000000 --- a/HeterogeneousCore/CUDACore/README.md +++ /dev/null @@ -1,812 +0,0 @@ -# CUDA algorithms in CMSSW - -## Outline - -* [Introduction](#introduction) - * [Design goals](#design-goals) - * [Overall guidelines](#overall-guidelines) -* [Sub-packages](#sub-packages) -* [Examples](#examples) - * [Isolated producer (no CUDA input nor output)](#isolated-producer-no-cuda-input-nor-output) - * [Producer with CUDA output](#producer-with-cuda-output) - * [Producer with CUDA input](#producer-with-cuda-input) - * [Producer with CUDA input and output (with ExternalWork)](#producer-with-cuda-input-and-output-with-externalwork) - * [Producer with CUDA input and output, and internal chain of CPU and GPU tasks (with ExternalWork)](producer-with-cuda-input-and-output-and-internal-chain-of-cpu-and-gpu-tasks-with-externalwork) - * [Producer with CUDA input and output (without ExternalWork)](#producer-with-cuda-input-and-output-without-externalwork) - * [Analyzer with CUDA input](#analyzer-with-cuda-input) - * [Configuration](#configuration) - * [GPU-only configuration](#gpu-only-configuration) -* [More details](#more-details) - * [Device choice](#device-choice) - * [Data model](#data-model) - * [CUDA EDProducer](#cuda-edproducer) - * [Class declaration](#class-declaration) - * [Memory allocation](#memory-allocation) - * [Caching allocator](#caching-allocator) - * [Non-cached pinned host `unique_ptr`](#non-cached-pinned-host-unique_ptr) - * [CUDA API](#cuda-api) - * [Setting the current device](#setting-the-current-device) - * [Getting input](#getting-input) - * [Calling the CUDA kernels](#calling-the-cuda-kernels) - * [Putting output](#putting-output) - * [`ExternalWork` extension](#externalwork-extension) - * [Module-internal chain of CPU and GPU tasks](#module-internal-chain-of-cpu-and-gpu-tasks) - * [Transferring GPU data to CPU](#transferring-gpu-data-to-cpu) - * [Synchronizing between CUDA streams](#synchronizing-between-cuda-streams) - * [CUDA ESProduct](#cuda-esproduct) - -## Introduction - -This page documents the CUDA integration within CMSSW - -### Design goals - -1. Provide a mechanism for a chain of modules to share a resource - * Resource can be e.g. CUDA device memory or a CUDA stream -2. Minimize data movements between the CPU and the device -3. Support multiple devices -4. Allow the same job configuration to be used on all hardware combinations - -### Overall guidelines - -1. Within the `acquire()`/`produce()` functions all CUDA operations should be asynchronous, i.e. - * Use `cudaMemcpyAsync()`, `cudaMemsetAsync()`, `cudaMemPrefetchAsync()` etc. - * Avoid `cudaMalloc*()`, `cudaHostAlloc()`, `cudaFree*()`, `cudaHostRegister()`, `cudaHostUnregister()` on every event - * Occasional calls are permitted through a caching mechanism that amortizes the cost (see also [Caching allocator](#caching-allocator)) - * Avoid `assert()` in device functions, or use `#include HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h` - * With the latter the `assert()` calls in CUDA code are disabled by - default, but can be enabled by defining a `GPU_DEBUG` macro - (before the aforementioned include) -2. Synchronization needs should be fulfilled with - [`ExternalWork`](https://twiki.cern.ch/twiki/bin/view/CMSPublic/FWMultithreadedFrameworkStreamModuleInterface#edm_ExternalWork) - extension to EDProducers - * `ExternalWork` can be used to replace one synchronization point - (e.g. between device kernels and copying a known amount of data - back to CPU). - * For further synchronization points (e.g. copying data whose - amount is known only at the device side), split the work to - multiple `ExternalWork` producers. This approach has the added - benefit that e.g. data transfers to CPU become on-demand automatically - * A general breakdown of the possible steps: - * Convert input legacy CPU data format to CPU SoA - * Transfer input CPU SoA to GPU - * Launch kernels - * Transfer the number of output elements to CPU - * Transfer the output data from GPU to CPU SoA - * Convert the output SoA to legacy CPU data formats -3. Within `acquire()`/`produce()`, the current CUDA device is set - implicitly and the CUDA stream is provided by the system (with - `cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`) - * It is strongly recommended to use the provided CUDA stream for all operations - * If that is not feasible for some reason, the provided CUDA - stream must synchronize with the work queued on other CUDA - streams (with CUDA events and `cudaStreamWaitEvent()`) -4. Outside of `acquire()`/`produce()`, CUDA API functions may be - called only if the `CUDAService` implementation of the `CUDAInterface` - is available and `CUDAService::enabled()` returns `true`: - ```c++ - edm::Service cuda; - if (cuda and cuda->enabled()) { - // CUDA calls ca be made here - } - ``` - * With point 3 it follows that in these cases multiple devices have - to be dealt with explicitly, as well as CUDA streams - -## Sub-packages -* [`HeterogeneousCore/CUDACore`](#cuda-integration) CUDA-specific core components -* [`HeterogeneousCore/CUDAServices`](../CUDAServices) Various edm::Services related to CUDA -* [`HeterogeneousCore/CUDAUtilities`](../CUDAUtilities) Various utilities for CUDA kernel code -* [`HeterogeneousCore/CUDATest`](../CUDATest) Test modules and configurations -* [`CUDADataFormats/Common`](../../CUDADataFormats/Common) Utilities for event products with CUDA data - -## Examples - -### Isolated producer (no CUDA input nor output) - -```cpp -class IsolatedProducerCUDA: public edm::stream::EDProducer { -public: - ... - void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; - void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; - ... -private: - ... - IsolatedProducerGPUAlgo gpuAlgo_; - edm::EDGetTokenT inputToken_; - edm::EDPutTokenT outputToken_; -}; -... -void IsolatedProducerCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - // Sets the current device and creates a CUDA stream - cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)}; - - auto const& inputData = iEvent.get(inputToken_); - - // Queues asynchronous data transfers and kernels to the CUDA stream - // returned by cms::cuda::ScopedContextAcquire::stream() - gpuAlgo_.makeAsync(inputData, ctx.stream()); - - // Destructor of ctx queues a callback to the CUDA stream notifying - // waitingTaskHolder when the queued asynchronous work has finished -} - -// Called after the asynchronous work has finished -void IsolatedProducerCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { - // Real life is likely more complex than this simple example. Here - // getResult() returns some data in CPU memory that is passed - // directly to the OutputData constructor. - iEvent.emplace(outputToken_, gpuAlgo_.getResult()); -} -``` - -### Producer with CUDA output - -```cpp -class ProducerOutputCUDA: public edm::stream::EDProducer { -public: - ... - void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; - void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; - ... -private: - ... - ProducerOutputGPUAlgo gpuAlgo_; - edm::EDGetTokenT inputToken_; - edm::EDPutTokenT> outputToken_; - cms::cuda::ContextState ctxState_; -}; -... -void ProducerOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - // Sets the current device and creates a CUDA stream - cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder), ctxState_}; - - auto const& inputData = iEvent.get(inputToken_); - - // Queues asynchronous data transfers and kernels to the CUDA stream - // returned by cms::cuda::ScopedContextAcquire::stream() - gpuAlgo.makeAsync(inputData, ctx.stream()); - - // Destructor of ctx queues a callback to the CUDA stream notifying - // waitingTaskHolder when the queued asynchronous work has finished, - // and saves the device and CUDA stream to ctxState_ -} - -// Called after the asynchronous work has finished -void ProducerOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { - // Sets again the current device, uses the CUDA stream created in the acquire() - cms::cuda::ScopedContextProduce ctx{ctxState_}; - - // Now getResult() returns data in GPU memory that is passed to the - // constructor of OutputData. cms::cuda::ScopedContextProduce::emplace() wraps the - // OutputData to cms::cuda::Product. cms::cuda::Product stores also - // the current device and the CUDA stream since those will be needed - // in the consumer side. - ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult()); -} -``` - -### Producer with CUDA input - -```cpp -class ProducerInputCUDA: public edm::stream::EDProducer { -public: - ... - void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; - void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override; - ... -private: - ... - ProducerInputGPUAlgo gpuAlgo_; - edm::EDGetTokenT> inputToken_; - edm::EDGetTokenT> otherInputToken_; - edm::EDPutTokenT outputToken_; -}; -... -void ProducerInputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - cms::cuda::Product const& inputDataWrapped = iEvent.get(inputToken_); - - // Set the current device to the same that was used to produce - // InputData, and possibly use the same CUDA stream - cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; - - // Grab the real input data. Checks that the input data is on the - // current device. If the input data was produced in a different CUDA - // stream than the cms::cuda::ScopedContextAcquire holds, create an inter-stream - // synchronization point with CUDA event and cudaStreamWaitEvent() - auto const& inputData = ctx.get(inputDataWrapped); - - // Input data from another producer - auto const& otherInputData = ctx.get(iEvent.get(otherInputToken_)); - // or - auto const& otherInputData = ctx.get(iEvent, otherInputToken_); - - - // Queues asynchronous data transfers and kernels to the CUDA stream - // returned by cms::cuda::ScopedContextAcquire::stream() - gpuAlgo.makeAsync(inputData, otherInputData, ctx.stream()); - - // Destructor of ctx queues a callback to the CUDA stream notifying - // waitingTaskHolder when the queued asynchronous work has finished -} - -// Called after the asynchronous work has finished -void ProducerInputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) { - // Real life is likely more complex than this simple example. Here - // getResult() returns some data in CPU memory that is passed - // directly to the OutputData constructor. - iEvent.emplace(outputToken_, gpuAlgo_.getResult()); -} -``` - -See [further below](#setting-the-current-device) for the conditions -when the `cms::cuda::ScopedContextAcquire` constructor reuses the CUDA stream. Note -that the `cms::cuda::ScopedContextAcquire` constructor taking `edm::StreamID` is -allowed, it will just always create a new CUDA stream. - - -### Producer with CUDA input and output (with ExternalWork) - -```cpp -class ProducerInputOutputCUDA: public edm::stream::EDProducer { -public: - ... - void acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override; - void produce(edm::Event& iEvent, edm::EventSetup& iSetup) override; - ... -private: - ... - ProducerInputGPUAlgo gpuAlgo_; - edm::EDGetTokenT> inputToken_; - edm::EDPutTokenT> outputToken_; -}; -... -void ProducerInputOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - cms::cuda::Product const& inputDataWrapped = iEvent.get(inputToken_); - - // Set the current device to the same that was used to produce - // InputData, and also use the same CUDA stream - cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder), ctxState_}; - - // Grab the real input data. Checks that the input data is on the - // current device. If the input data was produced in a different CUDA - // stream than the cms::cuda::ScopedContextAcquire holds, create an inter-stream - // synchronization point with CUDA event and cudaStreamWaitEvent() - auto const& inputData = ctx.get(inputDataWrapped); - - // Queues asynchronous data transfers and kernels to the CUDA stream - // returned by cms::cuda::ScopedContextAcquire::stream() - gpuAlgo.makeAsync(inputData, ctx.stream()); - - // Destructor of ctx queues a callback to the CUDA stream notifying - // waitingTaskHolder when the queued asynchronous work has finished, - // and saves the device and CUDA stream to ctxState_ -} - -// Called after the asynchronous work has finished -void ProducerInputOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) { - // Sets again the current device, uses the CUDA stream created in the acquire() - cms::cuda::ScopedContextProduce ctx{ctxState_}; - - // Now getResult() returns data in GPU memory that is passed to the - // constructor of OutputData. cms::cuda::ScopedContextProduce::emplace() wraps the - // OutputData to cms::cuda::Product. cms::cuda::Product stores also - // the current device and the CUDA stream since those will be needed - // in the consumer side. - ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult()); -} -``` - -[Complete example](../CUDATest/plugins/TestCUDAProducerGPUEW.cc) - - -### Producer with CUDA input and output (without ExternalWork) - -If the producer does not need to transfer anything back to CPU (like -the number of output elements), the `ExternalWork` extension is not -needed as there is no need to synchronize. - -```cpp -class ProducerInputOutputCUDA: public edm::global::EDProducer<> { -public: - ... - void produce(edm::StreamID streamID, edm::Event& iEvent, edm::EventSetup& iSetup) const override; - ... -private: - ... - ProducerInputGPUAlgo gpuAlgo_; - edm::EDGetTokenT> inputToken_; - edm::EDPutTokenT> outputToken_; -}; -... -void ProducerInputOutputCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, edm::EventSetup& iSetup) const { - cms::cuda::Product const& inputDataWrapped = iEvent.get(inputToken_); - - // Set the current device to the same that was used to produce - // InputData, and possibly use the same CUDA stream - cms::cuda::ScopedContextProduce ctx{inputDataWrapped}; - - // Grab the real input data. Checks that the input data is on the - // current device. If the input data was produced in a different CUDA - // stream than the cms::cuda::ScopedContextProduce holds, create an inter-stream - // synchronization point with CUDA event and cudaStreamWaitEvent() - auto const& inputData = ctx.get(inputDataWrapped); - - // Queues asynchronous data transfers and kernels to the CUDA stream - // returned by cms::cuda::ScopedContextProduce::stream(). Here makeAsync() also - // returns data in GPU memory that is passed to the constructor of - // OutputData. cms::cuda::ScopedContextProduce::emplace() wraps the OutputData to - // cms::cuda::Product. cms::cuda::Product stores also the current - // device and the CUDA stream since those will be needed in the - // consumer side. - ctx.emplace(iEvent, outputToken, gpuAlgo.makeAsync(inputData, ctx.stream()); - - // Destructor of ctx queues a callback to the CUDA stream notifying - // waitingTaskHolder when the queued asynchronous work has finished -} -``` - -[Complete example](../CUDATest/plugins/TestCUDAProducerGPU.cc) - - -### Analyzer with CUDA input - -Analyzer with CUDA input is similar to [producer with CUDA -input](#producer-with-cuda-input). Note that currently we do not have -a mechanism for portable configurations with analyzers. This means -that a configuration with a CUDA analyzer can only run on a machine -with CUDA device(s). - -```cpp -class AnalyzerInputCUDA: public edm::global::EDAnalyzer<> { -public: - ... - void analyzer(edm::Event const& iEvent, edm::EventSetup const& iSetup) override; - ... -private: - ... - AnalyzerInputGPUAlgo gpuAlgo_; - edm::EDGetTokenT> inputToken_; - edm::EDGetTokenT> otherInputToken_; -}; -... -void AnalyzerInputCUDA::analyze(edm::Event const& iEvent, edm::EventSetup& iSetup) { - cms::cuda::Product const& inputDataWrapped = iEvent.get(inputToken_); - - // Set the current device to the same that was used to produce - // InputData, and possibly use the same CUDA stream - cms::cuda::ScopedContextAnalyze ctx{inputDataWrapped}; - - // Grab the real input data. Checks that the input data is on the - // current device. If the input data was produced in a different CUDA - // stream than the cms::cuda::ScopedContextAnalyze holds, create an inter-stream - // synchronization point with CUDA event and cudaStreamWaitEvent() - auto const& inputData = ctx.get(inputDataWrapped); - - // Input data from another producer - auto const& otherInputData = ctx.get(iEvent.get(otherInputToken_)); - // or - auto const& otherInputData = ctx.get(iEvent, otherInputToken_); - - - // Queues asynchronous data transfers and kernels to the CUDA stream - // returned by cms::cuda::ScopedContextAnalyze::stream() - gpuAlgo.analyzeAsync(inputData, otherInputData, ctx.stream()); -} -``` - -[Complete example](../CUDATest/plugins/TestCUDAAnalyzerGPU.cc) - - -### Configuration - -#### GPU-only configuration - -For a GPU-only configuration there is nothing special to be done, just -construct the Paths/Sequences/Tasks from the GPU modules. - -## More details - -### Device choice - -For multi-GPU setup the device is chosen in the first CUDA module in a -chain of modules by one of the constructors of -`cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce` -```cpp -// In ExternalWork acquire() -cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), ...}; - -// In normal produce() (or filter()) -cms::cuda::ScopedContextProduce ctx{iEvent.streamID()}; -``` -As the choice is still the static EDM stream to device assignment, the -EDM stream ID is needed. The logic will likely evolve in the future to -be more dynamic, and likely the device choice has to be made for the -full event. - -### Data model - -The "GPU data product" should be a class/struct containing smart -pointer(s) to device data (see [Memory allocation](#memory-allocation)). -When putting the data to event, the data is wrapped to -`cms::cuda::Product` template, which holds -* the GPU data product - * must be moveable, but no other restrictions -* the current device where the data was produced, and the CUDA stream the data was produced with -* [CUDA event for synchronization between multiple CUDA streams](#synchronizing-between-cuda-streams) - -Note that the `cms::cuda::Product` wrapper can be constructed only with -`cms::cuda::ScopedContextProduce::wrap()`, and the data `T` can be obtained -from it only with -`cms::cuda::ScopedContextAcquire::get()`/`cms::cuda::ScopedContextProduce::get()`/`cms::cuda::ScopedContextAnalyze::get()`, -as described further below. When putting the data product directly to -`edm::Event`, also `cms::cuda::SCopedContextProduce::emplace()` can be used. - -The GPU data products that depend on the CUDA runtime should be placed -under `CUDADataFormats` package, using the same name for sub-package -that would be used in `DataFormats`. Everything else, e.g. SoA for -CPU, should go under `DataFormats` as usual. - - -### CUDA EDProducer - -#### Class declaration - -The CUDA producers are normal EDProducers. The `ExternalWork` -extension should be used if a synchronization between the GPU and CPU -is needed, e.g. when transferring data from GPU to CPU. - -#### Memory allocation - -##### Caching allocator - -The memory allocations should be done dynamically with the following functions -```cpp -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" - -cms::cuda::device::unique_ptr device_buffer = cms::cuda::make_device_unique(50, cudaStream); -cms::cuda::host::unique_ptr host_buffer = cms::cuda::make_host_unique(50, cudaStream); -``` - -in the `acquire()` and `produce()` functions. The same -`cudaStream_t` object that is used for transfers and kernels -should be passed to the allocator. - -The allocator is based on [`cub::CachingDeviceAllocator`](https://nvlabs.github.io/cub/structcub_1_1_caching_device_allocator.html). -The memory is guaranteed to be reserved -* for the host: up to the destructor of the `unique_ptr` -* for the device: until all work queued in the `cudaStream` up to the point when the `unique_ptr` destructor is called has finished - -##### Non-cached pinned host `unique_ptr` - -In producers transferring data to GPU one may want to pinned host -memory allocated with `cudaHostAllocWriteCombined`. As of now we don't -want to include the flag dimension to the caching allocator. The CUDA -API wrapper library does not support allocation flags, so we add our -own `unique_ptr` for that. - -```cpp -#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h" - -cms::cuda::host::noncached_unique_ptr host_buffer = cms::cuda::make_host_noncached_unique(50, flags); -``` -The `flags` is passed directly to `cudaHostAlloc()`. - -##### CUDA API - -The `cudaMalloc()` etc may be used outside of the event loop, but that -should be limited to only relatively small allocations in order to -allow as much re-use of device memory as possible. - -If really needed, the `cudaMalloc()` etc may be used also within the -event loop, but then the cost of allocation and implicit -synchronization should be explicitly amortized e.g. by caching. - -#### Setting the current device - -A CUDA producer should construct `cms::cuda::ScopedContextAcquire` in -`acquire()` (`cms::cuda::ScopedContextProduce` `produce()` if not using -`ExternalWork`) either with `edm::StreamID`, or with a -`cms::cuda::Product` read as an input. - -```cpp -// From edm::StreamID -cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), ...}; -// or -cms::cuda::ScopedContextProduce ctx{iEvent.streamID()}; - - -// From cms::cuda::Product -cms::cuda::Product const& cclus = iEvent.get(srcToken_); -cms::cuda::ScopedContextAcquire ctx{cclus, ...}; -// or -cms::cuda::ScopedContextProduce ctx{cclus}; -``` - -A CUDA analyzer should construct `cms::cuda::ScopedContextAnalyze` with a -`cms::cuda::Product` read as an input. - -```cpp -cms::cuda::Product const& cclus = iEvent.get(srcToken_); -cms::cuda::ScopedContextAnalyze ctx{cclus}; -``` - -`cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`/`cms::cuda::ScopedContextAnalyze` work in the RAII way and does the following -* Sets the current device for the current scope - - If constructed from the `edm::StreamID`, chooses the device and creates a new CUDA stream - - If constructed from the `cms::cuda::Product`, uses the same device and possibly the same CUDA stream as was used to produce the `cms::cuda::Product` - * The CUDA stream is reused if this producer is the first consumer - of the `cms::cuda::Product`, otherwise a new CUDA stream is created. - This approach is simple compromise to automatically express the work of - parallel producers in different CUDA streams, and at the same - time allow a chain of producers to queue their work to the same - CUDA stream. -* Gives access to the CUDA stream the algorithm should use to queue asynchronous work -* `cms::cuda::ScopedContextAcquire` calls `edm::WaitingTaskWithArenaHolder::doneWaiting()` when necessary (in its destructor) -* [Synchronizes between CUDA streams if necessary](#synchronizing-between-cuda-streams) -* Needed to get `cms::cuda::Product` from the event - * `cms::cuda::ScopedContextProduce` is needed to put `cms::cuda::Product` to the event - -In case of multiple input products, from possibly different CUDA -streams and/or CUDA devices, this approach gives the developer full -control in which of them the kernels of the algorithm should be run. - -#### Getting input - -The real product (`T`) can be obtained from `cms::cuda::Product` only with -the help of -`cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`/`cms::cuda::ScopedContextAnalyze`. - -```cpp -// From cms::cuda::Product -cms::cuda::Product cclus = iEvent.get(srcToken_); -GPUClusters const& clus = ctx.get(cclus); - -// Directly from Event -GPUClusters const& clus = ctx.get(iEvent, srcToken_); -``` - -This step is needed to -* check that the data are on the same CUDA device - * if not, throw an exception (with unified memory could prefetch instead) -* if the CUDA streams are different, synchronize between them - -#### Calling the CUDA kernels - -It is usually best to wrap the CUDA kernel calls to a separate class, -and then call methods of that class from the EDProducer. The only -requirement is that the CUDA stream where to queue the operations -should be the one from the -`cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`/`cms::cuda::ScopedContextAnalyze`. - -```cpp -gpuAlgo.makeClustersAsync(..., ctx.stream()); -``` - -If necessary, different CUDA streams may be used internally, but they -should to be made to synchronize with the provided CUDA stream with -CUDA events and `cudaStreamWaitEvent()`. - - -#### Putting output - -The GPU data needs to be wrapped to `cms::cuda::Product` template with -`cms::cuda::ScopedContextProduce::wrap()` or `cms::cuda::ScopedContextProduce::emplace()` - -```cpp -GPUClusters clusters = gpuAlgo.makeClustersAsync(..., ctx.stream()); -std::unique_ptr> ret = ctx.wrap(clusters); -iEvent.put(std::move(ret)); - -// or with one line -iEvent.put(ctx.wrap(gpuAlgo.makeClustersAsync(ctx.stream()))); - -// or avoid one unique_ptr with emplace -edm::PutTokenT> putToken_ = produces>(); // in constructor -... -ctx.emplace(iEvent, putToken_, gpuAlgo.makeClustersAsync(ctx.stream())); -``` - -This step is needed to -* store the current device and CUDA stream into `cms::cuda::Product` -* record the CUDA event needed for CUDA stream synchronization - -#### `ExternalWork` extension - -Everything above works both with and without `ExternalWork`. - -Without `ExternalWork` the `EDProducer`s act similar to TBB -flowgraph's "streaming node". In other words, they just queue more -asynchronous work to the CUDA stream in their `produce()`. - -The `ExternalWork` is needed when one would otherwise call -`cudeStreamSynchronize()`. For example transferring something to CPU -needed for downstream DQM, or queueing more asynchronous work. With -`ExternalWork` an `acquire()` method needs to be implemented that gets -an `edm::WaitingTaskWithArenaHolder` parameter. The -`edm::WaitingTaskWithArenaHolder` should then be passed to the -constructor of `cms::cuda::ScopedContextAcquire` along - -```cpp -void acquire(..., edm::WaitingTaskWithArenaHolder waitingTaskHolder) { - cms::cuda::Product const& cclus = iEvent.get(token_); - cms::cuda::ScopedContextAcquire ctx{cclus, std::move(waitingTaskHolder)}; // can also copy instead of move if waitingTaskHolder is needed for something else as well - ... -``` - -When constructed this way, `cms::cuda::ScopedContextAcquire` registers a -callback function to the CUDA stream in its destructor to call -`waitingTaskHolder.doneWaiting()`. - -A GPU->GPU producer needs a `cms::cuda::ScopedContext` also in its -`produce()`. The device and CUDA stream are transferred via -`cms::cuda::ContextState` member variable: - -```cpp -class FooProducerCUDA ... { - ... - cms::cuda::ContextState ctxState_; -}; - -void FooProducerCUDA::acquire(...) { - ... - cms::cuda::ScopedContextAcquire ctx{..., std::move(waitingTaskHolder), ctxState_}; - ... -} - -void FooProducerCUDA::produce(...( { - ... - cms::cuda::ScopedContextProduce ctx{ctxState_}; -} -``` - -The `cms::cuda::ScopedContextAcquire` saves its state to the `ctxState_` in -the destructor, and `cms::cuda::ScopedContextProduce` then restores the -context. - - -#### Transferring GPU data to CPU - -The GPU->CPU data transfer needs synchronization to ensure the CPU -memory to have all data before putting that to the event. This means -the `ExternalWork` needs to be used along -* In `acquire()` - * (allocate CPU memory buffers) - * Queue all GPU->CPU transfers asynchronously -* In `produce()` - * If needed, read additional CPU products (e.g. from `edm::Ref`s) - * Reformat data back to legacy data formats - * Note: `cms::cuda::ScopedContextProduce` is **not** needed in `produce()` - -#### Synchronizing between CUDA streams - -In case the producer needs input data that were produced in two (or -more) CUDA streams, these streams have to be synchronized. Here this -synchronization is achieved with CUDA events. - -Each `cms::cuda::Product` constains also a CUDA event object. The call to -`cms::cuda::ScopedContextProduce::wrap()` will *record* the event in the CUDA -stream. This means that when all work queued to the CUDA stream up to -that point has been finished, the CUDA event becomes *occurred*. Then, -in -`cms::cuda::ScopedContextAcquire::get()`/`cms::cuda::ScopedContextProduce::get()`/`cms::cuda::ScopedContextAnalyze::get()`, -if the `cms::cuda::Product` to get from has a different CUDA stream than -the -`cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`/`cms::cuda::ScopedContextAnalyze`, -`cudaStreamWaitEvent(stream, event)` is called. This means that all -subsequent work queued to the CUDA stream will wait for the CUDA event -to become occurred. Therefore this subsequent work can assume that the -to-be-getted CUDA product exists. - - -### CUDA ESProduct - -Conditions data can be transferred to the device with the following -pattern. - -1. Define a `class`/`struct` for the data to be transferred in the format accessed in the device (hereafter referred to as "payload") -2. Define a wrapper ESProduct that holds the aforementioned data in the pinned host memory -3. The wrapper should have a function returning the payload on the - device memory. The function should transfer the data to the device - asynchronously with the help of `cms::cuda::ESProduct`. - -#### Example - -```cpp -#include "HeterogeneousCore/CUDACore/interface/ESProduct.h" - -// Declare the struct for the payload to be transferred. Here the -// example is an array with (potentially) dynamic size. Note that all of -// below becomes simpler if the array has compile-time size. -struct ESProductExampleCUDA { - float *someData; - unsigned int size; -}; - -// Declare the wrapper ESProduct. The corresponding ESProducer should -// produce objects of this type. -class ESProductExampleCUDAWrapper { -public: - // Constructor takes the standard CPU ESProduct, and transforms the - // necessary data to array(s) in pinned host memory - ESProductExampleCUDAWrapper(ESProductExample const&); - - // Deallocates all pinned host memory - ~ESProductExampleCUDAWrapper(); - - // Function to return the actual payload on the memory of the current device - ESProductExampleCUDA const *getGPUProductAsync(cudaStream_t stream) const; - -private: - // Holds the data in pinned CPU memory - float *someData_; - unsigned int size_; - - // Helper struct to hold all information that has to be allocated and - // deallocated per device - struct GPUData { - // Destructor should free all member pointers - ~GPUData(); - // internal pointers are on device, struct itself is on CPU - ESProductExampleCUDA *esproductHost = nullptr; - // internal pounters and struct are on device - ESProductExampleCUDA *esproductDevice = nullptr; - }; - - // Helper that takes care of complexity of transferring the data to - // multiple devices - cms::cuda::ESProduct gpuData_; -}; - -ESProductExampleCUDAWrapper::ESProductExampleCUDAWrapper(ESProductExample const& cpuProduct) { - cudaCheck(cudaMallocHost(&someData_, sizeof(float)*NUM_ELEMENTS)); - // fill someData_ and size_ from cpuProduct -} - -ESProductExampleCUDA const *ESProductExampleCUDAWrapper::getGPUProductAsync(cudaStream_t stream) const { - // cms::cuda::ESProduct essentially holds an array of GPUData objects, - // one per device. If the data have already been transferred to the - // current device (or the transfer has been queued), the helper just - // returns a reference to that GPUData object. Otherwise, i.e. data are - // not yet on the current device, the helper calls the lambda to do the - // necessary memory allocations and to queue the transfers. - auto const& data = gpuData_.dataForCurrentDeviceAsync(stream, [this](GPUData& data, cudaStream_t stream) { - // Allocate memory. Currently this can be with the CUDA API, - // sometime we'll migrate to the caching allocator. Assumption is - // that IOV changes are rare enough that adding global synchronization - // points is not that bad (for now). - - // Allocate the payload object on pinned host memory. - cudaCheck(cudaMallocHost(&data.esproductHost, sizeof(ESProductExampleCUDA))); - // Allocate the payload array(s) on device memory. - cudaCheck(cudaMalloc(&data.esproductHost->someData, sizeof(float)*NUM_ELEMENTS)); - - // Allocate the payload object on the device memory. - cudaCheck(cudaMalloc(&data.esproductDevice, sizeof(ESProductDevice))); - - // Complete the host-side information on the payload - data.cablingMapHost->size = this->size_; - - - // Transfer the payload, first the array(s) ... - cudaCheck(cudaMemcpyAsync(data.esproductHost->someData, this->someData, sizeof(float)*NUM_ELEMENTS, cudaMemcpyDefault, stream)); - // ... and then the payload object - cudaCheck(cudaMemcpyAsync(data.esproductDevice, data.esproduceHost, sizeof(ESProductExampleCUDA), cudaMemcpyDefault, stream)); -}); - - // Returns the payload object on the memory of the current device - return data.esproductDevice; -} - -// Destructor frees all member pointers -ESProductExampleCUDA::GPUData::~GPUData() { - if(esproductHost != nullptr) { - cudaCheck(cudaFree(esproductHost->someData)); - cudaCheck(cudaFreeHost(esproductHost)); - } - cudaCheck(cudaFree(esProductDevice)); -} - -``` diff --git a/HeterogeneousCore/CUDACore/interface/ContextState.h b/HeterogeneousCore/CUDACore/interface/ContextState.h deleted file mode 100644 index 9c52113cc1e8d..0000000000000 --- a/HeterogeneousCore/CUDACore/interface/ContextState.h +++ /dev/null @@ -1,60 +0,0 @@ -#ifndef HeterogeneousCore_CUDACore_ContextState_h -#define HeterogeneousCore_CUDACore_ContextState_h - -#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h" - -#include - -namespace cms { - namespace cuda { - /** - * The purpose of this class is to deliver the device and CUDA stream - * information from ExternalWork's acquire() to producer() via a - * member/StreamCache variable. - */ - class ContextState { - public: - ContextState() = default; - ~ContextState() = default; - - ContextState(const ContextState&) = delete; - ContextState& operator=(const ContextState&) = delete; - ContextState(ContextState&&) = delete; - ContextState& operator=(ContextState&& other) = delete; - - private: - friend class ScopedContextAcquire; - friend class ScopedContextProduce; - - void set(int device, SharedStreamPtr stream) { - throwIfStream(); - device_ = device; - stream_ = std::move(stream); - } - - int device() const { return device_; } - - const SharedStreamPtr& streamPtr() const { - throwIfNoStream(); - return stream_; - } - - SharedStreamPtr releaseStreamPtr() { - throwIfNoStream(); - // This function needs to effectively reset stream_ (i.e. stream_ - // must be empty after this function). This behavior ensures that - // the SharedStreamPtr is not hold for inadvertedly long (i.e. to - // the next event), and is checked at run time. - return std::move(stream_); - } - - void throwIfStream() const; - void throwIfNoStream() const; - - SharedStreamPtr stream_; - int device_; - }; - } // namespace cuda -} // namespace cms - -#endif diff --git a/HeterogeneousCore/CUDACore/interface/ConvertingESProducerT.h b/HeterogeneousCore/CUDACore/interface/ConvertingESProducerT.h deleted file mode 100644 index 8018fcede7809..0000000000000 --- a/HeterogeneousCore/CUDACore/interface/ConvertingESProducerT.h +++ /dev/null @@ -1,46 +0,0 @@ -#ifndef HeterogeneousCore_CUDACore_interface_ConvertingESProducerT_h -#define HeterogeneousCore_CUDACore_interface_ConvertingESProducerT_h - -#include "FWCore/Framework/interface/ESProducer.h" -#include "FWCore/Framework/interface/ESTransientHandle.h" -#include "FWCore/Framework/interface/MakerMacros.h" -#include "FWCore/Framework/interface/ModuleFactory.h" -#include "FWCore/Framework/interface/eventsetuprecord_registration_macro.h" -#include "FWCore/ParameterSet/interface/ParameterSet.h" -#include "FWCore/Utilities/interface/typelookup.h" - -/* class template: ConvertingESProducerT - * - * This class template can be used to simplify the implementation of any ESProducer that reads - * conditions data from a record and pushes derived conditions data to the same record. - * The current use case is to convert and copy the calibrations from the CPU to the GPUs. - */ - -template -class ConvertingESProducerT : public edm::ESProducer { -public: - explicit ConvertingESProducerT(edm::ParameterSet const& ps) { - auto const& label = ps.getParameter("label"); - auto const& name = ps.getParameter("ComponentName"); - auto cc = setWhatProduced(this, name); - token_ = cc.consumes(edm::ESInputTag{"", label}); - } - - std::unique_ptr produce(Record const& record) { - // retrieve conditions in the old format and build a product in the new format - return std::make_unique(record.get(token_)); - } - - static void fillDescriptions(edm::ConfigurationDescriptions& confDesc) { - edm::ParameterSetDescription desc; - - desc.add("ComponentName", ""); - desc.add("label", "")->setComment("ESProduct label"); - confDesc.addWithDefaultLabel(desc); - } - -private: - edm::ESGetToken token_; -}; - -#endif // HeterogeneousCore_CUDACore_interface_ConvertingESProducerT_h diff --git a/HeterogeneousCore/CUDACore/interface/ConvertingESProducerWithDependenciesT.h b/HeterogeneousCore/CUDACore/interface/ConvertingESProducerWithDependenciesT.h deleted file mode 100644 index 9a57e405ceb5c..0000000000000 --- a/HeterogeneousCore/CUDACore/interface/ConvertingESProducerWithDependenciesT.h +++ /dev/null @@ -1,118 +0,0 @@ -#ifndef HeterogeneousCore_CUDACore_interface_ConvertingESProducerWithDependenciesT_h -#define HeterogeneousCore_CUDACore_interface_ConvertingESProducerWithDependenciesT_h - -#include -#include - -#include "FWCore/Framework/interface/ESProducer.h" -#include "FWCore/Framework/interface/ESHandle.h" -#include "FWCore/Framework/interface/MakerMacros.h" -#include "FWCore/Framework/interface/ModuleFactory.h" -#include "FWCore/Framework/interface/eventsetuprecord_registration_macro.h" -#include "FWCore/ParameterSet/interface/ParameterSet.h" -#include "FWCore/Utilities/interface/typelookup.h" - -/* class template: ConvertingESProducerWithDependenciesT - * - * This class template can be used to simplify the implementation of any ESProducer that reads - * multiple conditions data from one or more records record and pushes derived conditions data - * to a combined dependent record. - * The current use case is to convert and copy the calibrations from the CPU to the GPUs. - */ - -namespace detail { - // simple implementation of a type zipper over 2 tuples - // here, the main requirement is the default constructor for Gen template - // which __does__ exist for ESGetToken - - template