diff --git a/CUDADataFormats/Common/BuildFile.xml b/CUDADataFormats/Common/BuildFile.xml
deleted file mode 100644
index e4971bdf3ebbe..0000000000000
--- a/CUDADataFormats/Common/BuildFile.xml
+++ /dev/null
@@ -1,9 +0,0 @@
-<iftool name="cuda">
-  <use name="cuda"/>
-  <use name="rootcore"/>
-  <use name="DataFormats/Common"/>
-  <use name="HeterogeneousCore/CUDAUtilities"/>
-  <export>
-    <lib name="1"/>
-  </export>
-</iftool>
diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
deleted file mode 100644
index 8cfa5c9f5ffde..0000000000000
--- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h
+++ /dev/null
@@ -1,194 +0,0 @@
-#ifndef CUDADataFormatsCommonHeterogeneousSoA_H
-#define CUDADataFormatsCommonHeterogeneousSoA_H
-
-#include <cassert>
-
-#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
-
-// a heterogeneous unique pointer...
-template <typename T>
-class HeterogeneousSoA {
-public:
-  using Product = T;
-
-  HeterogeneousSoA() = default;  // make root happy
-  ~HeterogeneousSoA() = default;
-  HeterogeneousSoA(HeterogeneousSoA &&) = default;
-  HeterogeneousSoA &operator=(HeterogeneousSoA &&) = default;
-
-  explicit HeterogeneousSoA(cms::cuda::device::unique_ptr<T> &&p) : dm_ptr(std::move(p)) {}
-  explicit HeterogeneousSoA(cms::cuda::host::unique_ptr<T> &&p) : hm_ptr(std::move(p)) {}
-  explicit HeterogeneousSoA(std::unique_ptr<T> &&p) : std_ptr(std::move(p)) {}
-
-  auto const *get() const { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); }
-
-  auto const &operator*() const { return *get(); }
-
-  auto const *operator->() const { return get(); }
-
-  auto *get() { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); }
-
-  auto &operator*() { return *get(); }
-
-  auto *operator->() { return get(); }
-
-  // in reality valid only for GPU version...
-  cms::cuda::host::unique_ptr<T> toHostAsync(cudaStream_t stream) const {
-    assert(dm_ptr);
-    auto ret = cms::cuda::make_host_unique<T>(stream);
-    cudaCheck(cudaMemcpyAsync(ret.get(), dm_ptr.get(), sizeof(T), cudaMemcpyDefault, stream));
-    return ret;
-  }
-
-private:
-  // a union wan't do it, a variant will not be more efficienct
-  cms::cuda::device::unique_ptr<T> dm_ptr;  //!
-  cms::cuda::host::unique_ptr<T> hm_ptr;    //!
-  std::unique_ptr<T> std_ptr;               //!
-};
-
-namespace cms {
-  namespace cudacompat {
-
-    struct GPUTraits {
-      template <typename T>
-      using unique_ptr = cms::cuda::device::unique_ptr<T>;
-
-      template <typename T>
-      static auto make_unique(cudaStream_t stream) {
-        return cms::cuda::make_device_unique<T>(stream);
-      }
-
-      template <typename T>
-      static auto make_unique(size_t size, cudaStream_t stream) {
-        return cms::cuda::make_device_unique<T>(size, stream);
-      }
-
-      template <typename T>
-      static auto make_host_unique(cudaStream_t stream) {
-        return cms::cuda::make_host_unique<T>(stream);
-      }
-
-      template <typename T>
-      static auto make_device_unique(cudaStream_t stream) {
-        return cms::cuda::make_device_unique<T>(stream);
-      }
-
-      template <typename T>
-      static auto make_device_unique(size_t size, cudaStream_t stream) {
-        return cms::cuda::make_device_unique<T>(size, stream);
-      }
-    };
-
-    struct HostTraits {
-      template <typename T>
-      using unique_ptr = cms::cuda::host::unique_ptr<T>;
-
-      template <typename T>
-      static auto make_unique(cudaStream_t stream) {
-        return cms::cuda::make_host_unique<T>(stream);
-      }
-
-      template <typename T>
-      static auto make_unique(size_t size, cudaStream_t stream) {
-        return cms::cuda::make_host_unique<T>(size, stream);
-      }
-
-      template <typename T>
-      static auto make_host_unique(cudaStream_t stream) {
-        return cms::cuda::make_host_unique<T>(stream);
-      }
-
-      template <typename T>
-      static auto make_device_unique(cudaStream_t stream) {
-        return cms::cuda::make_device_unique<T>(stream);
-      }
-
-      template <typename T>
-      static auto make_device_unique(size_t size, cudaStream_t stream) {
-        return cms::cuda::make_device_unique<T>(size, stream);
-      }
-    };
-
-    struct CPUTraits {
-      template <typename T>
-      using unique_ptr = std::unique_ptr<T>;
-
-      template <typename T>
-      static auto make_unique(cudaStream_t) {
-        return std::make_unique<T>();
-      }
-
-      template <typename T>
-      static auto make_unique(size_t size, cudaStream_t) {
-        return std::make_unique<T>(size);
-      }
-
-      template <typename T>
-      static auto make_host_unique(cudaStream_t) {
-        return std::make_unique<T>();
-      }
-
-      template <typename T>
-      static auto make_device_unique(cudaStream_t) {
-        return std::make_unique<T>();
-      }
-
-      template <typename T>
-      static auto make_device_unique(size_t size, cudaStream_t) {
-        return std::make_unique<T>(size);
-      }
-    };
-
-  }  // namespace cudacompat
-}  // namespace cms
-
-// a heterogeneous unique pointer (of a different sort) ...
-template <typename T, typename Traits>
-class HeterogeneousSoAImpl {
-public:
-  template <typename V>
-  using unique_ptr = typename Traits::template unique_ptr<V>;
-
-  HeterogeneousSoAImpl() = default;  // make root happy
-  ~HeterogeneousSoAImpl() = default;
-  HeterogeneousSoAImpl(HeterogeneousSoAImpl &&) = default;
-  HeterogeneousSoAImpl &operator=(HeterogeneousSoAImpl &&) = default;
-
-  explicit HeterogeneousSoAImpl(unique_ptr<T> &&p) : m_ptr(std::move(p)) {}
-  explicit HeterogeneousSoAImpl(cudaStream_t stream);
-
-  T const *get() const { return m_ptr.get(); }
-
-  T *get() { return m_ptr.get(); }
-
-  cms::cuda::host::unique_ptr<T> toHostAsync(cudaStream_t stream) const;
-
-private:
-  unique_ptr<T> m_ptr;  //!
-};
-
-template <typename T, typename Traits>
-HeterogeneousSoAImpl<T, Traits>::HeterogeneousSoAImpl(cudaStream_t stream) {
-  m_ptr = Traits::template make_unique<T>(stream);
-}
-
-// in reality valid only for GPU version...
-template <typename T, typename Traits>
-cms::cuda::host::unique_ptr<T> HeterogeneousSoAImpl<T, Traits>::toHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<T>(stream);
-  cudaCheck(cudaMemcpyAsync(ret.get(), get(), sizeof(T), cudaMemcpyDefault, stream));
-  return ret;
-}
-
-template <typename T>
-using HeterogeneousSoAGPU = HeterogeneousSoAImpl<T, cms::cudacompat::GPUTraits>;
-template <typename T>
-using HeterogeneousSoACPU = HeterogeneousSoAImpl<T, cms::cudacompat::CPUTraits>;
-template <typename T>
-using HeterogeneousSoAHost = HeterogeneousSoAImpl<T, cms::cudacompat::HostTraits>;
-
-#endif
diff --git a/CUDADataFormats/Common/interface/HostProduct.h b/CUDADataFormats/Common/interface/HostProduct.h
deleted file mode 100644
index 63a152298e42b..0000000000000
--- a/CUDADataFormats/Common/interface/HostProduct.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef CUDADataFormatsCommonHostProduct_H
-#define CUDADataFormatsCommonHostProduct_H
-
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
-
-// a heterogeneous unique pointer...
-template <typename T>
-class HostProduct {
-public:
-  HostProduct() = default;  // make root happy
-  ~HostProduct() = default;
-  HostProduct(HostProduct&&) = default;
-  HostProduct& operator=(HostProduct&&) = default;
-
-  explicit HostProduct(cms::cuda::host::unique_ptr<T>&& p) : hm_ptr(std::move(p)) {}
-  explicit HostProduct(std::unique_ptr<T>&& p) : std_ptr(std::move(p)) {}
-
-  auto const* get() const { return hm_ptr ? hm_ptr.get() : std_ptr.get(); }
-
-  auto const& operator*() const { return *get(); }
-
-  auto const* operator->() const { return get(); }
-
-private:
-  cms::cuda::host::unique_ptr<T> hm_ptr;  //!
-  std::unique_ptr<T> std_ptr;             //!
-};
-
-#endif
diff --git a/CUDADataFormats/Common/interface/PortableDeviceCollection.h b/CUDADataFormats/Common/interface/PortableDeviceCollection.h
deleted file mode 100644
index 78f72cb3d5437..0000000000000
--- a/CUDADataFormats/Common/interface/PortableDeviceCollection.h
+++ /dev/null
@@ -1,67 +0,0 @@
-#ifndef CUDADataFormats_Common_interface_PortableDeviceCollection_h
-#define CUDADataFormats_Common_interface_PortableDeviceCollection_h
-
-#include <cassert>
-#include <cstdlib>
-
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-
-namespace cms::cuda {
-
-  // generic SoA-based product in device memory
-  template <typename T>
-  class PortableDeviceCollection {
-  public:
-    using Layout = T;
-    using View = typename Layout::View;
-    using ConstView = typename Layout::ConstView;
-    using Buffer = cms::cuda::device::unique_ptr<std::byte[]>;
-
-    PortableDeviceCollection() = default;
-
-    PortableDeviceCollection(int32_t elements, cudaStream_t stream)
-        : buffer_{cms::cuda::make_device_unique<std::byte[]>(Layout::computeDataSize(elements), stream)},
-          layout_{buffer_.get(), elements},
-          view_{layout_} {
-      // CUDA device memory uses a default alignment of at least 128 bytes
-      assert(reinterpret_cast<uintptr_t>(buffer_.get()) % Layout::alignment == 0);
-    }
-
-    // non-copyable
-    PortableDeviceCollection(PortableDeviceCollection const&) = delete;
-    PortableDeviceCollection& operator=(PortableDeviceCollection const&) = delete;
-
-    // movable
-    PortableDeviceCollection(PortableDeviceCollection&&) = default;
-    PortableDeviceCollection& operator=(PortableDeviceCollection&&) = default;
-
-    // default destructor
-    ~PortableDeviceCollection() = default;
-
-    // access the View
-    View& view() { return view_; }
-    ConstView const& view() const { return view_; }
-    ConstView const& const_view() const { return view_; }
-
-    View& operator*() { return view_; }
-    ConstView const& operator*() const { return view_; }
-
-    View* operator->() { return &view_; }
-    ConstView const* operator->() const { return &view_; }
-
-    // access the Buffer
-    Buffer& buffer() { return buffer_; }
-    Buffer const& buffer() const { return buffer_; }
-    Buffer const& const_buffer() const { return buffer_; }
-
-    size_t bufferSize() const { return layout_.metadata().byteSize(); }
-
-  private:
-    Buffer buffer_;  //!
-    Layout layout_;  //
-    View view_;      //!
-  };
-
-}  // namespace cms::cuda
-
-#endif  // CUDADataFormats_Common_interface_PortableDeviceCollection_h
diff --git a/CUDADataFormats/Common/interface/PortableHostCollection.h b/CUDADataFormats/Common/interface/PortableHostCollection.h
deleted file mode 100644
index cfaf40c85b3bc..0000000000000
--- a/CUDADataFormats/Common/interface/PortableHostCollection.h
+++ /dev/null
@@ -1,85 +0,0 @@
-#ifndef CUDADataFormats_Common_interface_PortableHostCollection_h
-#define CUDADataFormats_Common_interface_PortableHostCollection_h
-
-#include <cassert>
-#include <cstdlib>
-
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
-
-namespace cms::cuda {
-
-  // generic SoA-based product in host memory
-  template <typename T>
-  class PortableHostCollection {
-  public:
-    using Layout = T;
-    using View = typename Layout::View;
-    using ConstView = typename Layout::ConstView;
-    using Buffer = cms::cuda::host::unique_ptr<std::byte[]>;
-
-    PortableHostCollection() = default;
-
-    PortableHostCollection(int32_t elements)
-        // allocate pageable host memory
-        : buffer_{cms::cuda::make_host_unique<std::byte[]>(Layout::computeDataSize(elements))},
-          layout_{buffer_.get(), elements},
-          view_{layout_} {
-      // make_host_unique for pageable host memory uses a default alignment of 128 bytes
-      assert(reinterpret_cast<uintptr_t>(buffer_.get()) % Layout::alignment == 0);
-    }
-
-    PortableHostCollection(int32_t elements, cudaStream_t stream)
-        // allocate pinned host memory, accessible by the current device
-        : buffer_{cms::cuda::make_host_unique<std::byte[]>(Layout::computeDataSize(elements), stream)},
-          layout_{buffer_.get(), elements},
-          view_{layout_} {
-      // CUDA pinned host memory uses a default alignment of at least 128 bytes
-      assert(reinterpret_cast<uintptr_t>(buffer_.get()) % Layout::alignment == 0);
-    }
-
-    // non-copyable
-    PortableHostCollection(PortableHostCollection const&) = delete;
-    PortableHostCollection& operator=(PortableHostCollection const&) = delete;
-
-    // movable
-    PortableHostCollection(PortableHostCollection&&) = default;
-    PortableHostCollection& operator=(PortableHostCollection&&) = default;
-
-    // default destructor
-    ~PortableHostCollection() = default;
-
-    // access the View
-    View& view() { return view_; }
-    ConstView const& view() const { return view_; }
-    ConstView const& const_view() const { return view_; }
-
-    View& operator*() { return view_; }
-    ConstView const& operator*() const { return view_; }
-
-    View* operator->() { return &view_; }
-    ConstView const* operator->() const { return &view_; }
-
-    // access the Buffer
-    Buffer& buffer() { return buffer_; }
-    Buffer const& buffer() const { return buffer_; }
-    Buffer const& const_buffer() const { return buffer_; }
-
-    size_t bufferSize() const { return layout_.metadata().byteSize(); }
-
-    // part of the ROOT read streamer
-    static void ROOTReadStreamer(PortableHostCollection* newObj, Layout const& layout) {
-      newObj->~PortableHostCollection();
-      // allocate pinned host memory using the legacy stream, that synchronises with all (blocking) streams
-      new (newObj) PortableHostCollection(layout.metadata().size());
-      newObj->layout_.ROOTReadStreamer(layout);
-    }
-
-  private:
-    Buffer buffer_;  //!
-    Layout layout_;  //
-    View view_;      //!
-  };
-
-}  // namespace cms::cuda
-
-#endif  // CUDADataFormats_Common_interface_PortableHostCollection_h
diff --git a/CUDADataFormats/Common/interface/Product.h b/CUDADataFormats/Common/interface/Product.h
deleted file mode 100644
index 41bb8356e67cf..0000000000000
--- a/CUDADataFormats/Common/interface/Product.h
+++ /dev/null
@@ -1,60 +0,0 @@
-#ifndef CUDADataFormats_Common_Product_h
-#define CUDADataFormats_Common_Product_h
-
-#include <memory>
-
-#include "CUDADataFormats/Common/interface/ProductBase.h"
-
-namespace edm {
-  template <typename T>
-  class Wrapper;
-}
-
-namespace cms {
-  namespace cuda {
-    namespace impl {
-      class ScopedContextGetterBase;
-    }
-
-    /**
-     * The purpose of this class is to wrap CUDA data to edm::Event in a
-     * way which forces correct use of various utilities.
-     *
-     * The non-default construction has to be done with cms::cuda::ScopedContext
-     * (in order to properly register the CUDA event).
-     *
-     * The default constructor is needed only for the ROOT dictionary generation.
-     *
-     * The CUDA event is in practice needed only for stream-stream
-     * synchronization, but someone with long-enough lifetime has to own
-     * it. Here is a somewhat natural place. If overhead is too much, we
-     * can use them only where synchronization between streams is needed.
-     */
-    template <typename T>
-    class Product : public ProductBase {
-    public:
-      Product() = default;  // Needed only for ROOT dictionary generation
-
-      Product(const Product&) = delete;
-      Product& operator=(const Product&) = delete;
-      Product(Product&&) = default;
-      Product& operator=(Product&&) = default;
-
-    private:
-      friend class impl::ScopedContextGetterBase;
-      friend class ScopedContextProduce;
-      friend class edm::Wrapper<Product<T>>;
-
-      explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, T data)
-          : ProductBase(device, std::move(stream), std::move(event)), data_(std::move(data)) {}
-
-      template <typename... Args>
-      explicit Product(int device, SharedStreamPtr stream, SharedEventPtr event, Args&&... args)
-          : ProductBase(device, std::move(stream), std::move(event)), data_(std::forward<Args>(args)...) {}
-
-      T data_;  //!
-    };
-  }  // namespace cuda
-}  // namespace cms
-
-#endif
diff --git a/CUDADataFormats/Common/interface/ProductBase.h b/CUDADataFormats/Common/interface/ProductBase.h
deleted file mode 100644
index efe2242903bd0..0000000000000
--- a/CUDADataFormats/Common/interface/ProductBase.h
+++ /dev/null
@@ -1,93 +0,0 @@
-#ifndef CUDADataFormats_Common_ProductBase_h
-#define CUDADataFormats_Common_ProductBase_h
-
-#include <atomic>
-#include <memory>
-
-#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h"
-
-namespace cms {
-  namespace cuda {
-    namespace impl {
-      class ScopedContextBase;
-    }
-
-    /**
-     * Base class for all instantiations of CUDA<T> to hold the
-     * non-T-dependent members.
-     */
-    class ProductBase {
-    public:
-      ProductBase() = default;  // Needed only for ROOT dictionary generation
-      ~ProductBase();
-
-      ProductBase(const ProductBase&) = delete;
-      ProductBase& operator=(const ProductBase&) = delete;
-      ProductBase(ProductBase&& other)
-          : stream_{std::move(other.stream_)},
-            event_{std::move(other.event_)},
-            mayReuseStream_{other.mayReuseStream_.load()},
-            device_{other.device_} {}
-      ProductBase& operator=(ProductBase&& other) {
-        stream_ = std::move(other.stream_);
-        event_ = std::move(other.event_);
-        mayReuseStream_ = other.mayReuseStream_.load();
-        device_ = other.device_;
-        return *this;
-      }
-
-      bool isValid() const { return stream_.get() != nullptr; }
-      bool isAvailable() const;
-
-      int device() const { return device_; }
-
-      // cudaStream_t is a pointer to a thread-safe object, for which a
-      // mutable access is needed even if the cms::cuda::ScopedContext itself
-      // would be const. Therefore it is ok to return a non-const
-      // pointer from a const method here.
-      cudaStream_t stream() const { return stream_.get(); }
-
-      // cudaEvent_t is a pointer to a thread-safe object, for which a
-      // mutable access is needed even if the cms::cuda::ScopedContext itself
-      // would be const. Therefore it is ok to return a non-const
-      // pointer from a const method here.
-      cudaEvent_t event() const { return event_.get(); }
-
-    protected:
-      explicit ProductBase(int device, SharedStreamPtr stream, SharedEventPtr event)
-          : stream_{std::move(stream)}, event_{std::move(event)}, device_{device} {}
-
-    private:
-      friend class impl::ScopedContextBase;
-      friend class ScopedContextProduce;
-
-      // The following function is intended to be used only from ScopedContext
-      const SharedStreamPtr& streamPtr() const { return stream_; }
-
-      bool mayReuseStream() const {
-        bool expected = true;
-        bool changed = mayReuseStream_.compare_exchange_strong(expected, false);
-        // If the current thread is the one flipping the flag, it may
-        // reuse the stream.
-        return changed;
-      }
-
-      // The cudaStream_t is really shared among edm::Event products, so
-      // using shared_ptr also here
-      SharedStreamPtr stream_;  //!
-      // shared_ptr because of caching in cms::cuda::EventCache
-      SharedEventPtr event_;  //!
-
-      // This flag tells whether the CUDA stream may be reused by a
-      // consumer or not. The goal is to have a "chain" of modules to
-      // queue their work to the same stream.
-      mutable std::atomic<bool> mayReuseStream_ = true;  //!
-
-      // The CUDA device associated with this product
-      int device_ = -1;  //!
-    };
-  }  // namespace cuda
-}  // namespace cms
-
-#endif
diff --git a/CUDADataFormats/Common/src/ProductBase.cc b/CUDADataFormats/Common/src/ProductBase.cc
deleted file mode 100644
index 8e1cf64b17122..0000000000000
--- a/CUDADataFormats/Common/src/ProductBase.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-#include "CUDADataFormats/Common/interface/ProductBase.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"
-
-namespace cms::cuda {
-  bool ProductBase::isAvailable() const {
-    // if default-constructed, the product is not available
-    if (not event_) {
-      return false;
-    }
-    return eventWorkHasCompleted(event_.get());
-  }
-
-  ProductBase::~ProductBase() {
-    // Make sure that the production of the product in the GPU is
-    // complete before destructing the product. This is to make sure
-    // that the EDM stream does not move to the next event before all
-    // asynchronous processing of the current is complete.
-
-    // TODO: a callback notifying a WaitingTaskHolder (or similar)
-    // would avoid blocking the CPU, but would also require more work.
-    //
-    // Intentionally not checking the return value to avoid throwing
-    // exceptions. If this call would fail, we should get failures
-    // elsewhere as well.
-    if (event_) {
-      cudaEventSynchronize(event_.get());
-    }
-  }
-}  // namespace cms::cuda
diff --git a/CUDADataFormats/Common/src/classes.h b/CUDADataFormats/Common/src/classes.h
deleted file mode 100644
index 239e071d513a2..0000000000000
--- a/CUDADataFormats/Common/src/classes.h
+++ /dev/null
@@ -1,7 +0,0 @@
-#ifndef CUDADataFormats_Common_src_classes_h
-#define CUDADataFormats_Common_src_classes_h
-
-#include "CUDADataFormats/Common/interface/HostProduct.h"
-#include "DataFormats/Common/interface/Wrapper.h"
-
-#endif  // CUDADataFormats_Common_src_classes_h
diff --git a/CUDADataFormats/Common/src/classes_def.xml b/CUDADataFormats/Common/src/classes_def.xml
deleted file mode 100644
index d8514251c807a..0000000000000
--- a/CUDADataFormats/Common/src/classes_def.xml
+++ /dev/null
@@ -1,4 +0,0 @@
-<lcgdict>
-  <class name="HostProduct<uint32_t[]>" persistent="false"/>
-  <class name="edm::Wrapper<HostProduct<uint32_t[]>>" persistent="false"/>
-</lcgdict>
diff --git a/CUDADataFormats/Common/test/BuildFile.xml b/CUDADataFormats/Common/test/BuildFile.xml
deleted file mode 100644
index a0cbbdd8a7858..0000000000000
--- a/CUDADataFormats/Common/test/BuildFile.xml
+++ /dev/null
@@ -1,8 +0,0 @@
-<iftool name="cuda-gcc-support">
-  <bin file="test*.cc" name="testCUDADataFormatsCommon">
-    <use name="HeterogeneousCore/CUDACore"/>
-    <use name="catch2"/>
-    <use name="cuda"/>
-  </bin>
-
-</iftool>
diff --git a/CUDADataFormats/Common/test/test_Product.cc b/CUDADataFormats/Common/test/test_Product.cc
deleted file mode 100644
index 5790d07bec56d..0000000000000
--- a/CUDADataFormats/Common/test/test_Product.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-#include "catch2/catch_all.hpp"
-
-#include "CUDADataFormats/Common/interface/Product.h"
-#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
-
-#include <cuda_runtime_api.h>
-
-namespace cms::cudatest {
-  class TestScopedContext {
-  public:
-    static cuda::ScopedContextProduce make(int dev, bool createEvent) {
-      cms::cuda::SharedEventPtr event;
-      if (createEvent) {
-        event = cms::cuda::getEventCache().get();
-      }
-      return cuda::ScopedContextProduce(dev, cms::cuda::getStreamCache().get(), std::move(event));
-    }
-  };
-}  // namespace cms::cudatest
-
-TEST_CASE("Use of cms::cuda::Product template", "[CUDACore]") {
-  SECTION("Default constructed") {
-    auto foo = cms::cuda::Product<int>();
-    REQUIRE(!foo.isValid());
-
-    auto bar = std::move(foo);
-  }
-
-  if (not cms::cudatest::testDevices()) {
-    return;
-  }
-
-  constexpr int defaultDevice = 0;
-  cudaCheck(cudaSetDevice(defaultDevice));
-  {
-    auto ctx = cms::cudatest::TestScopedContext::make(defaultDevice, true);
-    std::unique_ptr<cms::cuda::Product<int>> dataPtr = ctx.wrap(10);
-    auto& data = *dataPtr;
-
-    SECTION("Construct from cms::cuda::ScopedContext") {
-      REQUIRE(data.isValid());
-      REQUIRE(data.device() == defaultDevice);
-      REQUIRE(data.stream() == ctx.stream());
-      REQUIRE(data.event() != nullptr);
-    }
-
-    SECTION("Move constructor") {
-      auto data2 = cms::cuda::Product<int>(std::move(data));
-      REQUIRE(data2.isValid());
-      REQUIRE(!data.isValid());
-    }
-
-    SECTION("Move assignment") {
-      cms::cuda::Product<int> data2;
-      data2 = std::move(data);
-      REQUIRE(data2.isValid());
-      REQUIRE(!data.isValid());
-    }
-  }
-
-  cudaCheck(cudaSetDevice(defaultDevice));
-  cudaCheck(cudaDeviceSynchronize());
-  // Note: CUDA resources are cleaned up by the destructors of the global cache objects
-}
diff --git a/CUDADataFormats/Common/test/test_main.cc b/CUDADataFormats/Common/test/test_main.cc
deleted file mode 100644
index b3ea47c29c7a7..0000000000000
--- a/CUDADataFormats/Common/test/test_main.cc
+++ /dev/null
@@ -1,2 +0,0 @@
-#define CATCH_CONFIG_MAIN
-#include "catch2/catch_all.hpp"
diff --git a/CUDADataFormats/PortableTestObjects/BuildFile.xml b/CUDADataFormats/PortableTestObjects/BuildFile.xml
deleted file mode 100644
index 595a743a6c4c5..0000000000000
--- a/CUDADataFormats/PortableTestObjects/BuildFile.xml
+++ /dev/null
@@ -1,7 +0,0 @@
-<use name="rootcore"/>
-<use name="CUDADataFormats/Common"/>
-<use name="DataFormats/Common"/>
-<use name="DataFormats/PortableTestObjects"/>
-<export>
-  <lib name="1"/>
-</export>
diff --git a/CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h b/CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h
deleted file mode 100644
index 621f0939116d7..0000000000000
--- a/CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef CUDADataFormats_PortableTestObjects_interface_TestDeviceCollection_h
-#define CUDADataFormats_PortableTestObjects_interface_TestDeviceCollection_h
-
-#include "CUDADataFormats/Common/interface/PortableDeviceCollection.h"
-#include "DataFormats/PortableTestObjects/interface/TestSoA.h"
-
-namespace cudatest {
-
-  // Eigen matrix
-  using Matrix = portabletest::Matrix;
-  using Array = portabletest::Array;
-
-  // SoA with x, y, z, id fields, r scalar, m matrix, in device global memory
-  using TestDeviceCollection = cms::cuda::PortableDeviceCollection<portabletest::TestSoA>;
-
-}  // namespace cudatest
-
-#endif  // CUDADataFormats_PortableTestObjects_interface_TestDeviceCollection_h
diff --git a/CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h b/CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h
deleted file mode 100644
index 9426b6c6a8275..0000000000000
--- a/CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h
+++ /dev/null
@@ -1,18 +0,0 @@
-#ifndef CUDADataFormats_PortableTestObjects_interface_TestHostCollection_h
-#define CUDADataFormats_PortableTestObjects_interface_TestHostCollection_h
-
-#include "CUDADataFormats/Common/interface/PortableHostCollection.h"
-#include "DataFormats/PortableTestObjects/interface/TestSoA.h"
-
-namespace cudatest {
-
-  // Eigen matrix
-  using Matrix = portabletest::Matrix;
-  using Array = portabletest::Array;
-
-  // SoA with x, y, z, id fields, r scalar, m matrix, in host memory
-  using TestHostCollection = cms::cuda::PortableHostCollection<portabletest::TestSoA>;
-
-}  // namespace cudatest
-
-#endif  // CUDADataFormats_PortableTestObjects_interface_TestHostCollection_h
diff --git a/HeterogeneousCore/CUDACore/BuildFile.xml b/HeterogeneousCore/CUDACore/BuildFile.xml
deleted file mode 100644
index 42f7db8fc72d6..0000000000000
--- a/HeterogeneousCore/CUDACore/BuildFile.xml
+++ /dev/null
@@ -1,15 +0,0 @@
-<iftool name="cuda">
-  <use name="FWCore/Concurrency"/>
-  <use name="FWCore/Framework"/>
-  <use name="FWCore/MessageLogger"/>
-  <use name="FWCore/ServiceRegistry"/>
-  <use name="FWCore/ParameterSet"/>
-  <use name="FWCore/Utilities"/>
-  <use name="CUDADataFormats/Common"/>
-  <use name="HeterogeneousCore/CUDAServices"/>
-  <use name="HeterogeneousCore/CUDAUtilities"/>
-  <use name="cuda"/>
-  <export>
-    <lib name="1"/>
-  </export>
-</iftool>
diff --git a/HeterogeneousCore/CUDACore/README.md b/HeterogeneousCore/CUDACore/README.md
deleted file mode 100644
index 92a29e1460f19..0000000000000
--- a/HeterogeneousCore/CUDACore/README.md
+++ /dev/null
@@ -1,812 +0,0 @@
-# CUDA algorithms in CMSSW
-
-## Outline
-
-* [Introduction](#introduction)
-  * [Design goals](#design-goals)
-  * [Overall guidelines](#overall-guidelines)
-* [Sub-packages](#sub-packages)
-* [Examples](#examples)
-  * [Isolated producer (no CUDA input nor output)](#isolated-producer-no-cuda-input-nor-output)
-  * [Producer with CUDA output](#producer-with-cuda-output)
-  * [Producer with CUDA input](#producer-with-cuda-input)
-  * [Producer with CUDA input and output (with ExternalWork)](#producer-with-cuda-input-and-output-with-externalwork)
-  * [Producer with CUDA input and output, and internal chain of CPU and GPU tasks (with ExternalWork)](producer-with-cuda-input-and-output-and-internal-chain-of-cpu-and-gpu-tasks-with-externalwork)
-  * [Producer with CUDA input and output (without ExternalWork)](#producer-with-cuda-input-and-output-without-externalwork)
-  * [Analyzer with CUDA input](#analyzer-with-cuda-input)
-  * [Configuration](#configuration)
-    * [GPU-only configuration](#gpu-only-configuration)
-* [More details](#more-details)
-  * [Device choice](#device-choice)
-  * [Data model](#data-model)
-  * [CUDA EDProducer](#cuda-edproducer)
-    * [Class declaration](#class-declaration)
-    * [Memory allocation](#memory-allocation)
-      * [Caching allocator](#caching-allocator)
-      * [Non-cached pinned host `unique_ptr`](#non-cached-pinned-host-unique_ptr)
-      * [CUDA API](#cuda-api)
-    * [Setting the current device](#setting-the-current-device)
-    * [Getting input](#getting-input)
-    * [Calling the CUDA kernels](#calling-the-cuda-kernels)
-    * [Putting output](#putting-output)
-    * [`ExternalWork` extension](#externalwork-extension)
-    * [Module-internal chain of CPU and GPU tasks](#module-internal-chain-of-cpu-and-gpu-tasks)
-    * [Transferring GPU data to CPU](#transferring-gpu-data-to-cpu)
-    * [Synchronizing between CUDA streams](#synchronizing-between-cuda-streams)
-  * [CUDA ESProduct](#cuda-esproduct)
-
-## Introduction
-
-This page documents the CUDA integration within CMSSW
-
-### Design goals
-
-1. Provide a mechanism for a chain of modules to share a resource
-   * Resource can be e.g. CUDA device memory or a CUDA stream
-2. Minimize data movements between the CPU and the device
-3. Support multiple devices
-4. Allow the same job configuration to be used on all hardware combinations
-
-### Overall guidelines
-
-1. Within the `acquire()`/`produce()` functions all CUDA operations should be asynchronous, i.e.
-   * Use `cudaMemcpyAsync()`, `cudaMemsetAsync()`, `cudaMemPrefetchAsync()` etc.
-   * Avoid `cudaMalloc*()`, `cudaHostAlloc()`, `cudaFree*()`, `cudaHostRegister()`, `cudaHostUnregister()` on every event
-     * Occasional calls are permitted through a caching mechanism that amortizes the cost (see also [Caching allocator](#caching-allocator))
-   * Avoid `assert()` in device functions, or use `#include HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h`
-     * With the latter the `assert()` calls in CUDA code are disabled by
-       default, but can be enabled by defining a `GPU_DEBUG` macro
-       (before the aforementioned include)
-2. Synchronization needs should be fulfilled with
-   [`ExternalWork`](https://twiki.cern.ch/twiki/bin/view/CMSPublic/FWMultithreadedFrameworkStreamModuleInterface#edm_ExternalWork)
-   extension to EDProducers
-   * `ExternalWork` can be used to replace one synchronization point
-     (e.g. between device kernels and copying a known amount of data
-     back to CPU).
-   * For further synchronization points (e.g. copying data whose
-     amount is known only at the device side), split the work to
-     multiple `ExternalWork` producers. This approach has the added
-     benefit that e.g. data transfers to CPU become on-demand automatically
-   * A general breakdown of the possible steps:
-     * Convert input legacy CPU data format to CPU SoA
-     * Transfer input CPU SoA to GPU
-     * Launch kernels
-     * Transfer the number of output elements to CPU
-     * Transfer the output data from GPU to CPU SoA
-     * Convert the output SoA to legacy CPU data formats
-3. Within `acquire()`/`produce()`, the current CUDA device is set
-   implicitly and the CUDA stream is provided by the system (with
-   `cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`)
-   * It is strongly recommended to use the provided CUDA stream for all operations
-     * If that is not feasible for some reason, the provided CUDA
-       stream must synchronize with the work queued on other CUDA
-       streams (with CUDA events and `cudaStreamWaitEvent()`)
-4. Outside of `acquire()`/`produce()`, CUDA API functions may be
-   called only if the `CUDAService` implementation of the `CUDAInterface`
-   is available and `CUDAService::enabled()` returns `true`:
-     ```c++
-     edm::Service<CUDAInterface> cuda;
-     if (cuda and cuda->enabled()) {
-       // CUDA calls ca be made here
-     }
-     ```
-   * With point 3 it follows that in these cases multiple devices have
-     to be dealt with explicitly, as well as CUDA streams
-
-## Sub-packages
-* [`HeterogeneousCore/CUDACore`](#cuda-integration) CUDA-specific core components
-* [`HeterogeneousCore/CUDAServices`](../CUDAServices) Various edm::Services related to CUDA
-* [`HeterogeneousCore/CUDAUtilities`](../CUDAUtilities) Various utilities for CUDA kernel code
-* [`HeterogeneousCore/CUDATest`](../CUDATest) Test modules and configurations
-* [`CUDADataFormats/Common`](../../CUDADataFormats/Common) Utilities for event products with CUDA data
-
-## Examples
-
-### Isolated producer (no CUDA input nor output)
-
-```cpp
-class IsolatedProducerCUDA: public edm::stream::EDProducer<ExternalWork> {
-public:
-  ...
-  void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
-  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
-  ...
-private:
-  ...
-  IsolatedProducerGPUAlgo gpuAlgo_;
-  edm::EDGetTokenT<InputData> inputToken_;
-  edm::EDPutTokenT<OutputData> outputToken_;
-};
-...
-void IsolatedProducerCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  // Sets the current device and creates a CUDA stream
-  cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
-
-  auto const& inputData = iEvent.get(inputToken_);
-
-  // Queues asynchronous data transfers and kernels to the CUDA stream
-  // returned by cms::cuda::ScopedContextAcquire::stream()
-  gpuAlgo_.makeAsync(inputData, ctx.stream());
-
-  // Destructor of ctx queues a callback to the CUDA stream notifying
-  // waitingTaskHolder when the queued asynchronous work has finished
-}
-
-// Called after the asynchronous work has finished
-void IsolatedProducerCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
-  // Real life is likely more complex than this simple example. Here
-  // getResult() returns some data in CPU memory that is passed
-  // directly to the OutputData constructor.
-  iEvent.emplace(outputToken_, gpuAlgo_.getResult());
-}
-```
-
-### Producer with CUDA output
-
-```cpp
-class ProducerOutputCUDA: public edm::stream::EDProducer<ExternalWork> {
-public:
-  ...
-  void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
-  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
-  ...
-private:
-  ...
-  ProducerOutputGPUAlgo gpuAlgo_;
-  edm::EDGetTokenT<InputData> inputToken_;
-  edm::EDPutTokenT<cms::cuda::Product<OutputData>> outputToken_;
-  cms::cuda::ContextState ctxState_;
-};
-...
-void ProducerOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  // Sets the current device and creates a CUDA stream
-  cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), std::move(waitingTaskHolder), ctxState_};
-
-  auto const& inputData = iEvent.get(inputToken_);
-
-  // Queues asynchronous data transfers and kernels to the CUDA stream
-  // returned by cms::cuda::ScopedContextAcquire::stream()
-  gpuAlgo.makeAsync(inputData, ctx.stream());
-
-  // Destructor of ctx queues a callback to the CUDA stream notifying
-  // waitingTaskHolder when the queued asynchronous work has finished,
-  // and saves the device and CUDA stream to ctxState_
-}
-
-// Called after the asynchronous work has finished
-void ProducerOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
-  // Sets again the current device, uses the CUDA stream created in the acquire()
-  cms::cuda::ScopedContextProduce ctx{ctxState_};
-
-  // Now getResult() returns data in GPU memory that is passed to the
-  // constructor of OutputData. cms::cuda::ScopedContextProduce::emplace() wraps the
-  // OutputData to cms::cuda::Product<OutputData>. cms::cuda::Product<T> stores also
-  // the current device and the CUDA stream since those will be needed
-  // in the consumer side.
-  ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult());
-}
-```
-
-### Producer with CUDA input
-
-```cpp
-class ProducerInputCUDA: public edm::stream::EDProducer<ExternalWork> {
-public:
-  ...
-  void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
-  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
-  ...
-private:
-  ...
-  ProducerInputGPUAlgo gpuAlgo_;
-  edm::EDGetTokenT<cms::cuda:Product<InputData>> inputToken_;
-  edm::EDGetTokenT<cms::cuda::Product<OtherInputData>> otherInputToken_;
-  edm::EDPutTokenT<OutputData> outputToken_;
-};
-...
-void ProducerInputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  cms::cuda::Product<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
-
-  // Set the current device to the same that was used to produce
-  // InputData, and possibly use the same CUDA stream
-  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
-
-  // Grab the real input data. Checks that the input data is on the
-  // current device. If the input data was produced in a different CUDA
-  // stream than the cms::cuda::ScopedContextAcquire holds, create an inter-stream
-  // synchronization point with CUDA event and cudaStreamWaitEvent()
-  auto const& inputData = ctx.get(inputDataWrapped);
-
-  // Input data from another producer
-  auto const& otherInputData = ctx.get(iEvent.get(otherInputToken_));
-  // or
-  auto const& otherInputData = ctx.get(iEvent, otherInputToken_);
-
-
-  // Queues asynchronous data transfers and kernels to the CUDA stream
-  // returned by cms::cuda::ScopedContextAcquire::stream()
-  gpuAlgo.makeAsync(inputData, otherInputData, ctx.stream());
-
-  // Destructor of ctx queues a callback to the CUDA stream notifying
-  // waitingTaskHolder when the queued asynchronous work has finished
-}
-
-// Called after the asynchronous work has finished
-void ProducerInputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) {
-  // Real life is likely more complex than this simple example. Here
-  // getResult() returns some data in CPU memory that is passed
-  // directly to the OutputData constructor.
-  iEvent.emplace(outputToken_, gpuAlgo_.getResult());
-}
-```
-
-See [further below](#setting-the-current-device) for the conditions
-when the `cms::cuda::ScopedContextAcquire` constructor reuses the CUDA stream. Note
-that the `cms::cuda::ScopedContextAcquire` constructor taking `edm::StreamID` is
-allowed, it will just always create a new CUDA stream.
-
-
-### Producer with CUDA input and output (with ExternalWork)
-
-```cpp
-class ProducerInputOutputCUDA: public edm::stream::EDProducer<ExternalWork> {
-public:
-  ...
-  void acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
-  void produce(edm::Event& iEvent, edm::EventSetup& iSetup) override;
-  ...
-private:
-  ...
-  ProducerInputGPUAlgo gpuAlgo_;
-  edm::EDGetTokenT<cms::cuda::Product<InputData>> inputToken_;
-  edm::EDPutTokenT<cms::cuda::Product<OutputData>> outputToken_;
-};
-...
-void ProducerInputOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  cms::cuda::Product<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
-
-  // Set the current device to the same that was used to produce
-  // InputData, and also use the same CUDA stream
-  cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder), ctxState_};
-
-  // Grab the real input data. Checks that the input data is on the
-  // current device. If the input data was produced in a different CUDA
-  // stream than the cms::cuda::ScopedContextAcquire holds, create an inter-stream
-  // synchronization point with CUDA event and cudaStreamWaitEvent()
-  auto const& inputData = ctx.get(inputDataWrapped);
-
-  // Queues asynchronous data transfers and kernels to the CUDA stream
-  // returned by cms::cuda::ScopedContextAcquire::stream()
-  gpuAlgo.makeAsync(inputData, ctx.stream());
-
-  // Destructor of ctx queues a callback to the CUDA stream notifying
-  // waitingTaskHolder when the queued asynchronous work has finished,
-  // and saves the device and CUDA stream to ctxState_
-}
-
-// Called after the asynchronous work has finished
-void ProducerInputOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) {
-  // Sets again the current device, uses the CUDA stream created in the acquire()
-  cms::cuda::ScopedContextProduce ctx{ctxState_};
-
-  // Now getResult() returns data in GPU memory that is passed to the
-  // constructor of OutputData. cms::cuda::ScopedContextProduce::emplace() wraps the
-  // OutputData to cms::cuda::Product<OutputData>. cms::cuda::Product<T> stores also
-  // the current device and the CUDA stream since those will be needed
-  // in the consumer side.
-  ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult());
-}
-```
-
-[Complete example](../CUDATest/plugins/TestCUDAProducerGPUEW.cc)
-
-
-### Producer with CUDA input and output (without ExternalWork)
-
-If the producer does not need to transfer anything back to CPU (like
-the number of output elements), the `ExternalWork` extension is not
-needed as there is no need to synchronize.
-
-```cpp
-class ProducerInputOutputCUDA: public edm::global::EDProducer<> {
-public:
-  ...
-  void produce(edm::StreamID streamID, edm::Event& iEvent, edm::EventSetup& iSetup) const override;
-  ...
-private:
-  ...
-  ProducerInputGPUAlgo gpuAlgo_;
-  edm::EDGetTokenT<cms::cuda::Product<InputData>> inputToken_;
-  edm::EDPutTokenT<cms::cuda::Product<OutputData>> outputToken_;
-};
-...
-void ProducerInputOutputCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, edm::EventSetup& iSetup) const {
-  cms::cuda::Product<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
-
-  // Set the current device to the same that was used to produce
-  // InputData, and possibly use the same CUDA stream
-  cms::cuda::ScopedContextProduce ctx{inputDataWrapped};
-
-  // Grab the real input data. Checks that the input data is on the
-  // current device. If the input data was produced in a different CUDA
-  // stream than the cms::cuda::ScopedContextProduce holds, create an inter-stream
-  // synchronization point with CUDA event and cudaStreamWaitEvent()
-  auto const& inputData = ctx.get(inputDataWrapped);
-
-  // Queues asynchronous data transfers and kernels to the CUDA stream
-  // returned by cms::cuda::ScopedContextProduce::stream(). Here makeAsync() also
-  // returns data in GPU memory that is passed to the constructor of
-  // OutputData. cms::cuda::ScopedContextProduce::emplace() wraps the OutputData to
-  // cms::cuda::Product<OutputData>. cms::cuda::Product<T> stores also the current
-  // device and the CUDA stream since those will be needed in the
-  // consumer side.
-  ctx.emplace(iEvent, outputToken, gpuAlgo.makeAsync(inputData, ctx.stream());
-
-  // Destructor of ctx queues a callback to the CUDA stream notifying
-  // waitingTaskHolder when the queued asynchronous work has finished
-}
-```
-
-[Complete example](../CUDATest/plugins/TestCUDAProducerGPU.cc)
-
-
-### Analyzer with CUDA input
-
-Analyzer with CUDA input is similar to [producer with CUDA
-input](#producer-with-cuda-input). Note that currently we do not have
-a mechanism for portable configurations with analyzers. This means
-that a configuration with a CUDA analyzer can only run on a machine
-with CUDA device(s).
-
-```cpp
-class AnalyzerInputCUDA: public edm::global::EDAnalyzer<> {
-public:
-  ...
-  void analyzer(edm::Event const& iEvent, edm::EventSetup const& iSetup) override;
-  ...
-private:
-  ...
-  AnalyzerInputGPUAlgo gpuAlgo_;
-  edm::EDGetTokenT<cms::cuda::Product<InputData>> inputToken_;
-  edm::EDGetTokenT<cms::cuda::Product<OtherInputData>> otherInputToken_;
-};
-...
-void AnalyzerInputCUDA::analyze(edm::Event const& iEvent, edm::EventSetup& iSetup) {
-  cms::cuda::Product<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
-
-  // Set the current device to the same that was used to produce
-  // InputData, and possibly use the same CUDA stream
-  cms::cuda::ScopedContextAnalyze ctx{inputDataWrapped};
-
-  // Grab the real input data. Checks that the input data is on the
-  // current device. If the input data was produced in a different CUDA
-  // stream than the cms::cuda::ScopedContextAnalyze holds, create an inter-stream
-  // synchronization point with CUDA event and cudaStreamWaitEvent()
-  auto const& inputData = ctx.get(inputDataWrapped);
-
-  // Input data from another producer
-  auto const& otherInputData = ctx.get(iEvent.get(otherInputToken_));
-  // or
-  auto const& otherInputData = ctx.get(iEvent, otherInputToken_);
-
-
-  // Queues asynchronous data transfers and kernels to the CUDA stream
-  // returned by cms::cuda::ScopedContextAnalyze::stream()
-  gpuAlgo.analyzeAsync(inputData, otherInputData, ctx.stream());
-}
-```
-
-[Complete example](../CUDATest/plugins/TestCUDAAnalyzerGPU.cc)
-
-
-### Configuration
-
-#### GPU-only configuration
-
-For a GPU-only configuration there is nothing special to be done, just
-construct the Paths/Sequences/Tasks from the GPU modules.
-
-## More details
-
-### Device choice
-
-For multi-GPU setup the device is chosen in the first CUDA module in a
-chain of modules by one of the constructors of
-`cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`
-```cpp
-// In ExternalWork acquire()
-cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), ...};
-
-// In normal produce() (or filter())
-cms::cuda::ScopedContextProduce ctx{iEvent.streamID()};
-```
-As the choice is still the static EDM stream to device assignment, the
-EDM stream ID is needed. The logic will likely evolve in the future to
-be more dynamic, and likely the device choice has to be made for the
-full event.
-
-### Data model
-
-The "GPU data product" should be a class/struct containing smart
-pointer(s) to device data (see [Memory allocation](#memory-allocation)).
-When putting the data to event, the data is wrapped to
-`cms::cuda::Product<T>` template, which holds
-* the GPU data product
-  * must be moveable, but no other restrictions
-* the current device where the data was produced, and the CUDA stream the data was produced with
-* [CUDA event for synchronization between multiple CUDA streams](#synchronizing-between-cuda-streams)
-
-Note that the `cms::cuda::Product<T>` wrapper can be constructed only with
-`cms::cuda::ScopedContextProduce::wrap()`, and the data `T` can be obtained
-from it only with
-`cms::cuda::ScopedContextAcquire::get()`/`cms::cuda::ScopedContextProduce::get()`/`cms::cuda::ScopedContextAnalyze::get()`,
-as described further below. When putting the data product directly to
-`edm::Event`, also `cms::cuda::SCopedContextProduce::emplace()` can be used.
-
-The GPU data products that depend on the CUDA runtime should be placed
-under `CUDADataFormats` package, using the same name for sub-package
-that would be used in `DataFormats`. Everything else, e.g. SoA for
-CPU, should go under `DataFormats` as usual.
-
-
-### CUDA EDProducer
-
-#### Class declaration
-
-The CUDA producers are normal EDProducers. The `ExternalWork`
-extension should be used if a synchronization between the GPU and CPU
-is needed, e.g. when transferring data from GPU to CPU.
-
-#### Memory allocation
-
-##### Caching allocator
-
-The memory allocations should be done dynamically with the following functions
-```cpp
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
-
-cms::cuda::device::unique_ptr<float[]> device_buffer = cms::cuda::make_device_unique<float[]>(50, cudaStream);
-cms::cuda::host::unique_ptr<float[]>   host_buffer   = cms::cuda::make_host_unique<float[]>(50, cudaStream);
-```
-
-in the `acquire()` and `produce()` functions. The same
-`cudaStream_t` object that is used for transfers and kernels
-should be passed to the allocator.
-
-The allocator is based on [`cub::CachingDeviceAllocator`](https://nvlabs.github.io/cub/structcub_1_1_caching_device_allocator.html).
-The memory is guaranteed to be reserved
-* for the host: up to the destructor of the `unique_ptr`
-* for the device: until all work queued in the `cudaStream` up to the point when the `unique_ptr` destructor is called has finished
-
-##### Non-cached pinned host `unique_ptr`
-
-In producers transferring data to GPU one may want to pinned host
-memory allocated with `cudaHostAllocWriteCombined`. As of now we don't
-want to include the flag dimension to the caching allocator. The CUDA
-API wrapper library does not support allocation flags, so we add our
-own `unique_ptr` for that.
-
-```cpp
-#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
-
-cms::cuda::host::noncached_unique_ptr<float[]> host_buffer = cms::cuda::make_host_noncached_unique<float[]>(50, flags);
-```
-The `flags` is passed directly to `cudaHostAlloc()`.
-
-##### CUDA API
-
-The `cudaMalloc()` etc may be used outside of the event loop, but that
-should be limited to only relatively small allocations in order to
-allow as much re-use of device memory as possible.
-
-If really needed, the `cudaMalloc()` etc may be used also within the
-event loop, but then the cost of allocation and implicit
-synchronization should be explicitly amortized e.g. by caching.
-
-#### Setting the current device
-
-A CUDA producer should construct `cms::cuda::ScopedContextAcquire` in
-`acquire()` (`cms::cuda::ScopedContextProduce` `produce()` if not using
-`ExternalWork`) either with `edm::StreamID`, or with a
-`cms::cuda::Product<T>` read as an input.
-
-```cpp
-// From edm::StreamID
-cms::cuda::ScopedContextAcquire ctx{iEvent.streamID(), ...};
-// or
-cms::cuda::ScopedContextProduce ctx{iEvent.streamID()};
-
-
-// From cms::cuda::Product<T>
-cms::cuda::Product<GPUClusters> const& cclus = iEvent.get(srcToken_);
-cms::cuda::ScopedContextAcquire ctx{cclus, ...};
-// or
-cms::cuda::ScopedContextProduce ctx{cclus};
-```
-
-A CUDA analyzer should construct `cms::cuda::ScopedContextAnalyze` with a
-`cms::cuda::Product<T>` read as an input.
-
-```cpp
-cms::cuda::Product<GPUClusters> const& cclus = iEvent.get(srcToken_);
-cms::cuda::ScopedContextAnalyze ctx{cclus};
-```
-
-`cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`/`cms::cuda::ScopedContextAnalyze` work in the RAII way and does the following
-* Sets the current device for the current scope
-  - If constructed from the `edm::StreamID`, chooses the device and creates a new CUDA stream
-  - If constructed from the `cms::cuda::Product<T>`, uses the same device and possibly the same CUDA stream as was used to produce the `cms::cuda::Product<T>`
-    * The CUDA stream is reused if this producer is the first consumer
-      of the `cms::cuda::Product<T>`, otherwise a new CUDA stream is created.
-      This approach is simple compromise to automatically express the work of
-      parallel producers in different CUDA streams, and at the same
-      time allow a chain of producers to queue their work to the same
-      CUDA stream.
-* Gives access to the CUDA stream the algorithm should use to queue asynchronous work
-* `cms::cuda::ScopedContextAcquire` calls `edm::WaitingTaskWithArenaHolder::doneWaiting()` when necessary (in its destructor)
-* [Synchronizes between CUDA streams if necessary](#synchronizing-between-cuda-streams)
-* Needed to get `cms::cuda::Product<T>` from the event
-  * `cms::cuda::ScopedContextProduce` is needed to put `cms::cuda::Product<T>` to the event
-
-In case of multiple input products, from possibly different CUDA
-streams and/or CUDA devices, this approach gives the developer full
-control in which of them the kernels of the algorithm should be run.
-
-#### Getting input
-
-The real product (`T`) can be obtained from `cms::cuda::Product<T>` only with
-the help of
-`cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`/`cms::cuda::ScopedContextAnalyze`.
-
-```cpp
-// From cms::cuda::Product<T>
-cms::cuda::Product<GPUClusters> cclus = iEvent.get(srcToken_);
-GPUClusters const& clus = ctx.get(cclus);
-
-// Directly from Event
-GPUClusters const& clus = ctx.get(iEvent, srcToken_);
-```
-
-This step is needed to
-* check that the data are on the same CUDA device
-  * if not, throw an exception (with unified memory could prefetch instead)
-* if the CUDA streams are different, synchronize between them
-
-#### Calling the CUDA kernels
-
-It is usually best to wrap the CUDA kernel calls to a separate class,
-and then call methods of that class from the EDProducer. The only
-requirement is that the CUDA stream where to queue the operations
-should be the one from the
-`cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`/`cms::cuda::ScopedContextAnalyze`.
-
-```cpp
-gpuAlgo.makeClustersAsync(..., ctx.stream());
-```
-
-If necessary, different CUDA streams may be used internally, but they
-should to be made to synchronize with the provided CUDA stream with
-CUDA events and `cudaStreamWaitEvent()`.
-
-
-#### Putting output
-
-The GPU data needs to be wrapped to `cms::cuda::Product<T>` template with
-`cms::cuda::ScopedContextProduce::wrap()` or `cms::cuda::ScopedContextProduce::emplace()`
-
-```cpp
-GPUClusters clusters = gpuAlgo.makeClustersAsync(..., ctx.stream());
-std::unique_ptr<cms::cuda::Product<GPUClusters>> ret = ctx.wrap(clusters);
-iEvent.put(std::move(ret));
-
-// or with one line
-iEvent.put(ctx.wrap(gpuAlgo.makeClustersAsync(ctx.stream())));
-
-// or avoid one unique_ptr with emplace
-edm::PutTokenT<cms::cuda::Product<GPUClusters>> putToken_ = produces<cms::cuda::Product<GPUClusters>>(); // in constructor
-...
-ctx.emplace(iEvent, putToken_, gpuAlgo.makeClustersAsync(ctx.stream()));
-```
-
-This step is needed to
-* store the current device and CUDA stream into `cms::cuda::Product<T>`
-* record the CUDA event needed for CUDA stream synchronization
-
-#### `ExternalWork` extension
-
-Everything above works both with and without `ExternalWork`.
-
-Without `ExternalWork` the `EDProducer`s act similar to TBB
-flowgraph's "streaming node". In other words, they just queue more
-asynchronous work to the CUDA stream in their `produce()`.
-
-The `ExternalWork` is needed when one would otherwise call
-`cudeStreamSynchronize()`. For example transferring something to CPU
-needed for downstream DQM, or queueing more asynchronous work. With
-`ExternalWork` an `acquire()` method needs to be implemented that gets
-an `edm::WaitingTaskWithArenaHolder` parameter. The
-`edm::WaitingTaskWithArenaHolder` should then be passed to the
-constructor of `cms::cuda::ScopedContextAcquire` along
-
-```cpp
-void acquire(..., edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  cms::cuda::Product<GPUClusters> const& cclus = iEvent.get(token_);
-  cms::cuda::ScopedContextAcquire ctx{cclus, std::move(waitingTaskHolder)}; // can also copy instead of move if waitingTaskHolder is needed for something else as well
-  ...
-```
-
-When constructed this way, `cms::cuda::ScopedContextAcquire` registers a
-callback function to the CUDA stream in its destructor to call
-`waitingTaskHolder.doneWaiting()`.
-
-A GPU->GPU producer needs a `cms::cuda::ScopedContext` also in its
-`produce()`. The device and CUDA stream are transferred via
-`cms::cuda::ContextState` member variable:
-
-```cpp
-class FooProducerCUDA ... {
-  ...
-  cms::cuda::ContextState ctxState_;
-};
-
-void FooProducerCUDA::acquire(...) {
-  ...
-  cms::cuda::ScopedContextAcquire ctx{..., std::move(waitingTaskHolder), ctxState_};
-  ...
-}
-
-void FooProducerCUDA::produce(...( {
-  ...
-  cms::cuda::ScopedContextProduce ctx{ctxState_};
-}
-```
-
-The `cms::cuda::ScopedContextAcquire` saves its state to the `ctxState_` in
-the destructor, and `cms::cuda::ScopedContextProduce` then restores the
-context.
-
-
-#### Transferring GPU data to CPU
-
-The GPU->CPU data transfer needs synchronization to ensure the CPU
-memory to have all data before putting that to the event. This means
-the `ExternalWork` needs to be used along
-* In `acquire()`
-  * (allocate CPU memory buffers)
-  * Queue all GPU->CPU transfers asynchronously
-* In `produce()`
-  * If needed, read additional CPU products (e.g. from `edm::Ref`s)
-  * Reformat data back to legacy data formats
-  * Note: `cms::cuda::ScopedContextProduce` is **not** needed in `produce()`
-
-#### Synchronizing between CUDA streams
-
-In case the producer needs input data that were produced in two (or
-more) CUDA streams, these streams have to be synchronized. Here this
-synchronization is achieved with CUDA events.
-
-Each `cms::cuda::Product<T>` constains also a CUDA event object. The call to
-`cms::cuda::ScopedContextProduce::wrap()` will *record* the event in the CUDA
-stream. This means that when all work queued to the CUDA stream up to
-that point has been finished, the CUDA event becomes *occurred*. Then,
-in
-`cms::cuda::ScopedContextAcquire::get()`/`cms::cuda::ScopedContextProduce::get()`/`cms::cuda::ScopedContextAnalyze::get()`,
-if the `cms::cuda::Product<T>` to get from has a different CUDA stream than
-the
-`cms::cuda::ScopedContextAcquire`/`cms::cuda::ScopedContextProduce`/`cms::cuda::ScopedContextAnalyze`,
-`cudaStreamWaitEvent(stream, event)` is called. This means that all
-subsequent work queued to the CUDA stream will wait for the CUDA event
-to become occurred. Therefore this subsequent work can assume that the
-to-be-getted CUDA product exists.
-
-
-### CUDA ESProduct
-
-Conditions data can be transferred to the device with the following
-pattern.
-
-1. Define a `class`/`struct` for the data to be transferred in the format accessed in the device (hereafter referred to as "payload")
-2. Define a wrapper ESProduct that holds the aforementioned data in the pinned host memory
-3. The wrapper should have a function returning the payload on the
-   device memory. The function should transfer the data to the device
-   asynchronously with the help of `cms::cuda::ESProduct<T>`.
-
-#### Example
-
-```cpp
-#include "HeterogeneousCore/CUDACore/interface/ESProduct.h"
-
-// Declare the struct for the payload to be transferred. Here the
-// example is an array with (potentially) dynamic size. Note that all of
-// below becomes simpler if the array has compile-time size.
-struct ESProductExampleCUDA {
-  float *someData;
-  unsigned int size;
-};
-
-// Declare the wrapper ESProduct. The corresponding ESProducer should
-// produce objects of this type.
-class ESProductExampleCUDAWrapper {
-public:
-  // Constructor takes the standard CPU ESProduct, and transforms the
-  // necessary data to array(s) in pinned host memory
-  ESProductExampleCUDAWrapper(ESProductExample const&);
-
-  // Deallocates all pinned host memory
-  ~ESProductExampleCUDAWrapper();
-
-  // Function to return the actual payload on the memory of the current device
-  ESProductExampleCUDA const *getGPUProductAsync(cudaStream_t stream) const;
-
-private:
-  // Holds the data in pinned CPU memory
-  float *someData_;
-  unsigned int size_;
-
-  // Helper struct to hold all information that has to be allocated and
-  // deallocated per device
-  struct GPUData {
-    // Destructor should free all member pointers
-    ~GPUData();
-    // internal pointers are on device, struct itself is on CPU
-    ESProductExampleCUDA *esproductHost = nullptr;
-    // internal pounters and struct are on device
-    ESProductExampleCUDA *esproductDevice = nullptr;
-  };
-
-  // Helper that takes care of complexity of transferring the data to
-  // multiple devices
-  cms::cuda::ESProduct<GPUData> gpuData_;
-};
-
-ESProductExampleCUDAWrapper::ESProductExampleCUDAWrapper(ESProductExample const& cpuProduct) {
-  cudaCheck(cudaMallocHost(&someData_, sizeof(float)*NUM_ELEMENTS));
-  // fill someData_ and size_ from cpuProduct
-}
-
-ESProductExampleCUDA const *ESProductExampleCUDAWrapper::getGPUProductAsync(cudaStream_t stream) const {
-  // cms::cuda::ESProduct<T> essentially holds an array of GPUData objects,
-  // one per device. If the data have already been transferred to the
-  // current device (or the transfer has been queued), the helper just
-  // returns a reference to that GPUData object. Otherwise, i.e. data are
-  // not yet on the current device, the helper calls the lambda to do the
-  // necessary memory allocations and to queue the transfers.
-  auto const& data = gpuData_.dataForCurrentDeviceAsync(stream, [this](GPUData& data, cudaStream_t stream) {
-    // Allocate memory. Currently this can be with the CUDA API,
-    // sometime we'll migrate to the caching allocator. Assumption is
-    // that IOV changes are rare enough that adding global synchronization
-    // points is not that bad (for now).
-
-    // Allocate the payload object on pinned host memory.
-    cudaCheck(cudaMallocHost(&data.esproductHost, sizeof(ESProductExampleCUDA)));
-    // Allocate the payload array(s) on device memory.
-    cudaCheck(cudaMalloc(&data.esproductHost->someData, sizeof(float)*NUM_ELEMENTS));
-
-    // Allocate the payload object on the device memory.
-    cudaCheck(cudaMalloc(&data.esproductDevice, sizeof(ESProductDevice)));
-
-    // Complete the host-side information on the payload
-    data.cablingMapHost->size = this->size_;
-
-
-    // Transfer the payload, first the array(s) ...
-    cudaCheck(cudaMemcpyAsync(data.esproductHost->someData, this->someData, sizeof(float)*NUM_ELEMENTS, cudaMemcpyDefault, stream));
-    // ... and then the payload object
-    cudaCheck(cudaMemcpyAsync(data.esproductDevice, data.esproduceHost, sizeof(ESProductExampleCUDA), cudaMemcpyDefault, stream));
-});
-
-  // Returns the payload object on the memory of the current device
-  return data.esproductDevice;
-}
-
-// Destructor frees all member pointers
-ESProductExampleCUDA::GPUData::~GPUData() {
-  if(esproductHost != nullptr) {
-    cudaCheck(cudaFree(esproductHost->someData));
-    cudaCheck(cudaFreeHost(esproductHost));
-  }
-  cudaCheck(cudaFree(esProductDevice));
-}
-
-```
diff --git a/HeterogeneousCore/CUDACore/interface/ContextState.h b/HeterogeneousCore/CUDACore/interface/ContextState.h
deleted file mode 100644
index 9c52113cc1e8d..0000000000000
--- a/HeterogeneousCore/CUDACore/interface/ContextState.h
+++ /dev/null
@@ -1,60 +0,0 @@
-#ifndef HeterogeneousCore_CUDACore_ContextState_h
-#define HeterogeneousCore_CUDACore_ContextState_h
-
-#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
-
-#include <memory>
-
-namespace cms {
-  namespace cuda {
-    /**
-     * The purpose of this class is to deliver the device and CUDA stream
-     * information from ExternalWork's acquire() to producer() via a
-     * member/StreamCache variable.
-     */
-    class ContextState {
-    public:
-      ContextState() = default;
-      ~ContextState() = default;
-
-      ContextState(const ContextState&) = delete;
-      ContextState& operator=(const ContextState&) = delete;
-      ContextState(ContextState&&) = delete;
-      ContextState& operator=(ContextState&& other) = delete;
-
-    private:
-      friend class ScopedContextAcquire;
-      friend class ScopedContextProduce;
-
-      void set(int device, SharedStreamPtr stream) {
-        throwIfStream();
-        device_ = device;
-        stream_ = std::move(stream);
-      }
-
-      int device() const { return device_; }
-
-      const SharedStreamPtr& streamPtr() const {
-        throwIfNoStream();
-        return stream_;
-      }
-
-      SharedStreamPtr releaseStreamPtr() {
-        throwIfNoStream();
-        // This function needs to effectively reset stream_ (i.e. stream_
-        // must be empty after this function). This behavior ensures that
-        // the SharedStreamPtr is not hold for inadvertedly long (i.e. to
-        // the next event), and is checked at run time.
-        return std::move(stream_);
-      }
-
-      void throwIfStream() const;
-      void throwIfNoStream() const;
-
-      SharedStreamPtr stream_;
-      int device_;
-    };
-  }  // namespace cuda
-}  // namespace cms
-
-#endif
diff --git a/HeterogeneousCore/CUDACore/interface/ConvertingESProducerT.h b/HeterogeneousCore/CUDACore/interface/ConvertingESProducerT.h
deleted file mode 100644
index 8018fcede7809..0000000000000
--- a/HeterogeneousCore/CUDACore/interface/ConvertingESProducerT.h
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifndef HeterogeneousCore_CUDACore_interface_ConvertingESProducerT_h
-#define HeterogeneousCore_CUDACore_interface_ConvertingESProducerT_h
-
-#include "FWCore/Framework/interface/ESProducer.h"
-#include "FWCore/Framework/interface/ESTransientHandle.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-#include "FWCore/Framework/interface/ModuleFactory.h"
-#include "FWCore/Framework/interface/eventsetuprecord_registration_macro.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/Utilities/interface/typelookup.h"
-
-/* class template: ConvertingESProducerT
- * 
- * This class template can be used to simplify the implementation of any ESProducer that reads
- * conditions data from a record and pushes derived conditions data to the same record.
- * The current use case is to convert and copy the calibrations from the CPU to the GPUs.
- */
-
-template <typename Record, typename Target, typename Source>
-class ConvertingESProducerT : public edm::ESProducer {
-public:
-  explicit ConvertingESProducerT(edm::ParameterSet const& ps) {
-    auto const& label = ps.getParameter<std::string>("label");
-    auto const& name = ps.getParameter<std::string>("ComponentName");
-    auto cc = setWhatProduced(this, name);
-    token_ = cc.consumes(edm::ESInputTag{"", label});
-  }
-
-  std::unique_ptr<Target> produce(Record const& record) {
-    // retrieve conditions in the old format and build a product in the new format
-    return std::make_unique<Target>(record.get(token_));
-  }
-
-  static void fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
-    edm::ParameterSetDescription desc;
-
-    desc.add<std::string>("ComponentName", "");
-    desc.add<std::string>("label", "")->setComment("ESProduct label");
-    confDesc.addWithDefaultLabel(desc);
-  }
-
-private:
-  edm::ESGetToken<Source, Record> token_;
-};
-
-#endif  // HeterogeneousCore_CUDACore_interface_ConvertingESProducerT_h
diff --git a/HeterogeneousCore/CUDACore/interface/ConvertingESProducerWithDependenciesT.h b/HeterogeneousCore/CUDACore/interface/ConvertingESProducerWithDependenciesT.h
deleted file mode 100644
index 9a57e405ceb5c..0000000000000
--- a/HeterogeneousCore/CUDACore/interface/ConvertingESProducerWithDependenciesT.h
+++ /dev/null
@@ -1,118 +0,0 @@
-#ifndef HeterogeneousCore_CUDACore_interface_ConvertingESProducerWithDependenciesT_h
-#define HeterogeneousCore_CUDACore_interface_ConvertingESProducerWithDependenciesT_h
-
-#include <tuple>
-#include <utility>
-
-#include "FWCore/Framework/interface/ESProducer.h"
-#include "FWCore/Framework/interface/ESHandle.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-#include "FWCore/Framework/interface/ModuleFactory.h"
-#include "FWCore/Framework/interface/eventsetuprecord_registration_macro.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/Utilities/interface/typelookup.h"
-
-/* class template: ConvertingESProducerWithDependenciesT
- * 
- * This class template can be used to simplify the implementation of any ESProducer that reads
- * multiple conditions data from one or more records record and pushes derived conditions data
- * to a combined dependent record.
- * The current use case is to convert and copy the calibrations from the CPU to the GPUs.
- */
-
-namespace detail {
-  // simple implementation of a type zipper over 2 tuples
-  // here, the main requirement is the default constructor for Gen template
-  // which __does__ exist for ESGetToken
-
-  template <template <typename, typename> class Gen, typename Tuple1, typename Tuple2>
-  struct TypeZipper;
-
-  template <template <typename, typename> class Gen, typename Tuple1, typename Tuple2, std::size_t... Is>
-  auto TypeZipperImpl(Tuple1 const& t1, Tuple2 const& t2, std::index_sequence<Is...>) {
-    return std::make_tuple(
-        Gen<typename std::tuple_element<Is, Tuple1>::type, typename std::tuple_element<Is, Tuple2>::type>{}...);
-  }
-
-  template <template <typename, typename> class Gen, typename... Ts1, typename... Ts2>
-  struct TypeZipper<Gen, std::tuple<Ts1...>, std::tuple<Ts2...>> {
-    static_assert(sizeof...(Ts1) == sizeof...(Ts2));
-    using type = typename std::decay<decltype(TypeZipperImpl<Gen>(
-        std::tuple<Ts1...>{}, std::tuple<Ts2...>{}, std::index_sequence_for<Ts1...>{}))>::type;
-  };
-
-}  // namespace detail
-
-template <typename CombinedRecord, typename Target, typename... Dependencies>
-class ConvertingESProducerWithDependenciesT;
-
-template <template <typename...> typename CombinedRecord,
-          typename... DepsRecords,
-          typename Target,
-          typename... Dependencies>
-class ConvertingESProducerWithDependenciesT<CombinedRecord<DepsRecords...>, Target, Dependencies...>
-    : public edm::ESProducer {
-public:
-  static constexpr std::size_t nsources = sizeof...(Dependencies);
-  static_assert(sizeof...(Dependencies) == sizeof...(DepsRecords));
-
-  explicit ConvertingESProducerWithDependenciesT(edm::ParameterSet const& ps) {
-    std::vector<edm::ESInputTag> tags(nsources);
-    for (std::size_t i = 0; i < nsources; i++)
-      tags[i] = edm::ESInputTag{"", ps.getParameter<std::string>("label" + std::to_string(i))};
-
-    std::string const& name = ps.getParameter<std::string>("ComponentName");
-    edm::ESConsumesCollectorT<CombinedRecord<DepsRecords...>> cc = setWhatProduced(this, name);
-    WalkConsumes<nsources - 1>::iterate(cc, tokens_, tags);
-  }
-
-  std::unique_ptr<Target> produce(CombinedRecord<DepsRecords...> const& record) {
-    auto handles = std::tuple<edm::ESHandle<Dependencies>...>{};
-    WalkAndCall<nsources - 1, edm::ESHandle<Dependencies>...>::iterate(record, handles, tokens_);
-
-    return std::apply([](auto const&... handles) { return std::make_unique<Target>((*handles)...); }, handles);
-  }
-
-  static void fillDescriptions(edm::ConfigurationDescriptions& confDesc) {
-    edm::ParameterSetDescription desc;
-
-    desc.add<std::string>("ComponentName", "");
-    for (std::size_t i = 0; i < nsources; i++)
-      desc.add<std::string>("label" + std::to_string(i), "")->setComment("Product Label");
-    confDesc.addWithDefaultLabel(desc);
-  }
-
-private:
-  using TokenType =
-      typename detail::TypeZipper<edm::ESGetToken, std::tuple<Dependencies...>, std::tuple<DepsRecords...>>::type;
-  TokenType tokens_;
-
-private:
-  template <std::size_t N>
-  struct WalkConsumes {
-    static void iterate(edm::ESConsumesCollectorT<CombinedRecord<DepsRecords...>>& cc,
-                        TokenType& tokens,
-                        std::vector<edm::ESInputTag> const& tags) {
-      if constexpr (N > 0)
-        WalkConsumes<N - 1>::iterate(cc, tokens, tags);
-      std::get<N>(tokens) = cc.consumes(tags[N]);
-    }
-  };
-
-  template <std::size_t N, typename... Types>
-  struct WalkAndCall {
-    static void iterate(CombinedRecord<DepsRecords...> const& containingRecord,
-                        std::tuple<Types...>& ts,
-                        TokenType const& tokens) {
-      using Record = typename std::tuple_element<N, std::tuple<DepsRecords...>>::type;
-      if constexpr (N > 0)
-        WalkAndCall<N - 1, Types...>::iterate(containingRecord, ts, tokens);
-      // get the right dependent record
-      auto const& record = containingRecord.template getRecord<Record>();
-      // assign the right element of the tuple
-      std::get<N>(ts) = record.getHandle(std::get<N>(tokens));
-    }
-  };
-};
-
-#endif  // HeterogeneousCore_CUDACore_interface_ConvertingESProducerWithDependenciesT_h
diff --git a/HeterogeneousCore/CUDACore/interface/ESProduct.h b/HeterogeneousCore/CUDACore/interface/ESProduct.h
deleted file mode 100644
index fbe1825b0fa4d..0000000000000
--- a/HeterogeneousCore/CUDACore/interface/ESProduct.h
+++ /dev/null
@@ -1,110 +0,0 @@
-#ifndef HeterogeneousCore_CUDACore_ESProduct_h
-#define HeterogeneousCore_CUDACore_ESProduct_h
-
-#include <atomic>
-#include <cassert>
-#include <mutex>
-#include <vector>
-
-#include "FWCore/Utilities/interface/thread_safety_macros.h"
-#include "HeterogeneousCore/CUDAServices/interface/numberOfDevices.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"
-
-namespace cms {
-  namespace cuda {
-    template <typename T>
-    class ESProduct {
-    public:
-      ESProduct() : gpuDataPerDevice_(numberOfDevices()) {
-        if (not gpuDataPerDevice_.empty()) {
-          cms::cuda::ScopedSetDevice scopedDevice;
-          for (size_t i = 0; i < gpuDataPerDevice_.size(); ++i) {
-            scopedDevice.set(i);
-            gpuDataPerDevice_[i].m_event = getEventCache().get();
-          }
-        }
-      }
-
-      ~ESProduct() = default;
-
-      // transferAsync should be a function of (T&, cudaStream_t)
-      // which enqueues asynchronous transfers (possibly kernels as well)
-      // to the CUDA stream
-      template <typename F>
-      const T& dataForCurrentDeviceAsync(cudaStream_t cudaStream, F transferAsync) const {
-        int device = currentDevice();
-        auto& data = gpuDataPerDevice_[device];
-
-        // If the GPU data has already been filled, we can return it immediately
-        if (not data.m_filled.load()) {
-          // It wasn't, so need to fill it
-          std::scoped_lock<std::mutex> lk{data.m_mutex};
-
-          if (data.m_filled.load()) {
-            // Other thread marked it filled while we were locking the mutex, so we're free to return it
-            return data.m_data;
-          }
-
-          if (data.m_fillingStream != nullptr) {
-            // Someone else is filling
-
-            // Check first if the recorded event has occurred
-            if (eventWorkHasCompleted(data.m_event.get())) {
-              // It was, so data is accessible from all CUDA streams on
-              // the device. Set the 'filled' for all subsequent calls and
-              // return the value
-              auto should_be_false = data.m_filled.exchange(true);
-              assert(not should_be_false);
-              data.m_fillingStream = nullptr;
-            } else if (data.m_fillingStream != cudaStream) {
-              // Filling is still going on. For other CUDA stream, add
-              // wait on the CUDA stream and return the value. Subsequent
-              // work queued on the stream will wait for the event to
-              // occur (i.e. transfer to finish).
-              cudaCheck(cudaStreamWaitEvent(cudaStream, data.m_event.get(), 0),
-                        "Failed to make a stream to wait for an event");
-            }
-            // else: filling is still going on. But for the same CUDA
-            // stream (which would be a bit strange but fine), we can just
-            // return as all subsequent work should be enqueued to the
-            // same CUDA stream (or stream to be explicitly synchronized
-            // by the caller)
-          } else {
-            // Now we can be sure that the data is not yet on the GPU, and
-            // this thread is the first to try that.
-            transferAsync(data.m_data, cudaStream);
-            assert(data.m_fillingStream == nullptr);
-            data.m_fillingStream = cudaStream;
-            // Record in the cudaStream an event to mark the readiness of the
-            // EventSetup data on the GPU, so other streams can check for it
-            cudaCheck(cudaEventRecord(data.m_event.get(), cudaStream));
-            // Now the filling has been enqueued to the cudaStream, so we
-            // can return the GPU data immediately, since all subsequent
-            // work must be either enqueued to the cudaStream, or the cudaStream
-            // must be synchronized by the caller
-          }
-        }
-
-        return data.m_data;
-      }
-
-    private:
-      struct Item {
-        mutable std::mutex m_mutex;
-        CMS_THREAD_GUARD(m_mutex) mutable SharedEventPtr m_event;
-        // non-null if some thread is already filling (cudaStream_t is just a pointer)
-        CMS_THREAD_GUARD(m_mutex) mutable cudaStream_t m_fillingStream = nullptr;
-        mutable std::atomic<bool> m_filled = false;  // easy check if data has been filled already or not
-        CMS_THREAD_GUARD(m_mutex) mutable T m_data;
-      };
-
-      std::vector<Item> gpuDataPerDevice_;
-    };
-  }  // namespace cuda
-}  // namespace cms
-
-#endif  // HeterogeneousCore_CUDACore_ESProduct_h
diff --git a/HeterogeneousCore/CUDACore/interface/JobConfigurationGPURecord.h b/HeterogeneousCore/CUDACore/interface/JobConfigurationGPURecord.h
deleted file mode 100644
index fd0b3864e909c..0000000000000
--- a/HeterogeneousCore/CUDACore/interface/JobConfigurationGPURecord.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef HeterogeneousCore_CUDACore_interface_JobConfigurationGPURecord_h
-#define HeterogeneousCore_CUDACore_interface_JobConfigurationGPURecord_h
-
-#include "FWCore/Framework/interface/EventSetupRecordImplementation.h"
-
-class JobConfigurationGPURecord : public edm::eventsetup::EventSetupRecordImplementation<JobConfigurationGPURecord> {};
-
-#endif  // HeterogeneousCore_CUDACore_interface_JobConfigurationGPURecord_h
diff --git a/HeterogeneousCore/CUDACore/interface/ScopedContext.h b/HeterogeneousCore/CUDACore/interface/ScopedContext.h
deleted file mode 100644
index 68dfa57e7719d..0000000000000
--- a/HeterogeneousCore/CUDACore/interface/ScopedContext.h
+++ /dev/null
@@ -1,200 +0,0 @@
-#ifndef HeterogeneousCore_CUDACore_ScopedContext_h
-#define HeterogeneousCore_CUDACore_ScopedContext_h
-
-#include <optional>
-
-#include "CUDADataFormats/Common/interface/Product.h"
-#include "FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Utilities/interface/EDGetToken.h"
-#include "FWCore/Utilities/interface/EDPutToken.h"
-#include "FWCore/Utilities/interface/StreamID.h"
-#include "HeterogeneousCore/CUDACore/interface/ContextState.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/SharedEventPtr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/SharedStreamPtr.h"
-
-namespace cms {
-  namespace cudatest {
-    class TestScopedContext;
-  }
-
-  namespace cuda {
-
-    namespace impl {
-      // This class is intended to be derived by other ScopedContext*, not for general use
-      class ScopedContextBase {
-      public:
-        int device() const { return currentDevice_; }
-
-        // cudaStream_t is a pointer to a thread-safe object, for which a
-        // mutable access is needed even if the ScopedContext itself
-        // would be const. Therefore it is ok to return a non-const
-        // pointer from a const method here.
-        cudaStream_t stream() const { return stream_.get(); }
-        const SharedStreamPtr& streamPtr() const { return stream_; }
-
-      protected:
-        // The constructors set the current device, but the device
-        // is not set back to the previous value at the destructor. This
-        // should be sufficient (and tiny bit faster) as all CUDA API
-        // functions relying on the current device should be called from
-        // the scope where this context is. The current device doesn't
-        // really matter between modules (or across TBB tasks).
-        explicit ScopedContextBase(edm::StreamID streamID);
-
-        explicit ScopedContextBase(const ProductBase& data);
-
-        explicit ScopedContextBase(int device, SharedStreamPtr stream);
-
-      private:
-        int currentDevice_;
-        SharedStreamPtr stream_;
-      };
-
-      class ScopedContextGetterBase : public ScopedContextBase {
-      public:
-        template <typename T>
-        const T& get(const Product<T>& data) {
-          synchronizeStreams(data.device(), data.stream(), data.isAvailable(), data.event());
-          return data.data_;
-        }
-
-        template <typename T>
-        const T& get(const edm::Event& iEvent, edm::EDGetTokenT<Product<T>> token) {
-          return get(iEvent.get(token));
-        }
-
-      protected:
-        template <typename... Args>
-        ScopedContextGetterBase(Args&&... args) : ScopedContextBase(std::forward<Args>(args)...) {}
-
-        void synchronizeStreams(int dataDevice, cudaStream_t dataStream, bool available, cudaEvent_t dataEvent);
-      };
-
-      class ScopedContextHolderHelper {
-      public:
-        ScopedContextHolderHelper(edm::WaitingTaskWithArenaHolder waitingTaskHolder)
-            : waitingTaskHolder_{std::move(waitingTaskHolder)} {}
-
-        template <typename F>
-        void pushNextTask(F&& f, ContextState const* state);
-
-        void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-          waitingTaskHolder_ = std::move(waitingTaskHolder);
-        }
-
-        void enqueueCallback(int device, cudaStream_t stream);
-
-      private:
-        edm::WaitingTaskWithArenaHolder waitingTaskHolder_;
-      };
-    }  // namespace impl
-
-    /**
-     * The aim of this class is to do necessary per-event "initialization" in ExternalWork acquire():
-     * - setting the current device
-     * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
-     * - synchronizing between CUDA streams if necessary
-     * and enforce that those get done in a proper way in RAII fashion.
-     */
-    class ScopedContextAcquire : public impl::ScopedContextGetterBase {
-    public:
-      /// Constructor to create a new CUDA stream (no need for context beyond acquire())
-      explicit ScopedContextAcquire(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
-          : ScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)} {}
-
-      /// Constructor to create a new CUDA stream, and the context is needed after acquire()
-      explicit ScopedContextAcquire(edm::StreamID streamID,
-                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder,
-                                    ContextState& state)
-          : ScopedContextGetterBase(streamID), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {}
-
-      /// Constructor to (possibly) re-use a CUDA stream (no need for context beyond acquire())
-      explicit ScopedContextAcquire(const ProductBase& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder)
-          : ScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)} {}
-
-      /// Constructor to (possibly) re-use a CUDA stream, and the context is needed after acquire()
-      explicit ScopedContextAcquire(const ProductBase& data,
-                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder,
-                                    ContextState& state)
-          : ScopedContextGetterBase(data), holderHelper_{std::move(waitingTaskHolder)}, contextState_{&state} {}
-
-      ~ScopedContextAcquire() noexcept(false);
-
-      template <typename F>
-      void pushNextTask(F&& f) {
-        if (contextState_ == nullptr)
-          throwNoState();
-        holderHelper_.pushNextTask(std::forward<F>(f), contextState_);
-      }
-
-      void replaceWaitingTaskHolder(edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-        holderHelper_.replaceWaitingTaskHolder(std::move(waitingTaskHolder));
-      }
-
-    private:
-      void throwNoState();
-
-      impl::ScopedContextHolderHelper holderHelper_;
-      ContextState* contextState_ = nullptr;
-    };
-
-    /**
-     * The aim of this class is to do necessary per-event "initialization" in ExternalWork produce() or normal produce():
-     * - setting the current device
-     * - synchronizing between CUDA streams if necessary
-     * and enforce that those get done in a proper way in RAII fashion.
-     */
-    class ScopedContextProduce : public impl::ScopedContextGetterBase {
-    public:
-      /// Constructor to create a new CUDA stream (non-ExternalWork module)
-      explicit ScopedContextProduce(edm::StreamID streamID) : ScopedContextGetterBase(streamID) {}
-
-      /// Constructor to (possibly) re-use a CUDA stream (non-ExternalWork module)
-      explicit ScopedContextProduce(const ProductBase& data) : ScopedContextGetterBase(data) {}
-
-      /// Constructor to re-use the CUDA stream of acquire() (ExternalWork module)
-      explicit ScopedContextProduce(ContextState& state)
-          : ScopedContextGetterBase(state.device(), state.releaseStreamPtr()) {}
-
-      /// Record the CUDA event, all asynchronous work must have been queued before the destructor
-      ~ScopedContextProduce();
-
-      template <typename T>
-      std::unique_ptr<Product<T>> wrap(T data) {
-        // make_unique doesn't work because of private constructor
-        return std::unique_ptr<Product<T>>(new Product<T>(device(), streamPtr(), event_, std::move(data)));
-      }
-
-      template <typename T, typename... Args>
-      auto emplace(edm::Event& iEvent, edm::EDPutTokenT<T> token, Args&&... args) {
-        return iEvent.emplace(token, device(), streamPtr(), event_, std::forward<Args>(args)...);
-      }
-
-    private:
-      friend class cudatest::TestScopedContext;
-
-      // This construcor is only meant for testing
-      explicit ScopedContextProduce(int device, SharedStreamPtr stream, SharedEventPtr event)
-          : ScopedContextGetterBase(device, std::move(stream)), event_{std::move(event)} {}
-
-      // create the CUDA Event upfront to catch possible errors from its creation
-      SharedEventPtr event_ = getEventCache().get();
-    };
-
-    /**
-     * The aim of this class is to do necessary per-event "initialization" in analyze()
-     * - setting the current device
-     * - synchronizing between CUDA streams if necessary
-     * and enforce that those get done in a proper way in RAII fashion.
-     */
-    class ScopedContextAnalyze : public impl::ScopedContextGetterBase {
-    public:
-      /// Constructor to (possibly) re-use a CUDA stream
-      explicit ScopedContextAnalyze(const ProductBase& data) : ScopedContextGetterBase(data) {}
-    };
-  }  // namespace cuda
-}  // namespace cms
-
-#endif
diff --git a/HeterogeneousCore/CUDACore/src/ContextState.cc b/HeterogeneousCore/CUDACore/src/ContextState.cc
deleted file mode 100644
index 0670f01d472f3..0000000000000
--- a/HeterogeneousCore/CUDACore/src/ContextState.cc
+++ /dev/null
@@ -1,16 +0,0 @@
-#include "HeterogeneousCore/CUDACore/interface/ContextState.h"
-#include "FWCore/Utilities/interface/Exception.h"
-
-namespace cms::cuda {
-  void ContextState::throwIfStream() const {
-    if (stream_) {
-      throw cms::Exception("LogicError") << "Trying to set ContextState, but it already had a valid state";
-    }
-  }
-
-  void ContextState::throwIfNoStream() const {
-    if (not stream_) {
-      throw cms::Exception("LogicError") << "Trying to get ContextState, but it did not have a valid state";
-    }
-  }
-}  // namespace cms::cuda
diff --git a/HeterogeneousCore/CUDACore/src/JobConfigurationGPURecord.cc b/HeterogeneousCore/CUDACore/src/JobConfigurationGPURecord.cc
deleted file mode 100644
index b4bff09f07090..0000000000000
--- a/HeterogeneousCore/CUDACore/src/JobConfigurationGPURecord.cc
+++ /dev/null
@@ -1,4 +0,0 @@
-#include "FWCore/Framework/interface/eventsetuprecord_registration_macro.h"
-#include "HeterogeneousCore/CUDACore/interface/JobConfigurationGPURecord.h"
-
-EVENTSETUP_RECORD_REG(JobConfigurationGPURecord);
diff --git a/HeterogeneousCore/CUDACore/src/ScopedContext.cc b/HeterogeneousCore/CUDACore/src/ScopedContext.cc
deleted file mode 100644
index 367e0fef6c8ac..0000000000000
--- a/HeterogeneousCore/CUDACore/src/ScopedContext.cc
+++ /dev/null
@@ -1,92 +0,0 @@
-#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
-
-#include "FWCore/Concurrency/interface/Async.h"
-#include "FWCore/MessageLogger/interface/MessageLogger.h"
-#include "FWCore/ServiceRegistry/interface/Service.h"
-#include "FWCore/Utilities/interface/Exception.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-
-#include "chooseDevice.h"
-
-namespace cms::cuda {
-  namespace impl {
-    ScopedContextBase::ScopedContextBase(edm::StreamID streamID) : currentDevice_(chooseDevice(streamID)) {
-      cudaCheck(cudaSetDevice(currentDevice_));
-      stream_ = getStreamCache().get();
-    }
-
-    ScopedContextBase::ScopedContextBase(const ProductBase& data) : currentDevice_(data.device()) {
-      cudaCheck(cudaSetDevice(currentDevice_));
-      if (data.mayReuseStream()) {
-        stream_ = data.streamPtr();
-      } else {
-        stream_ = getStreamCache().get();
-      }
-    }
-
-    ScopedContextBase::ScopedContextBase(int device, SharedStreamPtr stream)
-        : currentDevice_(device), stream_(std::move(stream)) {
-      cudaCheck(cudaSetDevice(currentDevice_));
-    }
-
-    ////////////////////
-
-    void ScopedContextGetterBase::synchronizeStreams(int dataDevice,
-                                                     cudaStream_t dataStream,
-                                                     bool available,
-                                                     cudaEvent_t dataEvent) {
-      if (dataDevice != device()) {
-        // Eventually replace with prefetch to current device (assuming unified memory works)
-        // If we won't go to unified memory, need to figure out something else...
-        throw cms::Exception("LogicError") << "Handling data from multiple devices is not yet supported";
-      }
-
-      if (dataStream != stream()) {
-        // Different streams, need to synchronize
-        if (not available) {
-          // Event not yet occurred, so need to add synchronization
-          // here. Sychronization is done by making the CUDA stream to
-          // wait for an event, so all subsequent work in the stream
-          // will run only after the event has "occurred" (i.e. data
-          // product became available).
-          cudaCheck(cudaStreamWaitEvent(stream(), dataEvent, 0), "Failed to make a stream to wait for an event");
-        }
-      }
-    }
-
-    void ScopedContextHolderHelper::enqueueCallback(int device, cudaStream_t stream) {
-      edm::Service<edm::Async> async;
-      SharedEventPtr event = getEventCache().get();
-      cudaCheck(cudaEventRecord(event.get(), stream));
-      async->runAsync(
-          std::move(waitingTaskHolder_),
-          [event = std::move(event)]() mutable { cudaCheck(cudaEventSynchronize(event.get())); },
-          []() { return "Enqueued by cms::cuda::ScopedContextHolderHelper::enqueueCallback()"; });
-    }
-  }  // namespace impl
-
-  ////////////////////
-
-  ScopedContextAcquire::~ScopedContextAcquire() noexcept(false) {
-    holderHelper_.enqueueCallback(device(), stream());
-    if (contextState_) {
-      contextState_->set(device(), streamPtr());
-    }
-  }
-
-  void ScopedContextAcquire::throwNoState() {
-    throw cms::Exception("LogicError")
-        << "Calling ScopedContextAcquire::insertNextTask() requires ScopedContextAcquire to be constructed with "
-           "ContextState, but that was not the case";
-  }
-
-  ////////////////////
-
-  ScopedContextProduce::~ScopedContextProduce() {
-    // Intentionally not checking the return value to avoid throwing
-    // exceptions. If this call would fail, we should get failures
-    // elsewhere as well.
-    cudaEventRecord(event_.get(), stream());
-  }
-}  // namespace cms::cuda
diff --git a/HeterogeneousCore/CUDACore/src/chooseDevice.cc b/HeterogeneousCore/CUDACore/src/chooseDevice.cc
deleted file mode 100644
index a768cea02c5ca..0000000000000
--- a/HeterogeneousCore/CUDACore/src/chooseDevice.cc
+++ /dev/null
@@ -1,28 +0,0 @@
-#include "FWCore/ServiceRegistry/interface/Service.h"
-#include "FWCore/Utilities/interface/Exception.h"
-#include "HeterogeneousCore/CUDAServices/interface/CUDAInterface.h"
-
-#include "chooseDevice.h"
-
-namespace cms::cuda {
-  int chooseDevice(edm::StreamID id) {
-    edm::Service<CUDAInterface> cuda;
-    if (not cuda or not cuda->enabled()) {
-      cms::Exception ex("CUDAError");
-      ex << "Unable to choose current device because CUDAService is not preset or disabled.\n"
-         << "If CUDAService was not explicitly disabled in the configuration, the probable\n"
-         << "cause is that there is no GPU or there is some problem in the CUDA runtime or\n"
-         << "drivers.";
-      ex.addContext("Calling cms::cuda::chooseDevice()");
-      throw ex;
-    }
-
-    // For startes we "statically" assign the device based on
-    // edm::Stream number. This is suboptimal if the number of
-    // edm::Streams is not a multiple of the number of CUDA devices
-    // (and even then there is no load balancing).
-    //
-    // TODO: improve the "assignment" logic
-    return id % cuda->numberOfDevices();
-  }
-}  // namespace cms::cuda
diff --git a/HeterogeneousCore/CUDACore/src/chooseDevice.h b/HeterogeneousCore/CUDACore/src/chooseDevice.h
deleted file mode 100644
index ab642325aaecf..0000000000000
--- a/HeterogeneousCore/CUDACore/src/chooseDevice.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef HeterogeneousCore_CUDACore_chooseDevice_h
-#define HeterogeneousCore_CUDACore_chooseDevice_h
-
-#include "FWCore/Utilities/interface/StreamID.h"
-
-namespace cms::cuda {
-  int chooseDevice(edm::StreamID id);
-}
-
-#endif
diff --git a/HeterogeneousCore/CUDACore/test/BuildFile.xml b/HeterogeneousCore/CUDACore/test/BuildFile.xml
index c11ac87aeb176..19807bdf7928b 100644
--- a/HeterogeneousCore/CUDACore/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDACore/test/BuildFile.xml
@@ -1,16 +1,4 @@
 <iftool name="cuda-gcc-support">
-  <bin file="test_*.cc test_*.cu" name="testHeterogeneousCoreCUDACore">
-    <use name="catch2"/>
-    <use name="cuda"/>
-    <use name="tbb"/>
-    <use name="CUDADataFormats/Common"/>
-    <use name="FWCore/ParameterSet"/>
-    <use name="FWCore/ParameterSetReader"/>
-    <use name="FWCore/PluginManager"/>
-    <use name="FWCore/ServiceRegistry"/>
-    <use name="HeterogeneousCore/CUDACore"/>
-  </bin>
-
   <bin file="testStreamEvent.cu" name="testHeterogeneousCoreCUDACoreStreamEvent">
     <use name="cuda"/>
     <use name="HeterogeneousCore/CUDAUtilities"/>
diff --git a/HeterogeneousCore/CUDACore/test/test_ScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_ScopedContext.cc
deleted file mode 100644
index f2a514e3a7d48..0000000000000
--- a/HeterogeneousCore/CUDACore/test/test_ScopedContext.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-#include "catch2/catch_all.hpp"
-
-#include "CUDADataFormats/Common/interface/Product.h"
-#include "FWCore/Concurrency/interface/FinalWaitingTask.h"
-#include "FWCore/Concurrency/interface/WaitingTask.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/eventWorkHasCompleted.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/EventCache.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/ScopedSetDevice.h"
-
-#include "test_ScopedContextKernels.h"
-
-#include "oneapi/tbb/task_arena.h"
-#include "oneapi/tbb/task_group.h"
-
-namespace cms::cudatest {
-  class TestScopedContext {
-  public:
-    static cuda::ScopedContextProduce make(int dev, bool createEvent) {
-      cms::cuda::SharedEventPtr event;
-      if (createEvent) {
-        event = cms::cuda::getEventCache().get();
-      }
-      return cuda::ScopedContextProduce(dev, cms::cuda::getStreamCache().get(), std::move(event));
-    }
-  };
-}  // namespace cms::cudatest
-
-namespace {
-  std::unique_ptr<cms::cuda::Product<int*>> produce(int device, int* d, int* h) {
-    auto ctx = cms::cudatest::TestScopedContext::make(device, true);
-    cudaCheck(cudaMemcpyAsync(d, h, sizeof(int), cudaMemcpyHostToDevice, ctx.stream()));
-    cms::cudatest::testScopedContextKernels_single(d, ctx.stream());
-    return ctx.wrap(d);
-  }
-}  // namespace
-
-TEST_CASE("Use of cms::cuda::ScopedContext", "[CUDACore]") {
-  if (not cms::cudatest::testDevices()) {
-    return;
-  }
-
-  constexpr int defaultDevice = 0;
-  {
-    auto ctx = cms::cudatest::TestScopedContext::make(defaultDevice, true);
-
-    SECTION("Construct from device ID") { REQUIRE(cms::cuda::currentDevice() == defaultDevice); }
-
-    SECTION("Wrap T to cms::cuda::Product<T>") {
-      std::unique_ptr<cms::cuda::Product<int>> dataPtr = ctx.wrap(10);
-      REQUIRE(dataPtr.get() != nullptr);
-      REQUIRE(dataPtr->device() == ctx.device());
-      REQUIRE(dataPtr->stream() == ctx.stream());
-    }
-
-    SECTION("Construct from from cms::cuda::Product<T>") {
-      std::unique_ptr<cms::cuda::Product<int>> dataPtr = ctx.wrap(10);
-      const auto& data = *dataPtr;
-
-      cms::cuda::ScopedContextProduce ctx2{data};
-      REQUIRE(cms::cuda::currentDevice() == data.device());
-      REQUIRE(ctx2.stream() == data.stream());
-
-      // Second use of a product should lead to new stream
-      cms::cuda::ScopedContextProduce ctx3{data};
-      REQUIRE(cms::cuda::currentDevice() == data.device());
-      REQUIRE(ctx3.stream() != data.stream());
-    }
-
-    SECTION("Storing state in cms::cuda::ContextState") {
-      oneapi::tbb::task_arena arena(1);
-      arena.execute([&ctx]() {
-        cms::cuda::ContextState ctxstate;
-        {  // acquire
-          std::unique_ptr<cms::cuda::Product<int>> dataPtr = ctx.wrap(10);
-          const auto& data = *dataPtr;
-          oneapi::tbb::task_group group;
-          edm::FinalWaitingTask waitTask{group};
-          {
-            edm::WaitingTaskWithArenaHolder dummy{group, &waitTask};
-            cms::cuda::ScopedContextAcquire ctx2{data, dummy, ctxstate};
-          }
-          waitTask.wait();
-        }
-
-        {  // produce
-          cms::cuda::ScopedContextProduce ctx2{ctxstate};
-          REQUIRE(cms::cuda::currentDevice() == ctx.device());
-          REQUIRE(ctx2.stream() == ctx.stream());
-        }
-      });
-    }
-
-    SECTION("Joining multiple CUDA streams") {
-      cms::cuda::ScopedSetDevice setDeviceForThisScope(defaultDevice);
-
-      // Mimick a producer on the first CUDA stream
-      int h_a1 = 1;
-      auto d_a1 = cms::cuda::make_device_unique<int>(nullptr);
-      auto wprod1 = produce(defaultDevice, d_a1.get(), &h_a1);
-
-      // Mimick a producer on the second CUDA stream
-      int h_a2 = 2;
-      auto d_a2 = cms::cuda::make_device_unique<int>(nullptr);
-      auto wprod2 = produce(defaultDevice, d_a2.get(), &h_a2);
-
-      REQUIRE(wprod1->stream() != wprod2->stream());
-
-      // Mimick a third producer "joining" the two streams
-      cms::cuda::ScopedContextProduce ctx2{*wprod1};
-
-      auto prod1 = ctx2.get(*wprod1);
-      auto prod2 = ctx2.get(*wprod2);
-
-      auto d_a3 = cms::cuda::make_device_unique<int>(nullptr);
-      cms::cudatest::testScopedContextKernels_join(prod1, prod2, d_a3.get(), ctx2.stream());
-      cudaCheck(cudaStreamSynchronize(ctx2.stream()));
-      REQUIRE(wprod2->isAvailable());
-      REQUIRE(cms::cuda::eventWorkHasCompleted(wprod2->event()));
-
-      h_a1 = 0;
-      h_a2 = 0;
-      int h_a3 = 0;
-
-      cudaCheck(cudaMemcpyAsync(&h_a1, d_a1.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
-      cudaCheck(cudaMemcpyAsync(&h_a2, d_a2.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
-      cudaCheck(cudaMemcpyAsync(&h_a3, d_a3.get(), sizeof(int), cudaMemcpyDeviceToHost, ctx.stream()));
-
-      REQUIRE(h_a1 == 2);
-      REQUIRE(h_a2 == 4);
-      REQUIRE(h_a3 == 6);
-    }
-  }
-
-  cudaCheck(cudaSetDevice(defaultDevice));
-  cudaCheck(cudaDeviceSynchronize());
-  // Note: CUDA resources are cleaned up by the destructors of the global cache objects
-}
diff --git a/HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.cu b/HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.cu
deleted file mode 100644
index b87f1e20a5f24..0000000000000
--- a/HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.cu
+++ /dev/null
@@ -1,17 +0,0 @@
-#include "test_ScopedContextKernels.h"
-
-namespace {
-  __global__ void single_mul(int *d) { d[0] = d[0] * 2; }
-
-  __global__ void join_add(const int *d1, const int *d2, int *d3) { d3[0] = d1[0] + d2[0]; }
-}  // namespace
-
-namespace cms {
-  namespace cudatest {
-    void testScopedContextKernels_single(int *d, cudaStream_t stream) { single_mul<<<1, 1, 0, stream>>>(d); }
-
-    void testScopedContextKernels_join(const int *d1, const int *d2, int *d3, cudaStream_t stream) {
-      join_add<<<1, 1, 0, stream>>>(d1, d2, d3);
-    }
-  }  // namespace cudatest
-}  // namespace cms
diff --git a/HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.h b/HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.h
deleted file mode 100644
index dfc55682afc76..0000000000000
--- a/HeterogeneousCore/CUDACore/test/test_ScopedContextKernels.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#ifndef HeterogeneousCore_CUDACore_test_ScopedContextKernels_h
-#define HeterogeneousCore_CUDACore_test_ScopedContextKernels_h
-
-#include <cuda_runtime.h>
-
-namespace cms {
-  namespace cudatest {
-    void testScopedContextKernels_single(int *d, cudaStream_t stream);
-    void testScopedContextKernels_join(const int *d1, const int *d2, int *d3, cudaStream_t stream);
-  }  // namespace cudatest
-}  // namespace cms
-
-#endif
diff --git a/HeterogeneousCore/CUDACore/test/test_main.cc b/HeterogeneousCore/CUDACore/test/test_main.cc
deleted file mode 100644
index f817af48040a2..0000000000000
--- a/HeterogeneousCore/CUDACore/test/test_main.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-#define CATCH_CONFIG_MAIN
-#include "catch2/catch_all.hpp"
-
-#include "FWCore/ParameterSetReader/interface/ParameterSetReader.h"
-#include "FWCore/PluginManager/interface/PluginManager.h"
-#include "FWCore/PluginManager/interface/standard.h"
-#include "FWCore/ServiceRegistry/interface/ServiceRegistry.h"
-
-class ServiceRegistryListener : public Catch::EventListenerBase {
-public:
-  using Catch::EventListenerBase::EventListenerBase;  // inherit constructor
-
-  void testRunStarting(Catch::TestRunInfo const& testRunInfo) override {
-    edmplugin::PluginManager::configure(edmplugin::standard::config());
-
-    const std::string config{
-        R"_(import FWCore.ParameterSet.Config as cms
-process = cms.Process('Test')
-process.CUDAService = cms.Service('CUDAService')
-process.AsyncService = cms.Service('AsyncService')
-)_"};
-
-    std::unique_ptr<edm::ParameterSet> params;
-    edm::makeParameterSets(config, params);
-    edm::ServiceToken tempToken(edm::ServiceRegistry::createServicesFromConfig(std::move(params)));
-    operate_.reset(new edm::ServiceRegistry::Operate(tempToken));
-  }
-
-private:
-  std::unique_ptr<edm::ServiceRegistry::Operate> operate_;
-};
-CATCH_REGISTER_LISTENER(ServiceRegistryListener);
diff --git a/HeterogeneousCore/CUDATest/BuildFile.xml b/HeterogeneousCore/CUDATest/BuildFile.xml
index 3d370b4248fa4..fcf5ae111da8e 100644
--- a/HeterogeneousCore/CUDATest/BuildFile.xml
+++ b/HeterogeneousCore/CUDATest/BuildFile.xml
@@ -1,6 +1,5 @@
 <iftool name="cuda-gcc-support">
-  <use name="DataFormats/Common"/>
-  <use name="CUDADataFormats/Common"/>
-  <use name="HeterogeneousCore/CUDAUtilities"/>
+  <use name="cuda"/>
   <use name="rootcore"/>
+  <use name="DataFormats/Common"/>
 </iftool>
diff --git a/HeterogeneousCore/CUDATest/interface/Thing.h b/HeterogeneousCore/CUDATest/interface/Thing.h
deleted file mode 100644
index 27dc58e1443f4..0000000000000
--- a/HeterogeneousCore/CUDATest/interface/Thing.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#ifndef HeterogeneousCore_CUDATest_Thing_H
-#define HeterogeneousCore_CUDATest_Thing_H
-
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-
-namespace cms {
-  namespace cudatest {
-    class Thing {
-    public:
-      Thing() = default;
-      explicit Thing(cms::cuda::device::unique_ptr<float[]> ptr) : ptr_(std::move(ptr)) {}
-
-      const float *get() const { return ptr_.get(); }
-
-    private:
-      cms::cuda::device::unique_ptr<float[]> ptr_;
-    };
-  }  // namespace cudatest
-}  // namespace cms
-
-#endif
diff --git a/HeterogeneousCore/CUDATest/plugins/BuildFile.xml b/HeterogeneousCore/CUDATest/plugins/BuildFile.xml
index e12fb032e7f8d..08d0d7bd46f32 100644
--- a/HeterogeneousCore/CUDATest/plugins/BuildFile.xml
+++ b/HeterogeneousCore/CUDATest/plugins/BuildFile.xml
@@ -1,12 +1,9 @@
 <iftool name="cuda-gcc-support">
-  <library file="*.cc *.cu" name="HeterogeneousCoreCUDATestPlugins">
-    <flags EDM_PLUGIN="1"/>
-    <use name="CUDADataFormats/Common"/>
+  <library file="*.cc" name="HeterogeneousCoreCUDATestPlugins">
+    <use name="cuda"/>
     <use name="FWCore/Framework"/>
     <use name="FWCore/ParameterSet"/>
-    <use name="HeterogeneousCore/CUDACore"/>
-    <use name="HeterogeneousCore/CUDAUtilities"/>
-    <use name="cuda"/>
+    <use name="HeterogeneousCore/CUDATest"/>
+    <flags EDM_PLUGIN="1"/>
   </library>
-
 </iftool>
diff --git a/HeterogeneousCore/CUDATest/plugins/TestAlgo.cc b/HeterogeneousCore/CUDATest/plugins/TestAlgo.cc
deleted file mode 100644
index 721db32dda55b..0000000000000
--- a/HeterogeneousCore/CUDATest/plugins/TestAlgo.cc
+++ /dev/null
@@ -1,22 +0,0 @@
-#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h"
-
-#include "TestAlgo.h"
-
-namespace cudatest {
-
-  static void testAlgoKernel(cudatest::TestHostCollection::View view, int32_t size) {
-    const cudatest::Matrix matrix{{1, 2, 3, 4, 5, 6}, {2, 4, 6, 8, 10, 12}, {3, 6, 9, 12, 15, 18}};
-    const cudatest::Array flags = {{6, 4, 2, 0}};
-
-    view.r() = 1.;
-
-    for (auto i = 0; i < size; ++i) {
-      view[i] = {0., 0., 0., i, flags, matrix * i};
-    }
-  }
-
-  void TestAlgo::fill(cudatest::TestHostCollection& collection) const {
-    testAlgoKernel(collection.view(), collection->metadata().size());
-  }
-
-}  // namespace cudatest
diff --git a/HeterogeneousCore/CUDATest/plugins/TestAlgo.cu b/HeterogeneousCore/CUDATest/plugins/TestAlgo.cu
deleted file mode 100644
index 06e238e98c0c2..0000000000000
--- a/HeterogeneousCore/CUDATest/plugins/TestAlgo.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-#include <cuda_runtime.h>
-
-#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h"
-
-#include "TestAlgo.h"
-
-namespace cudatest {
-
-  static __global__ void testAlgoKernel(cudatest::TestDeviceCollection::View view, int32_t size) {
-    const int32_t thread = blockIdx.x * blockDim.x + threadIdx.x;
-    const int32_t stride = blockDim.x * gridDim.x;
-    const cudatest::Matrix matrix{{1, 2, 3, 4, 5, 6}, {2, 4, 6, 8, 10, 12}, {3, 6, 9, 12, 15, 18}};
-    const cudatest::Array flags = {{6, 4, 2, 0}};
-
-    if (thread == 0) {
-      view.r() = 1.;
-    }
-    for (auto i = thread; i < size; i += stride) {
-      view[i] = {0., 0., 0., i, flags, matrix * i};
-    }
-  }
-
-  void TestAlgo::fill(cudatest::TestDeviceCollection& collection, cudaStream_t stream) const {
-    const uint32_t maxThreadsPerBlock = 1024;
-
-    uint32_t threadsPerBlock = maxThreadsPerBlock;
-    uint32_t blocksPerGrid = (collection->metadata().size() + threadsPerBlock - 1) / threadsPerBlock;
-
-    testAlgoKernel<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(collection.view(), collection->metadata().size());
-  }
-
-}  // namespace cudatest
diff --git a/HeterogeneousCore/CUDATest/plugins/TestAlgo.h b/HeterogeneousCore/CUDATest/plugins/TestAlgo.h
deleted file mode 100644
index a91a773234f68..0000000000000
--- a/HeterogeneousCore/CUDATest/plugins/TestAlgo.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#ifndef HeterogeneousCore_CUDATest_plugins_TestAlgo_h
-#define HeterogeneousCore_CUDATest_plugins_TestAlgo_h
-
-#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h"
-#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h"
-
-namespace cudatest {
-
-  class TestAlgo {
-  public:
-    void fill(cudatest::TestDeviceCollection& collection, cudaStream_t stream) const;
-    void fill(cudatest::TestHostCollection& collection) const;
-  };
-
-}  // namespace cudatest
-
-#endif  // HeterogeneousCore_CUDATest_plugins_TestAlgo_h
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc
deleted file mode 100644
index 09d85f6c1c47d..0000000000000
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPU.cc
+++ /dev/null
@@ -1,81 +0,0 @@
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/Frameworkfwd.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-#include "FWCore/Framework/interface/global/EDAnalyzer.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-#include "FWCore/ServiceRegistry/interface/Service.h"
-
-#include "CUDADataFormats/Common/interface/Product.h"
-#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
-#include "HeterogeneousCore/CUDAServices/interface/CUDAInterface.h"
-#include "HeterogeneousCore/CUDATest/interface/Thing.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
-
-#include "TestCUDAAnalyzerGPUKernel.h"
-
-class TestCUDAAnalyzerGPU : public edm::global::EDAnalyzer<> {
-public:
-  explicit TestCUDAAnalyzerGPU(edm::ParameterSet const& iConfig);
-  ~TestCUDAAnalyzerGPU() override = default;
-
-  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
-
-  void analyze(edm::StreamID, edm::Event const& iEvent, edm::EventSetup const& iSetup) const override;
-  void endJob() override;
-
-private:
-  std::string const label_;
-  edm::EDGetTokenT<cms::cuda::Product<cms::cudatest::Thing>> const srcToken_;
-  double const minValue_;
-  double const maxValue_;
-  // the public interface is thread safe
-  CMS_THREAD_SAFE mutable std::unique_ptr<TestCUDAAnalyzerGPUKernel> gpuAlgo_;
-};
-
-TestCUDAAnalyzerGPU::TestCUDAAnalyzerGPU(edm::ParameterSet const& iConfig)
-    : label_(iConfig.getParameter<std::string>("@module_label")),
-      srcToken_(consumes<cms::cuda::Product<cms::cudatest::Thing>>(iConfig.getParameter<edm::InputTag>("src"))),
-      minValue_(iConfig.getParameter<double>("minValue")),
-      maxValue_(iConfig.getParameter<double>("maxValue")) {
-  edm::Service<CUDAInterface> cuda;
-  if (cuda and cuda->enabled()) {
-    auto streamPtr = cms::cuda::getStreamCache().get();
-    gpuAlgo_ = std::make_unique<TestCUDAAnalyzerGPUKernel>(streamPtr.get());
-  }
-}
-
-void TestCUDAAnalyzerGPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
-  edm::ParameterSetDescription desc;
-  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Source of cms::cuda::Product<cms::cudatest::Thing>.");
-  desc.add<double>("minValue", -1e308);
-  desc.add<double>("maxValue", 1e308);
-  descriptions.addWithDefaultLabel(desc);
-  descriptions.setComment("This EDAnalyzer is part of the TestCUDAProducer* family. It models a GPU analyzer.");
-}
-
-void TestCUDAAnalyzerGPU::analyze(edm::StreamID, edm::Event const& iEvent, edm::EventSetup const& iSetup) const {
-  edm::LogVerbatim("TestCUDAAnalyzerGPU") << label_ << " TestCUDAAnalyzerGPU::analyze begin event "
-                                          << iEvent.id().event() << " stream " << iEvent.streamID();
-
-  auto const& in = iEvent.get(srcToken_);
-  cms::cuda::ScopedContextAnalyze ctx{in};
-  cms::cudatest::Thing const& input = ctx.get(in);
-  gpuAlgo_->analyzeAsync(input.get(), ctx.stream());
-
-  edm::LogVerbatim("TestCUDAAnalyzerGPU")
-      << label_ << " TestCUDAAnalyzerGPU::analyze end event " << iEvent.id().event() << " stream " << iEvent.streamID();
-}
-
-void TestCUDAAnalyzerGPU::endJob() {
-  edm::LogVerbatim("TestCUDAAnalyzerGPU") << label_ << " TestCUDAAnalyzerGPU::endJob begin";
-
-  auto streamPtr = cms::cuda::getStreamCache().get();
-  auto value = gpuAlgo_->value(streamPtr.get());
-  edm::LogVerbatim("TestCUDAAnalyzerGPU") << label_ << "  accumulated value " << value;
-  assert(minValue_ <= value && value <= maxValue_);
-
-  edm::LogVerbatim("TestCUDAAnalyzerGPU") << label_ << " TestCUDAAnalyzerGPU::endJob end";
-}
-
-DEFINE_FWK_MODULE(TestCUDAAnalyzerGPU);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.cu b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.cu
deleted file mode 100644
index 2b3951a2b5cfe..0000000000000
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.cu
+++ /dev/null
@@ -1,46 +0,0 @@
-#include "TestCUDAAnalyzerGPUKernel.h"
-
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-
-namespace {
-  __global__ void analyze(const float *input, float *sum, int numElements) {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < numElements) {
-      atomicAdd(sum + i, input[i]);
-    }
-  }
-
-  __global__ void sum(const float *input, float *output, int numElements) {
-    float val = 0.f;
-    for (int i = 0; i < numElements; ++i) {
-      val += input[i];
-    }
-    *output = val;
-  }
-}  // namespace
-
-TestCUDAAnalyzerGPUKernel::TestCUDAAnalyzerGPUKernel(cudaStream_t stream) {
-  sum_ = cms::cuda::make_device_unique<float[]>(NUM_VALUES, stream);
-  cms::cuda::memsetAsync(sum_, 0, NUM_VALUES, stream);
-  // better to synchronize since there is no guarantee that the stream
-  // of analyzeAsync() would be otherwise synchronized with this one
-  cudaCheck(cudaStreamSynchronize(stream));
-}
-
-void TestCUDAAnalyzerGPUKernel::analyzeAsync(const float *d_input, cudaStream_t stream) {
-  analyze<<<int(ceil(float(NUM_VALUES) / 256)), 256, 0, stream>>>(d_input, sum_.get(), NUM_VALUES);
-}
-
-float TestCUDAAnalyzerGPUKernel::value(cudaStream_t stream) const {
-  auto accumulator = cms::cuda::make_device_unique<float>(stream);
-  auto h_accumulator = cms::cuda::make_host_unique<float>(stream);
-  sum<<<1, 1, 0, stream>>>(sum_.get(), accumulator.get(), NUM_VALUES);
-  cms::cuda::copyAsync(h_accumulator, accumulator, stream);
-  // need to synchronize
-  cudaCheck(cudaStreamSynchronize(stream));
-  return *h_accumulator;
-}
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.h b/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.h
deleted file mode 100644
index a9a6b962c2cc4..0000000000000
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAAnalyzerGPUKernel.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef HeterogeneousCore_CUDACore_TestCUDAAnalyzerGPUKernel_h
-#define HeterogeneousCore_CUDACore_TestCUDAAnalyzerGPUKernel_h
-
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-
-#include <cuda_runtime.h>
-
-class TestCUDAAnalyzerGPUKernel {
-public:
-  static constexpr int NUM_VALUES = 4000;
-
-  TestCUDAAnalyzerGPUKernel(cudaStream_t stream);
-  ~TestCUDAAnalyzerGPUKernel() = default;
-
-  // thread safe
-  void analyzeAsync(const float* d_input, cudaStream_t stream);
-  float value(cudaStream_t stream) const;
-
-private:
-  cms::cuda::device::unique_ptr<float[]> sum_;  // all writes are atomic in CUDA
-};
-
-#endif
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc
deleted file mode 100644
index c25a44023ebc0..0000000000000
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-#include "FWCore/Framework/interface/global/EDProducer.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/Frameworkfwd.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-
-#include <chrono>
-#include <random>
-#include <thread>
-
-class TestCUDAProducerCPU : public edm::global::EDProducer<> {
-public:
-  explicit TestCUDAProducerCPU(edm::ParameterSet const& iConfig);
-  ~TestCUDAProducerCPU() override = default;
-
-  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
-
-  void produce(edm::StreamID id, edm::Event& iEvent, edm::EventSetup const& iSetup) const override;
-
-private:
-  std::string const label_;
-  edm::EDGetTokenT<int> srcToken_;
-  edm::EDPutTokenT<int> const dstToken_;
-};
-
-TestCUDAProducerCPU::TestCUDAProducerCPU(edm::ParameterSet const& iConfig)
-    : label_{iConfig.getParameter<std::string>("@module_label")}, dstToken_{produces<int>()} {
-  auto srcTag = iConfig.getParameter<edm::InputTag>("src");
-  if (!srcTag.label().empty()) {
-    srcToken_ = consumes<int>(srcTag);
-  }
-}
-
-void TestCUDAProducerCPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
-  edm::ParameterSetDescription desc;
-  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Optional source of another TestCUDAProducerCPU.");
-  descriptions.addWithDefaultLabel(desc);
-  descriptions.setComment("This EDProducer is part of the TestCUDAProducer* family. It models a CPU algorithm.");
-}
-
-void TestCUDAProducerCPU::produce(edm::StreamID id, edm::Event& iEvent, edm::EventSetup const& iSetup) const {
-  edm::LogVerbatim("TestCUDAProducerCPU")
-      << label_ << " TestCUDAProducerCPU::produce begin event " << iEvent.id().event() << " stream " << id;
-
-  int input = 0;
-  if (!srcToken_.isUninitialized()) {
-    input = iEvent.get(srcToken_);
-  }
-
-  std::random_device r;
-  std::mt19937 gen(r());
-  auto dist = std::uniform_real_distribution<>(0.2, 1.5);
-  auto dur = dist(gen);
-  edm::LogVerbatim("TestCUDAProducerCPU")
-      << " Task (CPU) for event " << iEvent.id().event() << " in stream " << id << " will take " << dur << " seconds";
-  std::this_thread::sleep_for(std::chrono::seconds(1) * dur);
-
-  unsigned int const output = input + id * 100 + iEvent.id().event();
-
-  iEvent.emplace(dstToken_, output);
-
-  edm::LogVerbatim("TestCUDAProducerCPU") << label_ << " TestCUDAProducerCPU::produce end event " << iEvent.id().event()
-                                          << " stream " << id << " result " << output;
-}
-
-DEFINE_FWK_MODULE(TestCUDAProducerCPU);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
deleted file mode 100644
index 6f92ac91dd922..0000000000000
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/Frameworkfwd.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-#include "FWCore/Framework/interface/global/EDProducer.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-
-#include "CUDADataFormats/Common/interface/Product.h"
-#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
-#include "HeterogeneousCore/CUDATest/interface/Thing.h"
-
-#include "TestCUDAProducerGPUKernel.h"
-
-class TestCUDAProducerGPU : public edm::global::EDProducer<> {
-public:
-  explicit TestCUDAProducerGPU(const edm::ParameterSet& iConfig);
-  ~TestCUDAProducerGPU() override = default;
-
-  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
-
-  void produce(edm::StreamID streamID, edm::Event& iEvent, edm::EventSetup const& iSetup) const override;
-
-private:
-  std::string const label_;
-  edm::EDGetTokenT<cms::cuda::Product<cms::cudatest::Thing>> const srcToken_;
-  edm::EDPutTokenT<cms::cuda::Product<cms::cudatest::Thing>> const dstToken_;
-  TestCUDAProducerGPUKernel const gpuAlgo_;
-};
-
-TestCUDAProducerGPU::TestCUDAProducerGPU(edm::ParameterSet const& iConfig)
-    : label_(iConfig.getParameter<std::string>("@module_label")),
-      srcToken_(consumes<cms::cuda::Product<cms::cudatest::Thing>>(iConfig.getParameter<edm::InputTag>("src"))),
-      dstToken_(produces<cms::cuda::Product<cms::cudatest::Thing>>()) {}
-
-void TestCUDAProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
-  edm::ParameterSetDescription desc;
-  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Source of cms::cuda::Product<cms::cudatest::Thing>.");
-  descriptions.addWithDefaultLabel(desc);
-  descriptions.setComment(
-      "This EDProducer is part of the TestCUDAProducer* family. It models a GPU algorithm this is not the first "
-      "algorithm in the chain of the GPU EDProducers. Produces cms::cuda::Product<cms::cudatest::Thing>.");
-}
-
-void TestCUDAProducerGPU::produce(edm::StreamID streamID, edm::Event& iEvent, edm::EventSetup const& iSetup) const {
-  edm::LogVerbatim("TestCUDAProducerGPU") << label_ << " TestCUDAProducerGPU::produce begin event "
-                                          << iEvent.id().event() << " stream " << iEvent.streamID();
-
-  auto const& in = iEvent.get(srcToken_);
-  cms::cuda::ScopedContextProduce ctx{in};
-  cms::cudatest::Thing const& input = ctx.get(in);
-
-  ctx.emplace(iEvent, dstToken_, cms::cudatest::Thing{gpuAlgo_.runAlgo(label_, input.get(), ctx.stream())});
-
-  edm::LogVerbatim("TestCUDAProducerGPU")
-      << label_ << " TestCUDAProducerGPU::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID();
-}
-
-DEFINE_FWK_MODULE(TestCUDAProducerGPU);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
deleted file mode 100644
index b8b3f9058d496..0000000000000
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/Frameworkfwd.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-#include "FWCore/Framework/interface/stream/EDProducer.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-#include "FWCore/ServiceRegistry/interface/Service.h"
-
-#include "CUDADataFormats/Common/interface/Product.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
-#include "HeterogeneousCore/CUDACore/interface/ContextState.h"
-#include "HeterogeneousCore/CUDAServices/interface/CUDAInterface.h"
-#include "HeterogeneousCore/CUDATest/interface/Thing.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
-
-#include "TestCUDAProducerGPUKernel.h"
-
-class TestCUDAProducerGPUEW : public edm::stream::EDProducer<edm::ExternalWork> {
-public:
-  explicit TestCUDAProducerGPUEW(edm::ParameterSet const& iConfig);
-  ~TestCUDAProducerGPUEW() override = default;
-
-  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
-
-  void acquire(edm::Event const& iEvent,
-               edm::EventSetup const& iSetup,
-               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
-  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
-
-private:
-  std::string const label_;
-  edm::EDGetTokenT<cms::cuda::Product<cms::cudatest::Thing>> const srcToken_;
-  edm::EDPutTokenT<cms::cuda::Product<cms::cudatest::Thing>> const dstToken_;
-  TestCUDAProducerGPUKernel gpuAlgo_;
-  cms::cuda::ContextState ctxState_;
-  cms::cuda::device::unique_ptr<float[]> devicePtr_;
-  cms::cuda::host::noncached::unique_ptr<float> hostData_;
-};
-
-TestCUDAProducerGPUEW::TestCUDAProducerGPUEW(edm::ParameterSet const& iConfig)
-    : label_{iConfig.getParameter<std::string>("@module_label")},
-      srcToken_{consumes<cms::cuda::Product<cms::cudatest::Thing>>(iConfig.getParameter<edm::InputTag>("src"))},
-      dstToken_{produces<cms::cuda::Product<cms::cudatest::Thing>>()} {
-  edm::Service<CUDAInterface> cuda;
-  if (cuda and cuda->enabled()) {
-    hostData_ = cms::cuda::make_host_noncached_unique<float>();
-  }
-}
-
-void TestCUDAProducerGPUEW::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
-  edm::ParameterSetDescription desc;
-  desc.add<edm::InputTag>("src", edm::InputTag());
-  descriptions.addWithDefaultLabel(desc);
-  descriptions.setComment(
-      "This EDProducer is part of the TestCUDAProducer* family. It models a GPU algorithm this is not the first "
-      "algorithm in the chain of the GPU EDProducers, and that transfers some data from GPU to CPU and thus needs to "
-      "synchronize GPU and CPU. The synchronization is implemented with the ExternalWork extension. Produces "
-      "cms::cuda::Product<cms::cuda::Thing>.");
-}
-
-void TestCUDAProducerGPUEW::acquire(edm::Event const& iEvent,
-                                    edm::EventSetup const& iSetup,
-                                    edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire begin event "
-                                            << iEvent.id().event() << " stream " << iEvent.streamID();
-
-  auto const& in = iEvent.get(srcToken_);
-  cms::cuda::ScopedContextAcquire ctx{in, std::move(waitingTaskHolder), ctxState_};
-  cms::cudatest::Thing const& input = ctx.get(in);
-
-  devicePtr_ = gpuAlgo_.runAlgo(label_, input.get(), ctx.stream());
-  // Mimick the need to transfer some of the GPU data back to CPU to
-  // be used for something within this module, or to be put in the
-  // event.
-  cudaCheck(
-      cudaMemcpyAsync(hostData_.get(), devicePtr_.get() + 10, sizeof(float), cudaMemcpyDeviceToHost, ctx.stream()));
-  edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire end event "
-                                            << iEvent.id().event() << " stream " << iEvent.streamID();
-}
-
-void TestCUDAProducerGPUEW::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
-  edm::LogVerbatim("TestCUDAProducerGPUEW")
-      << label_ << " TestCUDAProducerGPUEW::produce begin event " << iEvent.id().event() << " stream "
-      << iEvent.streamID() << " 10th element " << *hostData_;
-
-  cms::cuda::ScopedContextProduce ctx{ctxState_};
-
-  ctx.emplace(iEvent, dstToken_, std::move(devicePtr_));
-
-  edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::produce end event "
-                                            << iEvent.id().event() << " stream " << iEvent.streamID();
-}
-
-DEFINE_FWK_MODULE(TestCUDAProducerGPUEW);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
deleted file mode 100644
index b9752f6f41630..0000000000000
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/Frameworkfwd.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-#include "FWCore/Framework/interface/global/EDProducer.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-
-#include "CUDADataFormats/Common/interface/Product.h"
-#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
-#include "HeterogeneousCore/CUDATest/interface/Thing.h"
-
-#include "TestCUDAProducerGPUKernel.h"
-
-class TestCUDAProducerGPUFirst : public edm::global::EDProducer<> {
-public:
-  explicit TestCUDAProducerGPUFirst(edm::ParameterSet const& iConfig);
-  ~TestCUDAProducerGPUFirst() override = default;
-
-  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
-
-  void produce(edm::StreamID stream, edm::Event& iEvent, edm::EventSetup const& iSetup) const override;
-
-private:
-  std::string const label_;
-  edm::EDPutTokenT<cms::cuda::Product<cms::cudatest::Thing>> const dstToken_;
-  TestCUDAProducerGPUKernel const gpuAlgo_;
-};
-
-TestCUDAProducerGPUFirst::TestCUDAProducerGPUFirst(edm::ParameterSet const& iConfig)
-    : label_(iConfig.getParameter<std::string>("@module_label")),
-      dstToken_{produces<cms::cuda::Product<cms::cudatest::Thing>>()} {}
-
-void TestCUDAProducerGPUFirst::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
-  edm::ParameterSetDescription desc;
-  descriptions.addWithDefaultLabel(desc);
-  descriptions.setComment(
-      "This EDProducer is part of the TestCUDAProducer* family. It models a GPU algorithm this the first algorithm in "
-      "the chain of the GPU EDProducers. Produces cms::cuda::Productcms::cudatest::Thing>.");
-}
-
-void TestCUDAProducerGPUFirst::produce(edm::StreamID streamID,
-                                       edm::Event& iEvent,
-                                       edm::EventSetup const& iSetup) const {
-  edm::LogVerbatim("TestCUDAProducerGPUFirst") << label_ << " TestCUDAProducerGPUFirst::produce begin event "
-                                               << iEvent.id().event() << " stream " << iEvent.streamID();
-
-  cms::cuda::ScopedContextProduce ctx{streamID};
-
-  cms::cuda::device::unique_ptr<float[]> output = gpuAlgo_.runAlgo(label_, ctx.stream());
-  ctx.emplace(iEvent, dstToken_, std::move(output));
-
-  edm::LogVerbatim("TestCUDAProducerGPUFirst") << label_ << " TestCUDAProducerGPUFirst::produce end event "
-                                               << iEvent.id().event() << " stream " << iEvent.streamID();
-}
-
-DEFINE_FWK_MODULE(TestCUDAProducerGPUFirst);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
deleted file mode 100644
index 69264a40aca62..0000000000000
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
+++ /dev/null
@@ -1,131 +0,0 @@
-#include "FWCore/Utilities/interface/Exception.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/MessageLogger.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
-
-#include "TestCUDAProducerGPUKernel.h"
-
-namespace {
-  template <typename T>
-  __global__ void vectorAddConstant(T *a, T b, int numElements) {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < numElements) {
-      a[i] += b;
-    }
-  }
-
-  template <typename T>
-  __global__ void vectorAdd(const T *a, const T *b, T *c, int numElements) {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
-    if (i < numElements) {
-      c[i] = a[i] + b[i];
-    }
-  }
-
-  template <typename T>
-  __global__ void vectorProd(const T *a, const T *b, T *c, int numElements) {
-    int row = blockIdx.y * blockDim.y + threadIdx.y;
-    int col = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (row < numElements && col < numElements) {
-      c[row * numElements + col] = a[row] * b[col];
-    }
-  }
-
-  template <typename T>
-  __global__ void matrixMul(const T *a, const T *b, T *c, int numElements) {
-    int row = blockIdx.y * blockDim.y + threadIdx.y;
-    int col = blockIdx.x * blockDim.x + threadIdx.x;
-
-    if (row < numElements && col < numElements) {
-      T tmp = 0;
-      for (int i = 0; i < numElements; ++i) {
-        tmp += a[row * numElements + i] * b[i * numElements + col];
-      }
-      c[row * numElements + col] = tmp;
-    }
-  }
-
-  template <typename T>
-  __global__ void matrixMulVector(const T *a, const T *b, T *c, int numElements) {
-    int row = blockIdx.y * blockDim.y + threadIdx.y;
-
-    if (row < numElements) {
-      T tmp = 0;
-      for (int i = 0; i < numElements; ++i) {
-        tmp += a[row * numElements + i] * b[i];
-      }
-      c[row] = tmp;
-    }
-  }
-}  // namespace
-
-cms::cuda::device::unique_ptr<float[]> TestCUDAProducerGPUKernel::runAlgo(const std::string &label,
-                                                                          const float *d_input,
-                                                                          cudaStream_t stream) const {
-  // First make the sanity check
-  if (d_input != nullptr) {
-    auto h_check = std::make_unique<float[]>(NUM_VALUES);
-    cudaCheck(cudaMemcpyAsync(h_check.get(), d_input, NUM_VALUES * sizeof(float), cudaMemcpyDeviceToHost, stream));
-    cudaCheck(cudaStreamSynchronize(stream));
-    for (int i = 0; i < NUM_VALUES; ++i) {
-      if (h_check[i] != i) {
-        throw cms::Exception("Assert") << "Sanity check on element " << i << " failed, expected " << i << " got "
-                                       << h_check[i];
-      }
-    }
-  }
-
-  auto h_a = cms::cuda::make_host_unique<float[]>(NUM_VALUES, stream);
-  auto h_b = cms::cuda::make_host_unique<float[]>(NUM_VALUES, stream);
-
-  for (auto i = 0; i < NUM_VALUES; i++) {
-    h_a[i] = i;
-    h_b[i] = i * i;
-  }
-
-  auto d_a = cms::cuda::make_device_unique<float[]>(NUM_VALUES, stream);
-  auto d_b = cms::cuda::make_device_unique<float[]>(NUM_VALUES, stream);
-
-  cudaCheck(cudaMemcpyAsync(d_a.get(), h_a.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream));
-  cudaCheck(cudaMemcpyAsync(d_b.get(), h_b.get(), NUM_VALUES * sizeof(float), cudaMemcpyHostToDevice, stream));
-
-  int threadsPerBlock{32};
-  int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock;
-
-  auto d_c = cms::cuda::make_device_unique<float[]>(NUM_VALUES, stream);
-  auto current_device = cms::cuda::currentDevice();
-  cms::cuda::LogVerbatim("TestHeterogeneousEDProducerGPU")
-      << "  " << label << " GPU launching kernels device " << current_device << " CUDA stream " << stream;
-  vectorAdd<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(d_a.get(), d_b.get(), d_c.get(), NUM_VALUES);
-
-  auto d_ma = cms::cuda::make_device_unique<float[]>(NUM_VALUES * NUM_VALUES, stream);
-  auto d_mb = cms::cuda::make_device_unique<float[]>(NUM_VALUES * NUM_VALUES, stream);
-  auto d_mc = cms::cuda::make_device_unique<float[]>(NUM_VALUES * NUM_VALUES, stream);
-  dim3 threadsPerBlock3{NUM_VALUES, NUM_VALUES};
-  dim3 blocksPerGrid3{1, 1};
-  if (NUM_VALUES * NUM_VALUES > 32) {
-    threadsPerBlock3.x = 32;
-    threadsPerBlock3.y = 32;
-    blocksPerGrid3.x = ceil(double(NUM_VALUES) / double(threadsPerBlock3.x));
-    blocksPerGrid3.y = ceil(double(NUM_VALUES) / double(threadsPerBlock3.y));
-  }
-  vectorProd<<<blocksPerGrid3, threadsPerBlock3, 0, stream>>>(d_a.get(), d_b.get(), d_ma.get(), NUM_VALUES);
-  vectorProd<<<blocksPerGrid3, threadsPerBlock3, 0, stream>>>(d_a.get(), d_c.get(), d_mb.get(), NUM_VALUES);
-  matrixMul<<<blocksPerGrid3, threadsPerBlock3, 0, stream>>>(d_ma.get(), d_mb.get(), d_mc.get(), NUM_VALUES);
-
-  matrixMulVector<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(d_mc.get(), d_b.get(), d_c.get(), NUM_VALUES);
-
-  cms::cuda::LogVerbatim("TestHeterogeneousEDProducerGPU")
-      << "  " << label << " GPU kernels launched, returning return pointer device " << current_device << " CUDA stream "
-      << stream;
-  return d_a;
-}
-
-void TestCUDAProducerGPUKernel::runSimpleAlgo(float *d_data, cudaStream_t stream) const {
-  int threadsPerBlock{32};
-  int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock;
-  vectorAddConstant<<<blocksPerGrid, threadsPerBlock, 0, stream>>>(d_data, 1.0f, NUM_VALUES);
-}
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h
deleted file mode 100644
index 5eeba0009656e..0000000000000
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#ifndef HeterogeneousCore_CUDACore_TestCUDAProducerGPUKernel_h
-#define HeterogeneousCore_CUDACore_TestCUDAProducerGPUKernel_h
-
-#include <string>
-
-#include <cuda_runtime.h>
-
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-
-/**
- * This class models the actual CUDA implementation of an algorithm.
- *
- * Memory is allocated dynamically with the allocator in cms::cuda.
- *
- * The algorithm is intended to waste time with large matrix
- * operations so that the asynchronous nature of the CUDA integration
- * becomes visible with debug prints.
- */
-class TestCUDAProducerGPUKernel {
-public:
-  static constexpr int NUM_VALUES = 4000;
-
-  TestCUDAProducerGPUKernel() = default;
-  ~TestCUDAProducerGPUKernel() = default;
-
-  // returns (owning) pointer to device memory
-  cms::cuda::device::unique_ptr<float[]> runAlgo(const std::string& label, cudaStream_t stream) const {
-    return runAlgo(label, nullptr, stream);
-  }
-  cms::cuda::device::unique_ptr<float[]> runAlgo(const std::string& label,
-                                                 const float* d_input,
-                                                 cudaStream_t stream) const;
-
-  void runSimpleAlgo(float* d_data, cudaStream_t stream) const;
-};
-
-#endif
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
deleted file mode 100644
index dc07fc0add7f7..0000000000000
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/Frameworkfwd.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-#include "FWCore/Framework/interface/stream/EDProducer.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-
-#include "CUDADataFormats/Common/interface/Product.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
-#include "HeterogeneousCore/CUDATest/interface/Thing.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
-
-#include "TestCUDAProducerGPUKernel.h"
-
-class TestCUDAProducerGPUtoCPU : public edm::stream::EDProducer<edm::ExternalWork> {
-public:
-  explicit TestCUDAProducerGPUtoCPU(edm::ParameterSet const& iConfig);
-  ~TestCUDAProducerGPUtoCPU() override = default;
-
-  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
-
-  void acquire(edm::Event const& iEvent,
-               edm::EventSetup const& iSetup,
-               edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
-
-  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
-
-private:
-  std::string const label_;
-  edm::EDGetTokenT<cms::cuda::Product<cms::cudatest::Thing>> const srcToken_;
-  edm::EDPutTokenT<int> const dstToken_;
-  cms::cuda::host::unique_ptr<float[]> buffer_;
-};
-
-TestCUDAProducerGPUtoCPU::TestCUDAProducerGPUtoCPU(edm::ParameterSet const& iConfig)
-    : label_{iConfig.getParameter<std::string>("@module_label")},
-      srcToken_{consumes<cms::cuda::Product<cms::cudatest::Thing>>(iConfig.getParameter<edm::InputTag>("src"))},
-      dstToken_{produces<int>()} {}
-
-void TestCUDAProducerGPUtoCPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
-  edm::ParameterSetDescription desc;
-  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Source for cms::cuda::Product<cms::cudatest::Thing>.");
-  descriptions.addWithDefaultLabel(desc);
-  descriptions.setComment(
-      "This EDProducer is part of the TestCUDAProducer* family. It models the GPU->CPU data transfer and formatting of "
-      "the data to legacy data format. Produces int, to be compatible with TestCUDAProducerCPU.");
-}
-
-void TestCUDAProducerGPUtoCPU::acquire(edm::Event const& iEvent,
-                                       edm::EventSetup const& iSetup,
-                                       edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::acquire begin event "
-                                               << iEvent.id().event() << " stream " << iEvent.streamID();
-
-  auto const& in = iEvent.get(srcToken_);
-  cms::cuda::ScopedContextAcquire ctx{in, std::move(waitingTaskHolder)};
-  cms::cudatest::Thing const& device = ctx.get(in);
-
-  buffer_ = cms::cuda::make_host_unique<float[]>(TestCUDAProducerGPUKernel::NUM_VALUES, ctx.stream());
-  // Enqueue async copy, continue in produce once finished
-  cudaCheck(cudaMemcpyAsync(buffer_.get(),
-                            device.get(),
-                            TestCUDAProducerGPUKernel::NUM_VALUES * sizeof(float),
-                            cudaMemcpyDeviceToHost,
-                            ctx.stream()));
-
-  edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::acquire end event "
-                                               << iEvent.id().event() << " stream " << iEvent.streamID();
-}
-
-void TestCUDAProducerGPUtoCPU::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
-  edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::produce begin event "
-                                               << iEvent.id().event() << " stream " << iEvent.streamID();
-
-  int counter = 0;
-  for (int i = 0; i < TestCUDAProducerGPUKernel::NUM_VALUES; ++i) {
-    counter += buffer_[i];
-  }
-  buffer_.reset();  // not so nice, but no way around?
-
-  iEvent.emplace(dstToken_, counter);
-
-  edm::LogVerbatim("TestCUDAProducerGPUtoCPU")
-      << label_ << " TestCUDAProducerGPUtoCPU::produce end event " << iEvent.id().event() << " stream "
-      << iEvent.streamID() << " result " << counter;
-}
-
-DEFINE_FWK_MODULE(TestCUDAProducerGPUtoCPU);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestPortableAnalyzer.cc b/HeterogeneousCore/CUDATest/plugins/TestPortableAnalyzer.cc
deleted file mode 100644
index 1d3bd585ccdfe..0000000000000
--- a/HeterogeneousCore/CUDATest/plugins/TestPortableAnalyzer.cc
+++ /dev/null
@@ -1,125 +0,0 @@
-#include <cassert>
-
-#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/Frameworkfwd.h"
-#include "FWCore/Framework/interface/stream/EDAnalyzer.h"
-#include "FWCore/MessageLogger/interface/MessageLogger.h"
-#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-#include "FWCore/Utilities/interface/EDGetToken.h"
-#include "FWCore/Utilities/interface/InputTag.h"
-
-namespace {
-
-  template <typename T>
-  class Column {
-  public:
-    Column(T const* data, size_t size) : data_(data), size_(size) {}
-
-    Column(std::span<T const> span) : data_(span.data()), size_(span.size()) {}
-
-    void print(std::ostream& out) const {
-      std::stringstream buffer;
-      buffer << "{ ";
-      if (size_ > 0) {
-        buffer << data_[0];
-      }
-      if (size_ > 1) {
-        buffer << ", " << data_[1];
-      }
-      if (size_ > 2) {
-        buffer << ", " << data_[2];
-      }
-      if (size_ > 3) {
-        buffer << ", ...";
-      }
-      buffer << '}';
-      out << buffer.str();
-    }
-
-  private:
-    T const* const data_;
-    size_t const size_;
-  };
-
-  template <typename T>
-  std::ostream& operator<<(std::ostream& out, Column<T> const& column) {
-    column.print(out);
-    return out;
-  }
-}  // namespace
-
-class TestPortableAnalyzer : public edm::stream::EDAnalyzer<> {
-public:
-  TestPortableAnalyzer(edm::ParameterSet const& config)
-      : source_{config.getParameter<edm::InputTag>("source")}, token_{consumes(source_)} {}
-
-  void analyze(edm::Event const& event, edm::EventSetup const&) override {
-    cudatest::TestHostCollection const& product = event.get(token_);
-
-    auto const& view = product.const_view();
-    for (int32_t i = 0; i < view.metadata().size(); ++i) {
-      assert(view[i].id() == i);
-    }
-
-    edm::LogInfo msg("TestPortableAnalyzer");
-    msg << source_.encode() << ".size() = " << view.metadata().size() << '\n';
-    msg << "  data  @ " << product.buffer().get() << ",\n"
-        << "  x     @ " << view.metadata().addressOf_x() << " = " << Column(view.x()) << ",\n"
-        << "  y     @ " << view.metadata().addressOf_y() << " = " << Column(view.y()) << ",\n"
-        << "  z     @ " << view.metadata().addressOf_z() << " = " << Column(view.z()) << ",\n"
-        << "  id    @ " << view.metadata().addressOf_id() << " = " << Column(view.id()) << ",\n"
-        << "  r     @ " << view.metadata().addressOf_r() << " = " << view.r() << '\n'
-        << "  flags @ " << view.metadata().addressOf_flags() << " = " << Column(view.flags()) << ",\n"
-        << "  m     @ " << view.metadata().addressOf_m() << " = { ... {" << view[1].m()(1, Eigen::indexing::all)
-        << " } ... } \n";
-    msg << std::hex << "  [y - x] = 0x"
-        << reinterpret_cast<intptr_t>(view.metadata().addressOf_y()) -
-               reinterpret_cast<intptr_t>(view.metadata().addressOf_x())
-        << "  [z - y] = 0x"
-        << reinterpret_cast<intptr_t>(view.metadata().addressOf_z()) -
-               reinterpret_cast<intptr_t>(view.metadata().addressOf_y())
-        << "  [id - z] = 0x"
-        << reinterpret_cast<intptr_t>(view.metadata().addressOf_id()) -
-               reinterpret_cast<intptr_t>(view.metadata().addressOf_z())
-        << "  [r - id] = 0x"
-        << reinterpret_cast<intptr_t>(view.metadata().addressOf_r()) -
-               reinterpret_cast<intptr_t>(view.metadata().addressOf_id())
-        << "  [flags - r] = 0x"
-        << reinterpret_cast<intptr_t>(view.metadata().addressOf_flags()) -
-               reinterpret_cast<intptr_t>(view.metadata().addressOf_r())
-        << "  [m - flags] = 0x"
-        << reinterpret_cast<intptr_t>(view.metadata().addressOf_m()) -
-               reinterpret_cast<intptr_t>(view.metadata().addressOf_flags());
-
-    const portabletest::Matrix matrix{{1, 2, 3, 4, 5, 6}, {2, 4, 6, 8, 10, 12}, {3, 6, 9, 12, 15, 18}};
-    const portabletest::Array flags = {{6, 4, 2, 0}};
-
-    assert(view.r() == 1.);
-    for (int32_t i = 0; i < view.metadata().size(); ++i) {
-      auto vi = view[i];
-      assert(vi.x() == 0.);
-      assert(vi.y() == 0.);
-      assert(vi.z() == 0.);
-      assert(vi.id() == i);
-      assert(vi.flags() == flags);
-      assert(vi.m() == matrix * i);
-    }
-  }
-
-  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
-    edm::ParameterSetDescription desc;
-    desc.add<edm::InputTag>("source");
-    descriptions.addWithDefaultLabel(desc);
-  }
-
-private:
-  const edm::InputTag source_;
-  const edm::EDGetTokenT<cudatest::TestHostCollection> token_;
-};
-
-#include "FWCore/Framework/interface/MakerMacros.h"
-DEFINE_FWK_MODULE(TestPortableAnalyzer);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCPU.cc
deleted file mode 100644
index 19d1611c6b899..0000000000000
--- a/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCPU.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/Frameworkfwd.h"
-#include "FWCore/Framework/interface/stream/EDProducer.h"
-#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-#include "FWCore/ServiceRegistry/interface/Service.h"
-#include "FWCore/Utilities/interface/EDGetToken.h"
-#include "FWCore/Utilities/interface/InputTag.h"
-#include "FWCore/Utilities/interface/StreamID.h"
-
-#include "TestAlgo.h"
-
-class TestPortableProducerCPU : public edm::stream::EDProducer<> {
-public:
-  TestPortableProducerCPU(edm::ParameterSet const& config)
-      : hostToken_{produces()}, size_{config.getParameter<int32_t>("size")} {}
-
-  void produce(edm::Event& event, edm::EventSetup const&) override {
-    // run the algorithm
-    cudatest::TestHostCollection hostProduct{size_};
-    algo_.fill(hostProduct);
-
-    // put the product into the event
-    event.emplace(hostToken_, std::move(hostProduct));
-  }
-
-  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
-    edm::ParameterSetDescription desc;
-    desc.add<int32_t>("size");
-    descriptions.addWithDefaultLabel(desc);
-  }
-
-private:
-  const edm::EDPutTokenT<cudatest::TestHostCollection> hostToken_;
-  const int32_t size_;
-
-  // implementation of the algorithm
-  cudatest::TestAlgo algo_;
-};
-
-#include "FWCore/Framework/interface/MakerMacros.h"
-DEFINE_FWK_MODULE(TestPortableProducerCPU);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCUDA.cc b/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCUDA.cc
deleted file mode 100644
index debafd6b874dd..0000000000000
--- a/HeterogeneousCore/CUDATest/plugins/TestPortableProducerCUDA.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-#include "CUDADataFormats/Common/interface/Product.h"
-#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/Frameworkfwd.h"
-#include "FWCore/Framework/interface/stream/EDProducer.h"
-#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-#include "FWCore/Utilities/interface/EDGetToken.h"
-#include "FWCore/Utilities/interface/InputTag.h"
-#include "FWCore/Utilities/interface/StreamID.h"
-#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
-
-#include "TestAlgo.h"
-
-class TestPortableProducerCUDA : public edm::stream::EDProducer<> {
-public:
-  TestPortableProducerCUDA(edm::ParameterSet const& config)
-      : deviceToken_{produces()}, size_{config.getParameter<int32_t>("size")} {}
-
-  void produce(edm::Event& event, edm::EventSetup const&) override {
-    // create a context based on the EDM stream number
-    cms::cuda::ScopedContextProduce ctx(event.streamID());
-
-    // run the algorithm, potentially asynchronously
-    cudatest::TestDeviceCollection deviceProduct{size_, ctx.stream()};
-    algo_.fill(deviceProduct, ctx.stream());
-
-    // put the asynchronous product into the event without waiting
-    ctx.emplace(event, deviceToken_, std::move(deviceProduct));
-  }
-
-  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
-    edm::ParameterSetDescription desc;
-    desc.add<int32_t>("size");
-    descriptions.addWithDefaultLabel(desc);
-  }
-
-private:
-  const edm::EDPutTokenT<cms::cuda::Product<cudatest::TestDeviceCollection>> deviceToken_;
-  const int32_t size_;
-
-  // implementation of the algorithm
-  cudatest::TestAlgo algo_;
-};
-
-#include "FWCore/Framework/interface/MakerMacros.h"
-DEFINE_FWK_MODULE(TestPortableProducerCUDA);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestPortableTranscriber.cc b/HeterogeneousCore/CUDATest/plugins/TestPortableTranscriber.cc
deleted file mode 100644
index 4b251ad720d0e..0000000000000
--- a/HeterogeneousCore/CUDATest/plugins/TestPortableTranscriber.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-#include "CUDADataFormats/Common/interface/Product.h"
-#include "CUDADataFormats/PortableTestObjects/interface/TestDeviceCollection.h"
-#include "CUDADataFormats/PortableTestObjects/interface/TestHostCollection.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/Frameworkfwd.h"
-#include "FWCore/Framework/interface/stream/EDProducer.h"
-#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-#include "FWCore/Utilities/interface/EDGetToken.h"
-#include "FWCore/Utilities/interface/InputTag.h"
-#include "FWCore/Utilities/interface/StreamID.h"
-#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
-
-class TestPortableTranscriber : public edm::stream::EDProducer<edm::ExternalWork> {
-public:
-  TestPortableTranscriber(edm::ParameterSet const& config)
-      : deviceToken_{consumes(config.getParameter<edm::InputTag>("source"))}, hostToken_{produces()} {}
-
-  void acquire(edm::Event const& event, edm::EventSetup const& setup, edm::WaitingTaskWithArenaHolder task) override {
-    // create a context reusing the same device and queue as the producer of the input collection
-    auto const& input = event.get(deviceToken_);
-    cms::cuda::ScopedContextAcquire ctx{input, std::move(task)};
-
-    cudatest::TestDeviceCollection const& deviceProduct = ctx.get(input);
-
-    // allocate a host product based on the metadata of the device product
-    hostProduct_ = cudatest::TestHostCollection{deviceProduct->metadata().size(), ctx.stream()};
-
-    // copy the content of the device product to the host product
-    cms::cuda::copyAsync(hostProduct_.buffer(), deviceProduct.const_buffer(), deviceProduct.bufferSize(), ctx.stream());
-
-    // do not wait for the asynchronous operation to complete
-  }
-
-  void produce(edm::Event& event, edm::EventSetup const&) override {
-    // produce() is called once the asynchronous operation has completed, so there is no need for an explicit wait
-    event.emplace(hostToken_, std::move(hostProduct_));
-  }
-
-  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
-    edm::ParameterSetDescription desc;
-    desc.add<edm::InputTag>("source");
-    descriptions.addWithDefaultLabel(desc);
-  }
-
-private:
-  const edm::EDGetTokenT<cms::cuda::Product<cudatest::TestDeviceCollection>> deviceToken_;
-  const edm::EDPutTokenT<cudatest::TestHostCollection> hostToken_;
-
-  // hold the output product between acquire() and produce()
-  cudatest::TestHostCollection hostProduct_;
-};
-
-#include "FWCore/Framework/interface/MakerMacros.h"
-DEFINE_FWK_MODULE(TestPortableTranscriber);
diff --git a/HeterogeneousCore/CUDATest/src/classes.h b/HeterogeneousCore/CUDATest/src/classes.h
index 57ad4cf679733..342d32f3dc871 100644
--- a/HeterogeneousCore/CUDATest/src/classes.h
+++ b/HeterogeneousCore/CUDATest/src/classes.h
@@ -1,6 +1,11 @@
+/*
+A simple data product used to test that the framework handles correctly the case of
+edm::Wrapper<T> where
+  - T has a dictionary
+  - edm::Wrapper<T> does not have a dictionary
+  - the corresponding classes.h file includes CUDA headers
+*/
 #include <cuda_runtime.h>
 
-#include "CUDADataFormats/Common/interface/Product.h"
 #include "DataFormats/Common/interface/Wrapper.h"
 #include "HeterogeneousCore/CUDATest/interface/MissingDictionaryCUDAObject.h"
-#include "HeterogeneousCore/CUDATest/interface/Thing.h"
diff --git a/HeterogeneousCore/CUDATest/src/classes_def.xml b/HeterogeneousCore/CUDATest/src/classes_def.xml
index fa11949429242..73ea428a7d7b3 100644
--- a/HeterogeneousCore/CUDATest/src/classes_def.xml
+++ b/HeterogeneousCore/CUDATest/src/classes_def.xml
@@ -1,7 +1,4 @@
 <lcgdict>
-    <class name="cms::cuda::Product<cms::cudatest::Thing>" persistent="false"/>
-    <class name="edm::Wrapper<cms::cuda::Product<cms::cudatest::Thing>>" persistent="false"/>
-
     <!--
     A simple data product used to test that the framework handles correctly the case of
     edm::Wrapper<T> where
diff --git a/HeterogeneousCore/CUDATest/test/BuildFile.xml b/HeterogeneousCore/CUDATest/test/BuildFile.xml
index 27f8a2bf91bdd..57b349035b1bf 100644
--- a/HeterogeneousCore/CUDATest/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDATest/test/BuildFile.xml
@@ -1,12 +1,4 @@
 <iftool name="cuda-gcc-support">
-  <bin file="test*.cc" name="testHeterogeneousCoreCUDATest">
-    <use name="FWCore/TestProcessor"/>
-    <use name="HeterogeneousCore/CUDACore"/>
-    <use name="catch2"/>
-    <!-- dependence only to trigger the unit test when NVIDIA GPU is (expected to be) present -->
-    <use name="cuda"/>
-  </bin>
-
   <!--
   Test that the framework handles correctly the case of edm::Wrapper<T> where
     - T has a dictionary
diff --git a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
deleted file mode 100644
index 75f9c438979c6..0000000000000
--- a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
+++ /dev/null
@@ -1,93 +0,0 @@
-#include "catch2/catch_all.hpp"
-#include "FWCore/TestProcessor/interface/TestProcessor.h"
-#include "FWCore/Utilities/interface/Exception.h"
-
-#include "CUDADataFormats/Common/interface/Product.h"
-#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
-#include "HeterogeneousCore/CUDATest/interface/Thing.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-
-#include <iostream>
-
-static constexpr auto s_tag = "[TestCUDAProducerGPUFirst]";
-
-TEST_CASE("Standard checks of TestCUDAProducerGPUFirst", s_tag) {
-  const std::string baseConfig{
-      R"_(from FWCore.TestProcessor.TestProcess import *
-process = TestProcess()
-process.load("HeterogeneousCore.CUDACore.ProcessAcceleratorCUDA_cfi")
-process.toTest = cms.EDProducer("TestCUDAProducerGPUFirst")
-process.moduleToTest(process.toTest)
-)_"};
-
-  edm::test::TestProcessor::Config config{baseConfig};
-  SECTION("base configuration is OK") { REQUIRE_NOTHROW(edm::test::TestProcessor(config)); }
-
-  SECTION("No event data") {
-    // Calls produce(), so don't call without a GPU
-    if (not cms::cudatest::testDevices()) {
-      return;
-    }
-    edm::test::TestProcessor tester(config);
-
-    REQUIRE_NOTHROW(tester.test());
-  }
-
-  SECTION("beginJob and endJob only") {
-    edm::test::TestProcessor tester(config);
-
-    REQUIRE_NOTHROW(tester.testBeginAndEndJobOnly());
-  }
-
-  SECTION("Run with no LuminosityBlocks") {
-    edm::test::TestProcessor tester(config);
-
-    REQUIRE_NOTHROW(tester.testRunWithNoLuminosityBlocks());
-  }
-
-  SECTION("LuminosityBlock with no Events") {
-    edm::test::TestProcessor tester(config);
-
-    REQUIRE_NOTHROW(tester.testLuminosityBlockWithNoEvents());
-  }
-}
-
-TEST_CASE("TestCUDAProducerGPUFirst operation", s_tag) {
-  const std::string baseConfig{
-      R"_(from FWCore.TestProcessor.TestProcess import *
-process = TestProcess()
-process.load("HeterogeneousCore.CUDACore.ProcessAcceleratorCUDA_cfi")
-process.toTest = cms.EDProducer("TestCUDAProducerGPUFirst")
-process.moduleToTest(process.toTest)
-)_"};
-  edm::test::TestProcessor::Config config{baseConfig};
-
-  if (not cms::cudatest::testDevices()) {
-    return;
-  }
-
-  constexpr int defaultDevice = 0;
-
-  SECTION("Produce") {
-    edm::test::TestProcessor tester{config};
-    auto event = tester.test();
-    auto prod = event.get<cms::cuda::Product<cms::cudatest::Thing> >();
-    REQUIRE(prod->device() == defaultDevice);
-    auto ctx = cms::cuda::ScopedContextProduce(*prod);
-    const cms::cudatest::Thing& thing = ctx.get(*prod);
-    const float* data = thing.get();
-    REQUIRE(data != nullptr);
-
-    float firstElements[10];
-    cudaCheck(cudaMemcpyAsync(firstElements, data, sizeof(float) * 10, cudaMemcpyDeviceToHost, prod->stream()));
-
-    std::cout << "Synchronizing with CUDA stream" << std::endl;
-    auto stream = prod->stream();
-    cudaCheck(cudaStreamSynchronize(stream));
-    std::cout << "Synchronized" << std::endl;
-    REQUIRE(firstElements[0] == 0.f);
-    REQUIRE(firstElements[1] == 1.f);
-    REQUIRE(firstElements[9] == 9.f);
-  }
-};
diff --git a/HeterogeneousCore/CUDATest/test/test_main.cc b/HeterogeneousCore/CUDATest/test/test_main.cc
deleted file mode 100644
index b3ea47c29c7a7..0000000000000
--- a/HeterogeneousCore/CUDATest/test/test_main.cc
+++ /dev/null
@@ -1,2 +0,0 @@
-#define CATCH_CONFIG_MAIN
-#include "catch2/catch_all.hpp"