Merge pull request #30 from hleberre-sx/main

iwald-nvidia · web-flow · commit eb3c9eef64a4 · 2026-03-25T13:46:44.000-06:00
Thread-safe default `AsyncGpuMemoryResource`
diff --git a/cuBQL/builder/cuda.h b/cuBQL/builder/cuda.h
@@ -9,7 +9,7 @@
 # include <cuda_runtime_api.h>
 #endif
 # include "cuBQL/math/box.h"
-# include <map>
+# include <mutex>
 
 namespace cuBQL {
 
@@ -64,23 +64,26 @@ namespace cuBQL {
      associated. If you pass the default stream 0, your mallocs will
      always happen on the first device! */
   struct AsyncGpuMemoryResource final : GpuMemoryResource {
-    AsyncGpuMemoryResource(int devID)
+    AsyncGpuMemoryResource()
     {
-      static bool memPoolInitialized = false;
-      if (!memPoolInitialized) {
-        CUBQL_CUDA_CALL(GetDeviceCount(&numDevices));
-        for (int i=0;i<numDevices;i++) {
+      // Configure the default memory pool on all visible devices the first
+      // time an instance of this object is created. The operation is thread-safe.
+      static std::once_flag s_initializedFlag;
+
+      std::call_once(s_initializedFlag, [this]() {
+        CUBQL_CUDA_CALL(GetDeviceCount(&s_numDevices));
+
+        for (int iDevice = 0; iDevice < s_numDevices; iDevice++) {
           cudaMemPool_t mempool;
-          cudaDeviceGetDefaultMemPool(&mempool, devID);
+          CUBQL_CUDA_CALL(DeviceGetDefaultMemPool(&mempool, iDevice));
           uint64_t threshold = UINT64_MAX;
-          cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold);
+          CUBQL_CUDA_CALL(MemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold));
         }
-        memPoolInitialized = true;;
-      }
+      } );
     }
     void malloc(void** ptr, size_t size, cudaStream_t s) override {
 #ifndef NDEBUG
-      if (numDevices > 1 && s == 0)
+      if (s_numDevices > 1 && s == 0)
         std::cerr << "@cuBQL: warning; async memory allocator used with default stream."
                   << std::endl;
 #endif
@@ -90,19 +93,17 @@ namespace cuBQL {
     {
       CUBQL_CUDA_CALL(FreeAsync(ptr, s));
     }
-    int numDevices = 0;
+
+  private:
+    static inline int s_numDevices = 0;
   };
 
   /* by default let's use cuda malloc async, which is much better and
      faster than regular malloc; but that's available on cuda 11, so
      let's add a fall back for older cuda's, too */
   inline GpuMemoryResource &defaultGpuMemResource() {
-    static std::map<int,AsyncGpuMemoryResource*> asyncMemPerDevice;
-    int devID;
-    CUBQL_CUDA_CALL(GetDevice(&devID));
-    if (asyncMemPerDevice[devID] == nullptr)
-      asyncMemPerDevice[devID] = new AsyncGpuMemoryResource(devID);
-    return *asyncMemPerDevice[devID];
+    static AsyncGpuMemoryResource memResource;
+    return memResource;
   }
 #else
   inline GpuMemoryResource &defaultGpuMemResource() {