99# include < cuda_runtime_api.h>
1010#endif
1111# include " cuBQL/math/box.h"
12- # include < map >
12+ # include < mutex >
1313
1414namespace cuBQL {
1515
@@ -64,23 +64,26 @@ namespace cuBQL {
6464 associated. If you pass the default stream 0, your mallocs will
6565 always happen on the first device! */
6666 struct AsyncGpuMemoryResource final : GpuMemoryResource {
67- AsyncGpuMemoryResource (int devID )
67+ AsyncGpuMemoryResource ()
6868 {
69- static bool memPoolInitialized = false ;
70- if (!memPoolInitialized) {
71- CUBQL_CUDA_CALL (GetDeviceCount (&numDevices));
72- for (int i=0 ;i<numDevices;i++) {
69+ // Configure the default memory pool on all visible devices the first
70+ // time an instance of this object is created. The operation is thread-safe.
71+ static std::once_flag s_initializedFlag;
72+
73+ std::call_once (s_initializedFlag, [this ]() {
74+ CUBQL_CUDA_CALL (GetDeviceCount (&s_numDevices));
75+
76+ for (int iDevice = 0 ; iDevice < s_numDevices; iDevice++) {
7377 cudaMemPool_t mempool;
74- cudaDeviceGetDefaultMemPool ( &mempool, devID );
78+ CUBQL_CUDA_CALL ( DeviceGetDefaultMemPool ( &mempool, iDevice) );
7579 uint64_t threshold = UINT64_MAX;
76- cudaMemPoolSetAttribute ( mempool, cudaMemPoolAttrReleaseThreshold, &threshold);
80+ CUBQL_CUDA_CALL ( MemPoolSetAttribute ( mempool, cudaMemPoolAttrReleaseThreshold, &threshold) );
7781 }
78- memPoolInitialized = true ;;
79- }
82+ } );
8083 }
8184 void malloc (void ** ptr, size_t size, cudaStream_t s) override {
8285#ifndef NDEBUG
83- if (numDevices > 1 && s == 0 )
86+ if (s_numDevices > 1 && s == 0 )
8487 std::cerr << " @cuBQL: warning; async memory allocator used with default stream."
8588 << std::endl;
8689#endif
@@ -90,19 +93,17 @@ namespace cuBQL {
9093 {
9194 CUBQL_CUDA_CALL (FreeAsync (ptr, s));
9295 }
93- int numDevices = 0 ;
96+
97+ private:
98+ static inline int s_numDevices = 0 ;
9499 };
95100
96101 /* by default let's use cuda malloc async, which is much better and
97102 faster than regular malloc; but that's available on cuda 11, so
98103 let's add a fall back for older cuda's, too */
99104 inline GpuMemoryResource &defaultGpuMemResource () {
100- static std::map<int ,AsyncGpuMemoryResource*> asyncMemPerDevice;
101- int devID;
102- CUBQL_CUDA_CALL (GetDevice (&devID));
103- if (asyncMemPerDevice[devID] == nullptr )
104- asyncMemPerDevice[devID] = new AsyncGpuMemoryResource (devID);
105- return *asyncMemPerDevice[devID];
105+ static AsyncGpuMemoryResource memResource;
106+ return memResource;
106107 }
107108#else
108109 inline GpuMemoryResource &defaultGpuMemResource () {
0 commit comments