Skip to content

Commit eb3c9ee

Browse files
authored
Merge pull request #30 from hleberre-sx/main
Thread-safe default `AsyncGpuMemoryResource`
2 parents 27128b4 + 60893e2 commit eb3c9ee

File tree

1 file changed

+19
-18
lines changed

1 file changed

+19
-18
lines changed

cuBQL/builder/cuda.h

Lines changed: 19 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
# include <cuda_runtime_api.h>
1010
#endif
1111
# include "cuBQL/math/box.h"
12-
# include <map>
12+
# include <mutex>
1313

1414
namespace cuBQL {
1515

@@ -64,23 +64,26 @@ namespace cuBQL {
6464
associated. If you pass the default stream 0, your mallocs will
6565
always happen on the first device! */
6666
struct AsyncGpuMemoryResource final : GpuMemoryResource {
67-
AsyncGpuMemoryResource(int devID)
67+
AsyncGpuMemoryResource()
6868
{
69-
static bool memPoolInitialized = false;
70-
if (!memPoolInitialized) {
71-
CUBQL_CUDA_CALL(GetDeviceCount(&numDevices));
72-
for (int i=0;i<numDevices;i++) {
69+
// Configure the default memory pool on all visible devices the first
70+
// time an instance of this object is created. The operation is thread-safe.
71+
static std::once_flag s_initializedFlag;
72+
73+
std::call_once(s_initializedFlag, [this]() {
74+
CUBQL_CUDA_CALL(GetDeviceCount(&s_numDevices));
75+
76+
for (int iDevice = 0; iDevice < s_numDevices; iDevice++) {
7377
cudaMemPool_t mempool;
74-
cudaDeviceGetDefaultMemPool(&mempool, devID);
78+
CUBQL_CUDA_CALL(DeviceGetDefaultMemPool(&mempool, iDevice));
7579
uint64_t threshold = UINT64_MAX;
76-
cudaMemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold);
80+
CUBQL_CUDA_CALL(MemPoolSetAttribute(mempool, cudaMemPoolAttrReleaseThreshold, &threshold));
7781
}
78-
memPoolInitialized = true;;
79-
}
82+
} );
8083
}
8184
void malloc(void** ptr, size_t size, cudaStream_t s) override {
8285
#ifndef NDEBUG
83-
if (numDevices > 1 && s == 0)
86+
if (s_numDevices > 1 && s == 0)
8487
std::cerr << "@cuBQL: warning; async memory allocator used with default stream."
8588
<< std::endl;
8689
#endif
@@ -90,19 +93,17 @@ namespace cuBQL {
9093
{
9194
CUBQL_CUDA_CALL(FreeAsync(ptr, s));
9295
}
93-
int numDevices = 0;
96+
97+
private:
98+
static inline int s_numDevices = 0;
9499
};
95100

96101
/* by default let's use cuda malloc async, which is much better and
97102
faster than regular malloc; but that's available on cuda 11, so
98103
let's add a fall back for older cuda's, too */
99104
inline GpuMemoryResource &defaultGpuMemResource() {
100-
static std::map<int,AsyncGpuMemoryResource*> asyncMemPerDevice;
101-
int devID;
102-
CUBQL_CUDA_CALL(GetDevice(&devID));
103-
if (asyncMemPerDevice[devID] == nullptr)
104-
asyncMemPerDevice[devID] = new AsyncGpuMemoryResource(devID);
105-
return *asyncMemPerDevice[devID];
105+
static AsyncGpuMemoryResource memResource;
106+
return memResource;
106107
}
107108
#else
108109
inline GpuMemoryResource &defaultGpuMemResource() {

0 commit comments

Comments
 (0)