Skip to content

Commit e3824a0

Browse files
committed
Merge branch 'iw/openmp'
2 parents 60cbf1f + 48c6704 commit e3824a0

9 files changed

Lines changed: 1164 additions & 23 deletions

File tree

CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ cmake_policy(SET CMP0048 NEW)
77
set(CMAKE_BUILD_TYPE_INIT "Release")
88
project(cuBQL VERSION 1.2.0 LANGUAGES C CXX)
99

10+
if (CUBQL_OMP)
11+
set(CUBQL_DISABLE_CUDA ON)
12+
endif()
1013
if (CUBQL_DISABLE_CUDA)
1114
message("#cuBQL: CUDA _DISABLED_ by user request")
1215
set(CUBQL_HAVE_CUDA OFF)
@@ -136,3 +139,5 @@ add_subdirectory(cuBQL)
136139
if (NOT CUBQL_IS_SUBPROJECT)
137140
add_subdirectory(samples)
138141
endif()
142+
143+
#add_subdirectory(testing)

cuBQL/builder/cuda/sm_builder.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -515,10 +515,10 @@ namespace cuBQL {
515515
while (true) {
516516
CUBQL_CUDA_CALL(MemcpyAsync(&numNodes,&buildState->numNodes,
517517
sizeof(numNodes),cudaMemcpyDeviceToHost,s));
518-
CUBQL_CUDA_CALL(EventRecord(stateDownloadedEvent,s));
519-
CUBQL_CUDA_CALL(EventSynchronize(stateDownloadedEvent));
520518
if (numNodes == numDone)
521519
break;
520+
CUBQL_CUDA_CALL(EventRecord(stateDownloadedEvent,s));
521+
CUBQL_CUDA_CALL(EventSynchronize(stateDownloadedEvent));
522522
#if CUBQL_PROFILE
523523
t_nodePass[pass].sync_start();
524524
#endif
@@ -529,7 +529,7 @@ namespace cuBQL {
529529
#if CUBQL_PROFILE
530530
t_nodePass[pass].sync_stop();
531531
t_primPass[pass].sync_start();
532-
#endif
532+
#endif
533533
numDone = numNodes;
534534

535535
// #if 1

cuBQL/builder/omp.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
#pragma once
5+
6+
namespace cuBQL {
7+
namespace omp {
8+
struct Context;
9+
10+
template<typename T, int D>
11+
void refit(BinaryBVH<T,D> &bvh,
12+
const box_t<T,D> *boxes,
13+
Context *ctx);
14+
}
15+
}
16+
17+
#include "cuBQL/builder/omp/refit.h"
18+
#include "cuBQL/builder/omp/spatialMedian.h"
19+

cuBQL/builder/omp/AtomicBox.h

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,168 @@
1+
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA
2+
// CORPORATION & AFFILIATES. All rights reserved.
3+
// SPDX-License-Identifier: Apache-2.0
4+
5+
#pragma once
6+
7+
#include "cuBQL/builder/omp/common.h"
8+
9+
10+
namespace cuBQL {
11+
namespace omp {
12+
13+
template<typename box_t>
14+
struct AtomicBox : public box_t {
15+
16+
inline void set_empty()
17+
{
18+
*(box_t *)this = box_t();
19+
}
20+
};
21+
22+
template<typename T>
23+
inline void atomic_min(T *ptr, T v);
24+
template<typename T>
25+
inline void atomic_max(T *ptr, T v);
26+
27+
/*! iw - note: this implementation of atomic min/max via atomic
28+
compare-exchange (CAS); which is cetainly not optimal on any
29+
sort of modern GPU - but it works in any C++-21 compliant
30+
compiler, so it's what we do for now */
31+
inline void atomic_min(float *ptr, float value)
32+
{
33+
#ifdef __NVCOMPILER
34+
# if 1
35+
float &mem = *ptr;
36+
if (mem <= value) return;
37+
while (1) {
38+
float wasBefore;
39+
#pragma omp atomic capture
40+
{ wasBefore = mem; mem = value; }
41+
if (wasBefore >= value) break;
42+
value = wasBefore;
43+
}
44+
# else
45+
float current = *(volatile float *)ptr;
46+
while (current > value) {
47+
bool wasChanged
48+
= ((std::atomic<int>*)ptr)
49+
->compare_exchange_weak((int&)current,(int&)value);
50+
if (wasChanged) break;
51+
}
52+
# endif
53+
#else
54+
float &x = *ptr;
55+
#pragma omp atomic compare
56+
if (x > value) { x = value; }
57+
// float t;
58+
// #pragma omp atomic capture
59+
// { t = *ptr; *ptr = std::min(t,value); }
60+
#endif
61+
}
62+
63+
/*! iw - note: this implementation of atomic min/max via atomic
64+
compare-exchange (CAS); which is cetainly not optimal on any
65+
sort of modern GPU - but it works in any C++-21 compliant
66+
compiler, so it's what we do for now */
67+
inline void atomic_max(float *ptr, float value)
68+
{
69+
#ifdef __NVCOMPILER
70+
# if 1
71+
float &mem = *ptr;
72+
if (mem >= value) return;
73+
while (1) {
74+
float wasBefore;
75+
#pragma omp atomic capture
76+
{ wasBefore = mem; mem = value; }
77+
if (wasBefore <= value) break;
78+
value = wasBefore;
79+
}
80+
# else
81+
float current = *(volatile float *)ptr;
82+
while (current < value) {
83+
bool wasChanged
84+
= ((std::atomic<int>*)ptr)
85+
->compare_exchange_weak((int&)current,(int&)value);
86+
if (wasChanged) break;
87+
}
88+
# endif
89+
#else
90+
float &x = *ptr;
91+
#pragma omp atomic compare
92+
if (x < value) { x = value; }
93+
// float t;
94+
// #pragma omp atomic capture
95+
// { t = *ptr; *ptr = std::max(t,value); }
96+
#endif
97+
}
98+
99+
template<typename T, int D>
100+
inline void v_atomic_min(vec_t<T,D> *ptr, vec_t<T,D> v);
101+
template<typename T, int D>
102+
inline void v_atomic_max(vec_t<T,D> *ptr, vec_t<T,D> v);
103+
104+
105+
template<typename T>
106+
inline void v_atomic_min(vec_t<T,2> *ptr, vec_t<T,2> v)
107+
{
108+
atomic_min(&ptr->x,v.x);
109+
atomic_min(&ptr->y,v.y);
110+
}
111+
112+
template<typename T>
113+
inline void v_atomic_min(vec_t<T,3> *ptr, vec_t<T,3> v)
114+
{
115+
atomic_min(&ptr->x,v.x);
116+
atomic_min(&ptr->y,v.y);
117+
atomic_min(&ptr->z,v.z);
118+
}
119+
120+
template<typename T>
121+
inline void v_atomic_min(vec_t<T,4> *ptr, vec_t<T,4> v)
122+
{
123+
atomic_min(&ptr->x,v.x);
124+
atomic_min(&ptr->y,v.y);
125+
atomic_min(&ptr->z,v.z);
126+
atomic_min(&ptr->w,v.w);
127+
}
128+
129+
template<typename T>
130+
inline void v_atomic_max(vec_t<T,2> *ptr, vec_t<T,2> v)
131+
{
132+
atomic_max(&ptr->x,v.x);
133+
atomic_max(&ptr->y,v.y);
134+
}
135+
136+
template<typename T>
137+
inline void v_atomic_max(vec_t<T,3> *ptr, vec_t<T,3> v)
138+
{
139+
atomic_max(&ptr->x,v.x);
140+
atomic_max(&ptr->y,v.y);
141+
atomic_max(&ptr->z,v.z);
142+
}
143+
144+
template<typename T>
145+
inline void v_atomic_max(vec_t<T,4> *ptr, vec_t<T,4> v)
146+
{
147+
atomic_max(&ptr->x,v.x);
148+
atomic_max(&ptr->y,v.y);
149+
atomic_max(&ptr->z,v.z);
150+
atomic_max(&ptr->w,v.w);
151+
}
152+
153+
template<typename box_t>
154+
inline void atomic_grow(AtomicBox<box_t> &ab, typename box_t::vec_t P)
155+
{
156+
v_atomic_min(&ab.lower,P);
157+
v_atomic_max(&ab.upper,P);
158+
}
159+
160+
template<typename box_t>
161+
inline void atomic_grow(AtomicBox<box_t> &ab, box_t B)
162+
{
163+
v_atomic_min(&ab.lower,B.lower);
164+
v_atomic_max(&ab.upper,B.upper);
165+
}
166+
167+
}
168+
}

cuBQL/builder/omp/common.h

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA
2+
// CORPORATION & AFFILIATES. All rights reserved.
3+
// SPDX-License-Identifier: Apache-2.0
4+
5+
#pragma once
6+
7+
#include "cuBQL/bvh.h"
8+
#include <omp.h>
9+
#include <atomic>
10+
11+
namespace cuBQL {
12+
namespace omp {
13+
14+
struct Context {
15+
Context(int gpuID);
16+
17+
void *alloc(size_t numBytes);
18+
19+
template<typename T>
20+
void alloc(T *&d_data, size_t Nelements);
21+
22+
template<typename T>
23+
void alloc_and_upload(T *&d_data, const T *h_data, size_t Nelements);
24+
25+
template<typename T>
26+
void upload(T *d_data, const T *h_data, size_t Nelements);
27+
28+
template<typename T>
29+
void alloc_and_upload(T *&d_data, const std::vector<T> &h_vector);
30+
31+
template<typename T>
32+
std::vector<T> download_vector(const T *d_data, size_t N);
33+
34+
template<typename T>
35+
void download(T &h_value, T *d_value);
36+
37+
void free(void *);
38+
39+
int gpuID;
40+
int hostID;
41+
};
42+
43+
struct Kernel {
44+
inline int workIdx() const { return _workIdx; }
45+
int _workIdx;
46+
};
47+
48+
inline uint32_t atomicAdd(uint32_t *ptr, uint32_t inc)
49+
{
50+
#ifdef __NVCOMPILER
51+
return (uint32_t)((std::atomic<int> *)ptr)->fetch_add((int)inc);
52+
#else
53+
uint32_t t;
54+
#pragma omp atomic capture
55+
{ t = *ptr; *ptr += inc; }
56+
// return ((std::atomic<int> *)p_value)->fetch_add(inc);
57+
return t;
58+
#endif
59+
}
60+
61+
62+
// ##################################################################
63+
// IMPLEMENTATION SECTION
64+
// ##################################################################
65+
Context::Context(int gpuID)
66+
: gpuID(gpuID),
67+
hostID(omp_get_initial_device())
68+
{
69+
assert(gpuID < omp_get_num_devices());
70+
printf("#cuBQL:omp:Context(gpu=%i/%i,host=%i)\n",
71+
gpuID,omp_get_num_devices(),hostID);
72+
}
73+
74+
void *Context::alloc(size_t numBytes)
75+
{ return omp_target_alloc(numBytes,gpuID); }
76+
77+
template<typename T> inline
78+
void Context::upload(T *d_data,
79+
const T *h_data,
80+
size_t N)
81+
{
82+
assert(d_data);
83+
omp_target_memcpy(d_data,h_data,N*sizeof(T),
84+
0,0,gpuID,hostID);
85+
}
86+
87+
template<typename T> inline
88+
void Context::alloc_and_upload(T *&d_data,
89+
const T *h_data,
90+
size_t N)
91+
{
92+
printf("target_alloc N %li gpu %i\n",N,gpuID);
93+
d_data = (T *)omp_target_alloc(N*sizeof(T),gpuID);
94+
printf("ptr %p\n",d_data);
95+
upload(d_data,h_data,N);
96+
}
97+
98+
template<typename T> inline
99+
void Context::alloc_and_upload(T *&d_data,
100+
const std::vector<T> &h_vector)
101+
{ alloc_and_upload(d_data,h_vector.data(),h_vector.size()); }
102+
103+
template<typename T>
104+
std::vector<T> Context::download_vector(const T *d_data, size_t N)
105+
{
106+
PRINT(N);
107+
PRINT(d_data);
108+
109+
std::vector<T> out(N);
110+
PRINT(out.data());
111+
PRINT(sizeof(T));
112+
omp_target_memcpy(out.data(),d_data,N*sizeof(T),
113+
0,0,hostID,gpuID);
114+
return out;
115+
}
116+
117+
inline void Context::free(void *ptr)
118+
{ omp_target_free(ptr,gpuID); }
119+
120+
template<typename T> inline
121+
void Context::alloc(T *&d_data, size_t N)
122+
{
123+
d_data = (T*)omp_target_alloc(N*sizeof(T),gpuID);
124+
}
125+
126+
// template<typename T> inline
127+
// void Context::alloc_and_upload(T *&d_data,
128+
// const T *h_data,
129+
// size_t N)
130+
// {
131+
// alloc(d_data,N);
132+
// upload(d_data,h_data,N);
133+
// }
134+
135+
// template<typename T> inline
136+
// void Context::alloc_and_upload(T *&d_data,
137+
// const std::vector<T> &h_vector)
138+
// {
139+
// alloc(d_data,h_vector.size());
140+
// upload(d_data,h_vector);
141+
// }
142+
143+
// template<typename T> inline
144+
// std::vector<T> Context::download_vector(const T *d_data,
145+
// size_t N)
146+
// {
147+
// std::vector<T> vec(N);
148+
// omp_target_memcpy(vec.data(),d_data,N*sizeof(T),
149+
// 0,0,hostID,gpuID);
150+
// return vec;
151+
// }
152+
153+
template<typename T>
154+
inline void Context::download(T &h_value, T *d_value)
155+
{
156+
omp_target_memcpy(&h_value,d_value,sizeof(T),
157+
0,0,hostID,gpuID);
158+
}
159+
160+
161+
} // ::cuBQL::omp
162+
} // ::cuBQL

0 commit comments

Comments
 (0)