Skip to content

Commit 101f42e

Browse files
authored
Merge pull request NVIDIA#23 from ingowald/iw/nbody
Support for adding (refitted) additional 'aggregate data' to BinaryBVH'es, and a tempalte for N-body style traversals
2 parents e50bb54 + bb1ce7f commit 101f42e

14 files changed

Lines changed: 763 additions & 143 deletions

File tree

cuBQL/builder/cuda.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,16 +186,32 @@ namespace cuBQL {
186186
cudaStream_t s=0,
187187
GpuMemoryResource &memResource=defaultGpuMemResource());
188188

189+
// ------------------------------------------------------------------
190+
/*! refit a previously built boxes to a new set of bounding
191+
boxes. The order of boxes in the array boxes[] has to
192+
correspond to that used when building the tree. */
193+
// ------------------------------------------------------------------
194+
template<typename T, int D>
195+
void refit(BinaryBVH<T,D> &bvh,
196+
const box_t<T,D> *boxes,
197+
cudaStream_t s=0,
198+
GpuMemoryResource &memResource=defaultGpuMemResource());
199+
200+
// ------------------------------------------------------------------
189201
/*! frees the bvh.nodes[] and bvh.primIDs[] memory allocated when
190202
building the BVH. this assumes that the 'memResource' provided
191203
here was the same that was used during building */
204+
// ------------------------------------------------------------------
192205
template<typename T, int D>
193206
void free(BinaryBVH<T,D> &bvh,
194207
cudaStream_t s=0,
195208
GpuMemoryResource& memResource=defaultGpuMemResource());
209+
210+
// ------------------------------------------------------------------
196211
/*! frees the bvh.nodes[] and bvh.primIDs[] memory allocated when
197212
building the BVH. this assumes that the 'memResource' provided
198213
here was the same that was used during building */
214+
// ------------------------------------------------------------------
199215
template<typename T, int D, int W>
200216
void free(WideBVH<T,D,W> &bvh,
201217
cudaStream_t s=0,

cuBQL/builder/cuda/builder_common.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,12 @@
1414
#include <float.h>
1515
#include <limits.h>
1616

17+
#ifdef __HIPCC__
18+
namespace cub {
19+
using namespace hipcub;
20+
}
21+
#endif
22+
1723
namespace cuBQL {
1824
namespace gpuBuilder_impl {
1925

cuBQL/builder/cuda/gpu_builder.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ namespace cuBQL {
5353
buildConfig.makeLeafThreshold = 1;
5454
gpuBuilder_impl::build(bvh,boxes,numBoxes,buildConfig,s,memResource);
5555
}
56-
gpuBuilder_impl::refit(bvh,boxes,s,memResource);
56+
cuBQL::cuda::refit(bvh,boxes,s,memResource);
5757
}
5858

5959
namespace cuda {
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA
2+
// CORPORATION & AFFILIATES. All rights reserved.
3+
// SPDX-License-Identifier: Apache-2.0
4+
5+
#pragma once
6+
7+
namespace cuBQL {
8+
namespace gpuBuilder_impl {
9+
10+
//#define CUBQL_PROFILE 1
11+
12+
#if CUBQL_PROFILE
13+
struct Profile {
14+
void setName(std::string name, int sub=-1)
15+
{
16+
if (sub >= 0) {
17+
char suff[1000];
18+
sprintf(suff,"[%2i]",sub);
19+
this->name = name+suff;
20+
} else
21+
this->name = name;
22+
}
23+
~Profile() { ping(); }
24+
25+
void start() {
26+
t0 = getCurrentTime();
27+
}
28+
void sync_start() {
29+
CUBQL_CUDA_SYNC_CHECK();
30+
start();
31+
}
32+
void sync_stop() {
33+
CUBQL_CUDA_SYNC_CHECK();
34+
stop();
35+
}
36+
void stop(bool do_ping = false) {
37+
double t1 = getCurrentTime();
38+
t_sum += (t1-t0);
39+
count ++;
40+
if (do_ping) ping();
41+
}
42+
void ping()
43+
{
44+
if (count)
45+
std::cout << "#PROF " << name << " = " << prettyDouble(t_sum / count) << std::endl;
46+
}
47+
double t0 = 0.;
48+
double t_sum = 0.;
49+
int count = 0;
50+
std::string name = "";
51+
};
52+
#endif
53+
54+
}
55+
}

cuBQL/builder/cuda/radix.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -742,7 +742,7 @@ namespace cuBQL {
742742
// ==================================================================
743743
// done. all we need to do now is refit the bboxes
744744
// ==================================================================
745-
gpuBuilder_impl::refit(bvh,boxes,s,memResource);
745+
cuBQL::cuda::refit(bvh,boxes,s,memResource);
746746
}
747747
}
748748

cuBQL/builder/cuda/rebinMortonBuilder.h

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1459,20 +1459,24 @@ namespace cuBQL {
14591459
// ==================================================================
14601460
// done. all we need to do now is refit the bboxes
14611461
// ==================================================================
1462-
gpuBuilder_impl::refit(bvh,boxes,s,memResource);
1462+
cuBQL::cuda::refit(bvh,boxes,s,memResource);
14631463
}
14641464
}
1465-
1465+
14661466
namespace cuda {
14671467
template<typename T, int D>
14681468
void rebinRadixBuilder(BinaryBVH<T,D> &bvh,
1469-
const box_t<T,D> *boxes,
1470-
uint32_t numPrims,
1471-
BuildConfig buildConfig,
1472-
cudaStream_t s,
1473-
GpuMemoryResource &memResource)
1474-
{ rebinRadixBuilder_impl::build<T,D>(bvh,boxes,numPrims,buildConfig,s,memResource); }
1475-
}
1476-
}
1469+
const box_t<T,D> *boxes,
1470+
uint32_t numPrims,
1471+
BuildConfig buildConfig,
1472+
cudaStream_t s,
1473+
GpuMemoryResource &memResource)
1474+
{
1475+
rebinRadixBuilder_impl::build<T,D>
1476+
(bvh,boxes,numPrims,buildConfig,s,memResource);
1477+
}
1478+
1479+
} // ::cuBQL::cuda
1480+
} // ::cuBQL
14771481
#endif
14781482

cuBQL/builder/cuda/refit.h

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA
2+
// CORPORATION & AFFILIATES. All rights reserved.
3+
// SPDX-License-Identifier: Apache-2.0
4+
5+
#pragma once
6+
7+
#include "cuBQL/builder/cuda/builder_common.h"
8+
9+
namespace cuBQL {
10+
namespace cuda {
11+
12+
template<typename T, int D>
13+
__global__ void
14+
refit_init(const typename BinaryBVH<T,D>::Node *nodes,
15+
uint32_t *refitData,
16+
int numNodes)
17+
{
18+
const int nodeID = threadIdx.x+blockIdx.x*blockDim.x;
19+
if (nodeID == 1 || nodeID >= numNodes) return;
20+
if (nodeID < 2)
21+
refitData[0] = 0;
22+
const auto &node = nodes[nodeID];
23+
if (node.admin.count) return;
24+
25+
refitData[node.admin.offset+0] = nodeID << 1;
26+
refitData[node.admin.offset+1] = nodeID << 1;
27+
}
28+
29+
template<typename T, int D>
30+
__global__
31+
void refit_run(BinaryBVH<T,D> bvh,
32+
uint32_t *refitData,
33+
const box_t<T,D> *boxes)
34+
{
35+
int nodeID = threadIdx.x+blockIdx.x*blockDim.x;
36+
if (nodeID == 1 || nodeID >= bvh.numNodes) return;
37+
38+
typename BinaryBVH<T,D>::Node *node = &bvh.nodes[nodeID];
39+
if (node->admin.count == 0)
40+
// this is a inner node - exit
41+
return;
42+
43+
box_t<T,D> bounds; bounds.set_empty();
44+
for (int i=0;i<node->admin.count;i++) {
45+
const box_t<T,D> primBox = boxes[bvh.primIDs[node->admin.offset+i]];
46+
bounds.lower = min(bounds.lower,primBox.lower);
47+
bounds.upper = max(bounds.upper,primBox.upper);
48+
}
49+
50+
int parentID = (refitData[nodeID] >> 1);
51+
while (true) {
52+
node->bounds = bounds;
53+
__threadfence();
54+
if (node == bvh.nodes)
55+
break;
56+
57+
uint32_t refitBits = atomicAdd(&refitData[parentID],1u);
58+
if ((refitBits & 1) == 0)
59+
// we're the first one - let other one do it
60+
break;
61+
62+
nodeID = parentID;
63+
node = &bvh.nodes[parentID];
64+
parentID = (refitBits >> 1);
65+
66+
typename BinaryBVH<T,D>::Node l = bvh.nodes[node->admin.offset+0];
67+
typename BinaryBVH<T,D>::Node r = bvh.nodes[node->admin.offset+1];
68+
bounds.lower = min(l.bounds.lower,r.bounds.lower);
69+
bounds.upper = max(l.bounds.upper,r.bounds.upper);
70+
}
71+
}
72+
73+
template<typename T, int D>
74+
void refit(BinaryBVH<T,D> &bvh,
75+
const box_t<T,D> *boxes,
76+
cudaStream_t s,
77+
GpuMemoryResource &memResource)
78+
{
79+
int numNodes = bvh.numNodes;
80+
81+
uint32_t *refitData = 0;
82+
memResource.malloc((void**)&refitData,numNodes*sizeof(*refitData),s);
83+
84+
refit_init<T,D><<<divRoundUp(numNodes,1024),1024,0,s>>>
85+
(bvh.nodes,refitData,numNodes);
86+
refit_run<<<divRoundUp(numNodes,32),32,0,s>>>
87+
(bvh,refitData,boxes);
88+
memResource.free((void*)refitData,s);
89+
// we're not syncing here - let APP do that
90+
}
91+
92+
} // ::cuBQL::gpuBuilder_impl
93+
} // ::cuBQL
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA
2+
// CORPORATION & AFFILIATES. All rights reserved.
3+
// SPDX-License-Identifier: Apache-2.0
4+
5+
#pragma once
6+
7+
#include "cuBQL/builder/cuda/builder_common.h"
8+
#include "cuBQL/builder/cuda/refit.h"
9+
10+
namespace cuBQL {
11+
namespace cuda {
12+
13+
// ------------------------------------------------------------------
14+
// INTERFACE
15+
// ------------------------------------------------------------------
16+
17+
template<
18+
typename T,
19+
int D,
20+
typename AggregateNodeData
21+
// ,
22+
// typename AggregateFct
23+
>
24+
void refit_aggregate(BinaryBVH<T,D> bvh,
25+
AggregateNodeData *d_aggregateNodeData,
26+
void (*aggregateFct)(bvh3f,
27+
AggregateNodeData[],
28+
int),
29+
cudaStream_t s =0,
30+
GpuMemoryResource &memResource
31+
=defaultGpuMemResource());
32+
33+
template<typename T, int D,
34+
typename AggregateNodeData>
35+
__global__
36+
void refit_aggregate_run(BinaryBVH<T,D> bvh,
37+
AggregateNodeData *aggregateNodeData,
38+
void (*aggregateFct)(bvh3f,
39+
AggregateNodeData[],
40+
int),
41+
uint32_t *refitData)
42+
{
43+
int nodeID = threadIdx.x+blockIdx.x*blockDim.x;
44+
if (nodeID == 1 || nodeID >= bvh.numNodes) return;
45+
46+
typename BinaryBVH<T,D>::Node *node = &bvh.nodes[nodeID];
47+
if (node->admin.count == 0)
48+
// this is a inner node - exit
49+
return;
50+
51+
int parentID = (refitData[nodeID] >> 1);
52+
while (true) {
53+
aggregateFct(bvh,aggregateNodeData,nodeID);
54+
__threadfence();
55+
if (node == bvh.nodes)
56+
break;
57+
58+
uint32_t refitBits = atomicAdd(&refitData[parentID],1u);
59+
if ((refitBits & 1) == 0)
60+
// we're the first one - let other one do it
61+
break;
62+
63+
nodeID = parentID;
64+
node = &bvh.nodes[parentID];
65+
parentID = (refitBits >> 1);
66+
}
67+
}
68+
69+
70+
71+
// ------------------------------------------------------------------
72+
// IMPLEMENTATION
73+
// ------------------------------------------------------------------
74+
template<
75+
typename T,
76+
int D,
77+
typename AggregateNodeData>
78+
void refit_aggregate(BinaryBVH<T,D> bvh,
79+
AggregateNodeData *d_aggregateNodeData,
80+
void (*aggregateFct)(bvh3f,
81+
AggregateNodeData[],
82+
int),
83+
cudaStream_t s,
84+
GpuMemoryResource &memResource)
85+
{
86+
int numNodes = bvh.numNodes;
87+
88+
uint32_t *refitData = 0;
89+
memResource.malloc((void**)&refitData,numNodes*sizeof(*refitData),s);
90+
refit_init<T,D><<<divRoundUp(numNodes,1024),1024,0,s>>>
91+
(bvh.nodes,refitData,numNodes);
92+
refit_aggregate_run<<<divRoundUp(numNodes,32),32,0,s>>>
93+
(bvh,d_aggregateNodeData,aggregateFct,refitData);
94+
memResource.free((void*)refitData,s);
95+
// we're not syncing here - let APP do that
96+
}
97+
}
98+
}

cuBQL/builder/cuda/sah_builder.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -542,7 +542,7 @@ namespace cuBQL {
542542
_FREE(buildState,s,memResource);
543543
_FREE(sahBins,s,memResource);
544544

545-
gpuBuilder_impl::refit(bvh,boxes,s,memResource);
545+
cuBQL::cuda::refit(bvh,boxes,s,memResource);
546546
}
547547

548548
template<>

0 commit comments

Comments
 (0)