ingowald
diff --git a/‎cuBQL/builder/cuda.h‎
Lines changed: 16 additions & 0 deletions b/‎cuBQL/builder/cuda.h‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎cuBQL/builder/cuda/builder_common.h‎
Lines changed: 6 additions & 0 deletions b/‎cuBQL/builder/cuda/builder_common.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎cuBQL/builder/cuda/gpu_builder.h‎
Lines changed: 1 addition & 1 deletion b/‎cuBQL/builder/cuda/gpu_builder.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cuBQL/builder/cuda/profiling_helper.h‎
Lines changed: 55 additions & 0 deletions b/‎cuBQL/builder/cuda/profiling_helper.h‎
Lines changed: 55 additions & 0 deletions
diff --git a/‎cuBQL/builder/cuda/radix.h‎
Lines changed: 1 addition & 1 deletion b/‎cuBQL/builder/cuda/radix.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cuBQL/builder/cuda/rebinMortonBuilder.h‎
Lines changed: 14 additions & 10 deletions b/‎cuBQL/builder/cuda/rebinMortonBuilder.h‎
Lines changed: 14 additions & 10 deletions
diff --git a/‎cuBQL/builder/cuda/refit.h‎
Lines changed: 93 additions & 0 deletions b/‎cuBQL/builder/cuda/refit.h‎
Lines changed: 93 additions & 0 deletions
diff --git a/‎cuBQL/builder/cuda/refit_aggregate.h‎
Lines changed: 98 additions & 0 deletions b/‎cuBQL/builder/cuda/refit_aggregate.h‎
Lines changed: 98 additions & 0 deletions
diff --git a/‎cuBQL/builder/cuda/sah_builder.h‎
Lines changed: 1 addition & 1 deletion b/‎cuBQL/builder/cuda/sah_builder.h‎
Lines changed: 1 addition & 1 deletion
@@ -186,16 +186,32 @@ namespace cuBQL {
                            cudaStream_t       s=0,
                            GpuMemoryResource &memResource=defaultGpuMemResource());
 
+    // ------------------------------------------------------------------
+    /*! refit a previously built boxes to a new set of bounding
+        boxes. The order of boxes in the array boxes[] has to
+        correspond to that used when building the tree. */
+    // ------------------------------------------------------------------
+    template<typename T, int D>
+    void refit(BinaryBVH<T,D>    &bvh,
+               const box_t<T,D>  *boxes,
+               cudaStream_t       s=0,
+               GpuMemoryResource &memResource=defaultGpuMemResource());
+    
+    // ------------------------------------------------------------------
     /*! frees the bvh.nodes[] and bvh.primIDs[] memory allocated when
       building the BVH. this assumes that the 'memResource' provided
       here was the same that was used during building */
+    // ------------------------------------------------------------------
     template<typename T, int D>
     void free(BinaryBVH<T,D> &bvh,
               cudaStream_t      s=0,
               GpuMemoryResource& memResource=defaultGpuMemResource());
+    
+    // ------------------------------------------------------------------
     /*! frees the bvh.nodes[] and bvh.primIDs[] memory allocated when
       building the BVH. this assumes that the 'memResource' provided
       here was the same that was used during building */
+    // ------------------------------------------------------------------
     template<typename T, int D, int W>
     void free(WideBVH<T,D,W> &bvh,
               cudaStream_t      s=0,
 
@@ -14,6 +14,12 @@
 #include <float.h>
 #include <limits.h>
 
+#ifdef __HIPCC__
+namespace cub {
+  using namespace hipcub;
+}
+#endif
+
 namespace cuBQL {
   namespace gpuBuilder_impl {
 
 
@@ -53,7 +53,7 @@ namespace cuBQL {
         buildConfig.makeLeafThreshold = 1;
       gpuBuilder_impl::build(bvh,boxes,numBoxes,buildConfig,s,memResource);
     }
-    gpuBuilder_impl::refit(bvh,boxes,s,memResource);
+    cuBQL::cuda::refit(bvh,boxes,s,memResource);
   }
 
   namespace cuda {
 
@@ -0,0 +1,55 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA
+// CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+namespace cuBQL {
+  namespace gpuBuilder_impl {
+
+    //#define CUBQL_PROFILE 1
+
+#if CUBQL_PROFILE
+    struct Profile {
+      void setName(std::string name, int sub=-1)
+      {
+        if (sub >= 0) {
+          char suff[1000];
+          sprintf(suff,"[%2i]",sub);
+          this->name = name+suff;
+        } else
+          this->name = name;
+      }
+      ~Profile() { ping(); }
+      
+      void start() {
+        t0 = getCurrentTime();
+      }
+      void sync_start() {
+        CUBQL_CUDA_SYNC_CHECK();
+        start();
+      }
+      void sync_stop() {
+        CUBQL_CUDA_SYNC_CHECK();
+        stop();
+      }
+      void stop(bool do_ping = false) {
+        double t1 = getCurrentTime();
+        t_sum += (t1-t0);
+        count ++;
+        if (do_ping) ping();
+      }
+      void ping()
+      {
+        if (count)
+          std::cout << "#PROF " << name << " = " << prettyDouble(t_sum / count) << std::endl;
+      }
+      double t0 = 0.;
+      double t_sum = 0.;
+      int count = 0;
+      std::string name = "";
+    };
+#endif
+    
+  }
+}
@@ -742,7 +742,7 @@ namespace cuBQL {
       // ==================================================================
       // done. all we need to do now is refit the bboxes
       // ==================================================================
-      gpuBuilder_impl::refit(bvh,boxes,s,memResource);
+      cuBQL::cuda::refit(bvh,boxes,s,memResource);
     }
   }
 
 
@@ -1459,20 +1459,24 @@ namespace cuBQL {
       // ==================================================================
       // done. all we need to do now is refit the bboxes
       // ==================================================================
-      gpuBuilder_impl::refit(bvh,boxes,s,memResource);
+      cuBQL::cuda::refit(bvh,boxes,s,memResource);
     }
   }
-
+  
   namespace cuda {
     template<typename T, int D>
     void rebinRadixBuilder(BinaryBVH<T,D>    &bvh,
-                            const box_t<T,D>  *boxes,
-                            uint32_t           numPrims,
-                            BuildConfig        buildConfig,
-                            cudaStream_t       s,
-                            GpuMemoryResource &memResource)
-    { rebinRadixBuilder_impl::build<T,D>(bvh,boxes,numPrims,buildConfig,s,memResource); }
-  }
-}
+                           const box_t<T,D>  *boxes,
+                           uint32_t           numPrims,
+                           BuildConfig        buildConfig,
+                           cudaStream_t       s,
+                           GpuMemoryResource &memResource)
+    {
+      rebinRadixBuilder_impl::build<T,D>
+        (bvh,boxes,numPrims,buildConfig,s,memResource);
+    }
+    
+  } // ::cuBQL::cuda
+} // ::cuBQL
 #endif
 
@@ -0,0 +1,93 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA
+// CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "cuBQL/builder/cuda/builder_common.h"
+
+namespace cuBQL {
+  namespace cuda {
+
+    template<typename T, int D>
+    __global__ void
+    refit_init(const typename BinaryBVH<T,D>::Node *nodes,
+               uint32_t              *refitData,
+               int numNodes)
+    {
+      const int nodeID = threadIdx.x+blockIdx.x*blockDim.x;
+      if (nodeID == 1 || nodeID >= numNodes) return;
+      if (nodeID < 2)
+        refitData[0] = 0;
+      const auto &node = nodes[nodeID];
+      if (node.admin.count) return;
+
+      refitData[node.admin.offset+0] = nodeID << 1;
+      refitData[node.admin.offset+1] = nodeID << 1;
+    }
+    
+    template<typename T, int D>
+    __global__
+    void refit_run(BinaryBVH<T,D> bvh,
+                   uint32_t *refitData,
+                   const box_t<T,D> *boxes)
+    {
+      int nodeID = threadIdx.x+blockIdx.x*blockDim.x;
+      if (nodeID == 1 || nodeID >= bvh.numNodes) return;
+      
+      typename BinaryBVH<T,D>::Node *node = &bvh.nodes[nodeID];
+      if (node->admin.count == 0)
+        // this is a inner node - exit
+        return;
+
+      box_t<T,D> bounds; bounds.set_empty();
+      for (int i=0;i<node->admin.count;i++) {
+        const box_t<T,D> primBox = boxes[bvh.primIDs[node->admin.offset+i]];
+        bounds.lower = min(bounds.lower,primBox.lower);
+        bounds.upper = max(bounds.upper,primBox.upper);
+      }
+
+      int parentID = (refitData[nodeID] >> 1);
+      while (true) {
+        node->bounds = bounds;
+        __threadfence();
+        if (node == bvh.nodes)
+          break;
+
+        uint32_t refitBits = atomicAdd(&refitData[parentID],1u);
+        if ((refitBits & 1) == 0)
+          // we're the first one - let other one do it
+          break;
+
+        nodeID   = parentID;
+        node     = &bvh.nodes[parentID];
+        parentID = (refitBits >> 1);
+        
+        typename BinaryBVH<T,D>::Node l = bvh.nodes[node->admin.offset+0];
+        typename BinaryBVH<T,D>::Node r = bvh.nodes[node->admin.offset+1];
+        bounds.lower = min(l.bounds.lower,r.bounds.lower);
+        bounds.upper = max(l.bounds.upper,r.bounds.upper);
+      }
+    }
+
+    template<typename T, int D>
+    void refit(BinaryBVH<T,D>    &bvh,
+               const box_t<T,D>  *boxes,
+               cudaStream_t       s,
+               GpuMemoryResource &memResource)
+    {
+      int numNodes = bvh.numNodes;
+      
+      uint32_t *refitData = 0;
+      memResource.malloc((void**)&refitData,numNodes*sizeof(*refitData),s);
+      
+      refit_init<T,D><<<divRoundUp(numNodes,1024),1024,0,s>>>
+        (bvh.nodes,refitData,numNodes);
+      refit_run<<<divRoundUp(numNodes,32),32,0,s>>>
+        (bvh,refitData,boxes);
+      memResource.free((void*)refitData,s);
+      // we're not syncing here - let APP do that
+    }
+    
+  } // ::cuBQL::gpuBuilder_impl
+} // ::cuBQL
@@ -0,0 +1,98 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA
+// CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "cuBQL/builder/cuda/builder_common.h"
+#include "cuBQL/builder/cuda/refit.h"
+
+namespace cuBQL {
+  namespace cuda {
+
+    // ------------------------------------------------------------------
+    // INTERFACE
+    // ------------------------------------------------------------------
+    
+    template<
+      typename T,
+      int D,
+      typename AggregateNodeData
+      // ,
+      // typename AggregateFct
+      >
+    void refit_aggregate(BinaryBVH<T,D> bvh,
+                         AggregateNodeData *d_aggregateNodeData,
+                         void (*aggregateFct)(bvh3f,
+                                              AggregateNodeData[],
+                                              int),
+                         cudaStream_t       s =0,
+                         GpuMemoryResource &memResource
+                         =defaultGpuMemResource());
+    
+    template<typename T, int D,
+             typename AggregateNodeData>
+    __global__
+    void refit_aggregate_run(BinaryBVH<T,D> bvh,
+                             AggregateNodeData *aggregateNodeData,
+                         void (*aggregateFct)(bvh3f,
+                                              AggregateNodeData[],
+                                              int),
+                             uint32_t *refitData)
+    {
+      int nodeID = threadIdx.x+blockIdx.x*blockDim.x;
+      if (nodeID == 1 || nodeID >= bvh.numNodes) return;
+      
+      typename BinaryBVH<T,D>::Node *node = &bvh.nodes[nodeID];
+      if (node->admin.count == 0)
+        // this is a inner node - exit
+        return;
+      
+      int parentID = (refitData[nodeID] >> 1);
+      while (true) {
+        aggregateFct(bvh,aggregateNodeData,nodeID);
+        __threadfence();
+        if (node == bvh.nodes)
+          break;
+
+        uint32_t refitBits = atomicAdd(&refitData[parentID],1u);
+        if ((refitBits & 1) == 0)
+          // we're the first one - let other one do it
+          break;
+
+        nodeID   = parentID;
+        node     = &bvh.nodes[parentID];
+        parentID = (refitBits >> 1);
+      }
+    }
+
+    
+    
+    // ------------------------------------------------------------------
+    // IMPLEMENTATION
+    // ------------------------------------------------------------------
+    template<
+      typename T,
+      int D,
+      typename AggregateNodeData>
+    void refit_aggregate(BinaryBVH<T,D> bvh,
+                         AggregateNodeData *d_aggregateNodeData,
+                         void (*aggregateFct)(bvh3f,
+                                              AggregateNodeData[],
+                                              int),
+                         cudaStream_t       s,
+                         GpuMemoryResource &memResource)
+    {
+      int numNodes = bvh.numNodes;
+      
+      uint32_t *refitData = 0;
+      memResource.malloc((void**)&refitData,numNodes*sizeof(*refitData),s);
+      refit_init<T,D><<<divRoundUp(numNodes,1024),1024,0,s>>>
+        (bvh.nodes,refitData,numNodes);
+      refit_aggregate_run<<<divRoundUp(numNodes,32),32,0,s>>>
+        (bvh,d_aggregateNodeData,aggregateFct,refitData);
+      memResource.free((void*)refitData,s);
+      // we're not syncing here - let APP do that
+    }
+  }
+}
@@ -542,7 +542,7 @@ namespace cuBQL {
       _FREE(buildState,s,memResource);
       _FREE(sahBins,s,memResource);
 
-      gpuBuilder_impl::refit(bvh,boxes,s,memResource);
+      cuBQL::cuda::refit(bvh,boxes,s,memResource);
     }
 
     template<>
Original file line number	Diff line number	Diff line change
`@@ -53,7 +53,7 @@ namespace cuBQL {`
`53`	`53`	`buildConfig.makeLeafThreshold = 1;`
`54`	`54`	`gpuBuilder_impl::build(bvh,boxes,numBoxes,buildConfig,s,memResource);`
`55`	`55`	`}`
`56`		`- gpuBuilder_impl::refit(bvh,boxes,s,memResource);`
	`56`	`+ cuBQL::cuda::refit(bvh,boxes,s,memResource);`
`57`	`57`	`}`
`58`	`58`
`59`	`59`	`namespace cuda {`
Original file line number	Diff line number	Diff line change
`@@ -742,7 +742,7 @@ namespace cuBQL {`
`742`	`742`	`// ==================================================================`
`743`	`743`	`// done. all we need to do now is refit the bboxes`
`744`	`744`	`// ==================================================================`
`745`		`- gpuBuilder_impl::refit(bvh,boxes,s,memResource);`
	`745`	`+ cuBQL::cuda::refit(bvh,boxes,s,memResource);`
`746`	`746`	`}`
`747`	`747`	`}`
`748`	`748`
Original file line number	Diff line number	Diff line change
`@@ -542,7 +542,7 @@ namespace cuBQL {`
`542`	`542`	`_FREE(buildState,s,memResource);`
`543`	`543`	`_FREE(sahBins,s,memResource);`
`544`	`544`
`545`		`- gpuBuilder_impl::refit(bvh,boxes,s,memResource);`
	`545`	`+ cuBQL::cuda::refit(bvh,boxes,s,memResource);`
`546`	`546`	`}`
`547`	`547`
`548`	`548`	`template<>`