Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
include/libsgm_config.h
build/
build/
# build directories (any arch/platform)
build*/
9 changes: 7 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,14 @@ option(ENABLE_SAMPLES "Build samples" OFF)
option(ENABLE_TESTS "Test library" OFF)
option(LIBSGM_SHARED "Build a shared library" OFF)
option(BUILD_OPENCV_WRAPPER "Make library compatible with cv::Mat and cv::cuda::GpuMat of OpenCV" OFF)
option(USE_HIP "Build the GPU kernels with ROCm/HIP instead of CUDA" OFF)

if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
set(CMAKE_CUDA_ARCHITECTURES "52;61;72;75;86")
# For HIP the architecture is auto-detected by enable_language(HIP) in src/, with
# a gfx90a fallback there; -DCMAKE_HIP_ARCHITECTURES overrides it.
if(NOT USE_HIP)
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
set(CMAKE_CUDA_ARCHITECTURES "52;61;72;75;86")
endif()
endif()

project(libSGM VERSION 3.1.0)
Expand Down
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,19 @@ $ cmake ../ # Several options available
$ make
```

### Building for AMD GPUs (ROCm/HIP)

libSGM also builds on AMD GPUs through ROCm/HIP. With a ROCm toolchain installed, configure with `-DUSE_HIP=ON`:

```
$ cmake .. -DUSE_HIP=ON -DCMAKE_PREFIX_PATH=/opt/rocm
$ make
```

If the ROCm install is not already on `CMAKE_PREFIX_PATH`, point `-DCMAKE_PREFIX_PATH` at it (`/opt/rocm` by default) so `find_package(hip)` can locate the HIP config; otherwise configuration fails with `hip_DIR-NOTFOUND`.

The GPU architecture is auto-detected from the build machine; target another AMD GPU with `-DCMAKE_HIP_ARCHITECTURES=<arch>` (e.g. `gfx90a`, `gfx1100`). The CUDA build path is unchanged (`USE_HIP=OFF`).

## Sample Execution
```
$ pwd
Expand Down
33 changes: 30 additions & 3 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,26 @@ set(LIBSGM_INCLUDE_DIR ${LIBSGM_ROOT_DIR}/include)

# create project
set(PROJECT_NAME sgm)
project(${PROJECT_NAME} LANGUAGES CXX CUDA)
if(USE_HIP)
project(${PROJECT_NAME} LANGUAGES CXX)
enable_language(HIP)
# enable_language(HIP) auto-detects the installed GPU(s) into
# CMAKE_HIP_ARCHITECTURES; fall back to gfx90a only if nothing was detected.
# -DCMAKE_HIP_ARCHITECTURES=<arch> overrides both.
if(NOT CMAKE_HIP_ARCHITECTURES)
set(CMAKE_HIP_ARCHITECTURES "gfx90a")
endif()
list(REMOVE_DUPLICATES CMAKE_HIP_ARCHITECTURES)
else()
project(${PROJECT_NAME} LANGUAGES CXX CUDA)
endif()

# dependent packages
find_package(CUDAToolkit REQUIRED)
if(USE_HIP)
find_package(hip REQUIRED)
else()
find_package(CUDAToolkit REQUIRED)
endif()

if(BUILD_OPENCV_WRAPPER)
find_package(OpenCV REQUIRED core)
Expand All @@ -26,7 +42,18 @@ add_library(${PROJECT_NAME} ${SGM_LIB_TYPE})
target_sources(${PROJECT_NAME} PRIVATE ${SRCS})
target_include_directories(${PROJECT_NAME} PRIVATE ${LIBSGM_INCLUDE_DIR} $<$<BOOL:${BUILD_OPENCV_WRAPPER}>:${OpenCV_INCLUDE_DIRS}>)
target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_17)
target_link_libraries(${PROJECT_NAME} PUBLIC CUDA::cudart $<$<BOOL:${BUILD_OPENCV_WRAPPER}>:${OpenCV_LIBS}>)

if(USE_HIP)
# Compile the .cu (and .cpp, which include hip/hip_runtime.h) with the HIP
# toolchain; the device runtime is implicit, no CUDA::cudart link needed.
file(GLOB CU_SRCS ./*.cu)
set_source_files_properties(${CU_SRCS} PROPERTIES LANGUAGE HIP)
set_target_properties(${PROJECT_NAME} PROPERTIES HIP_ARCHITECTURES "${CMAKE_HIP_ARCHITECTURES}" POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(${PROJECT_NAME} PRIVATE USE_HIP)
target_link_libraries(${PROJECT_NAME} PUBLIC hip::host $<$<BOOL:${BUILD_OPENCV_WRAPPER}>:${OpenCV_LIBS}>)
else()
target_link_libraries(${PROJECT_NAME} PUBLIC CUDA::cudart $<$<BOOL:${BUILD_OPENCV_WRAPPER}>:${OpenCV_LIBS}>)
endif()
set_target_properties(${PROJECT_NAME} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${LIBSGM_INCLUDE_DIR})

target_compile_options(${PROJECT_NAME} PRIVATE
Expand Down
2 changes: 1 addition & 1 deletion src/census_transform.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ limitations under the License.

#include "internal.h"

#include <cuda_runtime.h>
#include "cuda_to_hip.h"

#include "types.h"
#include "host_utility.h"
Expand Down
2 changes: 1 addition & 1 deletion src/check_consistency.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ limitations under the License.

#include "internal.h"

#include <cuda_runtime.h>
#include "cuda_to_hip.h"

#include "constants.h"
#include "host_utility.h"
Expand Down
30 changes: 30 additions & 0 deletions src/constants.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,37 @@ limitations under the License.
namespace sgm
{

// WARP_SIZE is the true hardware wavefront width and parameterizes the whole
// aggregation/WTA design: shuffle-subgroup partitioning, the WTA per-lane data
// layout (REDUCTION_PER_THREAD = MAX_DISPARITY / WARP_SIZE), and the device-side
// block sizing (BLOCK_SIZE = WARP_SIZE * N). On CUDA the wavefront is 32; on HIP
// it is 64 on wave64 GCN (gfx8/gfx9, e.g. gfx90a) and 32 on RDNA (gfx10/11).
//
// The DEVICE value is keyed per arch off the __GFX*__ macros, so a multi-arch
// build (e.g. gfx90a;gfx1100) compiles each device slice with its own correct
// width. The HOST must NOT drive launch geometry from a compile-time width: the
// host pass can target several arches at once and there is no single right
// answer. Launch block/grid dims are recomputed from a runtime-queried warpSize
// (host_utility.h device_warp_size()), so host and device agree on every arch.
//
// hipcc still parses the __global__ kernel bodies (and their __shared__ sizing,
// subgroup_min<WARP_SIZE>, etc.) in the host pass to emit the launch stubs, so
// WARP_SIZE needs SOME compile-time value there to parse. The fallback below is
// for that parse only; it never reaches runtime, where the device pass owns the
// real width and the host owns launch dims via device_warp_size().
#if defined(USE_HIP)
# if defined(__HIP_DEVICE_COMPILE__)
# if defined(__GFX8__) || defined(__GFX9__)
static constexpr unsigned int WARP_SIZE = 64u;
# else
static constexpr unsigned int WARP_SIZE = 32u;
# endif
# else
static constexpr unsigned int WARP_SIZE = 64u;
# endif
#else
static constexpr unsigned int WARP_SIZE = 32u;
#endif
static constexpr output_type INVALID_DISP = static_cast<output_type>(-1);

} // namespace sgm
Expand Down
2 changes: 1 addition & 1 deletion src/correct_disparity_range.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ limitations under the License.

#include "internal.h"

#include <cuda_runtime.h>
#include "cuda_to_hip.h"

#include "constants.h"
#include "host_utility.h"
Expand Down
66 changes: 37 additions & 29 deletions src/cost_aggregation.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ limitations under the License.

#include "internal.h"

#include <cuda_runtime.h>
#include "cuda_to_hip.h"

#include "device_utility.h"
#include "host_utility.h"
Expand Down Expand Up @@ -110,7 +110,8 @@ namespace vertical
{

static constexpr unsigned int DP_BLOCK_SIZE = 16u;
static constexpr unsigned int BLOCK_SIZE = WARP_SIZE * 8u;
static constexpr unsigned int WARPS_PER_BLOCK = 8u;
static constexpr unsigned int BLOCK_SIZE = WARP_SIZE * WARPS_PER_BLOCK;

template <typename CENSUS_TYPE, int DIRECTION, unsigned int MAX_DISPARITY>
__global__ void aggregate_vertical_path_kernel(
Expand Down Expand Up @@ -210,10 +211,11 @@ void aggregate_up2down(
cudaStream_t stream)
{
static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE;
const unsigned int block_size = device_warp_size() * WARPS_PER_BLOCK;
const unsigned int paths_per_block = block_size / SUBGROUP_SIZE;

const int gdim = (width + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
const int bdim = BLOCK_SIZE;
const int gdim = (width + paths_per_block - 1) / paths_per_block;
const int bdim = block_size;
aggregate_vertical_path_kernel<CENSUS_TYPE, 1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
dest, left, right, width, height, p1, p2, min_disp);
CUDA_CHECK(cudaGetLastError());
Expand All @@ -232,10 +234,11 @@ void aggregate_down2up(
cudaStream_t stream)
{
static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE;
const unsigned int block_size = device_warp_size() * WARPS_PER_BLOCK;
const unsigned int paths_per_block = block_size / SUBGROUP_SIZE;

const int gdim = (width + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
const int bdim = BLOCK_SIZE;
const int gdim = (width + paths_per_block - 1) / paths_per_block;
const int bdim = block_size;
aggregate_vertical_path_kernel<CENSUS_TYPE, -1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
dest, left, right, width, height, p1, p2, min_disp);
CUDA_CHECK(cudaGetLastError());
Expand Down Expand Up @@ -368,11 +371,11 @@ void aggregate_left2right(
cudaStream_t stream)
{
static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
static const unsigned int PATHS_PER_BLOCK =
BLOCK_SIZE * DP_BLOCKS_PER_THREAD / SUBGROUP_SIZE;
const unsigned int block_size = device_warp_size() * WARPS_PER_BLOCK;
const unsigned int paths_per_block = block_size * DP_BLOCKS_PER_THREAD / SUBGROUP_SIZE;

const int gdim = (height + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
const int bdim = BLOCK_SIZE;
const int gdim = (height + paths_per_block - 1) / paths_per_block;
const int bdim = block_size;
aggregate_horizontal_path_kernel<CENSUS_TYPE, 1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
dest, left, right, width, height, p1, p2, min_disp);
CUDA_CHECK(cudaGetLastError());
Expand All @@ -391,11 +394,11 @@ void aggregate_right2left(
cudaStream_t stream)
{
static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
static const unsigned int PATHS_PER_BLOCK =
BLOCK_SIZE * DP_BLOCKS_PER_THREAD / SUBGROUP_SIZE;
const unsigned int block_size = device_warp_size() * WARPS_PER_BLOCK;
const unsigned int paths_per_block = block_size * DP_BLOCKS_PER_THREAD / SUBGROUP_SIZE;

const int gdim = (height + PATHS_PER_BLOCK - 1) / PATHS_PER_BLOCK;
const int bdim = BLOCK_SIZE;
const int gdim = (height + paths_per_block - 1) / paths_per_block;
const int bdim = block_size;
aggregate_horizontal_path_kernel<CENSUS_TYPE, -1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
dest, left, right, width, height, p1, p2, min_disp);
CUDA_CHECK(cudaGetLastError());
Expand All @@ -407,7 +410,8 @@ namespace oblique
{

static constexpr unsigned int DP_BLOCK_SIZE = 16u;
static constexpr unsigned int BLOCK_SIZE = WARP_SIZE * 8u;
static constexpr unsigned int WARPS_PER_BLOCK = 8u;
static constexpr unsigned int BLOCK_SIZE = WARP_SIZE * WARPS_PER_BLOCK;

template <typename CENSUS_TYPE, int X_DIRECTION, int Y_DIRECTION, unsigned int MAX_DISPARITY>
__global__ void aggregate_oblique_path_kernel(
Expand Down Expand Up @@ -510,10 +514,11 @@ void aggregate_upleft2downright(
cudaStream_t stream)
{
static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE;
const unsigned int block_size = device_warp_size() * WARPS_PER_BLOCK;
const unsigned int paths_per_block = block_size / SUBGROUP_SIZE;

const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
const int bdim = BLOCK_SIZE;
const int gdim = (width + height + paths_per_block - 2) / paths_per_block;
const int bdim = block_size;
aggregate_oblique_path_kernel<CENSUS_TYPE, 1, 1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
dest, left, right, width, height, p1, p2, min_disp);
CUDA_CHECK(cudaGetLastError());
Expand All @@ -532,10 +537,11 @@ void aggregate_upright2downleft(
cudaStream_t stream)
{
static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE;
const unsigned int block_size = device_warp_size() * WARPS_PER_BLOCK;
const unsigned int paths_per_block = block_size / SUBGROUP_SIZE;

const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
const int bdim = BLOCK_SIZE;
const int gdim = (width + height + paths_per_block - 2) / paths_per_block;
const int bdim = block_size;
aggregate_oblique_path_kernel<CENSUS_TYPE, -1, 1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
dest, left, right, width, height, p1, p2, min_disp);
CUDA_CHECK(cudaGetLastError());
Expand All @@ -554,10 +560,11 @@ void aggregate_downright2upleft(
cudaStream_t stream)
{
static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE;
const unsigned int block_size = device_warp_size() * WARPS_PER_BLOCK;
const unsigned int paths_per_block = block_size / SUBGROUP_SIZE;

const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
const int bdim = BLOCK_SIZE;
const int gdim = (width + height + paths_per_block - 2) / paths_per_block;
const int bdim = block_size;
aggregate_oblique_path_kernel<CENSUS_TYPE, -1, -1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
dest, left, right, width, height, p1, p2, min_disp);
CUDA_CHECK(cudaGetLastError());
Expand All @@ -576,10 +583,11 @@ void aggregate_downleft2upright(
cudaStream_t stream)
{
static const unsigned int SUBGROUP_SIZE = MAX_DISPARITY / DP_BLOCK_SIZE;
static const unsigned int PATHS_PER_BLOCK = BLOCK_SIZE / SUBGROUP_SIZE;
const unsigned int block_size = device_warp_size() * WARPS_PER_BLOCK;
const unsigned int paths_per_block = block_size / SUBGROUP_SIZE;

const int gdim = (width + height + PATHS_PER_BLOCK - 2) / PATHS_PER_BLOCK;
const int bdim = BLOCK_SIZE;
const int gdim = (width + height + paths_per_block - 2) / paths_per_block;
const int bdim = block_size;
aggregate_oblique_path_kernel<CENSUS_TYPE, 1, -1, MAX_DISPARITY><<<gdim, bdim, 0, stream>>>(
dest, left, right, width, height, p1, p2, min_disp);
CUDA_CHECK(cudaGetLastError());
Expand Down
66 changes: 66 additions & 0 deletions src/cuda_to_hip.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/*
Copyright 2016 Fixstars Corporation

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http ://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

#ifndef __CUDA_TO_HIP_H__
#define __CUDA_TO_HIP_H__

// Single compatibility shim for the ROCm/HIP port. On a CUDA build
// this header is a thin passthrough to <cuda_runtime.h>; on HIP it pulls in the
// HIP runtime and supplies the small set of device intrinsics that ROCm 7.x does
// not provide. The .cu/.cpp sources include this instead of <cuda_runtime.h>.
//
// The two non-mechanical concerns are documented at their definitions below:
// - the wavefront width: device-side per-arch via __GFX*__ (constants.h); host
// side via the runtime hipGetDeviceProperties().warpSize query in
// host_utility.h device_warp_size(). No single compile-time host constant, so
// a multi-arch (gfx90a;gfx1100) build stays correct on each device slice.
// - the __v*u2/__v*u4 SIMD video intrinsics, software-emulated for HIP.

#if defined(USE_HIP)

#include <hip/hip_runtime.h>

// Alias the small CUDA runtime surface the host code uses onto HIP. The .cu
// kernel-launch syntax <<<>>> is accepted by hipcc directly; only these named
// runtime entry points and the stream/error types need mapping.
using cudaError_t = hipError_t;
using cudaStream_t = hipStream_t;

#define cudaSuccess hipSuccess
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost

#define cudaMalloc hipMalloc
#define cudaFree hipFree
#define cudaMemcpy hipMemcpy
#define cudaMemset hipMemset
#define cudaStreamCreate hipStreamCreate
#define cudaStreamSynchronize hipStreamSynchronize
#define cudaStreamDestroy hipStreamDestroy
#define cudaGetLastError hipGetLastError
#define cudaGetErrorString hipGetErrorString

#define cudaGetDevice hipGetDevice
#define cudaDeviceGetAttribute hipDeviceGetAttribute
#define cudaDevAttrWarpSize hipDeviceAttributeWarpSize

#else

#include <cuda_runtime.h>

#endif // USE_HIP

#endif // !__CUDA_TO_HIP_H__
2 changes: 1 addition & 1 deletion src/cuda_utils.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ limitations under the License.

#include "internal.h"

#include <cuda_runtime.h>
#include "cuda_to_hip.h"

#include "host_utility.h"

Expand Down
2 changes: 1 addition & 1 deletion src/device_allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ limitations under the License.

#include "device_allocator.h"

#include <cuda_runtime.h>
#include "cuda_to_hip.h"

#include "host_utility.h"

Expand Down
Loading