Skip to content

Commit 70b5309

Browse files
authored
Merge branch 'master' into master
2 parents 3552b49 + 61af4bc commit 70b5309

File tree

7 files changed

+239
-7
lines changed

7 files changed

+239
-7
lines changed
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
cmake_minimum_required(VERSION 3.20)
2+
project(kompute_vulkan_extensions_printf)
3+
4+
set(CMAKE_CXX_STANDARD 14)
5+
6+
# Options
7+
option(KOMPUTE_OPT_GIT_TAG "The tag of the repo to use for the example" v0.9.0)
8+
option(KOMPUTE_OPT_FROM_SOURCE "Whether to build example from source or from git fetch repo" ON)
9+
10+
if(KOMPUTE_OPT_FROM_SOURCE)
11+
add_subdirectory(../../ ${CMAKE_CURRENT_BINARY_DIR}/kompute_build)
12+
else()
13+
include(FetchContent)
14+
FetchContent_Declare(kompute GIT_REPOSITORY https://github.com/KomputeProject/kompute.git
15+
GIT_TAG ${KOMPUTE_OPT_GIT_TAG})
16+
FetchContent_MakeAvailable(kompute)
17+
include_directories(${kompute_SOURCE_DIR}/src/include)
18+
endif()
19+
20+
# Compiling shader
21+
# To add more shaders simply copy the vulkan_compile_shader command and replace it with your new shader
22+
vulkan_compile_shader(
23+
INFILE shader/example_shader.comp
24+
OUTFILE shader/example_shader.hpp
25+
NAMESPACE "shader")
26+
27+
# Then add it to the library, so you can access it later in your code
28+
add_library(shader INTERFACE "shader/example_shader.hpp")
29+
target_include_directories(shader INTERFACE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>)
30+
31+
# Setting up main example code
32+
add_executable(example_shader_printf src/main.cpp)
33+
target_link_libraries(example_shader_printf PRIVATE shader kompute::kompute)
34+
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# Kompute Debugging GLSL Shader using DebugPrint
2+
3+
This folder contains an end to end Kompute Example that implements a GLSL with a debug print statement.
4+
This example is structured such that you will be able to extend it for your project.
5+
It contains a CMake build configuration that can be used in your production applications.
6+
7+
## Further information
8+
Debugging Vulkan shaders, especially compute shaders, can be very difficult to do even with the aid
9+
of a powerful debugging tool like RenderDoc. Debug Printf is a recent Vulkan feature that allows
10+
developers to debug their shaders by inserting Debug Print statements. This debug print statement operates
11+
quite like the C printf statement, only that it is executed in multiple compute cores at the same time, so it needs some logic to allow semantic usage of the debug statement otherwise you can easily be overwelmed by debug messages.
12+
<br>
13+
14+
This feature is now supported within RenderDoc in a way that allows for per-invocation inspection of values in a shader.
15+
This article describes how to instrument your GLSL or HLSL shaders with Debug Printf and how to
16+
inspect and debug with them in the terminal, using vkconfig, or with environment variables.
17+
18+
For a full walkthrough of the process please see my article at https://medium.com/@evanokeeffe/using-debug-printf-in-vulkan-kompute-shaders-2aaf30bdb96c
19+
20+
## Building the example
21+
22+
You will notice that it's a standalone project, so you can re-use it for your application.
23+
It uses CMake's [`fetch_content`](https://cmake.org/cmake/help/latest/module/FetchContent.html) to consume Kompute as a dependency.
24+
To build you just need to run the CMake command in this folder as follows:
25+
26+
```bash
27+
git clone https://github.com/KomputeProject/kompute.git
28+
cd kompute/examples/vulkan_ext_printf
29+
mkdir build
30+
cd build
31+
cmake ..
32+
cmake --build .
33+
```
34+
35+
## Executing
36+
37+
From inside the `build/` directory run:
38+
39+
### Linux
40+
41+
```bash
42+
./example_shader_printf
43+
```
44+
45+
## Pre-requisites
46+
47+
In order to run this example, you will need the following dependencies:
48+
49+
* REQUIRED
50+
+ The Vulkan SDK must be installed
51+
52+
For the Vulkan SDK, the simplest way to install it is through [their website](https://vulkan.lunarg.com/sdk/home). You just have to follow the instructions for the relevant platform.
53+
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#version 450
2+
// To use Debug Printf in GLSL shaders, you need to enable the GL_EXT_debug_printf extension.
3+
// Then add debugPrintfEXT calls at the locations in your shader where you want to print
4+
// messages and/or values
5+
#extension GL_EXT_debug_printf : enable
6+
7+
// The execution structure
8+
layout (local_size_x = 1) in;
9+
10+
// The buffers are provided via the tensors
11+
layout(binding = 0) buffer bufA { float a[]; };
12+
layout(binding = 1) buffer bufB { float b[]; };
13+
layout(binding = 2) buffer bufOut { float o[]; };
14+
15+
void main() {
16+
uint index = gl_GlobalInvocationID.x;
17+
o[index] = a[index] * b[index];
18+
19+
// the debug statement operates much the same as printf in C
20+
// do be wary of the size of each line as the default debug message size is
21+
// 1024 bytes
22+
debugPrintfEXT("The result of %f x %f = %f \n", a[index], b[index], o[index]);
23+
}
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
#include <iostream>
2+
#include <memory>
3+
#include <vector>
4+
#include <string>
5+
#include <random>
6+
#include <bits/stdc++.h>
7+
#include <functional> // std::multiplies
8+
#include <algorithm> // std::transform
9+
10+
#include <kompute/Kompute.hpp>
11+
#include <shader/example_shader.hpp>
12+
13+
std::vector<float> generate_random_floats(int n, float min_val, float max_val) {
14+
// create std::vector and pre-allocate the vector with n floats
15+
std::vector<float> vectorF = std::vector<float>(n);
16+
// 1. Obtain a random seed from the hardware
17+
std::random_device rd;
18+
// 2. Initialize the generator with the seed
19+
std::mt19937 gen(rd());
20+
// 3. Define the distribution range [0, n)
21+
std::uniform_real_distribution<float> dis(min_val, max_val);
22+
//fill the vector with randomly distributed floats
23+
for (int i = 0; i < n; i++) {vectorF[i] = dis(gen);}
24+
//return the filled vector
25+
return vectorF;
26+
}
27+
28+
int main(int argc, char *argv[])
29+
{
30+
int device_id = 0;
31+
32+
if(argc>1){
33+
device_id = atoi(argv[1]);
34+
}else{
35+
std::cout<<"Using device 0"<<std::endl;
36+
}
37+
38+
// make sure to add the extension, check vulkan_info to see if your GPU vulkan driver supports the extension
39+
// vulkaninfo | grep VK_KHR_shader_non_semantic_info
40+
const std::vector<std::string> desiredExtensions = std::vector<std::string>({
41+
"VK_KHR_shader_non_semantic_info",
42+
});
43+
const std::vector<uint32_t> familyQueueIndices = std::vector<uint32_t>({});
44+
45+
kp::Manager mgr(device_id, familyQueueIndices, desiredExtensions);
46+
47+
int vector_length = 10;
48+
49+
const std::vector<float> A = generate_random_floats(vector_length, 1.0, 10.0);
50+
const std::vector<float> B = generate_random_floats(vector_length, 1.0, 10.0);
51+
const std::vector<float> C = generate_random_floats(vector_length, 0.0, 0.0);
52+
53+
std::shared_ptr<kp::TensorT<float>> tensorInA = mgr.tensorT<float>(A);
54+
std::shared_ptr<kp::TensorT<float>> tensorInB = mgr.tensorT<float>(B);
55+
std::shared_ptr<kp::TensorT<float>> tensorOut = mgr.tensorT<float>(C);
56+
57+
const std::vector<std::shared_ptr<kp::Memory>> params = { tensorInA,
58+
tensorInB,
59+
tensorOut };
60+
61+
kp::Workgroup workgroup = { vector_length, 1, 1 };
62+
63+
const std::vector<uint32_t> shader = std::vector<uint32_t>(
64+
shader::EXAMPLE_SHADER_COMP_SPV.begin(), shader::EXAMPLE_SHADER_COMP_SPV.end());
65+
std::shared_ptr<kp::Algorithm> algo = mgr.algorithm(params, shader, workgroup);
66+
67+
mgr.sequence()
68+
->record<kp::OpSyncDevice>(params)
69+
->record<kp::OpAlgoDispatch>(algo)
70+
->record<kp::OpSyncLocal>(params)
71+
->eval();
72+
73+
std::cout << "Output: { ";
74+
for (const float& elem : tensorOut->vector()) { std::cout << elem << " ";}
75+
std::cout << "}" << std::endl;
76+
}

src/Manager.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,22 @@ Manager::createDevice(const std::vector<uint32_t>& familyQueueIndices,
398398

399399
this->mComputeQueueFamilyIndices.push_back(computeQueueFamilyIndex);
400400
} else {
401+
std::vector<vk::QueueFamilyProperties> allQueueFamilyProperties =
402+
physicalDevice.getQueueFamilyProperties();
403+
for (auto queueIndexGiven : familyQueueIndices) {
404+
if (queueIndexGiven >= allQueueFamilyProperties.size()) {
405+
throw std::runtime_error(
406+
"Given family queue index does not exists. Index given: " +
407+
std::to_string(queueIndexGiven));
408+
}
409+
if (!(allQueueFamilyProperties[queueIndexGiven].queueFlags &
410+
vk::QueueFlagBits::eCompute)) {
411+
throw std::runtime_error(
412+
"Given family queue index does not support compute "
413+
"operations. Index given: " +
414+
std::to_string(queueIndexGiven));
415+
}
416+
}
401417
this->mComputeQueueFamilyIndices = familyQueueIndices;
402418
}
403419

src/logger/CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,6 @@ target_compile_definitions(kp_logger INTERFACE KOMPUTE_OPT_ACTIVE_LOG_LEVEL=KOMP
5656
if(NOT KOMPUTE_OPT_LOG_LEVEL_DISABLED)
5757
if(KOMPUTE_OPT_USE_SPDLOG)
5858
target_link_libraries(kp_logger PUBLIC spdlog::spdlog)
59-
target_compile_definitions(spdlog INTERFACE SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL})
6059
target_compile_definitions(kp_logger INTERFACE SPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL})
6160
message(STATUS "setting SPDLOG_ACTIVE_LEVEL to SPDLOG_LEVEL_${KOMPUTE_OPT_LOG_LEVEL}")
6261

test/TestAsyncOperations.cpp

Lines changed: 37 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,32 @@
88
#include "kompute/logger/Logger.hpp"
99
#include "shaders/Utils.hpp"
1010

11+
namespace {
12+
std::vector<uint32_t>
13+
distinctFamilyQueueIndices(const vk::PhysicalDevice& device)
14+
{
15+
const std::vector<vk::QueueFamilyProperties> allQueueFamilyProperties =
16+
device.getQueueFamilyProperties();
17+
std::vector<uint32_t> distinctQueuesIndices;
18+
19+
for (uint32_t i = 0; i < allQueueFamilyProperties.size(); i++) {
20+
if (allQueueFamilyProperties[i].queueFlags &
21+
(vk::QueueFlagBits::eCompute)) {
22+
distinctQueuesIndices.push_back(i);
23+
}
24+
}
25+
return distinctQueuesIndices;
26+
}
27+
}
28+
1129
TEST(TestAsyncOperations, TestManagerParallelExecution)
1230
{
13-
// This test is built for NVIDIA 1650. It assumes:
14-
// * Queue family 0 and 2 have compute capabilities
31+
// This test assumes:
32+
// * There are at least 2 different Queue families with compute capabilities
1533
// * GPU is able to process parallel shader code across different families
16-
uint32_t size = 10;
34+
constexpr uint32_t size = 10;
1735

18-
uint32_t numParallel = 2;
36+
constexpr uint32_t numParallel = 2;
1937

2038
std::string shader(R"(
2139
#version 450
@@ -79,7 +97,18 @@ TEST(TestAsyncOperations, TestManagerParallelExecution)
7997
EXPECT_EQ(inputsSyncB[i]->vector<float>(), resultSync);
8098
}
8199

82-
kp::Manager mgrAsync(0, { 0, 2 });
100+
constexpr uint32_t deviceId =
101+
0u; // device 0 exists, because "mgr" could be created already
102+
auto queues = distinctFamilyQueueIndices(
103+
mgr.getVkInstance()->enumeratePhysicalDevices().at(deviceId));
104+
if (queues.size() < numParallel) {
105+
GTEST_SKIP() << "GPU does not support multiple compute queues. Only "
106+
<< queues.size() << " are supported. Skipping test.";
107+
}
108+
109+
queues.resize(numParallel);
110+
111+
kp::Manager mgrAsync(deviceId, std::move(queues));
83112

84113
std::vector<std::shared_ptr<kp::Memory>> inputsAsyncB;
85114

@@ -118,7 +147,9 @@ TEST(TestAsyncOperations, TestManagerParallelExecution)
118147
}
119148

120149
// The speedup should be at least 40%
121-
EXPECT_LT(durationAsync, durationSync * 0.6);
150+
EXPECT_LT(durationAsync, durationSync * 0.6)
151+
<< "There was no speedup in using multiple queues from different "
152+
"QueueFamilies. Maybe your GPU does not support parallel execution.";
122153
}
123154

124155
TEST(TestAsyncOperations, TestManagerAsyncExecution)

0 commit comments

Comments
 (0)