-
Notifications
You must be signed in to change notification settings - Fork 117
Expand file tree
/
Copy pathreduce_atomic.cpp
More file actions
123 lines (95 loc) · 3.68 KB
/
reduce_atomic.cpp
File metadata and controls
123 lines (95 loc) · 3.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
/*
* This code performs a reduce operation over an array.
*
* There are two implementations of the reduction using atomics:
*
* 1. Reduction where all values are naively reduced into global memory using
* an atomic add operation.
*
* 2. A reduction comprised of two atomic add operations:
* a. Reduce over a workgroup with a local memory atomic
* b. Reduce globally using a global atomic_ref
*
*/
#include <benchmark.h>
#include <sycl/sycl.hpp>
#include "../helpers.hpp"
template <typename T>
constexpr T my_min(T a, T b) {
return a < b ? a : b;
}
using T = float;
constexpr size_t dataSize = 32'768;
constexpr size_t workGroupSize = 1024;
constexpr int numIters = 100;
int main(int argc, char* argv[]) {
T a[dataSize];
T devAns[2] = {0, 0};
for (auto i = 0; i < dataSize; ++i) {
a[i] = static_cast<T>(i);
}
auto q = sycl::queue{};
T* devA = sycl::malloc_device<T>(dataSize, q);
T* devReduced = sycl::malloc_device<T>(1, q); // Holds intermediate values
T zeroVal = 0;
auto e1 = q.memcpy(devA, a, sizeof(T) * dataSize);
auto e2 = q.memcpy(devReduced, &zeroVal, sizeof(T));
auto myNd = sycl::nd_range(sycl::range(dataSize), sycl::range(workGroupSize));
util::benchmark(
[&]() {
q.submit([&](sycl::handler& cgh) {
cgh.depends_on({e1, e2});
cgh.parallel_for(myNd, [=](sycl::nd_item<1> item) {
auto globalIdx = item.get_global_linear_id();
sycl::atomic_ref<T, sycl::memory_order_relaxed,
sycl::memory_scope_device>(devReduced[0])
.fetch_add(devA[globalIdx]);
});
}).wait();
},
numIters, "Reduction using only global atomics");
auto e3 = q.memcpy(&devAns[0], devReduced, sizeof(T));
q.memcpy(devReduced, &zeroVal, sizeof(T), e3).wait();
util::benchmark(
[&]() {
q.submit([&](sycl::handler& cgh) {
cgh.depends_on({e1, e2});
sycl::local_accessor<T, 1> localMem(workGroupSize, cgh);
sycl::local_accessor<T, 1> localReduction(1, cgh);
cgh.parallel_for(myNd, [=](sycl::nd_item<1> item) {
auto localIdx = item.get_local_linear_id();
auto globalIdx = item.get_global_linear_id();
auto globalRange = item.get_global_range(0);
if (localIdx == 0) localReduction[0] = 0;
item.barrier();
// Accumulating thread local reductions into local memory
localMem[localIdx] = devA[globalIdx];
// Work group atomic add into local mem
sycl::atomic_ref<T, sycl::memory_order_relaxed,
sycl::memory_scope_work_group,
sycl::access::address_space::local_space>(
localReduction[0])
.fetch_add(localMem[localIdx]);
item.barrier();
if (localIdx == 0)
sycl::atomic_ref<T, sycl::memory_order_relaxed,
sycl::memory_scope_device,
sycl::access::address_space::global_space>(
devReduced[0])
.fetch_add(localReduction[0]);
});
}).wait();
},
numIters, "Reduction using local and global atomics");
q.memcpy(&devAns[1], devReduced, sizeof(T)).wait();
T serialAns = 0;
for (auto i = 0; i < dataSize; i++) {
serialAns += a[i];
}
std::cout << "Got global atomics device ans " << devAns[0] / numIters << '\n';
std::cout << "Got global and local atomics device ans "
<< devAns[1] / numIters << '\n';
std::cout << "vs serial ans " << serialAns << "\n\n";
sycl::free(devA, q);
sycl::free(devReduced, q);
}