kompute/src/include/kompute/Sequence.hpp at 62d5b979eed2f01429698935234b1e406efd2b89 · troelsy/kompute · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
// SPDX-License-Identifier: Apache-2.0
#pragma once

#include "kompute/Core.hpp"

#include "kompute/operations/OpAlgoDispatch.hpp"
#include "kompute/operations/OpBase.hpp"

namespace kp {

/**
 *  Container of operations that can be sent to GPU as batch
 */
class Sequence : public std::enable_shared_from_this<Sequence>
{
  public:
    /**
     * Main constructor for sequence which requires core vulkan components to
     * generate all dependent resources.
     *
     * @param physicalDevice Vulkan physical device
     * @param device Vulkan logical device
     * @param computeQueue Vulkan compute queue
     * @param queueIndex Vulkan compute queue index in device
     * @param totalTimestamps Maximum number of timestamps to allocate
     */
    Sequence(std::shared_ptr<vk::PhysicalDevice> physicalDevice,
             std::shared_ptr<vk::Device> device,
             std::shared_ptr<vk::Queue> computeQueue,
             uint32_t queueIndex,
             uint32_t totalTimestamps = 0) noexcept;

    /**
     * @brief Make Sequence uncopyable
     *
     */
    Sequence(const Sequence&) = delete;
    Sequence(const Sequence&&) = delete;
    Sequence& operator=(const Sequence&) = delete;
    Sequence& operator=(const Sequence&&) = delete;

    /**
     * Destructor for sequence which is responsible for cleaning all subsequent
     * owned operations.
     */
    ~Sequence() noexcept;

    /**
     * Record function for operation to be added to the GPU queue in batch. This
     * template requires classes to be derived from the OpBase class. This
     * function also requires the Sequence to be recording, otherwise it will
     * not be able to add the operation.
     *
     * @param op Object derived from kp::BaseOp that will be recoreded by the
     * sequence which will be used when the operation is evaluated.
     * @return shared_ptr<Sequence> of the Sequence class itself
     */
    std::shared_ptr<Sequence> record(std::shared_ptr<OpBase> op);

    /**
     * Record function for operation to be added to the GPU queue in batch. This
     * template requires classes to be derived from the OpBase class. This
     * function also requires the Sequence to be recording, otherwise it will
     * not be able to add the operation.
     *
     * @param memObjects Vector of mem objects to use for the operation
     * @param TArgs Template parameters that are used to initialise operation
     * which allows for extensible configurations on initialisation.
     * @return shared_ptr<Sequence> of the Sequence class itself
     */
    template<typename T, typename... TArgs>
    std::shared_ptr<Sequence> record(
      std::vector<std::shared_ptr<Memory>> memObjects,
      TArgs&&... params)
    {
        std::shared_ptr<T> op{ new T(memObjects,
                                     std::forward<TArgs>(params)...) };
        return this->record(op);
    }
    /**
     * Record function for operation to be added to the GPU queue in batch. This
     * template requires classes to be derived from the OpBase class. This
     * function also requires the Sequence to be recording, otherwise it will
     * not be able to add the operation.
     *
     * @param algorithm Algorithm to use for the record often used for OpAlgo
     * operations
     * @param TArgs Template parameters that are used to initialise operation
     * which allows for extensible configurations on initialisation.
     * @return shared_ptr<Sequence> of the Sequence class itself
     */
    template<typename T, typename... TArgs>
    std::shared_ptr<Sequence> record(std::shared_ptr<Algorithm> algorithm,
                                     TArgs&&... params)
    {
        std::shared_ptr<T> op{ new T(algorithm,
                                     std::forward<TArgs>(params)...) };
        return this->record(op);
    }

    /**
     * Eval sends all the recorded and stored operations in the vector of
     * operations into the gpu as a submit job synchronously (with a barrier).
     *
     * @return shared_ptr<Sequence> of the Sequence class itself
     */
    std::shared_ptr<Sequence> eval();

    /**
     * Resets all the recorded and stored operations, records the operation
     * provided and submits into the gpu as a submit job synchronously (with a
     * barrier).
     *
     * @return shared_ptr<Sequence> of the Sequence class itself
     */
    std::shared_ptr<Sequence> eval(std::shared_ptr<OpBase> op);

    /**
     * Eval sends all the recorded and stored operations in the vector of
     * operations into the gpu as a submit job with a barrier.
     *
     * @param memObjects Vector of memory objects to use for the operation
     * @param TArgs Template parameters that are used to initialise operation
     * which allows for extensible configurations on initialisation.
     * @return shared_ptr<Sequence> of the Sequence class itself
     */
    template<typename T, typename... TArgs>
    std::shared_ptr<Sequence> eval(
      std::vector<std::shared_ptr<Memory>> memObjects,
      TArgs&&... params)
    {
        std::shared_ptr<T> op{ new T(memObjects,
                                     std::forward<TArgs>(params)...) };
        return this->eval(op);
    }
    /**
     * Eval sends all the recorded and stored operations in the vector of
     * operations into the gpu as a submit job with a barrier.
     *
     * @param algorithm Algorithm to use for the record often used for OpAlgo
     * operations
     * @param TArgs Template parameters that are used to initialise operation
     * which allows for extensible configurations on initialisation.
     * @return shared_ptr<Sequence> of the Sequence class itself
     */
    template<typename T, typename... TArgs>
    std::shared_ptr<Sequence> eval(std::shared_ptr<Algorithm> algorithm,
                                   TArgs&&... params)
    {
        std::shared_ptr<T> op{ new T(algorithm,
                                     std::forward<TArgs>(params)...) };
        return this->eval(op);
    }

    /**
     * Eval Async sends all the recorded and stored operations in the vector of
     * operations into the gpu as a submit job without a barrier. EvalAwait()
     * must ALWAYS be called after to ensure the sequence is terminated
     * correctly.
     *
     * @return Boolean stating whether execution was successful.
     */
    std::shared_ptr<Sequence> evalAsync();
    /**
     * Clears currnet operations to record provided one in the vector of
     * operations into the gpu as a submit job without a barrier. EvalAwait()
     * must ALWAYS be called after to ensure the sequence is terminated
     * correctly.
     *
     * @return Boolean stating whether execution was successful.
     */
    std::shared_ptr<Sequence> evalAsync(std::shared_ptr<OpBase> op);
    /**
     * Eval sends all the recorded and stored operations in the vector of
     * operations into the gpu as a submit job with a barrier.
     *
     * @param memObjects Vector of memory objects to use for the operation
     * @param TArgs Template parameters that are used to initialise operation
     * which allows for extensible configurations on initialisation.
     * @return shared_ptr<Sequence> of the Sequence class itself
     */
    template<typename T, typename... TArgs>
    std::shared_ptr<Sequence> evalAsync(
      std::vector<std::shared_ptr<Memory>> memObjects,
      TArgs&&... params)
    {
        std::shared_ptr<T> op{ new T(memObjects,
                                     std::forward<TArgs>(params)...) };
        return this->evalAsync(op);
    }
    /**
     * Eval sends all the recorded and stored operations in the vector of
     * operations into the gpu as a submit job with a barrier.
     *
     * @param algorithm Algorithm to use for the record often used for OpAlgo
     * operations
     * @param TArgs Template parameters that are used to initialise operation
     * which allows for extensible configurations on initialisation.
     * @return shared_ptr<Sequence> of the Sequence class itself
     */
    template<typename T, typename... TArgs>
    std::shared_ptr<Sequence> evalAsync(std::shared_ptr<Algorithm> algorithm,
                                        TArgs&&... params)
    {
        std::shared_ptr<T> op{ new T(algorithm,
                                     std::forward<TArgs>(params)...) };
        return this->evalAsync(op);
    }

    /**
     * Eval Await waits for the fence to finish processing and then once it
     * finishes, it runs the postEval of all operations.
     *
     * @param waitFor Number of milliseconds to wait before timing out.
     * @return shared_ptr<Sequence> of the Sequence class itself
     */
    std::shared_ptr<Sequence> evalAwait(uint64_t waitFor = UINT64_MAX);

    /**
     * Clear function clears all operations currently recorded and starts
     * recording again.
     */
    void clear();

    /**
     * Return the timestamps that were latched at the beginning and
     * after each operation during the last eval() call.
     */
    std::vector<std::uint64_t> getTimestamps();

    /**
     * Begins recording commands for commands to be submitted into the command
     * buffer.
     */
    void begin();

    /**
     * Ends the recording and stops recording commands when the record command
     * is sent.
     */
    void end();

    /**
     * Returns true if the sequence is currently in recording activated.
     *
     * @return Boolean stating if recording ongoing.
     */
    bool isRecording() const;

    /**
     * Returns true if the sequence has been initialised, and it's based on the
     * GPU resources being referenced.
     *
     * @return Boolean stating if is initialized
     */
    bool isInit() const;

    /**
     * Clears command buffer and triggers re-record of all the current
     * operations saved, which is useful if the underlying kp::Memorys or
     * kp::Algorithms are modified and need to be re-recorded.
     */
    void rerecord();

    /**
     * Returns true if the sequence is currently running - mostly used for async
     * workloads.
     *
     * @return Boolean stating if currently running.
     */
    bool isRunning() const;

    /**
     * Destroys and frees the GPU resources which include the buffer and memory
     * and sets the sequence as init=False.
     */
    void destroy();

  protected:
    // -------------- NEVER OWNED RESOURCES
    std::shared_ptr<vk::PhysicalDevice> mPhysicalDevice = nullptr;
    std::shared_ptr<vk::Device> mDevice = nullptr;
    std::shared_ptr<vk::Queue> mComputeQueue = nullptr;
    uint32_t mQueueIndex = -1;

    // -------------- OPTIONALLY OWNED RESOURCES
    std::shared_ptr<vk::CommandPool> mCommandPool = nullptr;
    bool mFreeCommandPool = false;
    std::shared_ptr<vk::CommandBuffer> mCommandBuffer = nullptr;
    bool mFreeCommandBuffer = false;

    // -------------- ALWAYS OWNED RESOURCES
    vk::Fence mFence;
    std::vector<std::shared_ptr<OpBase>> mOperations{};
    std::shared_ptr<vk::QueryPool> timestampQueryPool = nullptr;

    // State
    bool mRecording = false;
    bool mIsRunning = false;

  private:
    // Create functions
    void createCommandPool();
    void createCommandBuffer();
    void createTimestampQueryPool(uint32_t totalTimestamps);
};

} // End namespace kp