Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions src/thorin/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ set(THORIN_SOURCES
be/emitter.h
be/c/c.cpp
be/c/c.h
be/lower_offload_intrinsics.cpp
be/lower_offload_intrinsics.h
be/runtime.h
be/kernel_config.h
tables/allnodes.h
Expand Down Expand Up @@ -104,10 +106,6 @@ if(LLVM_FOUND)
be/llvm/amdgpu_pal.h
be/llvm/nvvm.cpp
be/llvm/nvvm.h
be/llvm/parallel.cpp
be/llvm/runtime.inc
be/llvm/runtime.cpp
be/llvm/runtime.h
be/llvm/vectorize.cpp
)
endif()
Expand Down
114 changes: 41 additions & 73 deletions src/thorin/be/codegen.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,17 @@

#include "thorin/transform/hls_channels.h"
#include "thorin/transform/hls_kernel_launch.h"
#include "lower_offload_intrinsics.h"

namespace thorin {

void Backend::prepare_kernel_configs() {
device_code_.opt();

Cont2Config adjusted_configs_map;

auto conts = device_code_.world().copy_continuations();
for (auto continuation : kernels_) {
for (auto& [continuation, config] : kernel_configs_) {
// recover the imported continuation (lost after the call to opt)
Continuation* imported = nullptr;
for (auto original_cont : conts) {
Expand All @@ -37,27 +40,13 @@ void Backend::prepare_kernel_configs() {
if (original_cont->name() == continuation->name())
imported = original_cont;
}
assert(imported && "we lost a kernel ?");
if (!imported) continue;

visit_uses(continuation, [&] (Continuation* use) {
assert(use->has_body());

auto handler = backends_.intrinsics_.find(use->body()->callee()->as<Continuation>()->intrinsic());
assert(handler != backends_.intrinsics_.end());
auto [backend2, get_config] = handler->second;
assert(backend2 == this);

auto config = get_config(use->body(), imported);
if (config) {
auto p = kernel_configs_.emplace(imported, std::move(config));
assert_unused(p.second && "single kernel config entry expected");
}
return false;
}, true);

continuation->world().make_external(continuation);
continuation->destroy("codegen");
adjusted_configs_map[imported] = std::move(config);
}

std::swap(kernel_configs_, adjusted_configs_map);
}

static const App* get_alloc_call(const Def* def) {
Expand Down Expand Up @@ -211,7 +200,7 @@ struct ShadyBackend : public Backend {

struct HLSBackend : public Backend {
explicit HLSBackend(DeviceBackends& b, World& src, std::string& hls_flags) : Backend(b, src), hls_flags_(hls_flags) {
b.register_intrinsic(Intrinsic::HLS, *this, [&](const App* app, Continuation* imported) {
b.register_intrinsic(Intrinsic::HLS, *this, [&](const App* app, Continuation* kernel) {
HLSKernelConfig::Param2Size param_sizes;
for (size_t i = hls_free_vars_offset, e = app->num_args(); i != e; ++i) {
auto arg = app->arg(i);
Expand All @@ -237,7 +226,7 @@ struct HLSBackend : public Backend {
b.world().edef(arg, "only pointers to arrays of primitive types are supported");
auto num_elems = size / (multiplier * num_bits(prim_type->primtype_tag()) / 8);
// imported has type: fn (mem, fn (mem), ...)
param_sizes.emplace(imported->param(i - hls_free_vars_offset + 2), num_elems);
param_sizes.emplace(kernel->param(i - hls_free_vars_offset + 2), num_elems);
}
return std::make_unique<HLSKernelConfig>(param_sizes);
});
Expand All @@ -257,13 +246,13 @@ struct HLSBackend : public Backend {
std::string& hls_flags_;
};

DeviceBackends::DeviceBackends(thorin::World& world, int opt, bool debug, std::string& hls_flags) : world_(world), opt_(opt), debug_(debug) {
register_backend(std::make_unique<CudaBackend>(*this, world));
register_backend(std::make_unique<OpenCLBackend>(*this, world));
DeviceBackends::DeviceBackends(World& world, int opt, bool debug, std::string& hls_flags) : world_(world), opt_(opt), debug_(debug) {
register_backend(std::make_unique<CudaBackend>(*this, world_));
register_backend(std::make_unique<OpenCLBackend>(*this, world_));
#if THORIN_ENABLE_LLVM
register_backend(std::make_unique<AMDHSABackend>(*this, world));
register_backend(std::make_unique<AMDPALBackend>(*this, world));
register_backend(std::make_unique<NVVMBackend>(*this, world));
register_backend(std::make_unique<AMDHSABackend>(*this, world_));
register_backend(std::make_unique<AMDPALBackend>(*this, world_));
register_backend(std::make_unique<NVVMBackend>(*this, world_));
#endif
#if THORIN_ENABLE_SHADY
register_backend(std::make_unique<ShadyBackend>(*this, world))
Expand All @@ -272,9 +261,17 @@ DeviceBackends::DeviceBackends(thorin::World& world, int opt, bool debug, std::s
register_backend(std::make_unique<OpenCLSPIRVBackend>(*this, world));
register_backend(std::make_unique<LevelZeroSPIRVBackend>(*this, world));
#endif
register_backend(std::make_unique<HLSBackend>(*this, world, hls_flags));
register_backend(std::make_unique<HLSBackend>(*this, world_, hls_flags));

lower_offload_intrinsics(world, *this);

for (auto& backend : backends_) {
if (backend->thorin().world().empty())
continue;

search_for_device_code();
backend->prepare_kernel_configs();
cgs.emplace_back(backend->create_cg());
}
}

void DeviceBackends::register_backend(std::unique_ptr<Backend> backend) {
Expand All @@ -289,50 +286,21 @@ void DeviceBackends::register_intrinsic(thorin::Intrinsic intrinsic, Backend& ba
intrinsics_[intrinsic] = std::make_pair(&backend, f);
}

void DeviceBackends::search_for_device_code() {
// determine different parts of the world which need to be compiled differently
ScopesForest(world_).for_each([&] (const Scope& scope) {
auto continuation = scope.entry();
Continuation* imported = nullptr;

Intrinsic intrinsic = Intrinsic::None;
visit_capturing_intrinsics(continuation, [&] (Continuation* continuation) {
if (continuation->is_offload_intrinsic()) {
intrinsic = continuation->intrinsic();
return true;
}
return false;
});

if (intrinsic == Intrinsic::None)
return;

auto handler = intrinsics_.find(intrinsic);
assert(handler != intrinsics_.end());
auto [backend, get_config] = handler->second;

imported = backend->importer_->import(continuation)->as_nom<Continuation>();
if (imported == nullptr)
return;

// Necessary so that the names match in the original and imported worlds
imported->set_name(continuation->unique_name());
continuation->set_name(continuation->unique_name());
for (size_t i = 0, e = continuation->num_params(); i != e; ++i)
imported->param(i)->set_name(continuation->param(i)->name());
imported->world().make_external(imported);
imported->attributes().cc = CC::C;

backend->kernels_.emplace_back(continuation);
});

for (auto& backend : backends_) {
if (backend->thorin().world().empty())
continue;

backend->prepare_kernel_configs();
cgs.emplace_back(backend->create_cg());
}
void DeviceBackends::register_kernel_for_offloading(const App* launch, Continuation* kernel) {
Continuation* intrinsic_cont = launch->callee()->as_nom<Continuation>();
auto handler = intrinsics_.find(intrinsic_cont->intrinsic());
assert(handler != intrinsics_.end());
auto [backend, get_config] = handler->second;

// Import the continuation in the destination world
Continuation* imported = backend->importer_->import(kernel)->as_nom<Continuation>();
assert(imported);
imported->world().make_external(imported);
imported->attributes().cc = CC::C;

// Obtain the kernel config now
auto config = get_config(launch, kernel);
backend->kernel_configs_[kernel] = std::move(config);
}

CodeGen::CodeGen(Thorin& thorin, bool debug)
Expand Down
6 changes: 3 additions & 3 deletions src/thorin/be/codegen.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ struct Backend {
Thorin device_code_;
std::unique_ptr<Importer> importer_;

std::vector<Continuation*> kernels_;
Cont2Config kernel_configs_;

void prepare_kernel_configs();
Expand All @@ -53,6 +52,8 @@ struct Backend {
struct DeviceBackends {
DeviceBackends(World& world, int opt, bool debug, std::string& hls_flags);

DeviceBackends(DeviceBackends&) = delete;

World& world();
std::vector<std::unique_ptr<CodeGen>> cgs;

Expand All @@ -63,15 +64,14 @@ struct DeviceBackends {
using GetKernelConfigFn = std::function<std::unique_ptr<KernelConfig>(const App*, Continuation*)>;
void register_intrinsic(Intrinsic, Backend&, GetKernelConfigFn);

void register_kernel_for_offloading(const App* launch, Continuation*);
private:
World& world_;
std::vector<std::unique_ptr<Backend>> backends_;
std::unordered_map<Intrinsic, std::pair<Backend*, GetKernelConfigFn>> intrinsics_;

int opt_;
bool debug_;

void search_for_device_code();
friend Backend;
};

Expand Down
27 changes: 9 additions & 18 deletions src/thorin/be/llvm/llvm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,16 @@ CodeGen::CodeGen(
, function_calling_convention_(function_calling_convention)
, device_calling_convention_(device_calling_convention)
, kernel_calling_convention_(kernel_calling_convention)
, runtime_(std::make_unique<Runtime>(context(), module()))
{}

llvm::Function* CodeGen::get(CodeGen& code_gen, const char* name) {
auto result = llvm::cast<llvm::Function>(module_->getOrInsertFunction(name, module_->getFunction(name)->getFunctionType()).getCallee()->stripPointerCasts());
result->addFnAttr("target-cpu", code_gen.machine().getTargetCPU());
result->addFnAttr("target-features", code_gen.machine().getTargetFeatureString());
assert(result != nullptr && "Required runtime function could not be resolved");
return result;
}

void CodeGen::optimize() {
llvm::PassBuilder PB;
llvm::OptimizationLevel opt_level;
Expand Down Expand Up @@ -362,10 +369,6 @@ CodeGen::emit_module() {

verify();
optimize();

// We need to delete the runtime at this point, since the ownership of
// the context and module is handed away.
runtime_.reset();
return std::pair { std::move(context_), std::move(module_) };
}

Expand Down Expand Up @@ -1041,7 +1044,7 @@ void CodeGen::emit_phi_arg(llvm::IRBuilder<>& irbuilder, const Param* param, llv
*/

llvm::Value* CodeGen::emit_alloc(llvm::IRBuilder<>& irbuilder, const Type* type, const Def* extra) {
auto llvm_malloc = runtime_->get(*this, get_alloc_name().c_str());
auto llvm_malloc = get(*this, get_alloc_name().c_str());
auto alloced_type = convert(type);
llvm::CallInst* void_ptr;
auto layout = module().getDataLayout();
Expand Down Expand Up @@ -1303,19 +1306,7 @@ std::vector<llvm::Value*> CodeGen::emit_intrinsic(llvm::IRBuilder<>& irbuilder,
case Intrinsic::CmpXchgWeak: return emit_cmpxchg(irbuilder, continuation, true);
case Intrinsic::Fence: emit_fence(irbuilder, continuation); break;
case Intrinsic::Reserve: return { emit_reserve(irbuilder, continuation) };
case Intrinsic::CUDA: runtime_->emit_host_code(*this, irbuilder, Platform::CUDA_PLATFORM, ".cu", continuation); break;
case Intrinsic::NVVM: runtime_->emit_host_code(*this, irbuilder, Platform::CUDA_PLATFORM, ".nvvm", continuation); break;
case Intrinsic::OpenCL: runtime_->emit_host_code(*this, irbuilder, Platform::OPENCL_PLATFORM, ".cl", continuation); break;
case Intrinsic::OpenCL_SPIRV: runtime_->emit_host_code(*this, irbuilder, Platform::OPENCL_PLATFORM, ".spv", continuation); break;
case Intrinsic::LevelZero_SPIRV: runtime_->emit_host_code(*this, irbuilder, Platform::LEVEL_ZERO_PLATFORM, ".spv", continuation); break;
case Intrinsic::AMDGPUHSA: runtime_->emit_host_code(*this, irbuilder, Platform::HSA_PLATFORM, ".amdgpu", continuation); break;
case Intrinsic::AMDGPUPAL: runtime_->emit_host_code(*this, irbuilder, Platform::PAL_PLATFORM, ".amdgpu", continuation); break;
case Intrinsic::ShadyCompute: runtime_->emit_host_code(*this, irbuilder, Platform::SHADY_PLATFORM, ".shady", continuation); break;
case Intrinsic::HLS: emit_hls(irbuilder, continuation); break;
case Intrinsic::Parallel: emit_parallel(irbuilder, continuation); break;
case Intrinsic::Fibers: emit_fibers(irbuilder, continuation); break;
case Intrinsic::Spawn: return { emit_spawn(irbuilder, continuation) };
case Intrinsic::Sync: emit_sync(irbuilder, continuation); break;
#if THORIN_ENABLE_RV
case Intrinsic::Vectorize: emit_vectorize_continuation(irbuilder, continuation); break;
#else
Expand Down
7 changes: 1 addition & 6 deletions src/thorin/be/llvm/llvm.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
#include "thorin/analyses/schedule.h"
#include "thorin/be/codegen.h"
#include "thorin/be/emitter.h"
#include "thorin/be/llvm/runtime.h"
#include "thorin/be/kernel_config.h"
#include "thorin/transform/importer.h"

Expand Down Expand Up @@ -43,6 +42,7 @@ class CodeGen : public thorin::CodeGen, public thorin::Emitter<llvm::Value*, llv
const llvm::Module& module() const { return *module_; }
llvm::TargetMachine& machine() { return *machine_; }
int opt() const { return opt_; }
llvm::Function* get(CodeGen& code_gen, const char* name);
//@}

const char* file_ext() const override { return ".ll"; }
Expand Down Expand Up @@ -106,10 +106,6 @@ class CodeGen : public thorin::CodeGen, public thorin::Emitter<llvm::Value*, llv
Continuation* emit_peinfo(llvm::IRBuilder<>&, Continuation*);
std::vector<llvm::Value*> emit_intrinsic(llvm::IRBuilder<>&, Continuation*);
void emit_hls(llvm::IRBuilder<>&, Continuation*);
void emit_parallel(llvm::IRBuilder<>&, Continuation*);
void emit_fibers(llvm::IRBuilder<>&, Continuation*);
llvm::Value* emit_spawn(llvm::IRBuilder<>&, Continuation*);
void emit_sync(llvm::IRBuilder<>&, Continuation*);
void emit_vectorize_continuation(llvm::IRBuilder<>&, Continuation*);
llvm::Value* emit_atomic(llvm::IRBuilder<>&, Continuation*);
std::vector<llvm::Value*> emit_cmpxchg(llvm::IRBuilder<>&, Continuation*, bool);
Expand All @@ -136,7 +132,6 @@ class CodeGen : public thorin::CodeGen, public thorin::Emitter<llvm::Value*, llv
llvm::CallingConv::ID device_calling_convention_;
llvm::CallingConv::ID kernel_calling_convention_;
llvm::DIScope* discope_ = nullptr;
std::unique_ptr<Runtime> runtime_;
#if THORIN_ENABLE_RV
std::vector<std::tuple<u32, llvm::Function*, llvm::CallInst*>> vec_todo_;
#endif
Expand Down
Loading