diff --git a/mlir/lib/Target/LLVMIR/CMakeLists.txt b/mlir/lib/Target/LLVMIR/CMakeLists.txt --- a/mlir/lib/Target/LLVMIR/CMakeLists.txt +++ b/mlir/lib/Target/LLVMIR/CMakeLists.txt @@ -57,6 +57,8 @@ MLIROpenACCToLLVMIRTranslation MLIROpenMPToLLVMIRTranslation MLIRROCDLToLLVMIRTranslation + MLIRNVVMTarget + MLIRROCDLTarget ) add_mlir_translation_library(MLIRTargetLLVMIRImport diff --git a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp --- a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp +++ b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp @@ -13,6 +13,8 @@ #include "mlir/Dialect/DLTI/DLTI.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/IR/BuiltinOps.h" +#include "mlir/Target/LLVM/NVVM/Target.h" +#include "mlir/Target/LLVM/ROCDL/Target.h" #include "mlir/Target/LLVMIR/Dialect/All.h" #include "mlir/Target/LLVMIR/Export.h" #include "mlir/Tools/mlir-translate/Translation.h" @@ -36,6 +38,8 @@ }, [](DialectRegistry ®istry) { registry.insert(); + registerNVVMTarget(registry); + registerROCDLTarget(registry); registerAllToLLVMIRTranslations(registry); }); } diff --git a/mlir/lib/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.cpp --- a/mlir/lib/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.cpp @@ -12,10 +12,28 @@ #include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Target/LLVMIR/LLVMTranslationInterface.h" +#include "llvm/ADT/TypeSwitch.h" using namespace mlir; namespace { +LogicalResult launchKernel(gpu::LaunchFuncOp launchOp, + llvm::IRBuilderBase &builder, + LLVM::ModuleTranslation &moduleTranslation) { + auto kernelBinary = SymbolTable::lookupNearestSymbolFrom( + launchOp, launchOp.getKernelModuleName()); + if (!kernelBinary) { + launchOp.emitError("Couldn't find the binary holding the kernel: ") + << launchOp.getKernelModuleName(); + return failure(); + } + auto offloadingHandler = + dyn_cast( + kernelBinary.getOffloadingHandlerAttr()); + assert(offloadingHandler && "Invalid offloading handler."); + return offloadingHandler.launchKernel(launchOp, kernelBinary, builder, + moduleTranslation); +} class GPUDialectLLVMIRTranslationInterface : public LLVMTranslationDialectInterface { @@ -23,9 +41,23 @@ using LLVMTranslationDialectInterface::LLVMTranslationDialectInterface; LogicalResult - convertOperation(Operation *op, llvm::IRBuilderBase &builder, + convertOperation(Operation *operation, llvm::IRBuilderBase &builder, LLVM::ModuleTranslation &moduleTranslation) const override { - return isa(op) ? success() : failure(); + return llvm::TypeSwitch(operation) + .Case([&](gpu::GPUModuleOp) { return success(); }) + .Case([&](gpu::BinaryOp op) { + auto offloadingHandler = + dyn_cast( + op.getOffloadingHandlerAttr()); + assert(offloadingHandler && "Invalid offloading handler."); + return offloadingHandler.embedBinary(op, builder, moduleTranslation); + }) + .Case([&](gpu::LaunchFuncOp op) { + return launchKernel(op, builder, moduleTranslation); + }) + .Default([&](Operation *op) { + return op->emitError("unsupported GPU operation: ") << op->getName(); + }); } }; diff --git a/mlir/test/Target/LLVMIR/gpu.mlir b/mlir/test/Target/LLVMIR/gpu.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Target/LLVMIR/gpu.mlir @@ -0,0 +1,77 @@ +// RUN: mlir-translate -mlir-to-llvmir -split-input-file %s | FileCheck %s + +// Checking the translation of the `gpu.binary` & `gpu.launch_fun` ops. +module attributes {gpu.container_module} { + // CHECK: [[ARGS_TY:%.*]] = type { i32, i32 } + // CHECK: @kernel_module_bin_cst = internal constant [4 x i8] c"BLOB", align 8 + // CHECK: @kernel_module_kernel_kernel_name = private unnamed_addr constant [7 x i8] c"kernel\00", align 1 + gpu.binary @kernel_module [#gpu.object<#nvvm.target, "BLOB">] + llvm.func @foo() { + // CHECK: [[ARGS:%.*]] = alloca %{{.*}}, align 8 + // CHECK: [[ARGS_ARRAY:%.*]] = alloca ptr, i64 2, align 8 + // CHECK: [[ARG0:%.*]] = getelementptr inbounds [[ARGS_TY]], ptr [[ARGS]], i32 0, i32 0 + // CHECK: store i32 32, ptr [[ARG0]], align 4 + // CHECK: %{{.*}} = getelementptr ptr, ptr [[ARGS_ARRAY]], i32 0 + // CHECK: store ptr [[ARG0]], ptr %{{.*}}, align 8 + // CHECK: [[ARG1:%.*]] = getelementptr inbounds [[ARGS_TY]], ptr [[ARGS]], i32 0, i32 1 + // CHECK: store i32 32, ptr [[ARG1]], align 4 + // CHECK: %{{.*}} = getelementptr ptr, ptr [[ARGS_ARRAY]], i32 1 + // CHECK: store ptr [[ARG1]], ptr %{{.*}}, align 8 + // CHECK: [[MODULE:%.*]] = call ptr @mgpuModuleLoad(ptr @kernel_module_bin_cst) + // CHECK: [[FUNC:%.*]] = call ptr @mgpuModuleGetFunction(ptr [[MODULE]], ptr @kernel_module_kernel_kernel_name) + // CHECK: [[STREAM:%.*]] = call ptr @mgpuStreamCreate() + // CHECK: call void @mgpuLaunchKernel(ptr [[FUNC]], i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i32 256, ptr [[STREAM]], ptr [[ARGS_ARRAY]], ptr null) + // CHECK: call void @mgpuStreamSynchronize(ptr [[STREAM]]) + // CHECK: call void @mgpuStreamDestroy(ptr [[STREAM]]) + // CHECK: call void @mgpuModuleUnload(ptr [[MODULE]]) + %0 = llvm.mlir.constant(8 : index) : i64 + %1 = llvm.mlir.constant(32 : i32) : i32 + %2 = llvm.mlir.constant(256 : i32) : i32 + gpu.launch_func @kernel_module::@kernel blocks in (%0, %0, %0) threads in (%0, %0, %0) : i64 dynamic_shared_memory_size %2 args(%1 : i32, %1 : i32) + llvm.return + } +} + +// ----- + +// Checking the correct selection of the second object using an index as a selector. +module { + // CHECK: @kernel_module_bin_cst = internal constant [1 x i8] c"1", align 8 + gpu.binary @kernel_module <#gpu.select_object<1>> [#gpu.object<#nvvm.target, "0">, #gpu.object<#nvvm.target, "1">] +} + +// ----- + +// Checking the correct selection of the second object using a target as a selector. +module { + // CHECK: @kernel_module_bin_cst = internal constant [6 x i8] c"AMDGPU", align 8 + gpu.binary @kernel_module <#gpu.select_object<#rocdl.target>> [#gpu.object<#nvvm.target, "NVPTX">, #gpu.object<#rocdl.target, "AMDGPU">] +} + +// ----- + +// Checking the translation of `gpu.launch_fun` with an async dependency. +module attributes {gpu.container_module} { + // CHECK: @kernel_module_bin_cst = internal constant [4 x i8] c"BLOB", align 8 + gpu.binary @kernel_module [#gpu.object<#rocdl.target, "BLOB">] + llvm.func @foo() { + %0 = llvm.mlir.constant(8 : index) : i64 + // CHECK: = call ptr @mgpuStreamCreate() + // CHECK-NEXT: = alloca {{.*}}, align 8 + // CHECK-NEXT: [[ARGS:%.*]] = alloca ptr, i64 0, align 8 + // CHECK-NEXT: [[MODULE:%.*]] = call ptr @mgpuModuleLoad(ptr @kernel_module_bin_cst) + // CHECK-NEXT: [[FUNC:%.*]] = call ptr @mgpuModuleGetFunction(ptr [[MODULE]], ptr @kernel_module_kernel_kernel_name) + // CHECK-NEXT: call void @mgpuLaunchKernel(ptr [[FUNC]], i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i32 0, ptr {{.*}}, ptr [[ARGS]], ptr null) + // CHECK-NEXT: call void @mgpuModuleUnload(ptr [[MODULE]]) + // CHECK-NEXT: call void @mgpuStreamSynchronize(ptr %{{.*}}) + // CHECK-NEXT: call void @mgpuStreamDestroy(ptr %{{.*}}) + %1 = llvm.call @mgpuStreamCreate() : () -> !llvm.ptr + gpu.launch_func <%1 : !llvm.ptr> @kernel_module::@kernel blocks in (%0, %0, %0) threads in (%0, %0, %0) : i64 + llvm.call @mgpuStreamSynchronize(%1) : (!llvm.ptr) -> () + llvm.call @mgpuStreamDestroy(%1) : (!llvm.ptr) -> () + llvm.return + } + llvm.func @mgpuStreamCreate() -> !llvm.ptr + llvm.func @mgpuStreamSynchronize(!llvm.ptr) + llvm.func @mgpuStreamDestroy(!llvm.ptr) +}