diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -722,7 +722,22 @@ RewritePatternSet patterns(&getContext()); LLVMConversionTarget target(getContext()); - target.addIllegalDialect(); + SymbolTable symbolTable = SymbolTable(getOperation()); + + target.addDynamicallyLegalOp( + [](gpu::GPUModuleOp module) -> bool { + return module.getTargetsAttr() != nullptr; + }); + + target.addDynamicallyLegalOp( + [&](gpu::LaunchFuncOp op) -> bool { + auto module = + symbolTable.lookup(op.getKernelModuleName()); + return converter.isLegal(op->getOperandTypes()) && + converter.isLegal(op->getResultTypes()) && + (module && module.getTargetsAttr() && + module.getTargetsAttr().size()); + }); mlir::arith::populateArithToLLVMConversionPatterns(converter, patterns); mlir::cf::populateControlFlowToLLVMConversionPatterns(converter, patterns); @@ -1221,6 +1236,49 @@ launchOp, launchOp.getKernelModuleName()); assert(kernelModule && "expected a kernel module"); + // If the module has Targets then just update the op operands. + if (ArrayAttr targets = kernelModule.getTargetsAttr()) { + Value stream = Value(); + if (adaptor.getAsyncDependencies().size()) + stream = adaptor.getAsyncDependencies().front(); + // If the async keyword is present and there are no dependencies, then a + // stream must be created to pass to subsequent operations. + else if (launchOp.getAsyncToken()) + stream = streamCreateCallBuilder.create(loc, rewriter, {}).getResult(); + + // Lower the kernel operands to match kernel parameters. + SmallVector arguments; + if (kernelBarePtrCallConv) { + // Hack the bare pointer value on just for the argument promotion + LLVMTypeConverter *converter = getTypeConverter(); + LowerToLLVMOptions options = converter->getOptions(); + LowerToLLVMOptions overrideToMatchKernelOpts = options; + overrideToMatchKernelOpts.useBarePtrCallConv = true; + converter->dangerousSetOptions(overrideToMatchKernelOpts); + arguments = + converter->promoteOperands(loc, launchOp.getKernelOperands(), + adaptor.getKernelOperands(), rewriter); + converter->dangerousSetOptions(options); + } else { + arguments = getTypeConverter()->promoteOperands( + loc, launchOp.getKernelOperands(), adaptor.getKernelOperands(), + rewriter); + } + + rewriter.create( + launchOp.getLoc(), launchOp.getKernelAttr(), + gpu::KernelDim3{adaptor.getGridSizeX(), adaptor.getGridSizeY(), + adaptor.getGridSizeZ()}, + gpu::KernelDim3{adaptor.getBlockSizeX(), adaptor.getBlockSizeY(), + adaptor.getBlockSizeZ()}, + adaptor.getDynamicSharedMemorySize(), arguments, stream); + if (launchOp.getAsyncToken()) + rewriter.replaceOp(launchOp, {stream}); + else + rewriter.eraseOp(launchOp); + return success(); + } + auto binaryAttr = kernelModule->getAttrOfType(gpuBinaryAnnotation); if (!binaryAttr) { diff --git a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir --- a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir @@ -1,5 +1,5 @@ -// RUN: mlir-opt %s --gpu-to-llvm="gpu-binary-annotation=nvvm.cubin use-opaque-pointers=1" | FileCheck %s -// RUN: mlir-opt %s --gpu-to-llvm="gpu-binary-annotation=rocdl.hsaco use-opaque-pointers=1" | FileCheck %s --check-prefix=ROCDL +// RUN: mlir-opt %s --gpu-to-llvm="gpu-binary-annotation=nvvm.cubin use-opaque-pointers=1" -split-input-file | FileCheck %s +// RUN: mlir-opt %s --gpu-to-llvm="gpu-binary-annotation=rocdl.hsaco use-opaque-pointers=1" -split-input-file | FileCheck %s --check-prefix=ROCDL module attributes {gpu.container_module} { @@ -61,3 +61,37 @@ // CHECK: llvm.call @mgpuStreamDestroy // CHECK: llvm.call @mgpuModuleUnload } + +// ----- + +module attributes {gpu.container_module} { + // CHECK: gpu.module + // ROCDL: gpu.module + gpu.module @kernel_module [#gpu.nvptx] { + llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr, + %arg2: !llvm.ptr, %arg3: i64, %arg4: i64, + %arg5: i64) attributes {gpu.kernel} { + llvm.return + } + } + + func.func @foo(%buffer: memref) { + // CHECK: [[C8:%.*]] = llvm.mlir.constant(8 : index) : i64 + // CHECK: [[C32:%.*]] = llvm.mlir.constant(32 : i32) : i32 + // CHECK: [[C256:%.*]] = llvm.mlir.constant(256 : i32) : i32 + %c8 = arith.constant 8 : index + %c32 = arith.constant 32 : i32 + %c256 = arith.constant 256 : i32 + + // CHECK: gpu.launch_func @kernel_module::@kernel + // CHECK: blocks in ([[C8]], [[C8]], [[C8]]) : i64 threads in ([[C8]], [[C8]], [[C8]]) : i64 + // CHECK: dynamic_shared_memory_size [[C256]] + // CHECK: args([[C32]] : i32, %{{.*}} : !llvm.ptr, %{{.*}} : !llvm.ptr, %{{.*}} : i64, %{{.*}} : i64, %{{.*}} : i64) + gpu.launch_func @kernel_module::@kernel + blocks in (%c8, %c8, %c8) + threads in (%c8, %c8, %c8) + dynamic_shared_memory_size %c256 + args(%c32 : i32, %buffer : memref) + return + } +}