diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h --- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h +++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h @@ -14,10 +14,12 @@ namespace mlir { +class LLVMTypeConverter; class Location; struct LogicalResult; class ModuleOp; class Operation; +class OwningRewritePatternList; template class OperationPass; @@ -46,6 +48,11 @@ createConvertGpuLaunchFuncToGpuRuntimeCallsPass( StringRef gpuBinaryAnnotation = ""); +/// Collect a set of patterns to convert from the GPU dialect to LLVM. +void populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter, + OwningRewritePatternList &patterns, + StringRef gpuBinaryAnnotation); + /// Creates a pass to convert kernel functions into GPU target object blobs. /// /// This transformation takes the body of each function that is annotated with diff --git a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt --- a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt +++ b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt @@ -34,4 +34,5 @@ MLIRLLVMIR MLIRPass MLIRSupport + MLIRStandardToLLVM ) diff --git a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp --- a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp +++ b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp @@ -16,6 +16,7 @@ #include "mlir/Conversion/GPUCommon/GPUCommonPass.h" #include "../PassDetail.h" +#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" #include "mlir/Dialect/GPU/GPUDialect.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/IR/Attributes.h" @@ -34,21 +35,94 @@ using namespace mlir; -// To avoid name mangling, these are defined in the mini-runtime file. -static constexpr const char *kGpuModuleLoadName = "mgpuModuleLoad"; -static constexpr const char *kGpuModuleGetFunctionName = - "mgpuModuleGetFunction"; -static constexpr const char *kGpuLaunchKernelName = "mgpuLaunchKernel"; -static constexpr const char *kGpuStreamCreateName = "mgpuStreamCreate"; -static constexpr const char *kGpuStreamSynchronizeName = - "mgpuStreamSynchronize"; -static constexpr const char *kGpuMemHostRegisterName = "mgpuMemHostRegister"; static constexpr const char *kGpuBinaryStorageSuffix = "_gpubin_cst"; namespace { -/// A pass to convert gpu.launch_func operations into a sequence of GPU -/// runtime calls. Currently it supports CUDA and ROCm (HIP). +class GpuLaunchFuncToGpuRuntimeCallsPass + : public ConvertGpuLaunchFuncToGpuRuntimeCallsBase< + GpuLaunchFuncToGpuRuntimeCallsPass> { +public: + GpuLaunchFuncToGpuRuntimeCallsPass(StringRef gpuBinaryAnnotation) { + if (!gpuBinaryAnnotation.empty()) + this->gpuBinaryAnnotation = gpuBinaryAnnotation.str(); + } + + // Run the dialect converter on the module. + void runOnOperation() override; +}; + +class FunctionCallBuilder { +public: + FunctionCallBuilder(StringRef functionName, LLVM::LLVMType returnType, + ArrayRef argumentTypes) + : functionName(functionName), + functionType(LLVM::LLVMType::getFunctionTy(returnType, argumentTypes, + /*isVarArg=*/false)) {} + LLVM::CallOp create(Location loc, OpBuilder &builder, + ArrayRef arguments) const; + +private: + StringRef functionName; + LLVM::LLVMType functionType; +}; + +template +class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern { +public: + explicit ConvertOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) + : ConvertOpToLLVMPattern(typeConverter) {} + +protected: + LLVM::LLVMDialect *llvmDialect = this->typeConverter.getDialect(); + + LLVM::LLVMType llvmVoidType = LLVM::LLVMType::getVoidTy(llvmDialect); + LLVM::LLVMType llvmPointerType = LLVM::LLVMType::getInt8PtrTy(llvmDialect); + LLVM::LLVMType llvmPointerPointerType = llvmPointerType.getPointerTo(); + LLVM::LLVMType llvmInt8Type = LLVM::LLVMType::getInt8Ty(llvmDialect); + LLVM::LLVMType llvmInt32Type = LLVM::LLVMType::getInt32Ty(llvmDialect); + LLVM::LLVMType llvmInt64Type = LLVM::LLVMType::getInt64Ty(llvmDialect); + LLVM::LLVMType llvmIntPtrType = LLVM::LLVMType::getIntNTy( + llvmDialect, + llvmDialect->getLLVMModule().getDataLayout().getPointerSizeInBits()); + + FunctionCallBuilder moduleLoadCallBuilder = { + "mgpuModuleLoad", + llvmPointerType /* void *module */, + {llvmPointerType /* void *cubin */}}; + FunctionCallBuilder moduleGetFunctionCallBuilder = { + "mgpuModuleGetFunction", + llvmPointerType /* void *function */, + { + llvmPointerType, /* void *module */ + llvmPointerType /* char *name */ + }}; + FunctionCallBuilder launchKernelCallBuilder = { + "mgpuLaunchKernel", + llvmVoidType, + { + llvmPointerType, /* void* f */ + llvmIntPtrType, /* intptr_t gridXDim */ + llvmIntPtrType, /* intptr_t gridyDim */ + llvmIntPtrType, /* intptr_t gridZDim */ + llvmIntPtrType, /* intptr_t blockXDim */ + llvmIntPtrType, /* intptr_t blockYDim */ + llvmIntPtrType, /* intptr_t blockZDim */ + llvmInt32Type, /* unsigned int sharedMemBytes */ + llvmPointerType, /* void *hstream */ + llvmPointerPointerType, /* void **kernelParams */ + llvmPointerPointerType /* void **extra */ + }}; + FunctionCallBuilder streamCreateCallBuilder = { + "mgpuStreamCreate", llvmPointerType /* void *stream */, {}}; + FunctionCallBuilder streamSynchronizeCallBuilder = { + "mgpuStreamSynchronize", + llvmVoidType, + {llvmPointerType /* void *stream */}}; +}; + +/// A rewrite patter to convert gpu.launch_func operations into a sequence of +/// GPU runtime calls. Currently it supports CUDA and ROCm (HIP). /// /// In essence, a gpu.launch_func operations gets compiled into the following /// sequence of runtime calls: @@ -60,172 +134,65 @@ /// * streamSynchronize -- waits for operations on the stream to finish /// /// Intermediate data structures are allocated on the stack. -class GpuLaunchFuncToGpuRuntimeCallsPass - : public ConvertGpuLaunchFuncToGpuRuntimeCallsBase< - GpuLaunchFuncToGpuRuntimeCallsPass> { -private: - LLVM::LLVMDialect *getLLVMDialect() { return llvmDialect; } - - llvm::LLVMContext &getLLVMContext() { - return getLLVMDialect()->getLLVMContext(); - } - - void initializeCachedTypes() { - const llvm::Module &module = llvmDialect->getLLVMModule(); - llvmVoidType = LLVM::LLVMType::getVoidTy(llvmDialect); - llvmPointerType = LLVM::LLVMType::getInt8PtrTy(llvmDialect); - llvmPointerPointerType = llvmPointerType.getPointerTo(); - llvmInt8Type = LLVM::LLVMType::getInt8Ty(llvmDialect); - llvmInt32Type = LLVM::LLVMType::getInt32Ty(llvmDialect); - llvmInt64Type = LLVM::LLVMType::getInt64Ty(llvmDialect); - llvmIntPtrType = LLVM::LLVMType::getIntNTy( - llvmDialect, module.getDataLayout().getPointerSizeInBits()); - } - - LLVM::LLVMType getVoidType() { return llvmVoidType; } - - LLVM::LLVMType getPointerType() { return llvmPointerType; } - - LLVM::LLVMType getPointerPointerType() { return llvmPointerPointerType; } - - LLVM::LLVMType getInt8Type() { return llvmInt8Type; } - - LLVM::LLVMType getInt32Type() { return llvmInt32Type; } - - LLVM::LLVMType getInt64Type() { return llvmInt64Type; } - - LLVM::LLVMType getIntPtrType() { - const llvm::Module &module = getLLVMDialect()->getLLVMModule(); - return LLVM::LLVMType::getIntNTy( - getLLVMDialect(), module.getDataLayout().getPointerSizeInBits()); - } - - // Allocate a void pointer on the stack. - Value allocatePointer(OpBuilder &builder, Location loc) { - auto one = builder.create(loc, getInt32Type(), - builder.getI32IntegerAttr(1)); - return builder.create(loc, getPointerPointerType(), one, - /*alignment=*/0); - } +class ConvertLaunchFuncOpToGpuRuntimeCallPattern + : public ConvertOpToGpuRuntimeCallPattern { +public: + ConvertLaunchFuncOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter, + StringRef gpuBinaryAnnotation) + : ConvertOpToGpuRuntimeCallPattern(typeConverter), + gpuBinaryAnnotation(gpuBinaryAnnotation) {} - void declareGpuRuntimeFunctions(Location loc); - void addParamToList(OpBuilder &builder, Location loc, Value param, Value list, - unsigned pos, Value one); - Value setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder); +private: + void addParamToArray(OpBuilder &builder, Location loc, Value param, + Value array, unsigned pos, Value one) const; + Value generateParamsArray(gpu::LaunchFuncOp launchOp, unsigned numArguments, + OpBuilder &builder) const; Value generateKernelNameConstant(StringRef moduleName, StringRef name, - Location loc, OpBuilder &builder); - void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp); + Location loc, OpBuilder &builder) const; -public: - GpuLaunchFuncToGpuRuntimeCallsPass() = default; - GpuLaunchFuncToGpuRuntimeCallsPass(StringRef gpuBinaryAnnotation) { - this->gpuBinaryAnnotation = gpuBinaryAnnotation.str(); - } + LogicalResult + matchAndRewrite(Operation *op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override; - // Run the dialect converter on the module. - void runOnOperation() override { - // Cache the LLVMDialect for the current module. - llvmDialect = getContext().getRegisteredDialect(); - // Cache the used LLVM types. - initializeCachedTypes(); + llvm::SmallString<32> gpuBinaryAnnotation; +}; - getOperation().walk( - [this](mlir::gpu::LaunchFuncOp op) { translateGpuLaunchCalls(op); }); +class EraseGpuModuleOpPattern : public OpRewritePattern { + using OpRewritePattern::OpRewritePattern; + LogicalResult matchAndRewrite(gpu::GPUModuleOp op, + PatternRewriter &rewriter) const override { // GPU kernel modules are no longer necessary since we have a global // constant with the CUBIN, or HSACO data. - for (auto m : - llvm::make_early_inc_range(getOperation().getOps())) - m.erase(); + rewriter.eraseOp(op); + return success(); } - -private: - LLVM::LLVMDialect *llvmDialect; - LLVM::LLVMType llvmVoidType; - LLVM::LLVMType llvmPointerType; - LLVM::LLVMType llvmPointerPointerType; - LLVM::LLVMType llvmInt8Type; - LLVM::LLVMType llvmInt32Type; - LLVM::LLVMType llvmInt64Type; - LLVM::LLVMType llvmIntPtrType; }; -} // anonymous namespace - -// Adds declarations for the needed helper functions from the runtime wrappers. -// The types in comments give the actual types expected/returned but the API -// uses void pointers. This is fine as they have the same linkage in C. -void GpuLaunchFuncToGpuRuntimeCallsPass::declareGpuRuntimeFunctions( - Location loc) { - ModuleOp module = getOperation(); - OpBuilder builder(module.getBody()->getTerminator()); - if (!module.lookupSymbol(kGpuModuleLoadName)) { - builder.create( - loc, kGpuModuleLoadName, - LLVM::LLVMType::getFunctionTy(getPointerType(), - {getPointerType()}, /* void *cubin */ - /*isVarArg=*/false)); - } - if (!module.lookupSymbol(kGpuModuleGetFunctionName)) { - // The helper uses void* instead of CUDA's opaque CUmodule and - // CUfunction, or ROCm (HIP)'s opaque hipModule_t and hipFunction_t. - builder.create( - loc, kGpuModuleGetFunctionName, - LLVM::LLVMType::getFunctionTy(getPointerType(), - { - getPointerType(), /* void *module */ - getPointerType() /* char *name */ - }, - /*isVarArg=*/false)); - } - if (!module.lookupSymbol(kGpuLaunchKernelName)) { - // Other than the CUDA or ROCm (HIP) api, the wrappers use uintptr_t to - // match the LLVM type if MLIR's index type, which the GPU dialect uses. - // Furthermore, they use void* instead of CUDA's opaque CUfunction and - // CUstream, or ROCm (HIP)'s opaque hipFunction_t and hipStream_t. - builder.create( - loc, kGpuLaunchKernelName, - LLVM::LLVMType::getFunctionTy( - getVoidType(), - { - getPointerType(), /* void* f */ - getIntPtrType(), /* intptr_t gridXDim */ - getIntPtrType(), /* intptr_t gridyDim */ - getIntPtrType(), /* intptr_t gridZDim */ - getIntPtrType(), /* intptr_t blockXDim */ - getIntPtrType(), /* intptr_t blockYDim */ - getIntPtrType(), /* intptr_t blockZDim */ - getInt32Type(), /* unsigned int sharedMemBytes */ - getPointerType(), /* void *hstream */ - getPointerPointerType(), /* void **kernelParams */ - getPointerPointerType() /* void **extra */ - }, - /*isVarArg=*/false)); - } - if (!module.lookupSymbol(kGpuStreamCreateName)) { - // Helper function to get the current GPU compute stream. Uses void* - // instead of CUDA's opaque CUstream, or ROCm (HIP)'s opaque hipStream_t. - builder.create( - loc, kGpuStreamCreateName, - LLVM::LLVMType::getFunctionTy(getPointerType(), /*isVarArg=*/false)); - } - if (!module.lookupSymbol(kGpuStreamSynchronizeName)) { - builder.create( - loc, kGpuStreamSynchronizeName, - LLVM::LLVMType::getFunctionTy(getVoidType(), - {getPointerType()}, /* void *stream */ - /*isVarArg=*/false)); - } - if (!module.lookupSymbol(kGpuMemHostRegisterName)) { - builder.create( - loc, kGpuMemHostRegisterName, - LLVM::LLVMType::getFunctionTy(getVoidType(), - { - getPointerType(), /* void *ptr */ - getInt64Type() /* int64 sizeBytes*/ - }, - /*isVarArg=*/false)); - } +} // namespace + +void GpuLaunchFuncToGpuRuntimeCallsPass::runOnOperation() { + LLVMTypeConverter converter(&getContext()); + OwningRewritePatternList patterns; + populateGpuToLLVMConversionPatterns(converter, patterns, gpuBinaryAnnotation); + + LLVMConversionTarget target(getContext()); + if (failed(applyPartialConversion(getOperation(), target, patterns))) + signalPassFailure(); +} + +LLVM::CallOp FunctionCallBuilder::create(Location loc, OpBuilder &builder, + ArrayRef arguments) const { + auto module = builder.getBlock()->getParent()->getParentOfType(); + auto function = [&] { + if (auto function = module.lookupSymbol(functionName)) + return function; + return OpBuilder(module.getBody()->getTerminator()) + .create(loc, functionName, functionType); + }(); + return builder.create( + loc, const_cast(functionType).getFunctionResultType(), + builder.getSymbolRefAttr(function), arguments); } /// Emits the IR with the following structure: @@ -233,28 +200,26 @@ /// %data = llvm.alloca 1 x type-of() /// llvm.store , %data /// %typeErased = llvm.bitcast %data to !llvm<"i8*"> -/// %addr = llvm.getelementptr [] +/// %addr = llvm.getelementptr [] /// llvm.store %typeErased, %addr /// -/// This is necessary to construct the list of arguments passed to the kernel -/// function as accepted by cuLaunchKernel, i.e. as a void** that points to list -/// of stack-allocated type-erased pointers to the actual arguments. -void GpuLaunchFuncToGpuRuntimeCallsPass::addParamToList(OpBuilder &builder, - Location loc, - Value param, Value list, - unsigned pos, - Value one) { +/// This is necessary to construct the array of arguments passed to the kernel +/// function as accepted by cuLaunchKernel, i.e. as a void** that points to +/// array of stack-allocated type-erased pointers to the actual arguments. +void ConvertLaunchFuncOpToGpuRuntimeCallPattern::addParamToArray( + OpBuilder &builder, Location loc, Value param, Value array, unsigned pos, + Value one) const { auto memLocation = builder.create( loc, param.getType().cast().getPointerTo(), one, - /*alignment=*/1); + /*alignment=*/0); builder.create(loc, param, memLocation); auto casted = - builder.create(loc, getPointerType(), memLocation); + builder.create(loc, llvmPointerType, memLocation); - auto index = builder.create(loc, getInt32Type(), + auto index = builder.create(loc, llvmInt32Type, builder.getI32IntegerAttr(pos)); - auto gep = builder.create(loc, getPointerPointerType(), list, - ArrayRef{index}); + auto gep = builder.create(loc, llvmPointerPointerType, array, + index.getResult()); builder.create(loc, casted, gep); } @@ -266,24 +231,16 @@ // for (i : [0, NumKernelOperands)) // %array[i] = cast(KernelOperand[i]) // return %array -Value GpuLaunchFuncToGpuRuntimeCallsPass::setupParamsArray( - gpu::LaunchFuncOp launchOp, OpBuilder &builder) { - - // Get the launch target. - auto gpuFunc = SymbolTable::lookupNearestSymbolFrom( - launchOp, launchOp.kernel()); - if (!gpuFunc) - return {}; - - unsigned numArgs = gpuFunc.getNumArguments(); - +Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateParamsArray( + gpu::LaunchFuncOp launchOp, unsigned numArguments, + OpBuilder &builder) const { auto numKernelOperands = launchOp.getNumKernelOperands(); Location loc = launchOp.getLoc(); - auto one = builder.create(loc, getInt32Type(), + auto one = builder.create(loc, llvmInt32Type, builder.getI32IntegerAttr(1)); auto arraySize = builder.create( - loc, getInt32Type(), builder.getI32IntegerAttr(numArgs)); - auto array = builder.create(loc, getPointerPointerType(), + loc, llvmInt32Type, builder.getI32IntegerAttr(numArguments)); + auto array = builder.create(loc, llvmPointerPointerType, arraySize, /*alignment=*/0); unsigned pos = 0; @@ -295,7 +252,7 @@ // hold anymore then we `launchOp` to lower from MemRefType and not after // LLVMConversion has taken place and the MemRef information is lost. if (!llvmType.isStructTy()) { - addParamToList(builder, loc, operand, array, pos++, one); + addParamToArray(builder, loc, operand, array, pos++, one); continue; } @@ -309,7 +266,7 @@ Value elem = builder.create( loc, elemType.getArrayElementType(), operand, builder.getI32ArrayAttr({j, k})); - addParamToList(builder, loc, elem, array, pos++, one); + addParamToArray(builder, loc, elem, array, pos++, one); } } else { assert((elemType.isIntegerTy() || elemType.isFloatTy() || @@ -317,7 +274,7 @@ "expected scalar type"); Value strct = builder.create( loc, elemType, operand, builder.getI32ArrayAttr(j)); - addParamToList(builder, loc, strct, array, pos++, one); + addParamToArray(builder, loc, strct, array, pos++, one); } } } @@ -335,8 +292,9 @@ // %1 = llvm.constant (0 : index) // %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*"> // } -Value GpuLaunchFuncToGpuRuntimeCallsPass::generateKernelNameConstant( - StringRef moduleName, StringRef name, Location loc, OpBuilder &builder) { +Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateKernelNameConstant( + StringRef moduleName, StringRef name, Location loc, + OpBuilder &builder) const { // Make sure the trailing zero is included in the constant. std::vector kernelName(name.begin(), name.end()); kernelName.push_back('\0'); @@ -357,93 +315,86 @@ // %2 = // %3 = call %moduleGetFunction(%1, %2) // %4 = call %streamCreate() -// %5 = +// %5 = // call %launchKernel(%3, , 0, %4, %5, nullptr) // call %streamSynchronize(%4) -void GpuLaunchFuncToGpuRuntimeCallsPass::translateGpuLaunchCalls( - mlir::gpu::LaunchFuncOp launchOp) { - OpBuilder builder(launchOp); - Location loc = launchOp.getLoc(); - declareGpuRuntimeFunctions(loc); +LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite( + Operation *op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const { + Location loc = op->getLoc(); + auto launchOp = cast(op); + auto moduleOp = op->getParentOfType(); - auto zero = builder.create(loc, getInt32Type(), - builder.getI32IntegerAttr(0)); // Create an LLVM global with CUBIN extracted from the kernel annotation and // obtain a pointer to the first byte in it. - auto kernelModule = getOperation().lookupSymbol( - launchOp.getKernelModuleName()); + auto kernelModule = + moduleOp.lookupSymbol(launchOp.getKernelModuleName()); assert(kernelModule && "expected a kernel module"); auto binaryAttr = kernelModule.getAttrOfType(gpuBinaryAnnotation); if (!binaryAttr) { kernelModule.emitOpError() << "missing " << gpuBinaryAnnotation << " attribute"; - return signalPassFailure(); + return failure(); } SmallString<128> nameBuffer(kernelModule.getName()); nameBuffer.append(kGpuBinaryStorageSuffix); - Value data = LLVM::createGlobalString( - loc, builder, nameBuffer.str(), binaryAttr.getValue(), - LLVM::Linkage::Internal, getLLVMDialect()); - - // Emit the load module call to load the module data. Error checking is done - // in the called helper function. - auto gpuModuleLoad = - getOperation().lookupSymbol(kGpuModuleLoadName); - auto module = builder.create( - loc, ArrayRef{getPointerType()}, - builder.getSymbolRefAttr(gpuModuleLoad), ArrayRef{data}); + Value data = LLVM::createGlobalString(loc, rewriter, nameBuffer.str(), + binaryAttr.getValue(), + LLVM::Linkage::Internal, llvmDialect); + + auto module = moduleLoadCallBuilder.create(loc, rewriter, data); // Get the function from the module. The name corresponds to the name of // the kernel function. auto kernelName = generateKernelNameConstant( - launchOp.getKernelModuleName(), launchOp.getKernelName(), loc, builder); - auto gpuModuleGetFunction = - getOperation().lookupSymbol(kGpuModuleGetFunctionName); - auto function = builder.create( - loc, ArrayRef{getPointerType()}, - builder.getSymbolRefAttr(gpuModuleGetFunction), - ArrayRef{module.getResult(0), kernelName}); + launchOp.getKernelModuleName(), launchOp.getKernelName(), loc, rewriter); + auto function = moduleGetFunctionCallBuilder.create( + loc, rewriter, {module.getResult(0), kernelName}); // Grab the global stream needed for execution. - auto gpuStreamCreate = - getOperation().lookupSymbol(kGpuStreamCreateName); - auto stream = builder.create( - loc, ArrayRef{getPointerType()}, - builder.getSymbolRefAttr(gpuStreamCreate), ArrayRef{}); - // Invoke the function with required arguments. - auto gpuLaunchKernel = - getOperation().lookupSymbol(kGpuLaunchKernelName); - auto paramsArray = setupParamsArray(launchOp, builder); - if (!paramsArray) { - launchOp.emitOpError() << "cannot pass given parameters to the kernel"; - return signalPassFailure(); + auto stream = streamCreateCallBuilder.create(loc, rewriter, {}); + + // Get the launch target. + auto gpuFuncOp = SymbolTable::lookupNearestSymbolFrom( + launchOp, launchOp.kernel()); + if (!gpuFuncOp) { + launchOp.emitOpError() << "corresponding kernel function not found"; + return failure(); } + // Build array of kernel parameters. + auto kernelParams = + generateParamsArray(launchOp, gpuFuncOp.getNumArguments(), rewriter); + + // Invoke the function with required arguments. + auto zero = rewriter.create(loc, llvmInt32Type, + rewriter.getI32IntegerAttr(0)); auto nullpointer = - builder.create(loc, getPointerPointerType(), zero); - builder.create( - loc, ArrayRef{getVoidType()}, - builder.getSymbolRefAttr(gpuLaunchKernel), - ArrayRef{function.getResult(0), launchOp.getOperand(0), - launchOp.getOperand(1), launchOp.getOperand(2), - launchOp.getOperand(3), launchOp.getOperand(4), - launchOp.getOperand(5), zero, /* sharedMemBytes */ - stream.getResult(0), /* stream */ - paramsArray, /* kernel params */ - nullpointer /* extra */}); - // Sync on the stream to make it synchronous. - auto gpuStreamSync = - getOperation().lookupSymbol(kGpuStreamSynchronizeName); - builder.create(loc, ArrayRef{getVoidType()}, - builder.getSymbolRefAttr(gpuStreamSync), - ArrayRef(stream.getResult(0))); - launchOp.erase(); + rewriter.create(loc, llvmPointerPointerType, zero); + launchKernelCallBuilder.create( + loc, rewriter, + {function.getResult(0), launchOp.gridSizeX(), launchOp.gridSizeY(), + launchOp.gridSizeZ(), launchOp.blockSizeX(), launchOp.blockSizeY(), + launchOp.blockSizeZ(), zero, /* sharedMemBytes */ + stream.getResult(0), /* stream */ + kernelParams, /* kernel params */ + nullpointer /* extra */}); + streamSynchronizeCallBuilder.create(loc, rewriter, stream.getResult(0)); + + rewriter.eraseOp(op); + return success(); } std::unique_ptr> mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass( StringRef gpuBinaryAnnotation) { - if (gpuBinaryAnnotation.empty()) - return std::make_unique(); return std::make_unique( gpuBinaryAnnotation); } + +void mlir::populateGpuToLLVMConversionPatterns( + LLVMTypeConverter &converter, OwningRewritePatternList &patterns, + StringRef gpuBinaryAnnotation) { + patterns.insert( + converter, gpuBinaryAnnotation); + patterns.insert(&converter.getContext()); +} diff --git a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp --- a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp +++ b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp @@ -110,15 +110,17 @@ PassManager pm(m.getContext()); applyPassManagerCLOptions(pm); + const char gpuBinaryAnnotation[] = "nvvm.cubin"; pm.addPass(createGpuKernelOutliningPass()); auto &kernelPm = pm.nest(); kernelPm.addPass(createStripDebugInfoPass()); kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass()); kernelPm.addPass(createConvertGPUKernelToBlobPass( translateModuleToNVVMIR, compilePtxToCubin, "nvptx64-nvidia-cuda", - "sm_35", "+ptx60", "nvvm.cubin")); + "sm_35", "+ptx60", gpuBinaryAnnotation)); pm.addPass(createLowerToLLVMPass()); - pm.addPass(createConvertGpuLaunchFuncToGpuRuntimeCallsPass()); + pm.addPass( + createConvertGpuLaunchFuncToGpuRuntimeCallsPass(gpuBinaryAnnotation)); return pm.run(m); } diff --git a/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp b/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp --- a/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp +++ b/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp @@ -299,16 +299,17 @@ // Configure target features per ROCm / HIP version. configTargetFeatures(); + const char gpuBinaryAnnotation[] = "rocdl.hsaco"; pm.addPass(createGpuKernelOutliningPass()); auto &kernelPm = pm.nest(); kernelPm.addPass(createStripDebugInfoPass()); kernelPm.addPass(createLowerGpuOpsToROCDLOpsPass()); kernelPm.addPass(createConvertGPUKernelToBlobPass( compileModuleToROCDLIR, compileISAToHsaco, tripleName, targetChip, - features, /*gpuBinaryAnnotation=*/"rocdl.hsaco")); + features, gpuBinaryAnnotation)); pm.addPass(createLowerToLLVMPass()); - pm.addPass(createConvertGpuLaunchFuncToGpuRuntimeCallsPass( - /*gpuBinaryAnnotation=*/"rocdl.hsaco")); + pm.addPass( + createConvertGpuLaunchFuncToGpuRuntimeCallsPass(gpuBinaryAnnotation)); return pm.run(m); }