diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h new file mode 100644 --- /dev/null +++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h @@ -0,0 +1,36 @@ +//===- GPUCommonPass.h - MLIR GPU runtime support -------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_ +#define MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_ + +#include "mlir/Support/LLVM.h" +#include +#include +#include +#include + +namespace mlir { + +class Location; +class ModuleOp; + +template +class OperationPass; + +/// Creates a pass to convert a gpu.launch_func operation into a sequence of +/// GPU runtime calls. +/// +/// This pass does not generate code to call GPU runtime APIs directly but +/// instead uses a small wrapper library that exports a stable and conveniently +/// typed ABI on top of GPU runtimes such as CUDA or ROCm (HIP). +std::unique_ptr> +createConvertGpuLaunchFuncToGpuRuntimeCallsPass(); + +} // namespace mlir + +#endif // MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_ diff --git a/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h b/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h --- a/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h +++ b/mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h @@ -45,15 +45,6 @@ std::unique_ptr> createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator); -/// Creates a pass to convert a gpu.launch_func operation into a sequence of -/// CUDA calls. -/// -/// This pass does not generate code to call CUDA directly but instead uses a -/// small wrapper library that exports a stable and conveniently typed ABI -/// on top of CUDA. -std::unique_ptr> -createConvertGpuLaunchFuncToCudaCallsPass(); - } // namespace mlir #endif // MLIR_CONVERSION_GPUTOCUDA_GPUTOCUDAPASS_H_ diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -79,12 +79,18 @@ } //===----------------------------------------------------------------------===// -// GPUToCUDA +// GPUCommon //===----------------------------------------------------------------------===// -def ConvertGpuLaunchFuncToCudaCalls : Pass<"launch-func-to-cuda", "ModuleOp"> { - let summary = "Convert all launch_func ops to CUDA runtime calls"; - let constructor = "mlir::createConvertGpuLaunchFuncToCudaCallsPass()"; +def ConvertGpuLaunchFuncToGpuRuntimeCalls : Pass<"launch-func-to-gpu-runtime", + "ModuleOp"> { + let summary = "Convert all launch_func ops to GPU runtime calls"; + let constructor = "mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass()"; + let options = [ + Option<"gpuBinaryAnnotation", "gpu-binary-annotation", "std::string", + "\"nvvm.cubin\"", + "Annotation attribute string for GPU binary">, + ]; } //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/InitAllPasses.h b/mlir/include/mlir/InitAllPasses.h --- a/mlir/include/mlir/InitAllPasses.h +++ b/mlir/include/mlir/InitAllPasses.h @@ -15,6 +15,7 @@ #define MLIR_INITALLPASSES_H_ #include "mlir/Conversion/AVX512ToLLVM/ConvertAVX512ToLLVM.h" +#include "mlir/Conversion/GPUCommon/GPUCommonPass.h" #include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h" #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h" #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h" diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt --- a/mlir/lib/Conversion/CMakeLists.txt +++ b/mlir/lib/Conversion/CMakeLists.txt @@ -1,5 +1,6 @@ add_subdirectory(AffineToStandard) add_subdirectory(AVX512ToLLVM) +add_subdirectory(GPUCommon) add_subdirectory(GPUToCUDA) add_subdirectory(GPUToNVVM) add_subdirectory(GPUToROCDL) diff --git a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt @@ -0,0 +1,21 @@ +set(SOURCES + ConvertLaunchFuncToRuntimeCalls.cpp +) + +add_mlir_conversion_library(MLIRGPUtoGPURuntimeTransforms + ${SOURCES} + + DEPENDS + MLIRConversionPassIncGen + intrinsics_gen + + LINK_COMPONENTS + Core + + LINK_LIBS PUBLIC + MLIRGPU + MLIRIR + MLIRLLVMIR + MLIRPass + MLIRSupport +) diff --git a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp rename from mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp rename to mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp --- a/mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp +++ b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp @@ -1,4 +1,4 @@ -//===- ConvertLaunchFuncToCudaCalls.cpp - MLIR CUDA lowering passes -------===// +//===- ConvertLaunchFuncToGpuRuntimeCalls.cpp - MLIR GPU lowering passes --===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -7,13 +7,13 @@ //===----------------------------------------------------------------------===// // // This file implements a pass to convert gpu.launch_func op into a sequence of -// CUDA runtime calls. As the CUDA runtime does not have a stable published ABI, -// this pass uses a slim runtime layer that builds on top of the public API from -// the CUDA headers. +// GPU runtime calls. As most of GPU runtimes does not have a stable published +// ABI, this pass uses a slim runtime layer that builds on top of the public +// API from GPU runtime headers. // //===----------------------------------------------------------------------===// -#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h" +#include "mlir/Conversion/GPUCommon/GPUCommonPass.h" #include "../PassDetail.h" #include "mlir/Dialect/GPU/GPUDialect.h" @@ -35,33 +35,34 @@ using namespace mlir; // To avoid name mangling, these are defined in the mini-runtime file. -static constexpr const char *cuModuleLoadName = "mcuModuleLoad"; -static constexpr const char *cuModuleGetFunctionName = "mcuModuleGetFunction"; -static constexpr const char *cuLaunchKernelName = "mcuLaunchKernel"; -static constexpr const char *cuGetStreamHelperName = "mcuGetStreamHelper"; -static constexpr const char *cuStreamSynchronizeName = "mcuStreamSynchronize"; -static constexpr const char *kMcuMemHostRegister = "mcuMemHostRegister"; - -static constexpr const char *kCubinAnnotation = "nvvm.cubin"; -static constexpr const char *kCubinStorageSuffix = "_cubin_cst"; +static constexpr const char *kGpuModuleLoadName = "mgpuModuleLoad"; +static constexpr const char *kGpuModuleGetFunctionName = + "mgpuModuleGetFunction"; +static constexpr const char *kGpuLaunchKernelName = "mgpuLaunchKernel"; +static constexpr const char *kGpuGetStreamHelperName = "mgpuGetStreamHelper"; +static constexpr const char *kGpuStreamSynchronizeName = + "mgpuStreamSynchronize"; +static constexpr const char *kGpuMemHostRegisterName = "mgpuMemHostRegister"; +static constexpr const char *kGpuBinaryStorageSuffix = "_gpubin_cst"; namespace { -/// A pass to convert gpu.launch_func operations into a sequence of CUDA -/// runtime calls. +/// A pass to convert gpu.launch_func operations into a sequence of GPU +/// runtime calls. Currently it supports CUDA and ROCm (HIP). /// /// In essence, a gpu.launch_func operations gets compiled into the following /// sequence of runtime calls: /// -/// * mcuModuleLoad -- loads the module given the cubin data -/// * mcuModuleGetFunction -- gets a handle to the actual kernel function -/// * mcuGetStreamHelper -- initializes a new CUDA stream -/// * mcuLaunchKernelName -- launches the kernel on a stream -/// * mcuStreamSynchronize -- waits for operations on the stream to finish +/// * moduleLoad -- loads the module given the cubin / hsaco data +/// * moduleGetFunction -- gets a handle to the actual kernel function +/// * getStreamHelper -- initializes a new compute stream on GPU +/// * launchKernel -- launches the kernel on a stream +/// * streamSynchronize -- waits for operations on the stream to finish /// /// Intermediate data structures are allocated on the stack. -class GpuLaunchFuncToCudaCallsPass - : public ConvertGpuLaunchFuncToCudaCallsBase { +class GpuLaunchFuncToGpuRuntimeCallsPass + : public ConvertGpuLaunchFuncToGpuRuntimeCallsBase< + GpuLaunchFuncToGpuRuntimeCallsPass> { private: LLVM::LLVMDialect *getLLVMDialect() { return llvmDialect; } @@ -99,8 +100,9 @@ getLLVMDialect(), module.getDataLayout().getPointerSizeInBits()); } - LLVM::LLVMType getCUResultType() { - // This is declared as an enum in CUDA but helpers use i32. + LLVM::LLVMType getGpuRuntimeResultType() { + // This is declared as an enum in both CUDA and ROCm (HIP), but helpers + // use i32. return getInt32Type(); } @@ -112,7 +114,7 @@ /*alignment=*/0); } - void declareCudaFunctions(Location loc); + void declareGpuRuntimeFunctions(Location loc); void addParamToList(OpBuilder &builder, Location loc, Value param, Value list, unsigned pos, Value one); Value setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder); @@ -132,7 +134,7 @@ [this](mlir::gpu::LaunchFuncOp op) { translateGpuLaunchCalls(op); }); // GPU kernel modules are no longer necessary since we have a global - // constant with the CUBIN data. + // constant with the CUBIN, or HSACO data. for (auto m : llvm::make_early_inc_range(getOperation().getOps())) m.erase(); @@ -151,30 +153,31 @@ } // anonymous namespace -// Adds declarations for the needed helper functions from the CUDA wrapper. +// Adds declarations for the needed helper functions from the runtime wrappers. // The types in comments give the actual types expected/returned but the API // uses void pointers. This is fine as they have the same linkage in C. -void GpuLaunchFuncToCudaCallsPass::declareCudaFunctions(Location loc) { +void GpuLaunchFuncToGpuRuntimeCallsPass::declareGpuRuntimeFunctions( + Location loc) { ModuleOp module = getOperation(); OpBuilder builder(module.getBody()->getTerminator()); - if (!module.lookupSymbol(cuModuleLoadName)) { + if (!module.lookupSymbol(kGpuModuleLoadName)) { builder.create( - loc, cuModuleLoadName, + loc, kGpuModuleLoadName, LLVM::LLVMType::getFunctionTy( - getCUResultType(), + getGpuRuntimeResultType(), { getPointerPointerType(), /* CUmodule *module */ getPointerType() /* void *cubin */ }, /*isVarArg=*/false)); } - if (!module.lookupSymbol(cuModuleGetFunctionName)) { + if (!module.lookupSymbol(kGpuModuleGetFunctionName)) { // The helper uses void* instead of CUDA's opaque CUmodule and - // CUfunction. + // CUfunction, or ROCm (HIP)'s opaque hipModule_t and hipFunction_t. builder.create( - loc, cuModuleGetFunctionName, + loc, kGpuModuleGetFunctionName, LLVM::LLVMType::getFunctionTy( - getCUResultType(), + getGpuRuntimeResultType(), { getPointerPointerType(), /* void **function */ getPointerType(), /* void *module */ @@ -182,15 +185,15 @@ }, /*isVarArg=*/false)); } - if (!module.lookupSymbol(cuLaunchKernelName)) { - // Other than the CUDA api, the wrappers use uintptr_t to match the - // LLVM type if MLIR's index type, which the GPU dialect uses. + if (!module.lookupSymbol(kGpuLaunchKernelName)) { + // Other than the CUDA or ROCm (HIP) api, the wrappers use uintptr_t to + // match the LLVM type if MLIR's index type, which the GPU dialect uses. // Furthermore, they use void* instead of CUDA's opaque CUfunction and - // CUstream. + // CUstream, or ROCm (HIP)'s opaque hipFunction_t and hipStream_t. builder.create( - loc, cuLaunchKernelName, + loc, kGpuLaunchKernelName, LLVM::LLVMType::getFunctionTy( - getCUResultType(), + getGpuRuntimeResultType(), { getPointerType(), /* void* f */ getIntPtrType(), /* intptr_t gridXDim */ @@ -206,23 +209,23 @@ }, /*isVarArg=*/false)); } - if (!module.lookupSymbol(cuGetStreamHelperName)) { - // Helper function to get the current CUDA stream. Uses void* instead of - // CUDAs opaque CUstream. + if (!module.lookupSymbol(kGpuGetStreamHelperName)) { + // Helper function to get the current GPU compute stream. Uses void* + // instead of CUDA's opaque CUstream, or ROCm (HIP)'s opaque hipStream_t. builder.create( - loc, cuGetStreamHelperName, + loc, kGpuGetStreamHelperName, LLVM::LLVMType::getFunctionTy(getPointerType(), /*isVarArg=*/false)); } - if (!module.lookupSymbol(cuStreamSynchronizeName)) { + if (!module.lookupSymbol(kGpuStreamSynchronizeName)) { builder.create( - loc, cuStreamSynchronizeName, - LLVM::LLVMType::getFunctionTy(getCUResultType(), + loc, kGpuStreamSynchronizeName, + LLVM::LLVMType::getFunctionTy(getGpuRuntimeResultType(), getPointerType() /* CUstream stream */, /*isVarArg=*/false)); } - if (!module.lookupSymbol(kMcuMemHostRegister)) { + if (!module.lookupSymbol(kGpuMemHostRegisterName)) { builder.create( - loc, kMcuMemHostRegister, + loc, kGpuMemHostRegisterName, LLVM::LLVMType::getFunctionTy(getVoidType(), { getPointerType(), /* void *ptr */ @@ -243,10 +246,11 @@ /// This is necessary to construct the list of arguments passed to the kernel /// function as accepted by cuLaunchKernel, i.e. as a void** that points to list /// of stack-allocated type-erased pointers to the actual arguments. -void GpuLaunchFuncToCudaCallsPass::addParamToList(OpBuilder &builder, - Location loc, Value param, - Value list, unsigned pos, - Value one) { +void GpuLaunchFuncToGpuRuntimeCallsPass::addParamToList(OpBuilder &builder, + Location loc, + Value param, Value list, + unsigned pos, + Value one) { auto memLocation = builder.create( loc, param.getType().cast().getPointerTo(), one, /*alignment=*/1); @@ -261,16 +265,16 @@ builder.create(loc, casted, gep); } -// Generates a parameters array to be used with a CUDA kernel launch call. The -// arguments are extracted from the launchOp. +// Generates a parameters array to be used with a CUDA / ROCm (HIP) kernel +// launch call. The arguments are extracted from the launchOp. // The generated code is essentially as follows: // // %array = alloca(numparams * sizeof(void *)) // for (i : [0, NumKernelOperands)) // %array[i] = cast(KernelOperand[i]) // return %array -Value GpuLaunchFuncToCudaCallsPass::setupParamsArray(gpu::LaunchFuncOp launchOp, - OpBuilder &builder) { +Value GpuLaunchFuncToGpuRuntimeCallsPass::setupParamsArray( + gpu::LaunchFuncOp launchOp, OpBuilder &builder) { // Get the launch target. auto gpuFunc = SymbolTable::lookupNearestSymbolFrom( @@ -338,7 +342,7 @@ // %1 = llvm.constant (0 : index) // %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*"> // } -Value GpuLaunchFuncToCudaCallsPass::generateKernelNameConstant( +Value GpuLaunchFuncToGpuRuntimeCallsPass::generateKernelNameConstant( StringRef moduleName, StringRef name, Location loc, OpBuilder &builder) { // Make sure the trailing zero is included in the constant. std::vector kernelName(name.begin(), name.end()); @@ -352,30 +356,26 @@ } // Emits LLVM IR to launch a kernel function. Expects the module that contains -// the compiled kernel function as a cubin in the 'nvvm.cubin' attribute of the -// kernel function in the IR. -// While MLIR has no global constants, also expects a cubin getter function in -// an 'nvvm.cubingetter' attribute. Such function is expected to return a -// pointer to the cubin blob when invoked. -// With these given, the generated code in essence is +// the compiled kernel function as a cubin in the 'nvvm.cubin' attribute, or a +// hsaco in the 'rocdl.hsaco' attribute of the kernel function in the IR. // -// %0 = call %cubingetter +// %0 = call %binarygetter // %1 = alloca sizeof(void*) -// call %mcuModuleLoad(%2, %1) +// call %moduleLoad(%2, %1) // %2 = alloca sizeof(void*) // %3 = load %1 // %4 = -// call %mcuModuleGetFunction(%2, %3, %4) -// %5 = call %mcuGetStreamHelper() +// call %moduleGetFunction(%2, %3, %4) +// %5 = call %getStreamHelper() // %6 = load %2 // %7 = -// call %mcuLaunchKernel(%6, , 0, %5, %7, nullptr) -// call %mcuStreamSynchronize(%5) -void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls( +// call %launchKernel(%6, , 0, %5, %7, nullptr) +// call %streamSynchronize(%5) +void GpuLaunchFuncToGpuRuntimeCallsPass::translateGpuLaunchCalls( mlir::gpu::LaunchFuncOp launchOp) { OpBuilder builder(launchOp); Location loc = launchOp.getLoc(); - declareCudaFunctions(loc); + declareGpuRuntimeFunctions(loc); auto zero = builder.create(loc, getInt32Type(), builder.getI32IntegerAttr(0)); @@ -385,51 +385,51 @@ launchOp.getKernelModuleName()); assert(kernelModule && "expected a kernel module"); - auto cubinAttr = kernelModule.getAttrOfType(kCubinAnnotation); - if (!cubinAttr) { + auto binaryAttr = kernelModule.getAttrOfType(gpuBinaryAnnotation); + if (!binaryAttr) { kernelModule.emitOpError() - << "missing " << kCubinAnnotation << " attribute"; + << "missing " << gpuBinaryAnnotation << " attribute"; return signalPassFailure(); } SmallString<128> nameBuffer(kernelModule.getName()); - nameBuffer.append(kCubinStorageSuffix); + nameBuffer.append(kGpuBinaryStorageSuffix); Value data = LLVM::createGlobalString( - loc, builder, nameBuffer.str(), cubinAttr.getValue(), + loc, builder, nameBuffer.str(), binaryAttr.getValue(), LLVM::Linkage::Internal, getLLVMDialect()); // Emit the load module call to load the module data. Error checking is done // in the called helper function. - auto cuModule = allocatePointer(builder, loc); - auto cuModuleLoad = - getOperation().lookupSymbol(cuModuleLoadName); - builder.create(loc, ArrayRef{getCUResultType()}, - builder.getSymbolRefAttr(cuModuleLoad), - ArrayRef{cuModule, data}); + auto gpuModule = allocatePointer(builder, loc); + auto gpuModuleLoad = + getOperation().lookupSymbol(kGpuModuleLoadName); + builder.create(loc, ArrayRef{getGpuRuntimeResultType()}, + builder.getSymbolRefAttr(gpuModuleLoad), + ArrayRef{gpuModule, data}); // Get the function from the module. The name corresponds to the name of // the kernel function. - auto cuOwningModuleRef = - builder.create(loc, getPointerType(), cuModule); + auto gpuOwningModuleRef = + builder.create(loc, getPointerType(), gpuModule); auto kernelName = generateKernelNameConstant( launchOp.getKernelModuleName(), launchOp.getKernelName(), loc, builder); - auto cuFunction = allocatePointer(builder, loc); - auto cuModuleGetFunction = - getOperation().lookupSymbol(cuModuleGetFunctionName); + auto gpuFunction = allocatePointer(builder, loc); + auto gpuModuleGetFunction = + getOperation().lookupSymbol(kGpuModuleGetFunctionName); builder.create( - loc, ArrayRef{getCUResultType()}, - builder.getSymbolRefAttr(cuModuleGetFunction), - ArrayRef{cuFunction, cuOwningModuleRef, kernelName}); + loc, ArrayRef{getGpuRuntimeResultType()}, + builder.getSymbolRefAttr(gpuModuleGetFunction), + ArrayRef{gpuFunction, gpuOwningModuleRef, kernelName}); // Grab the global stream needed for execution. - auto cuGetStreamHelper = - getOperation().lookupSymbol(cuGetStreamHelperName); - auto cuStream = builder.create( + auto gpuGetStreamHelper = + getOperation().lookupSymbol(kGpuGetStreamHelperName); + auto gpuStream = builder.create( loc, ArrayRef{getPointerType()}, - builder.getSymbolRefAttr(cuGetStreamHelper), ArrayRef{}); + builder.getSymbolRefAttr(gpuGetStreamHelper), ArrayRef{}); // Invoke the function with required arguments. - auto cuLaunchKernel = - getOperation().lookupSymbol(cuLaunchKernelName); - auto cuFunctionRef = - builder.create(loc, getPointerType(), cuFunction); + auto gpuLaunchKernel = + getOperation().lookupSymbol(kGpuLaunchKernelName); + auto gpuFunctionRef = + builder.create(loc, getPointerType(), gpuFunction); auto paramsArray = setupParamsArray(launchOp, builder); if (!paramsArray) { launchOp.emitOpError() << "cannot pass given parameters to the kernel"; @@ -438,25 +438,25 @@ auto nullpointer = builder.create(loc, getPointerPointerType(), zero); builder.create( - loc, ArrayRef{getCUResultType()}, - builder.getSymbolRefAttr(cuLaunchKernel), - ArrayRef{cuFunctionRef, launchOp.getOperand(0), + loc, ArrayRef{getGpuRuntimeResultType()}, + builder.getSymbolRefAttr(gpuLaunchKernel), + ArrayRef{gpuFunctionRef, launchOp.getOperand(0), launchOp.getOperand(1), launchOp.getOperand(2), launchOp.getOperand(3), launchOp.getOperand(4), launchOp.getOperand(5), zero, /* sharedMemBytes */ - cuStream.getResult(0), /* stream */ + gpuStream.getResult(0), /* stream */ paramsArray, /* kernel params */ nullpointer /* extra */}); // Sync on the stream to make it synchronous. - auto cuStreamSync = - getOperation().lookupSymbol(cuStreamSynchronizeName); - builder.create(loc, ArrayRef{getCUResultType()}, - builder.getSymbolRefAttr(cuStreamSync), - ArrayRef(cuStream.getResult(0))); + auto gpuStreamSync = + getOperation().lookupSymbol(kGpuStreamSynchronizeName); + builder.create(loc, ArrayRef{getGpuRuntimeResultType()}, + builder.getSymbolRefAttr(gpuStreamSync), + ArrayRef(gpuStream.getResult(0))); launchOp.erase(); } std::unique_ptr> -mlir::createConvertGpuLaunchFuncToCudaCallsPass() { - return std::make_unique(); +mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass() { + return std::make_unique(); } diff --git a/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt b/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt --- a/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt +++ b/mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt @@ -2,12 +2,7 @@ ConvertKernelFuncToCubin.cpp ) -set(SOURCES - ConvertLaunchFuncToCudaCalls.cpp -) - if (MLIR_CUDA_CONVERSIONS_ENABLED) - list(APPEND SOURCES "ConvertKernelFuncToCubin.cpp") set(NVPTX_LIBS MC NVPTXCodeGen @@ -15,25 +10,26 @@ NVPTXInfo ) -endif() - -add_mlir_conversion_library(MLIRGPUtoCUDATransforms - ${SOURCES} + add_mlir_conversion_library(MLIRGPUtoCUDATransforms + ConvertKernelFuncToCubin.cpp - DEPENDS - MLIRConversionPassIncGen - intrinsics_gen + DEPENDS + MLIRConversionPassIncGen + intrinsics_gen - LINK_COMPONENTS - Core - ${NVPTX_LIBS} + LINK_COMPONENTS + Core + ${NVPTX_LIBS} - LINK_LIBS PUBLIC - MLIRGPU - MLIRIR - MLIRLLVMIR - MLIRNVVMIR - MLIRPass - MLIRSupport - MLIRTargetNVVMIR -) + LINK_LIBS PUBLIC + MLIRGPU + MLIRIR + MLIRLLVMIR + MLIRNVVMIR + MLIRPass + MLIRSupport + MLIRTargetNVVMIR + ) +else() + add_library(MLIRGPUtoCUDATransforms INTERFACE IMPORTED GLOBAL) +endif() diff --git a/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir rename from mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir rename to mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir --- a/mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir @@ -1,11 +1,13 @@ -// RUN: mlir-opt -allow-unregistered-dialect %s --launch-func-to-cuda | FileCheck %s +// RUN: mlir-opt -allow-unregistered-dialect %s --launch-func-to-gpu-runtime="gpu-binary-annotation=nvvm.cubin" | FileCheck %s +// RUN: mlir-opt -allow-unregistered-dialect %s --launch-func-to-gpu-runtime="gpu-binary-annotation=rocdl.hsaco" | FileCheck %s --check-prefix=ROCDL module attributes {gpu.container_module} { // CHECK: llvm.mlir.global internal constant @[[kernel_name:.*]]("kernel\00") // CHECK: llvm.mlir.global internal constant @[[global:.*]]("CUBIN") + // ROCDL: llvm.mlir.global internal constant @[[global:.*]]("HSACO") - gpu.module @kernel_module attributes {nvvm.cubin = "CUBIN"} { + gpu.module @kernel_module attributes {nvvm.cubin = "CUBIN", rocdl.hsaco = "HSACO"} { llvm.func @kernel(%arg0: !llvm.float, %arg1: !llvm<"float*">) attributes {gpu.kernel} { llvm.return } @@ -18,15 +20,15 @@ // CHECK: %[[addressof:.*]] = llvm.mlir.addressof @[[global]] // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index) - // CHECK: %[[cubin_ptr:.*]] = llvm.getelementptr %[[addressof]][%[[c0]], %[[c0]]] + // CHECK: %[[binary_ptr:.*]] = llvm.getelementptr %[[addressof]][%[[c0]], %[[c0]]] // CHECK-SAME: -> !llvm<"i8*"> // CHECK: %[[module_ptr:.*]] = llvm.alloca {{.*}} x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**"> - // CHECK: llvm.call @mcuModuleLoad(%[[module_ptr]], %[[cubin_ptr]]) : (!llvm<"i8**">, !llvm<"i8*">) -> !llvm.i32 + // CHECK: llvm.call @mgpuModuleLoad(%[[module_ptr]], %[[binary_ptr]]) : (!llvm<"i8**">, !llvm<"i8*">) -> !llvm.i32 // CHECK: %[[func_ptr:.*]] = llvm.alloca {{.*}} x !llvm<"i8*"> : (!llvm.i32) -> !llvm<"i8**"> - // CHECK: llvm.call @mcuModuleGetFunction(%[[func_ptr]], {{.*}}, {{.*}}) : (!llvm<"i8**">, !llvm<"i8*">, !llvm<"i8*">) -> !llvm.i32 - // CHECK: llvm.call @mcuGetStreamHelper - // CHECK: llvm.call @mcuLaunchKernel - // CHECK: llvm.call @mcuStreamSynchronize + // CHECK: llvm.call @mgpuModuleGetFunction(%[[func_ptr]], {{.*}}, {{.*}}) : (!llvm<"i8**">, !llvm<"i8*">, !llvm<"i8*">) -> !llvm.i32 + // CHECK: llvm.call @mgpuGetStreamHelper + // CHECK: llvm.call @mgpuLaunchKernel + // CHECK: llvm.call @mgpuStreamSynchronize "gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1) { kernel = @kernel_module::@kernel } : (!llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.float, !llvm<"float*">) -> () diff --git a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp --- a/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp +++ b/mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp @@ -30,15 +30,15 @@ } } // anonymous namespace -extern "C" int32_t mcuModuleLoad(void **module, void *data) { +extern "C" int32_t mgpuModuleLoad(void **module, void *data) { int32_t err = reportErrorIfAny( cuModuleLoadData(reinterpret_cast(module), data), "ModuleLoad"); return err; } -extern "C" int32_t mcuModuleGetFunction(void **function, void *module, - const char *name) { +extern "C" int32_t mgpuModuleGetFunction(void **function, void *module, + const char *name) { return reportErrorIfAny( cuModuleGetFunction(reinterpret_cast(function), reinterpret_cast(module), name), @@ -48,11 +48,11 @@ // The wrapper uses intptr_t instead of CUDA's unsigned int to match // the type of MLIR's index type. This avoids the need for casts in the // generated MLIR code. -extern "C" int32_t mcuLaunchKernel(void *function, intptr_t gridX, - intptr_t gridY, intptr_t gridZ, - intptr_t blockX, intptr_t blockY, - intptr_t blockZ, int32_t smem, void *stream, - void **params, void **extra) { +extern "C" int32_t mgpuLaunchKernel(void *function, intptr_t gridX, + intptr_t gridY, intptr_t gridZ, + intptr_t blockX, intptr_t blockY, + intptr_t blockZ, int32_t smem, void *stream, + void **params, void **extra) { return reportErrorIfAny( cuLaunchKernel(reinterpret_cast(function), gridX, gridY, gridZ, blockX, blockY, blockZ, smem, @@ -60,13 +60,13 @@ "LaunchKernel"); } -extern "C" void *mcuGetStreamHelper() { +extern "C" void *mgpuGetStreamHelper() { CUstream stream; reportErrorIfAny(cuStreamCreate(&stream, CU_STREAM_DEFAULT), "StreamCreate"); return stream; } -extern "C" int32_t mcuStreamSynchronize(void *stream) { +extern "C" int32_t mgpuStreamSynchronize(void *stream) { return reportErrorIfAny( cuStreamSynchronize(reinterpret_cast(stream)), "StreamSync"); } @@ -75,7 +75,7 @@ // Allows to register byte array with the CUDA runtime. Helpful until we have // transfer functions implemented. -extern "C" void mcuMemHostRegister(void *ptr, uint64_t sizeBytes) { +extern "C" void mgpuMemHostRegister(void *ptr, uint64_t sizeBytes) { reportErrorIfAny(cuMemHostRegister(ptr, sizeBytes, /*flags=*/0), "MemHostRegister"); } @@ -99,7 +99,7 @@ assert(strides == llvm::makeArrayRef(denseStrides)); std::fill_n(pointer, count, value); - mcuMemHostRegister(pointer, count * sizeof(T)); + mgpuMemHostRegister(pointer, count * sizeof(T)); } extern "C" void mcuMemHostRegisterFloat(int64_t rank, void *ptr) { diff --git a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp --- a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp +++ b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp @@ -14,6 +14,7 @@ #include "llvm/ADT/STLExtras.h" +#include "mlir/Conversion/GPUCommon/GPUCommonPass.h" #include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h" #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h" #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" @@ -115,7 +116,7 @@ kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass()); kernelPm.addPass(createConvertGPUKernelToCubinPass(&compilePtxToCubin)); pm.addPass(createLowerToLLVMPass()); - pm.addPass(createConvertGpuLaunchFuncToCudaCallsPass()); + pm.addPass(createConvertGpuLaunchFuncToGpuRuntimeCallsPass()); return pm.run(m); }