diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h --- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h +++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h @@ -45,8 +45,7 @@ /// instead uses a small wrapper library that exports a stable and conveniently /// typed ABI on top of GPU runtimes such as CUDA or ROCm (HIP). std::unique_ptr> -createConvertGpuLaunchFuncToGpuRuntimeCallsPass( - StringRef gpuBinaryAnnotation = ""); +createGpuToLLVMConversionPass(StringRef gpuBinaryAnnotation = ""); /// Collect a set of patterns to convert from the GPU dialect to LLVM. void populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter, diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td --- a/mlir/include/mlir/Conversion/Passes.td +++ b/mlir/include/mlir/Conversion/Passes.td @@ -82,14 +82,12 @@ // GPUCommon //===----------------------------------------------------------------------===// -def ConvertGpuLaunchFuncToGpuRuntimeCalls : Pass<"launch-func-to-gpu-runtime", - "ModuleOp"> { - let summary = "Convert all launch_func ops to GPU runtime calls"; - let constructor = "mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass()"; +def GpuToLLVMConversionPass : Pass<"gpu-to-llvm", "ModuleOp"> { + let summary = "Convert GPU dialect to LLVM dialect with GPU runtime calls"; + let constructor = "mlir::createGpuToLLVMConversionPass()"; let options = [ Option<"gpuBinaryAnnotation", "gpu-binary-annotation", "std::string", - "\"nvvm.cubin\"", - "Annotation attribute string for GPU binary">, + "", "Annotation attribute string for GPU binary">, ]; } diff --git a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp --- a/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp +++ b/mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp @@ -39,11 +39,10 @@ namespace { -class GpuLaunchFuncToGpuRuntimeCallsPass - : public ConvertGpuLaunchFuncToGpuRuntimeCallsBase< - GpuLaunchFuncToGpuRuntimeCallsPass> { +class GpuToLLVMConversionPass + : public GpuToLLVMConversionPassBase { public: - GpuLaunchFuncToGpuRuntimeCallsPass(StringRef gpuBinaryAnnotation) { + GpuToLLVMConversionPass(StringRef gpuBinaryAnnotation) { if (!gpuBinaryAnnotation.empty()) this->gpuBinaryAnnotation = gpuBinaryAnnotation.str(); } @@ -142,10 +141,8 @@ gpuBinaryAnnotation(gpuBinaryAnnotation) {} private: - void addParamToArray(OpBuilder &builder, Location loc, Value param, - Value array, unsigned pos, Value one) const; - Value generateParamsArray(gpu::LaunchFuncOp launchOp, unsigned numArguments, - OpBuilder &builder) const; + Value generateParamsArray(gpu::LaunchFuncOp launchOp, + ArrayRef operands, OpBuilder &builder) const; Value generateKernelNameConstant(StringRef moduleName, StringRef name, Location loc, OpBuilder &builder) const; @@ -170,9 +167,10 @@ } // namespace -void GpuLaunchFuncToGpuRuntimeCallsPass::runOnOperation() { +void GpuToLLVMConversionPass::runOnOperation() { LLVMTypeConverter converter(&getContext()); OwningRewritePatternList patterns; + populateStdToLLVMConversionPatterns(converter, patterns); populateGpuToLLVMConversionPatterns(converter, patterns, gpuBinaryAnnotation); LLVMConversionTarget target(getContext()); @@ -194,91 +192,57 @@ builder.getSymbolRefAttr(function), arguments); } -/// Emits the IR with the following structure: -/// -/// %data = llvm.alloca 1 x type-of() -/// llvm.store , %data -/// %typeErased = llvm.bitcast %data to !llvm<"i8*"> -/// %addr = llvm.getelementptr [] -/// llvm.store %typeErased, %addr -/// -/// This is necessary to construct the array of arguments passed to the kernel -/// function as accepted by cuLaunchKernel, i.e. as a void** that points to -/// array of stack-allocated type-erased pointers to the actual arguments. -void ConvertLaunchFuncOpToGpuRuntimeCallPattern::addParamToArray( - OpBuilder &builder, Location loc, Value param, Value array, unsigned pos, - Value one) const { - auto memLocation = builder.create( - loc, param.getType().cast().getPointerTo(), one, - /*alignment=*/0); - builder.create(loc, param, memLocation); - auto casted = - builder.create(loc, llvmPointerType, memLocation); - - auto index = builder.create(loc, llvmInt32Type, - builder.getI32IntegerAttr(pos)); - auto gep = builder.create(loc, llvmPointerPointerType, array, - index.getResult()); - builder.create(loc, casted, gep); -} - -// Generates a parameters array to be used with a CUDA / ROCm (HIP) kernel -// launch call. The arguments are extracted from the launchOp. +// Creates a struct containing all kernel parameters on the stack and returns +// an array of type-erased pointers to the fields of the struct. The array can +// then be passed to the CUDA / ROCm (HIP) kernel launch calls. // The generated code is essentially as follows: // -// %array = alloca(numparams * sizeof(void *)) -// for (i : [0, NumKernelOperands)) -// %array[i] = cast(KernelOperand[i]) +// %struct = alloca(sizeof(struct { Parameters... })) +// %array = alloca(NumParameters * sizeof(void *)) +// for (i : [0, NumParameters)) +// %fieldPtr = llvm.getelementptr %struct[0, i] +// llvm.store parameters[i], %fieldPtr +// %elementPtr = llvm.getelementptr %array[i] +// llvm.store %fieldPtr, %elementPtr // return %array Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateParamsArray( - gpu::LaunchFuncOp launchOp, unsigned numArguments, + gpu::LaunchFuncOp launchOp, ArrayRef operands, OpBuilder &builder) const { + auto loc = launchOp.getLoc(); auto numKernelOperands = launchOp.getNumKernelOperands(); - Location loc = launchOp.getLoc(); + auto arguments = typeConverter.promoteOperands( + loc, launchOp.getOperands().take_back(numKernelOperands), + operands.take_back(numKernelOperands), builder); + auto numArguments = arguments.size(); + SmallVector argumentTypes; + argumentTypes.reserve(numArguments); + for (auto argument : arguments) + argumentTypes.push_back(argument.getType().cast()); + auto structType = LLVM::LLVMType::createStructTy(argumentTypes, StringRef()); auto one = builder.create(loc, llvmInt32Type, builder.getI32IntegerAttr(1)); + auto structPtr = builder.create( + loc, structType.getPointerTo(), one, /*alignment=*/0); auto arraySize = builder.create( loc, llvmInt32Type, builder.getI32IntegerAttr(numArguments)); - auto array = builder.create(loc, llvmPointerPointerType, - arraySize, /*alignment=*/0); - - unsigned pos = 0; - for (unsigned idx = 0; idx < numKernelOperands; ++idx) { - auto operand = launchOp.getKernelOperand(idx); - auto llvmType = operand.getType().cast(); - - // Assume all struct arguments come from MemRef. If this assumption does not - // hold anymore then we `launchOp` to lower from MemRefType and not after - // LLVMConversion has taken place and the MemRef information is lost. - if (!llvmType.isStructTy()) { - addParamToArray(builder, loc, operand, array, pos++, one); - continue; - } - - // Put individual components of a memref descriptor into the flat argument - // list. We cannot use unpackMemref from LLVM lowering here because we have - // no access to MemRefType that had been lowered away. - for (int32_t j = 0, ej = llvmType.getStructNumElements(); j < ej; ++j) { - auto elemType = llvmType.getStructElementType(j); - if (elemType.isArrayTy()) { - for (int32_t k = 0, ek = elemType.getArrayNumElements(); k < ek; ++k) { - Value elem = builder.create( - loc, elemType.getArrayElementType(), operand, - builder.getI32ArrayAttr({j, k})); - addParamToArray(builder, loc, elem, array, pos++, one); - } - } else { - assert((elemType.isIntegerTy() || elemType.isFloatTy() || - elemType.isDoubleTy() || elemType.isPointerTy()) && - "expected scalar type"); - Value strct = builder.create( - loc, elemType, operand, builder.getI32ArrayAttr(j)); - addParamToArray(builder, loc, strct, array, pos++, one); - } - } + auto arrayPtr = builder.create(loc, llvmPointerPointerType, + arraySize, /*alignment=*/0); + auto zero = builder.create(loc, llvmInt32Type, + builder.getI32IntegerAttr(0)); + for (auto en : llvm::enumerate(arguments)) { + auto index = builder.create( + loc, llvmInt32Type, builder.getI32IntegerAttr(en.index())); + auto fieldPtr = + builder.create(loc, structType.getPointerTo(), structPtr, + ArrayRef{zero, index.getResult()}); + builder.create(loc, en.value(), fieldPtr); + auto elementPtr = builder.create(loc, llvmPointerPointerType, + arrayPtr, index.getResult()); + auto casted = + builder.create(loc, llvmPointerType, fieldPtr); + builder.create(loc, casted, elementPtr); } - - return array; + return arrayPtr; } // Generates an LLVM IR dialect global that contains the name of the given @@ -305,6 +269,11 @@ LLVM::Linkage::Internal); } +// Returns whether value is of LLVM type. +static bool isLLVMType(Value value) { + return value.getType().isa(); +} + // Emits LLVM IR to launch a kernel function. Expects the module that contains // the compiled kernel function as a cubin in the 'nvvm.cubin' attribute, or a // hsaco in the 'rocdl.hsaco' attribute of the kernel function in the IR. @@ -320,14 +289,17 @@ LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite( Operation *op, ArrayRef operands, ConversionPatternRewriter &rewriter) const { - Location loc = op->getLoc(); + if (!llvm::all_of(operands, isLLVMType)) + return rewriter.notifyMatchFailure( + op, "Cannot convert if operands aren't of LLVM type."); + auto launchOp = cast(op); - auto moduleOp = op->getParentOfType(); + Location loc = launchOp.getLoc(); // Create an LLVM global with CUBIN extracted from the kernel annotation and // obtain a pointer to the first byte in it. - auto kernelModule = - moduleOp.lookupSymbol(launchOp.getKernelModuleName()); + auto kernelModule = SymbolTable::lookupNearestSymbolFrom( + launchOp, launchOp.getKernelModuleName()); assert(kernelModule && "expected a kernel module"); auto binaryAttr = kernelModule.getAttrOfType(gpuBinaryAnnotation); @@ -350,25 +322,13 @@ launchOp.getKernelModuleName(), launchOp.getKernelName(), loc, rewriter); auto function = moduleGetFunctionCallBuilder.create( loc, rewriter, {module.getResult(0), kernelName}); - // Grab the global stream needed for execution. - auto stream = streamCreateCallBuilder.create(loc, rewriter, {}); - - // Get the launch target. - auto gpuFuncOp = SymbolTable::lookupNearestSymbolFrom( - launchOp, launchOp.kernel()); - if (!gpuFuncOp) { - launchOp.emitOpError() << "corresponding kernel function not found"; - return failure(); - } - // Build array of kernel parameters. - auto kernelParams = - generateParamsArray(launchOp, gpuFuncOp.getNumArguments(), rewriter); - - // Invoke the function with required arguments. auto zero = rewriter.create(loc, llvmInt32Type, rewriter.getI32IntegerAttr(0)); - auto nullpointer = - rewriter.create(loc, llvmPointerPointerType, zero); + // Grab the global stream needed for execution. + auto stream = streamCreateCallBuilder.create(loc, rewriter, {}); + // Create array of pointers to kernel arguments. + auto kernelParams = generateParamsArray(launchOp, operands, rewriter); + auto nullpointer = rewriter.create(loc, llvmPointerPointerType); launchKernelCallBuilder.create( loc, rewriter, {function.getResult(0), launchOp.gridSizeX(), launchOp.gridSizeY(), @@ -384,10 +344,8 @@ } std::unique_ptr> -mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass( - StringRef gpuBinaryAnnotation) { - return std::make_unique( - gpuBinaryAnnotation); +mlir::createGpuToLLVMConversionPass(StringRef gpuBinaryAnnotation) { + return std::make_unique(gpuBinaryAnnotation); } void mlir::populateGpuToLLVMConversionPatterns( diff --git a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir --- a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir @@ -1,5 +1,5 @@ -// RUN: mlir-opt -allow-unregistered-dialect %s --launch-func-to-gpu-runtime="gpu-binary-annotation=nvvm.cubin" | FileCheck %s -// RUN: mlir-opt -allow-unregistered-dialect %s --launch-func-to-gpu-runtime="gpu-binary-annotation=rocdl.hsaco" | FileCheck %s --check-prefix=ROCDL +// RUN: mlir-opt -allow-unregistered-dialect %s --gpu-to-llvm="gpu-binary-annotation=nvvm.cubin" | FileCheck %s +// RUN: mlir-opt -allow-unregistered-dialect %s --gpu-to-llvm="gpu-binary-annotation=rocdl.hsaco" | FileCheck %s --check-prefix=ROCDL module attributes {gpu.container_module} { diff --git a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp --- a/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp +++ b/mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp @@ -118,9 +118,7 @@ kernelPm.addPass(createConvertGPUKernelToBlobPass( translateModuleToNVVMIR, compilePtxToCubin, "nvptx64-nvidia-cuda", "sm_35", "+ptx60", gpuBinaryAnnotation)); - pm.addPass(createLowerToLLVMPass()); - pm.addPass( - createConvertGpuLaunchFuncToGpuRuntimeCallsPass(gpuBinaryAnnotation)); + pm.addPass(createGpuToLLVMConversionPass(gpuBinaryAnnotation)); return pm.run(m); } diff --git a/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp b/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp --- a/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp +++ b/mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp @@ -309,9 +309,7 @@ kernelPm.addPass(createConvertGPUKernelToBlobPass( compileModuleToROCDLIR, compileISAToHsaco, tripleName, targetChip, features, gpuBinaryAnnotation)); - pm.addPass(createLowerToLLVMPass()); - pm.addPass( - createConvertGpuLaunchFuncToGpuRuntimeCallsPass(gpuBinaryAnnotation)); + pm.addPass(createGpuToLLVMConversionPass(gpuBinaryAnnotation)); return pm.run(m); }