diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td --- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td @@ -24,6 +24,12 @@ let name = "nvvm"; let cppNamespace = "::mlir::NVVM"; let dependentDialects = ["LLVM::LLVMDialect"]; + + let extraClassDeclaration = [{ + /// Get the name of the attribute used to annotate external kernel + /// functions. + static StringRef getKernelFuncAttrName() { return "nvvm.kernel"; } + }]; } //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -24,6 +24,12 @@ let name = "rocdl"; let cppNamespace = "::mlir::ROCDL"; let dependentDialects = ["LLVM::LLVMDialect"]; + + let extraClassDeclaration = [{ + /// Get the name of the attribute used to annotate external kernel + /// functions. + static StringRef getKernelFuncAttrName() { return "rocdl.kernel"; } + }]; } //===----------------------------------------------------------------------===// diff --git a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt --- a/mlir/lib/Conversion/GPUCommon/CMakeLists.txt +++ b/mlir/lib/Conversion/GPUCommon/CMakeLists.txt @@ -17,6 +17,7 @@ add_mlir_conversion_library(MLIRGPUToGPURuntimeTransforms ConvertLaunchFuncToRuntimeCalls.cpp ConvertKernelFuncToBlob.cpp + GPUOpsLowering.cpp DEPENDS MLIRConversionPassIncGen diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h --- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h +++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h @@ -11,145 +11,26 @@ #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" #include "mlir/Dialect/GPU/GPUDialect.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" -#include "mlir/Dialect/StandardOps/IR/Ops.h" -#include "mlir/IR/Builders.h" -#include "llvm/Support/FormatVariadic.h" namespace mlir { -template struct GPUFuncOpLowering : ConvertOpToLLVMPattern { - using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern; + GPUFuncOpLowering(LLVMTypeConverter &converter, unsigned allocaAddrSpace, + Identifier kernelAttributeName) + : ConvertOpToLLVMPattern(converter), + allocaAddrSpace(allocaAddrSpace), + kernelAttributeName(kernelAttributeName) {} LogicalResult matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, ArrayRef operands, - ConversionPatternRewriter &rewriter) const override { - assert(operands.empty() && "func op is not expected to have operands"); - Location loc = gpuFuncOp.getLoc(); - - SmallVector workgroupBuffers; - workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions()); - for (auto en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) { - Value attribution = en.value(); - - auto type = attribution.getType().dyn_cast(); - assert(type && type.hasStaticShape() && "unexpected type in attribution"); - - uint64_t numElements = type.getNumElements(); - - auto elementType = typeConverter->convertType(type.getElementType()) - .template cast(); - auto arrayType = LLVM::LLVMArrayType::get(elementType, numElements); - std::string name = std::string( - llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index())); - auto globalOp = rewriter.create( - gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false, - LLVM::Linkage::Internal, name, /*value=*/Attribute(), - gpu::GPUDialect::getWorkgroupAddressSpace()); - workgroupBuffers.push_back(globalOp); - } - - // Rewrite the original GPU function to an LLVM function. - auto funcType = typeConverter->convertType(gpuFuncOp.getType()) - .template cast() - .getElementType(); - - // Remap proper input types. - TypeConverter::SignatureConversion signatureConversion( - gpuFuncOp.front().getNumArguments()); - getTypeConverter()->convertFunctionSignature( - gpuFuncOp.getType(), /*isVariadic=*/false, signatureConversion); - - // Create the new function operation. Only copy those attributes that are - // not specific to function modeling. - SmallVector attributes; - for (const auto &attr : gpuFuncOp.getAttrs()) { - if (attr.first == SymbolTable::getSymbolAttrName() || - attr.first == impl::getTypeAttrName() || - attr.first == gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName()) - continue; - attributes.push_back(attr); - } - auto llvmFuncOp = rewriter.create( - gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType, - LLVM::Linkage::External, attributes); + ConversionPatternRewriter &rewriter) const override; - { - // Insert operations that correspond to converted workgroup and private - // memory attributions to the body of the function. This must operate on - // the original function, before the body region is inlined in the new - // function to maintain the relation between block arguments and the - // parent operation that assigns their semantics. - OpBuilder::InsertionGuard guard(rewriter); +private: + /// The address spcae to use for `alloca`s in private memory. + unsigned allocaAddrSpace; - // Rewrite workgroup memory attributions to addresses of global buffers. - rewriter.setInsertionPointToStart(&gpuFuncOp.front()); - unsigned numProperArguments = gpuFuncOp.getNumArguments(); - auto i32Type = IntegerType::get(rewriter.getContext(), 32); - - Value zero = nullptr; - if (!workgroupBuffers.empty()) - zero = rewriter.create(loc, i32Type, - rewriter.getI32IntegerAttr(0)); - for (auto en : llvm::enumerate(workgroupBuffers)) { - LLVM::GlobalOp global = en.value(); - Value address = rewriter.create(loc, global); - auto elementType = - global.getType().cast().getElementType(); - Value memory = rewriter.create( - loc, LLVM::LLVMPointerType::get(elementType, global.addr_space()), - address, ArrayRef{zero, zero}); - - // Build a memref descriptor pointing to the buffer to plug with the - // existing memref infrastructure. This may use more registers than - // otherwise necessary given that memref sizes are fixed, but we can try - // and canonicalize that away later. - Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()]; - auto type = attribution.getType().cast(); - auto descr = MemRefDescriptor::fromStaticShape( - rewriter, loc, *getTypeConverter(), type, memory); - signatureConversion.remapInput(numProperArguments + en.index(), descr); - } - - // Rewrite private memory attributions to alloca'ed buffers. - unsigned numWorkgroupAttributions = - gpuFuncOp.getNumWorkgroupAttributions(); - auto int64Ty = IntegerType::get(rewriter.getContext(), 64); - for (auto en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) { - Value attribution = en.value(); - auto type = attribution.getType().cast(); - assert(type && type.hasStaticShape() && - "unexpected type in attribution"); - - // Explicitly drop memory space when lowering private memory - // attributions since NVVM models it as `alloca`s in the default - // memory space and does not support `alloca`s with addrspace(5). - auto ptrType = LLVM::LLVMPointerType::get( - typeConverter->convertType(type.getElementType()) - .template cast(), - AllocaAddrSpace); - Value numElements = rewriter.create( - gpuFuncOp.getLoc(), int64Ty, - rewriter.getI64IntegerAttr(type.getNumElements())); - Value allocated = rewriter.create( - gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0); - auto descr = MemRefDescriptor::fromStaticShape( - rewriter, loc, *getTypeConverter(), type, allocated); - signatureConversion.remapInput( - numProperArguments + numWorkgroupAttributions + en.index(), descr); - } - } - - // Move the region to the new function, update the entry block signature. - rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(), - llvmFuncOp.end()); - if (failed(rewriter.convertRegionTypes( - &llvmFuncOp.getBody(), *typeConverter, &signatureConversion))) - return failure(); - - rewriter.eraseOp(gpuFuncOp); - return success(); - } + /// The attribute name to use instead of `gpu.kernel`. + Identifier kernelAttributeName; }; struct GPUReturnOpLowering : public ConvertOpToLLVMPattern { diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp new file mode 100644 --- /dev/null +++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp @@ -0,0 +1,148 @@ +//===- GPUOpsLowering.cpp - GPU FuncOp / ReturnOp lowering ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "GPUOpsLowering.h" +#include "mlir/Dialect/StandardOps/IR/Ops.h" +#include "mlir/IR/Builders.h" +#include "llvm/Support/FormatVariadic.h" + +using namespace mlir; + +LogicalResult +GPUFuncOpLowering::matchAndRewrite(gpu::GPUFuncOp gpuFuncOp, + ArrayRef operands, + ConversionPatternRewriter &rewriter) const { + assert(operands.empty() && "func op is not expected to have operands"); + Location loc = gpuFuncOp.getLoc(); + + SmallVector workgroupBuffers; + workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions()); + for (auto en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) { + Value attribution = en.value(); + + auto type = attribution.getType().dyn_cast(); + assert(type && type.hasStaticShape() && "unexpected type in attribution"); + + uint64_t numElements = type.getNumElements(); + + auto elementType = + typeConverter->convertType(type.getElementType()).template cast(); + auto arrayType = LLVM::LLVMArrayType::get(elementType, numElements); + std::string name = std::string( + llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index())); + auto globalOp = rewriter.create( + gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false, + LLVM::Linkage::Internal, name, /*value=*/Attribute(), + gpu::GPUDialect::getWorkgroupAddressSpace()); + workgroupBuffers.push_back(globalOp); + } + + // Rewrite the original GPU function to an LLVM function. + auto funcType = typeConverter->convertType(gpuFuncOp.getType()) + .template cast() + .getElementType(); + + // Remap proper input types. + TypeConverter::SignatureConversion signatureConversion( + gpuFuncOp.front().getNumArguments()); + getTypeConverter()->convertFunctionSignature( + gpuFuncOp.getType(), /*isVariadic=*/false, signatureConversion); + + // Create the new function operation. Only copy those attributes that are + // not specific to function modeling. + SmallVector attributes; + for (const auto &attr : gpuFuncOp.getAttrs()) { + if (attr.first == SymbolTable::getSymbolAttrName() || + attr.first == impl::getTypeAttrName() || + attr.first == gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName()) + continue; + attributes.push_back(attr); + } + // Add a dialect specific kernel attribute in addition to GPU kernel + // attribute. The former is necessary for further translation while the + // latter is expected by gpu.launch_func. + if (gpuFuncOp.isKernel()) + attributes.emplace_back(kernelAttributeName, rewriter.getUnitAttr()); + auto llvmFuncOp = rewriter.create( + gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType, + LLVM::Linkage::External, attributes); + + { + // Insert operations that correspond to converted workgroup and private + // memory attributions to the body of the function. This must operate on + // the original function, before the body region is inlined in the new + // function to maintain the relation between block arguments and the + // parent operation that assigns their semantics. + OpBuilder::InsertionGuard guard(rewriter); + + // Rewrite workgroup memory attributions to addresses of global buffers. + rewriter.setInsertionPointToStart(&gpuFuncOp.front()); + unsigned numProperArguments = gpuFuncOp.getNumArguments(); + auto i32Type = IntegerType::get(rewriter.getContext(), 32); + + Value zero = nullptr; + if (!workgroupBuffers.empty()) + zero = rewriter.create(loc, i32Type, + rewriter.getI32IntegerAttr(0)); + for (auto en : llvm::enumerate(workgroupBuffers)) { + LLVM::GlobalOp global = en.value(); + Value address = rewriter.create(loc, global); + auto elementType = + global.getType().cast().getElementType(); + Value memory = rewriter.create( + loc, LLVM::LLVMPointerType::get(elementType, global.addr_space()), + address, ArrayRef{zero, zero}); + + // Build a memref descriptor pointing to the buffer to plug with the + // existing memref infrastructure. This may use more registers than + // otherwise necessary given that memref sizes are fixed, but we can try + // and canonicalize that away later. + Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()]; + auto type = attribution.getType().cast(); + auto descr = MemRefDescriptor::fromStaticShape( + rewriter, loc, *getTypeConverter(), type, memory); + signatureConversion.remapInput(numProperArguments + en.index(), descr); + } + + // Rewrite private memory attributions to alloca'ed buffers. + unsigned numWorkgroupAttributions = gpuFuncOp.getNumWorkgroupAttributions(); + auto int64Ty = IntegerType::get(rewriter.getContext(), 64); + for (auto en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) { + Value attribution = en.value(); + auto type = attribution.getType().cast(); + assert(type && type.hasStaticShape() && "unexpected type in attribution"); + + // Explicitly drop memory space when lowering private memory + // attributions since NVVM models it as `alloca`s in the default + // memory space and does not support `alloca`s with addrspace(5). + auto ptrType = LLVM::LLVMPointerType::get( + typeConverter->convertType(type.getElementType()) + .template cast(), + allocaAddrSpace); + Value numElements = rewriter.create( + gpuFuncOp.getLoc(), int64Ty, + rewriter.getI64IntegerAttr(type.getNumElements())); + Value allocated = rewriter.create( + gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0); + auto descr = MemRefDescriptor::fromStaticShape( + rewriter, loc, *getTypeConverter(), type, allocated); + signatureConversion.remapInput( + numProperArguments + numWorkgroupAttributions + en.index(), descr); + } + } + + // Move the region to the new function, update the entry block signature. + rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(), + llvmFuncOp.end()); + if (failed(rewriter.convertRegionTypes(&llvmFuncOp.getBody(), *typeConverter, + &signatureConversion))) + return failure(); + + rewriter.eraseOp(gpuFuncOp); + return success(); +} diff --git a/mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt b/mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt --- a/mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt +++ b/mlir/lib/Conversion/GPUToNVVM/CMakeLists.txt @@ -11,6 +11,7 @@ LINK_LIBS PUBLIC MLIRGPU + MLIRGPUToGPURuntimeTransforms MLIRLLVMIR MLIRNVVMIR MLIRPass diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp --- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp +++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp @@ -167,11 +167,16 @@ NVVM::BlockIdYOp, NVVM::BlockIdZOp>, GPUIndexIntrinsicOpLowering, - GPUShuffleOpLowering, GPUReturnOpLowering, - // Explicitly drop memory space when lowering private memory - // attributions since NVVM models it as `alloca`s in the default - // memory space and does not support `alloca`s with addrspace(5). - GPUFuncOpLowering<0>>(converter); + GPUShuffleOpLowering, GPUReturnOpLowering>(converter); + + // Explicitly drop memory space when lowering private memory + // attributions since NVVM models it as `alloca`s in the default + // memory space and does not support `alloca`s with addrspace(5). + patterns.insert( + converter, /*allocaAddrSpace=*/0, + Identifier::get(NVVM::NVVMDialect::getKernelFuncAttrName(), + &converter.getContext())); + patterns.insert>(converter, "__nv_fabsf", "__nv_fabs"); patterns.insert>(converter, "__nv_atanf", diff --git a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt --- a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt +++ b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt @@ -11,6 +11,7 @@ LINK_LIBS PUBLIC MLIRGPU + MLIRGPUToGPURuntimeTransforms MLIRLLVMIR MLIRROCDLIR MLIRPass diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -103,7 +103,11 @@ ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>, GPUIndexIntrinsicOpLowering, - GPUFuncOpLowering<5>, GPUReturnOpLowering>(converter); + GPUReturnOpLowering>(converter); + patterns.insert( + converter, /*allocaAddrSpace=*/5, + Identifier::get(ROCDL::ROCDLDialect::getKernelFuncAttrName(), + &converter.getContext())); patterns.insert>(converter, "__ocml_fabs_f32", "__ocml_fabs_f64"); patterns.insert>( diff --git a/mlir/lib/Target/LLVMIR/ConvertToNVVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertToNVVMIR.cpp --- a/mlir/lib/Target/LLVMIR/ConvertToNVVMIR.cpp +++ b/mlir/lib/Target/LLVMIR/ConvertToNVVMIR.cpp @@ -77,7 +77,8 @@ // function as a kernel. for (auto func : ModuleTranslation::getModuleBody(m).getOps()) { - if (!gpu::GPUDialect::isKernel(func)) + if (!func->getAttrOfType( + NVVM::NVVMDialect::getKernelFuncAttrName())) continue; auto *llvmFunc = llvmModule->getFunction(func.getName()); diff --git a/mlir/lib/Target/LLVMIR/ConvertToROCDLIR.cpp b/mlir/lib/Target/LLVMIR/ConvertToROCDLIR.cpp --- a/mlir/lib/Target/LLVMIR/ConvertToROCDLIR.cpp +++ b/mlir/lib/Target/LLVMIR/ConvertToROCDLIR.cpp @@ -87,7 +87,7 @@ for (auto func : ModuleTranslation::getModuleBody(m).getOps()) { if (!func->getAttrOfType( - gpu::GPUDialect::getKernelFuncAttrName())) + ROCDL::ROCDLDialect::getKernelFuncAttrName())) continue; auto *llvmFunc = llvmModule->getFunction(func.getName()); diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir --- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir +++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir @@ -423,3 +423,15 @@ std.return %result32, %result64 : f32, f64 } } + +// ----- + +gpu.module @test_module { + // CHECK-LABEL: @kernel_func + // CHECK: attributes + // CHECK: gpu.kernel + // CHECK: nvvm.kernel + gpu.func @kernel_func() kernel { + gpu.return + } +} diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir --- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir @@ -340,3 +340,15 @@ std.return %result32, %result64 : f32, f64 } } + +// ----- + +gpu.module @test_module { + // CHECK-LABEL: @kernel_func + // CHECK: attributes + // CHECK: gpu.kernel + // CHECK: rocdl.kernel + gpu.func @kernel_func() kernel { + gpu.return + } +} diff --git a/mlir/test/Target/nvvmir.mlir b/mlir/test/Target/nvvmir.mlir --- a/mlir/test/Target/nvvmir.mlir +++ b/mlir/test/Target/nvvmir.mlir @@ -75,7 +75,7 @@ // This function has the "kernel" attribute attached and should appear in the // NVVM annotations after conversion. -llvm.func @kernel_func() attributes {gpu.kernel} { +llvm.func @kernel_func() attributes {nvvm.kernel} { llvm.return } diff --git a/mlir/test/Target/rocdl.mlir b/mlir/test/Target/rocdl.mlir --- a/mlir/test/Target/rocdl.mlir +++ b/mlir/test/Target/rocdl.mlir @@ -29,7 +29,7 @@ llvm.return %1 : i32 } -llvm.func @kernel_func() attributes {gpu.kernel} { +llvm.func @kernel_func() attributes {rocdl.kernel} { // CHECK-LABEL: amdgpu_kernel void @kernel_func llvm.return }