diff --git a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h --- a/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h +++ b/mlir/include/mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h @@ -11,11 +11,19 @@ #include namespace mlir { +class LLVMTypeConverter; +class OwningRewritePatternList; + +template +class OperationPass; namespace gpu { class GPUModuleOp; } // namespace gpu -template class OperationPass; + +/// Collect a set of patterns to convert from the GPU dialect to ROCDL. +void populateGpuToROCDLConversionPatterns(LLVMTypeConverter &converter, + OwningRewritePatternList &patterns); /// Creates a pass that lowers GPU dialect operations to ROCDL counterparts. std::unique_ptr> diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -87,5 +87,19 @@ def ROCDL_GridDimZOp : ROCDL_DeviceFunctionOp<"grid.dim.z", "__ockl_get_global_size", 2>; +//===----------------------------------------------------------------------===// +// Synchronization primitives + +def ROCDL_BarrierOp : ROCDL_Op<"barrier"> { + string llvmBuilder = [{ + llvm::LLVMContext &llvmContext = builder.getContext(); + builder.CreateFence(llvm::AtomicOrdering::Release, + llvmContext.getOrInsertSyncScopeID("workgroup")); + createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_s_barrier); + builder.CreateFence(llvm::AtomicOrdering::Acquire, + llvmContext.getOrInsertSyncScopeID("workgroup")); + }]; + let assemblyFormat = "attr-dict"; +} #endif // ROCDLIR_OPS diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h new file mode 100644 --- /dev/null +++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h @@ -0,0 +1,171 @@ +//===- GPUOpsLowering.h - GPU FuncOp / ReturnOp lowering -------*- C++ -*--===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_ +#define MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_ + +#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h" +#include "mlir/Dialect/GPU/GPUDialect.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/StandardOps/IR/Ops.h" +#include "mlir/IR/Builders.h" + +namespace mlir { + +template +struct GPUFuncOpLowering : ConvertToLLVMPattern { + explicit GPUFuncOpLowering(LLVMTypeConverter &typeConverter) + : ConvertToLLVMPattern(gpu::GPUFuncOp::getOperationName(), + typeConverter.getDialect()->getContext(), + typeConverter) {} + + LogicalResult + matchAndRewrite(Operation *op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override { + assert(operands.empty() && "func op is not expected to have operands"); + auto gpuFuncOp = cast(op); + Location loc = gpuFuncOp.getLoc(); + + SmallVector workgroupBuffers; + workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions()); + for (auto en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) { + Value attribution = en.value(); + + auto type = attribution.getType().dyn_cast(); + assert(type && type.hasStaticShape() && "unexpected type in attribution"); + + uint64_t numElements = type.getNumElements(); + + auto elementType = typeConverter.convertType(type.getElementType()) + .cast(); + auto arrayType = LLVM::LLVMType::getArrayTy(elementType, numElements); + std::string name = std::string( + llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index())); + auto globalOp = rewriter.create( + gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false, + LLVM::Linkage::Internal, name, /*value=*/Attribute(), + gpu::GPUDialect::getWorkgroupAddressSpace()); + workgroupBuffers.push_back(globalOp); + } + + // Rewrite the original GPU function to an LLVM function. + auto funcType = typeConverter.convertType(gpuFuncOp.getType()) + .cast() + .getPointerElementTy(); + + // Remap proper input types. + TypeConverter::SignatureConversion signatureConversion( + gpuFuncOp.front().getNumArguments()); + typeConverter.convertFunctionSignature( + gpuFuncOp.getType(), /*isVariadic=*/false, signatureConversion); + + // Create the new function operation. Only copy those attributes that are + // not specific to function modeling. + SmallVector attributes; + for (const auto &attr : gpuFuncOp.getAttrs()) { + if (attr.first == SymbolTable::getSymbolAttrName() || + attr.first == impl::getTypeAttrName() || + attr.first == gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName()) + continue; + attributes.push_back(attr); + } + auto llvmFuncOp = rewriter.create( + gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType, + LLVM::Linkage::External, attributes); + + { + // Insert operations that correspond to converted workgroup and private + // memory attributions to the body of the function. This must operate on + // the original function, before the body region is inlined in the new + // function to maintain the relation between block arguments and the + // parent operation that assigns their semantics. + OpBuilder::InsertionGuard guard(rewriter); + + // Rewrite workgroup memory attributions to addresses of global buffers. + rewriter.setInsertionPointToStart(&gpuFuncOp.front()); + unsigned numProperArguments = gpuFuncOp.getNumArguments(); + auto i32Type = LLVM::LLVMType::getInt32Ty(typeConverter.getDialect()); + + Value zero = nullptr; + if (!workgroupBuffers.empty()) + zero = rewriter.create(loc, i32Type, + rewriter.getI32IntegerAttr(0)); + for (auto en : llvm::enumerate(workgroupBuffers)) { + LLVM::GlobalOp global = en.value(); + Value address = rewriter.create(loc, global); + auto elementType = global.getType().getArrayElementType(); + Value memory = rewriter.create( + loc, elementType.getPointerTo(global.addr_space().getZExtValue()), + address, ArrayRef{zero, zero}); + + // Build a memref descriptor pointing to the buffer to plug with the + // existing memref infrastructure. This may use more registers than + // otherwise necessary given that memref sizes are fixed, but we can try + // and canonicalize that away later. + Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()]; + auto type = attribution.getType().cast(); + auto descr = MemRefDescriptor::fromStaticShape( + rewriter, loc, typeConverter, type, memory); + signatureConversion.remapInput(numProperArguments + en.index(), descr); + } + + // Rewrite private memory attributions to alloca'ed buffers. + unsigned numWorkgroupAttributions = + gpuFuncOp.getNumWorkgroupAttributions(); + auto int64Ty = LLVM::LLVMType::getInt64Ty(typeConverter.getDialect()); + for (auto en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) { + Value attribution = en.value(); + auto type = attribution.getType().cast(); + assert(type && type.hasStaticShape() && + "unexpected type in attribution"); + + // Explicitly drop memory space when lowering private memory + // attributions since NVVM models it as `alloca`s in the default + // memory space and does not support `alloca`s with addrspace(5). + auto ptrType = typeConverter.convertType(type.getElementType()) + .cast() + .getPointerTo(AllocaAddrSpace); + Value numElements = rewriter.create( + gpuFuncOp.getLoc(), int64Ty, + rewriter.getI64IntegerAttr(type.getNumElements())); + Value allocated = rewriter.create( + gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0); + auto descr = MemRefDescriptor::fromStaticShape( + rewriter, loc, typeConverter, type, allocated); + signatureConversion.remapInput( + numProperArguments + numWorkgroupAttributions + en.index(), descr); + } + } + + // Move the region to the new function, update the entry block signature. + rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(), + llvmFuncOp.end()); + rewriter.applySignatureConversion(&llvmFuncOp.getBody(), + signatureConversion); + + rewriter.eraseOp(gpuFuncOp); + return success(); + } +}; + +struct GPUReturnOpLowering : public ConvertToLLVMPattern { + GPUReturnOpLowering(LLVMTypeConverter &typeConverter) + : ConvertToLLVMPattern(gpu::ReturnOp::getOperationName(), + typeConverter.getDialect()->getContext(), + typeConverter) {} + + LogicalResult + matchAndRewrite(Operation *op, ArrayRef operands, + ConversionPatternRewriter &rewriter) const override { + rewriter.replaceOpWithNewOp(op, operands); + return success(); + } +}; + +} // namespace mlir + +#endif // MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_ diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp --- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp +++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp @@ -21,6 +21,7 @@ #include "mlir/Transforms/DialectConversion.h" #include "llvm/Support/FormatVariadic.h" +#include "../GPUCommon/GPUOpsLowering.h" #include "../GPUCommon/IndexIntrinsicsOpLowering.h" #include "../GPUCommon/OpToFuncCallLowering.h" #include "../PassDetail.h" @@ -88,155 +89,6 @@ } }; -struct GPUFuncOpLowering : ConvertToLLVMPattern { - explicit GPUFuncOpLowering(LLVMTypeConverter &typeConverter) - : ConvertToLLVMPattern(gpu::GPUFuncOp::getOperationName(), - typeConverter.getDialect()->getContext(), - typeConverter) {} - - LogicalResult - matchAndRewrite(Operation *op, ArrayRef operands, - ConversionPatternRewriter &rewriter) const override { - assert(operands.empty() && "func op is not expected to have operands"); - auto gpuFuncOp = cast(op); - Location loc = gpuFuncOp.getLoc(); - - SmallVector workgroupBuffers; - workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions()); - for (auto en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) { - Value attribution = en.value(); - - auto type = attribution.getType().dyn_cast(); - assert(type && type.hasStaticShape() && "unexpected type in attribution"); - - uint64_t numElements = type.getNumElements(); - - auto elementType = typeConverter.convertType(type.getElementType()) - .cast(); - auto arrayType = LLVM::LLVMType::getArrayTy(elementType, numElements); - std::string name = std::string( - llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index())); - auto globalOp = rewriter.create( - gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false, - LLVM::Linkage::Internal, name, /*value=*/Attribute(), - gpu::GPUDialect::getWorkgroupAddressSpace()); - workgroupBuffers.push_back(globalOp); - } - - // Rewrite the original GPU function to an LLVM function. - auto funcType = typeConverter.convertType(gpuFuncOp.getType()) - .cast() - .getPointerElementTy(); - - // Remap proper input types. - TypeConverter::SignatureConversion signatureConversion( - gpuFuncOp.front().getNumArguments()); - typeConverter.convertFunctionSignature( - gpuFuncOp.getType(), /*isVariadic=*/false, signatureConversion); - - // Create the new function operation. Only copy those attributes that are - // not specific to function modeling. - SmallVector attributes; - for (const auto &attr : gpuFuncOp.getAttrs()) { - if (attr.first == SymbolTable::getSymbolAttrName() || - attr.first == impl::getTypeAttrName() || - attr.first == gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName()) - continue; - attributes.push_back(attr); - } - auto llvmFuncOp = rewriter.create( - gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType, - LLVM::Linkage::External, attributes); - - { - // Insert operations that correspond to converted workgroup and private - // memory attributions to the body of the function. This must operate on - // the original function, before the body region is inlined in the new - // function to maintain the relation between block arguments and the - // parent operation that assigns their semantics. - OpBuilder::InsertionGuard guard(rewriter); - - // Rewrite workgroup memory attributions to addresses of global buffers. - rewriter.setInsertionPointToStart(&gpuFuncOp.front()); - unsigned numProperArguments = gpuFuncOp.getNumArguments(); - auto i32Type = LLVM::LLVMType::getInt32Ty(typeConverter.getDialect()); - - Value zero = nullptr; - if (!workgroupBuffers.empty()) - zero = rewriter.create(loc, i32Type, - rewriter.getI32IntegerAttr(0)); - for (auto en : llvm::enumerate(workgroupBuffers)) { - LLVM::GlobalOp global = en.value(); - Value address = rewriter.create(loc, global); - auto elementType = global.getType().getArrayElementType(); - Value memory = rewriter.create( - loc, elementType.getPointerTo(global.addr_space().getZExtValue()), - address, ArrayRef{zero, zero}); - - // Build a memref descriptor pointing to the buffer to plug with the - // existing memref infrastructure. This may use more registers than - // otherwise necessary given that memref sizes are fixed, but we can try - // and canonicalize that away later. - Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()]; - auto type = attribution.getType().cast(); - auto descr = MemRefDescriptor::fromStaticShape( - rewriter, loc, typeConverter, type, memory); - signatureConversion.remapInput(numProperArguments + en.index(), descr); - } - - // Rewrite private memory attributions to alloca'ed buffers. - unsigned numWorkgroupAttributions = - gpuFuncOp.getNumWorkgroupAttributions(); - auto int64Ty = LLVM::LLVMType::getInt64Ty(typeConverter.getDialect()); - for (auto en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) { - Value attribution = en.value(); - auto type = attribution.getType().cast(); - assert(type && type.hasStaticShape() && - "unexpected type in attribution"); - - // Explicitly drop memory space when lowering private memory - // attributions since NVVM models it as `alloca`s in the default - // memory space and does not support `alloca`s with addrspace(5). - auto ptrType = typeConverter.convertType(type.getElementType()) - .cast() - .getPointerTo(); - Value numElements = rewriter.create( - gpuFuncOp.getLoc(), int64Ty, - rewriter.getI64IntegerAttr(type.getNumElements())); - Value allocated = rewriter.create( - gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0); - auto descr = MemRefDescriptor::fromStaticShape( - rewriter, loc, typeConverter, type, allocated); - signatureConversion.remapInput( - numProperArguments + numWorkgroupAttributions + en.index(), descr); - } - } - - // Move the region to the new function, update the entry block signature. - rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(), - llvmFuncOp.end()); - rewriter.applySignatureConversion(&llvmFuncOp.getBody(), - signatureConversion); - - rewriter.eraseOp(gpuFuncOp); - return success(); - } -}; - -struct GPUReturnOpLowering : public ConvertToLLVMPattern { - GPUReturnOpLowering(LLVMTypeConverter &typeConverter) - : ConvertToLLVMPattern(gpu::ReturnOp::getOperationName(), - typeConverter.getDialect()->getContext(), - typeConverter) {} - - LogicalResult - matchAndRewrite(Operation *op, ArrayRef operands, - ConversionPatternRewriter &rewriter) const override { - rewriter.replaceOpWithNewOp(op, operands); - return success(); - } -}; - /// Import the GPU Ops to NVVM Patterns. #include "GPUToNVVM.cpp.inc" @@ -300,8 +152,11 @@ NVVM::BlockIdYOp, NVVM::BlockIdZOp>, GPUIndexIntrinsicOpLowering, - GPUShuffleOpLowering, GPUFuncOpLowering, GPUReturnOpLowering>( - converter); + GPUShuffleOpLowering, GPUReturnOpLowering, + // Explicitly drop memory space when lowering private memory + // attributions since NVVM models it as `alloca`s in the default + // memory space and does not support `alloca`s with addrspace(5). + GPUFuncOpLowering<0>>(converter); patterns.insert>(converter, "__nv_fabsf", "__nv_fabs"); patterns.insert>(converter, "__nv_ceilf", diff --git a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt --- a/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt +++ b/mlir/lib/Conversion/GPUToROCDL/CMakeLists.txt @@ -1,9 +1,15 @@ +set(LLVM_TARGET_DEFINITIONS GPUToROCDL.td) +mlir_tablegen(GPUToROCDL.cpp.inc -gen-rewriters) +add_public_tablegen_target(MLIRGPUToROCDLIncGen) + add_mlir_conversion_library(MLIRGPUtoROCDLTransforms LowerGpuOpsToROCDLOps.cpp DEPENDS MLIRConversionPassIncGen + MLIRGPUToROCDLIncGen ) + target_link_libraries(MLIRGPUtoROCDLTransforms PUBLIC LLVMSupport diff --git a/mlir/lib/Conversion/GPUToROCDL/GPUToROCDL.td b/mlir/lib/Conversion/GPUToROCDL/GPUToROCDL.td new file mode 100644 --- /dev/null +++ b/mlir/lib/Conversion/GPUToROCDL/GPUToROCDL.td @@ -0,0 +1,21 @@ +//==-- GPUToROCDL.td - GPU Ops to ROCDL Patterns -------------*- tablegen -*==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Defines Patterns to lower GPU ops to ROCDL. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_CONVERSION_GPUTOROCDL_TD +#define MLIR_CONVERSION_GPUTOROCDL_TD + +include "mlir/Dialect/GPU/GPUOps.td" +include "mlir/Dialect/LLVMIR/ROCDLOps.td" + +def : Pat<(GPU_BarrierOp), (ROCDL_BarrierOp)>; + +#endif // MLIR_CONVERSION_GPUTOROCDL_TD diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -14,11 +14,16 @@ #include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h" #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h" +#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h" #include "mlir/Dialect/GPU/GPUDialect.h" +#include "mlir/Dialect/GPU/Passes.h" #include "mlir/Dialect/LLVMIR/ROCDLDialect.h" +#include "mlir/Dialect/Vector/VectorOps.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/DialectConversion.h" +#include "llvm/Support/FormatVariadic.h" +#include "../GPUCommon/GPUOpsLowering.h" #include "../GPUCommon/IndexIntrinsicsOpLowering.h" #include "../GPUCommon/OpToFuncCallLowering.h" #include "../PassDetail.h" @@ -27,6 +32,9 @@ namespace { +/// Import the GPU Ops to ROCDL Patterns. +#include "GPUToROCDL.cpp.inc" + // A pass that replaces all occurrences of GPU device operations with their // corresponding ROCDL equivalent. // @@ -38,41 +46,25 @@ void runOnOperation() override { gpu::GPUModuleOp m = getOperation(); - OwningRewritePatternList patterns; LLVMTypeConverter converter(m.getContext()); - populateStdToLLVMConversionPatterns(converter, patterns); - patterns.insert< - GPUIndexIntrinsicOpLowering, - GPUIndexIntrinsicOpLowering, - GPUIndexIntrinsicOpLowering, - GPUIndexIntrinsicOpLowering>( - converter); - patterns.insert>(converter, "__ocml_fabs_f32", - "__ocml_fabs_f64"); - patterns.insert>(converter, "__ocml_ceil_f32", - "__ocml_ceil_f64"); - patterns.insert>(converter, "__ocml_cos_f32", - "__ocml_cos_f64"); - patterns.insert>(converter, "__ocml_exp_f32", - "__ocml_exp_f64"); - patterns.insert>(converter, "__ocml_log_f32", - "__ocml_log_f64"); - patterns.insert>( - converter, "__ocml_log10_f32", "__ocml_log10_f64"); - patterns.insert>(converter, "__ocml_log2_f32", - "__ocml_log2_f64"); - patterns.insert>(converter, "__ocml_tanh_f32", - "__ocml_tanh_f64"); - ConversionTarget target(getContext()); - target.addLegalDialect(); + OwningRewritePatternList patterns; + + populateGpuRewritePatterns(m.getContext(), patterns); + applyPatternsAndFoldGreedily(m, patterns); + patterns.clear(); + + populateVectorToLLVMConversionPatterns(converter, patterns); + populateStdToLLVMConversionPatterns(converter, patterns); + populateGpuToROCDLConversionPatterns(converter, patterns); + LLVMConversionTarget target(getContext()); + target.addIllegalDialect(); target.addIllegalOp(); target.addIllegalOp(); + target.addLegalDialect(); + // TODO(whchung@gmail.com): Remove once we support replacing non-root ops. + target.addLegalOp(); if (failed(applyPartialConversion(m, target, patterns, &converter))) signalPassFailure(); } @@ -80,6 +72,37 @@ } // anonymous namespace +void mlir::populateGpuToROCDLConversionPatterns( + LLVMTypeConverter &converter, OwningRewritePatternList &patterns) { + populateWithGenerated(converter.getDialect()->getContext(), &patterns); + patterns.insert< + GPUIndexIntrinsicOpLowering, + GPUIndexIntrinsicOpLowering, + GPUIndexIntrinsicOpLowering, + GPUIndexIntrinsicOpLowering, + GPUFuncOpLowering<5>, GPUReturnOpLowering>(converter); + patterns.insert>(converter, "__ocml_fabs_f32", + "__ocml_fabs_f64"); + patterns.insert>(converter, "__ocml_ceil_f32", + "__ocml_ceil_f64"); + patterns.insert>(converter, "__ocml_cos_f32", + "__ocml_cos_f64"); + patterns.insert>(converter, "__ocml_exp_f32", + "__ocml_exp_f64"); + patterns.insert>(converter, "__ocml_log_f32", + "__ocml_log_f64"); + patterns.insert>(converter, "__ocml_log10_f32", + "__ocml_log10_f64"); + patterns.insert>(converter, "__ocml_log2_f32", + "__ocml_log2_f64"); + patterns.insert>(converter, "__ocml_tanh_f32", + "__ocml_tanh_f64"); +} + std::unique_ptr> mlir::createLowerGpuOpsToROCDLOpsPass() { return std::make_unique(); diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir --- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir @@ -1,9 +1,10 @@ -// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s +// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s --dump-input-on-failure -gpu.module @kernel_module { +gpu.module @test_module { // CHECK-LABEL: func @gpu_index_ops() func @gpu_index_ops() - attributes { gpu.kernel } { + -> (index, index, index, index, index, index, + index, index, index, index, index, index) { // CHECK: rocdl.workitem.id.x : !llvm.i32 %tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index) // CHECK: rocdl.workitem.id.y : !llvm.i32 @@ -32,68 +33,82 @@ // CHECK: rocdl.grid.dim.z : !llvm.i32 %gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index) + std.return %tIdX, %tIdY, %tIdZ, %bDimX, %bDimY, %bDimZ, + %bIdX, %bIdY, %bIdZ, %gDimX, %gDimY, %gDimZ + : index, index, index, index, index, index, + index, index, index, index, index, index + } +} + +// ----- + +gpu.module @test_module { + // CHECK-LABEL: func @gpu_sync() + func @gpu_sync() { + // CHECK: rocdl.barrier + gpu.barrier std.return } } // ----- -gpu.module @kernel_module { +gpu.module @test_module { // CHECK: llvm.func @__ocml_fabs_f32(!llvm.float) -> !llvm.float // CHECK: llvm.func @__ocml_fabs_f64(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_fabs - func @gpu_fabs(%arg_f32 : f32, %arg_f64 : f64) { + func @gpu_fabs(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { %result32 = std.absf %arg_f32 : f32 // CHECK: llvm.call @__ocml_fabs_f32(%{{.*}}) : (!llvm.float) -> !llvm.float %result64 = std.absf %arg_f64 : f64 // CHECK: llvm.call @__ocml_fabs_f64(%{{.*}}) : (!llvm.double) -> !llvm.double - std.return + std.return %result32, %result64 : f32, f64 } } // ----- -gpu.module @kernel_module { +gpu.module @test_module { // CHECK: llvm.func @__ocml_ceil_f32(!llvm.float) -> !llvm.float // CHECK: llvm.func @__ocml_ceil_f64(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_ceil - func @gpu_ceil(%arg_f32 : f32, %arg_f64 : f64) { + func @gpu_ceil(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { %result32 = std.ceilf %arg_f32 : f32 // CHECK: llvm.call @__ocml_ceil_f32(%{{.*}}) : (!llvm.float) -> !llvm.float %result64 = std.ceilf %arg_f64 : f64 // CHECK: llvm.call @__ocml_ceil_f64(%{{.*}}) : (!llvm.double) -> !llvm.double - std.return + std.return %result32, %result64 : f32, f64 } } // ----- -gpu.module @kernel_module { +gpu.module @test_module { // CHECK: llvm.func @__ocml_cos_f32(!llvm.float) -> !llvm.float // CHECK: llvm.func @__ocml_cos_f64(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_cos - func @gpu_cos(%arg_f32 : f32, %arg_f64 : f64) { + func @gpu_cos(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { %result32 = std.cos %arg_f32 : f32 // CHECK: llvm.call @__ocml_cos_f32(%{{.*}}) : (!llvm.float) -> !llvm.float %result64 = std.cos %arg_f64 : f64 // CHECK: llvm.call @__ocml_cos_f64(%{{.*}}) : (!llvm.double) -> !llvm.double - std.return + std.return %result32, %result64 : f32, f64 } } // ----- -gpu.module @kernel_module { +gpu.module @test_module { // CHECK: llvm.func @__ocml_exp_f32(!llvm.float) -> !llvm.float // CHECK: llvm.func @__ocml_exp_f64(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_exp - func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) { + func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { %exp_f32 = std.exp %arg_f32 : f32 // CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (!llvm.float) -> !llvm.float - %result_f32 = std.exp %exp_f32 : f32 + %result32 = std.exp %exp_f32 : f32 // CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (!llvm.float) -> !llvm.float %result64 = std.exp %arg_f64 : f64 // CHECK: llvm.call @__ocml_exp_f64(%{{.*}}) : (!llvm.double) -> !llvm.double - std.return + std.return %result32, %result64 : f32, f64 } } @@ -101,20 +116,20 @@ // ----- // Test that we handled properly operation with SymbolTable other than module op -gpu.module @kernel_module { +gpu.module @test_module { "test.symbol_scope"() ({ // CHECK: test.symbol_scope // CHECK: llvm.func @__ocml_exp_f32(!llvm.float) -> !llvm.float // CHECK: llvm.func @__ocml_exp_f64(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_exp - func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) { + func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { %exp_f32 = std.exp %arg_f32 : f32 // CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (!llvm.float) -> !llvm.float - %result_f32 = std.exp %exp_f32 : f32 + %result32 = std.exp %exp_f32 : f32 // CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (!llvm.float) -> !llvm.float %result64 = std.exp %arg_f64 : f64 // CHECK: llvm.call @__ocml_exp_f64(%{{.*}}) : (!llvm.double) -> !llvm.double - std.return + std.return %result32, %result64 : f32, f64 } "test.finish" () : () -> () }) : () -> () @@ -122,60 +137,60 @@ // ----- -gpu.module @kernel_module { +gpu.module @test_module { // CHECK: llvm.func @__ocml_log_f32(!llvm.float) -> !llvm.float // CHECK: llvm.func @__ocml_log_f64(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_log - func @gpu_log(%arg_f32 : f32, %arg_f64 : f64) { + func @gpu_log(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { %result32 = std.log %arg_f32 : f32 // CHECK: llvm.call @__ocml_log_f32(%{{.*}}) : (!llvm.float) -> !llvm.float %result64 = std.log %arg_f64 : f64 // CHECK: llvm.call @__ocml_log_f64(%{{.*}}) : (!llvm.double) -> !llvm.double - std.return + std.return %result32, %result64 : f32, f64 } } // ----- -gpu.module @kernel_module { +gpu.module @test_module { // CHECK: llvm.func @__ocml_log10_f32(!llvm.float) -> !llvm.float // CHECK: llvm.func @__ocml_log10_f64(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_log10 - func @gpu_log10(%arg_f32 : f32, %arg_f64 : f64) { + func @gpu_log10(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { %result32 = std.log10 %arg_f32 : f32 // CHECK: llvm.call @__ocml_log10_f32(%{{.*}}) : (!llvm.float) -> !llvm.float %result64 = std.log10 %arg_f64 : f64 // CHECK: llvm.call @__ocml_log10_f64(%{{.*}}) : (!llvm.double) -> !llvm.double - std.return + std.return %result32, %result64 : f32, f64 } } // ----- -gpu.module @kernel_module { +gpu.module @test_module { // CHECK: llvm.func @__ocml_log2_f32(!llvm.float) -> !llvm.float // CHECK: llvm.func @__ocml_log2_f64(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_log2 - func @gpu_log2(%arg_f32 : f32, %arg_f64 : f64) { + func @gpu_log2(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { %result32 = std.log2 %arg_f32 : f32 // CHECK: llvm.call @__ocml_log2_f32(%{{.*}}) : (!llvm.float) -> !llvm.float %result64 = std.log2 %arg_f64 : f64 // CHECK: llvm.call @__ocml_log2_f64(%{{.*}}) : (!llvm.double) -> !llvm.double - std.return + std.return %result32, %result64 : f32, f64 } } // ----- -gpu.module @kernel_module { +gpu.module @test_module { // CHECK: llvm.func @__ocml_tanh_f32(!llvm.float) -> !llvm.float // CHECK: llvm.func @__ocml_tanh_f64(!llvm.double) -> !llvm.double // CHECK-LABEL: func @gpu_tanh - func @gpu_tanh(%arg_f32 : f32, %arg_f64 : f64) { + func @gpu_tanh(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) { %result32 = std.tanh %arg_f32 : f32 // CHECK: llvm.call @__ocml_tanh_f32(%{{.*}}) : (!llvm.float) -> !llvm.float %result64 = std.tanh %arg_f64 : f64 // CHECK: llvm.call @__ocml_tanh_f64(%{{.*}}) : (!llvm.double) -> !llvm.double - std.return + std.return %result32, %result64 : f32, f64 } } diff --git a/mlir/test/Conversion/GPUToROCDL/memory-attrbution.mlir b/mlir/test/Conversion/GPUToROCDL/memory-attrbution.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Conversion/GPUToROCDL/memory-attrbution.mlir @@ -0,0 +1,145 @@ +// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-rocdl --split-input-file %s | FileCheck %s + +gpu.module @kernel { + // CHECK-LABEL: llvm.func @private + gpu.func @private(%arg0: f32) private(%arg1: memref<4xf32, 5>) { + // Allocate private memory inside the function. + // CHECK: %[[size:.*]] = llvm.mlir.constant(4 : i64) : !llvm.i64 + // CHECK: %[[raw:.*]] = llvm.alloca %[[size]] x !llvm.float : (!llvm.i64) -> !llvm<"float addrspace(5)*"> + + // Populate the memref descriptor. + // CHECK: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(5)*, float addrspace(5)*, i64, [1 x i64], [1 x i64] }"> + // CHECK: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0] + // CHECK: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1] + // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64 + // CHECK: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2] + // CHECK: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64 + // CHECK: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0] + // CHECK: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64 + // CHECK: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0] + + // "Store" lowering should work just as any other memref, only check that + // we emit some core instructions. + // CHECK: llvm.extractvalue %[[descr6:.*]] + // CHECK: llvm.getelementptr + // CHECK: llvm.store + %c0 = constant 0 : index + store %arg0, %arg1[%c0] : memref<4xf32, 5> + + "terminator"() : () -> () + } +} + +// ----- + +gpu.module @kernel { + // Workgroup buffers are allocated as globals. + // CHECK: llvm.mlir.global internal @[[buffer:.*]]() + // CHECK-SAME: addr_space = 3 + // CHECK-SAME: !llvm<"[4 x float]"> + + // CHECK-LABEL: llvm.func @workgroup + // CHECK-SAME: { + gpu.func @workgroup(%arg0: f32) workgroup(%arg1: memref<4xf32, 3>) { + // Get the address of the first element in the global array. + // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32 + // CHECK: %[[addr:.*]] = llvm.mlir.addressof @[[buffer]] : !llvm<"[4 x float] addrspace(3)*"> + // CHECK: %[[raw:.*]] = llvm.getelementptr %[[addr]][%[[c0]], %[[c0]]] + // CHECK-SAME: !llvm<"float addrspace(3)*"> + + // Populate the memref descriptor. + // CHECK: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(3)*, float addrspace(3)*, i64, [1 x i64], [1 x i64] }"> + // CHECK: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0] + // CHECK: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1] + // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64 + // CHECK: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2] + // CHECK: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64 + // CHECK: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0] + // CHECK: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64 + // CHECK: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0] + + // "Store" lowering should work just as any other memref, only check that + // we emit some core instructions. + // CHECK: llvm.extractvalue %[[descr6:.*]] + // CHECK: llvm.getelementptr + // CHECK: llvm.store + %c0 = constant 0 : index + store %arg0, %arg1[%c0] : memref<4xf32, 3> + + "terminator"() : () -> () + } +} + +// ----- + +gpu.module @kernel { + // Check that the total size was computed correctly. + // CHECK: llvm.mlir.global internal @[[buffer:.*]]() + // CHECK-SAME: addr_space = 3 + // CHECK-SAME: !llvm<"[48 x float]"> + + // CHECK-LABEL: llvm.func @workgroup3d + gpu.func @workgroup3d(%arg0: f32) workgroup(%arg1: memref<4x2x6xf32, 3>) { + // Get the address of the first element in the global array. + // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32 + // CHECK: %[[addr:.*]] = llvm.mlir.addressof @[[buffer]] : !llvm<"[48 x float] addrspace(3)*"> + // CHECK: %[[raw:.*]] = llvm.getelementptr %[[addr]][%[[c0]], %[[c0]]] + // CHECK-SAME: !llvm<"float addrspace(3)*"> + + // Populate the memref descriptor. + // CHECK: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(3)*, float addrspace(3)*, i64, [3 x i64], [3 x i64] }"> + // CHECK: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0] + // CHECK: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1] + // CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64 + // CHECK: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2] + // CHECK: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64 + // CHECK: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0] + // CHECK: %[[c12:.*]] = llvm.mlir.constant(12 : index) : !llvm.i64 + // CHECK: %[[descr6:.*]] = llvm.insertvalue %[[c12]], %[[descr5]][4, 0] + // CHECK: %[[c2:.*]] = llvm.mlir.constant(2 : index) : !llvm.i64 + // CHECK: %[[descr7:.*]] = llvm.insertvalue %[[c2]], %[[descr6]][3, 1] + // CHECK: %[[c6:.*]] = llvm.mlir.constant(6 : index) : !llvm.i64 + // CHECK: %[[descr8:.*]] = llvm.insertvalue %[[c6]], %[[descr7]][4, 1] + // CHECK: %[[c6:.*]] = llvm.mlir.constant(6 : index) : !llvm.i64 + // CHECK: %[[descr9:.*]] = llvm.insertvalue %[[c6]], %[[descr8]][3, 2] + // CHECK: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64 + // CHECK: %[[descr10:.*]] = llvm.insertvalue %[[c1]], %[[descr9]][4, 2] + + %c0 = constant 0 : index + store %arg0, %arg1[%c0,%c0,%c0] : memref<4x2x6xf32, 3> + "terminator"() : () -> () + } +} + +// ----- + +gpu.module @kernel { + // Check that several buffers are defined. + // CHECK: llvm.mlir.global internal @[[buffer1:.*]]() + // CHECK-SAME: !llvm<"[1 x float]"> + // CHECK: llvm.mlir.global internal @[[buffer2:.*]]() + // CHECK-SAME: !llvm<"[2 x float]"> + + // CHECK-LABEL: llvm.func @multiple + gpu.func @multiple(%arg0: f32) + workgroup(%arg1: memref<1xf32, 3>, %arg2: memref<2xf32, 3>) + private(%arg3: memref<3xf32, 5>, %arg4: memref<4xf32, 5>) { + + // Workgroup buffers. + // CHECK: llvm.mlir.addressof @[[buffer1]] + // CHECK: llvm.mlir.addressof @[[buffer2]] + + // Private buffers. + // CHECK: %[[c3:.*]] = llvm.mlir.constant(3 : i64) + // CHECK: llvm.alloca %[[c3]] x !llvm.float + // CHECK: %[[c4:.*]] = llvm.mlir.constant(4 : i64) + // CHECK: llvm.alloca %[[c4]] x !llvm.float + + %c0 = constant 0 : index + store %arg0, %arg1[%c0] : memref<1xf32, 3> + store %arg0, %arg2[%c0] : memref<2xf32, 3> + store %arg0, %arg3[%c0] : memref<3xf32, 5> + store %arg0, %arg4[%c0] : memref<4xf32, 5> + "terminator"() : () -> () + } +} diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir --- a/mlir/test/Dialect/LLVMIR/rocdl.mlir +++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir @@ -28,3 +28,9 @@ %11 = rocdl.grid.dim.z : !llvm.i32 llvm.return %0 : !llvm.i32 } + +func @rocdl.barrier() { + // CHECK: rocdl.barrier + rocdl.barrier + llvm.return +} diff --git a/mlir/test/Target/rocdl.mlir b/mlir/test/Target/rocdl.mlir --- a/mlir/test/Target/rocdl.mlir +++ b/mlir/test/Target/rocdl.mlir @@ -29,6 +29,14 @@ llvm.return %1 : !llvm.i32 } +llvm.func @rocdl.barrier() { + // CHECK: fence syncscope("workgroup") release + // CHECK-NEXT: call void @llvm.amdgcn.s.barrier() + // CHECK-NEXT: fence syncscope("workgroup") acquire + rocdl.barrier + llvm.return +} + llvm.func @kernel_func() attributes {gpu.kernel} { // CHECK-LABEL: amdgpu_kernel void @kernel_func llvm.return