diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -30,6 +30,12 @@ /// Get the name of the attribute used to annotate external kernel /// functions. static StringRef getKernelFuncAttrName() { return "rocdl.kernel"; } + static constexpr ::llvm::StringLiteral getFlatWorkGroupSizeAttrName() { + return ::llvm::StringLiteral("rocdl.flat_work_group_size"); + } + static constexpr ::llvm::StringLiteral getReqdWorkGroupSizeAttrName() { + return ::llvm::StringLiteral("rocdl.reqd_work_group_size"); + } }]; } @@ -49,8 +55,9 @@ list traits = []> : ROCDL_Op, Results<(outs LLVM_Type:$res)>, Arguments<(ins)> { - string llvmBuilder = "$res = createIntrinsicCall(builder," - # "llvm::Intrinsic::amdgcn_" # !subst(".","_", mnemonic) # ");"; + string llvmBuilder = "$res = createIntrinsicCallWithRange(builder," + # "llvm::Intrinsic::amdgcn_" # !subst(".","_", mnemonic) + # ", op->getAttrOfType<::mlir::DenseI32ArrayAttr>(\"range\"));"; let assemblyFormat = "attr-dict `:` type($res)"; } diff --git a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h --- a/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h +++ b/mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h @@ -11,6 +11,7 @@ #include "mlir/Conversion/LLVMCommon/Pattern.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/IR/BuiltinAttributes.h" namespace mlir { @@ -23,11 +24,19 @@ struct GPUIndexIntrinsicOpLowering : public ConvertOpToLLVMPattern { private: unsigned indexBitwidth; + StringRef boundsAttrName; public: explicit GPUIndexIntrinsicOpLowering(LLVMTypeConverter &typeConverter) : ConvertOpToLLVMPattern(typeConverter), - indexBitwidth(typeConverter.getIndexTypeBitwidth()) {} + indexBitwidth(typeConverter.getIndexTypeBitwidth()), + boundsAttrName("") {} + + explicit GPUIndexIntrinsicOpLowering(LLVMTypeConverter &typeConverter, + StringRef boundsAttrName) + : ConvertOpToLLVMPattern(typeConverter), + indexBitwidth(typeConverter.getIndexTypeBitwidth()), + boundsAttrName(boundsAttrName) {} // Convert the kernel arguments to an LLVM type, preserve the rest. LogicalResult @@ -35,7 +44,7 @@ ConversionPatternRewriter &rewriter) const override { auto loc = op->getLoc(); MLIRContext *context = rewriter.getContext(); - Value newOp; + Operation *newOp; switch (op.getDimension()) { case gpu::Dimension::x: newOp = rewriter.create(loc, IntegerType::get(context, 32)); @@ -48,15 +57,28 @@ break; } + Operation *function; + if (auto gpuFunc = op->template getParentOfType()) + function = gpuFunc; + if (auto llvmFunc = op->template getParentOfType()) + function = llvmFunc; + if (!boundsAttrName.empty() && function) { + if (auto attr = function->template getAttrOfType( + boundsAttrName)) { + int32_t maximum = attr[static_cast(op.getDimension())]; + newOp->setAttr("range", rewriter.getDenseI32ArrayAttr({0, maximum})); + } + } + if (indexBitwidth > 32) { newOp = rewriter.create( - loc, IntegerType::get(context, indexBitwidth), newOp); + loc, IntegerType::get(context, indexBitwidth), newOp->getResult(0)); } else if (indexBitwidth < 32) { newOp = rewriter.create( - loc, IntegerType::get(context, indexBitwidth), newOp); + loc, IntegerType::get(context, indexBitwidth), newOp->getResult(0)); } - rewriter.replaceOp(op, {newOp}); + rewriter.replaceOp(op, newOp->getResults()); return success(); } }; diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp --- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp +++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp @@ -26,9 +26,11 @@ #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" #include "mlir/Dialect/GPU/Transforms/Passes.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/LLVMIR/ROCDLDialect.h" #include "mlir/Dialect/Math/IR/Math.h" #include "mlir/Dialect/Vector/IR/VectorOps.h" +#include "mlir/IR/BuiltinAttributes.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/DialectConversion.h" #include "mlir/Transforms/GreedyPatternRewriteDriver.h" @@ -139,6 +141,27 @@ configureGpuToROCDLConversionLegality(target); if (failed(applyPartialConversion(m, target, std::move(llvmPatterns)))) signalPassFailure(); + + // Manually rewrite known block size attributes so the LLVMIR translation + // infrastructure can pick them up. + m.walk([ctx](LLVM::LLVMFuncOp op) { + if (auto blockSizes = + op->removeAttr(gpu::GPUFuncOp::getKnownBlockSizeAttrName()) + .dyn_cast_or_null()) { + op->setAttr(ROCDL::ROCDLDialect::getReqdWorkGroupSizeAttrName(), + blockSizes); + // Also set up the rocdl.flat_work_group_size attribute to prevent + // conflicting metadata. + uint32_t flatSize = 1; + for (uint32_t size : blockSizes.asArrayRef()) { + flatSize *= size; + } + StringAttr flatSizeAttr = + StringAttr::get(ctx, Twine(flatSize) + "," + Twine(flatSize)); + op->setAttr(ROCDL::ROCDLDialect::getFlatWorkGroupSizeAttrName(), + flatSizeAttr); + } + }); } }; @@ -173,11 +196,14 @@ populateWithGenerated(patterns); patterns .add, - GPUIndexIntrinsicOpLowering>( + converter, gpu::GPUFuncOp::getKnownBlockSizeAttrName()); + patterns.add>( + converter, gpu::GPUFuncOp::getKnownGridSizeAttrName()); + patterns + .add, - GPUIndexIntrinsicOpLowering, GPUIndexIntrinsicOpLowering, GPUReturnOpLowering>(converter); diff --git a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp --- a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp @@ -13,17 +13,35 @@ #include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h" #include "mlir/Dialect/LLVMIR/ROCDLDialect.h" +#include "mlir/IR/BuiltinAttributes.h" #include "mlir/IR/Operation.h" #include "mlir/Target/LLVMIR/ModuleTranslation.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/MDBuilder.h" #include "llvm/Support/raw_ostream.h" using namespace mlir; using namespace mlir::LLVM; using mlir::LLVM::detail::createIntrinsicCall; +static llvm::Value *createIntrinsicCallWithRange(llvm::IRBuilderBase &builder, + llvm::Intrinsic::ID intrinsic, + DenseI32ArrayAttr maybeRange) { + auto *inst = llvm::cast( + createIntrinsicCall(builder, intrinsic, {}, {})); + if (maybeRange) { + SmallVector apInts; + for (int32_t i : maybeRange.asArrayRef()) + apInts.push_back(llvm::APInt(32, i)); + llvm::MDBuilder mdBuilder(builder.getContext()); + llvm::MDNode *range = mdBuilder.createRange(apInts[0], apInts[1]); + inst->setMetadata(llvm::LLVMContext::MD_range, range); + } + return inst; +} + // Create a call to ROCm-Device-Library function // Currently this routine will work only for calling ROCDL functions that // take a single int32 argument. It is likely that the interface of this @@ -80,11 +98,13 @@ moduleTranslation.lookupFunction(func.getName()); llvmFunc->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); if (!llvmFunc->hasFnAttribute("amdgpu-flat-work-group-size")) { - llvmFunc->addFnAttr("amdgpu-flat-work-group-size", "1, 256"); + llvmFunc->addFnAttr("amdgpu-flat-work-group-size", "1,256"); } llvmFunc->addFnAttr("amdgpu-implicitarg-num-bytes", "56"); } // Override flat-work-group-size + // TODO: update clients to rocdl.flat_work_group_size instead, + // then remove this half of the branch if ("rocdl.max_flat_work_group_size" == attribute.getName()) { auto func = dyn_cast(op); if (!func) @@ -97,9 +117,46 @@ moduleTranslation.lookupFunction(func.getName()); llvm::SmallString<8> llvmAttrValue; llvm::raw_svector_ostream attrValueStream(llvmAttrValue); - attrValueStream << "1, " << value.getInt(); + attrValueStream << "1," << value.getInt(); + llvmFunc->addFnAttr("amdgpu-flat-work-group-size", llvmAttrValue); + } + if (ROCDL::ROCDLDialect::getFlatWorkGroupSizeAttrName() == + attribute.getName()) { + auto func = dyn_cast(op); + if (!func) + return failure(); + auto value = attribute.getValue().dyn_cast(); + if (!value) + return failure(); + + llvm::Function *llvmFunc = + moduleTranslation.lookupFunction(func.getName()); + llvm::SmallString<8> llvmAttrValue; + llvmAttrValue.append(value.getValue()); llvmFunc->addFnAttr("amdgpu-flat-work-group-size", llvmAttrValue); } + + // Set reqd_work_group_size metadata + if (ROCDL::ROCDLDialect::getReqdWorkGroupSizeAttrName() == + attribute.getName()) { + auto func = dyn_cast(op); + if (!func) + return failure(); + auto value = attribute.getValue().dyn_cast(); + if (!value) + return failure(); + llvm::LLVMContext &llvmContext = moduleTranslation.getLLVMContext(); + SmallVector metadata; + llvm::Type *i32 = llvm::IntegerType::get(llvmContext, 32); + for (int32_t i : value.asArrayRef()) { + llvm::Constant *constant = llvm::ConstantInt::get(i32, i); + metadata.push_back(llvm::ConstantAsMetadata::get(constant)); + } + llvm::Function *llvmFunc = + moduleTranslation.lookupFunction(func.getName()); + llvm::MDNode *node = llvm::MDNode::get(llvmContext, metadata); + llvmFunc->setMetadata("reqd_work_group_size", node); + } return success(); } }; diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir --- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir +++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir @@ -58,6 +58,36 @@ // ----- +gpu.module @test_module { + // CHECK-LABEL: func @gpu_index_ops_range() + // CHECK-SAME: rocdl.flat_work_group_size = "1536,1536" + // CHECK-SAME: rocdl.reqd_work_group_size = array + func.func @gpu_index_ops_range() + -> (index, index, index, index, index, index) attributes + {gpu.known_block_size = array, + gpu.known_grid_size = array} { + + // CHECK: rocdl.workitem.id.x {range = array} : i32 + %tIdX = gpu.thread_id x + // CHECK: rocdl.workitem.id.y {range = array} : i32 + %tIdY = gpu.thread_id y + // CHECK: rocdl.workitem.id.z {range = array} : i32 + %tIdZ = gpu.thread_id z + + // CHECK: rocdl.workgroup.id.x {range = array} : i32 + %bIdX = gpu.block_id x + // CHECK: rocdl.workgroup.id.y {range = array} : i32 + %bIdY = gpu.block_id y + // CHECK: rocdl.workgroup.id.z {range = array} : i32 + %bIdZ = gpu.block_id z + + func.return %tIdX, %tIdY, %tIdZ, %bIdX, %bIdY, %bIdZ + : index, index, index, index, index, index + } +} + +// ----- + gpu.module @test_module { // CHECK-LABEL: func @gpu_index_comp // CHECK32-LABEL: func @gpu_index_comp diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -26,6 +26,10 @@ %11 = rocdl.grid.dim.y : i64 // CHECK: call i64 @__ockl_get_global_size(i32 2) %12 = rocdl.grid.dim.z : i64 + + // CHECK: call i32 @llvm.amdgcn.workitem.id.x(),{{.*}} !range ![[$RANGE:[0-9]+]] + %13 = rocdl.workitem.id.x {range = array} : i32 + llvm.return %1 : i32 } @@ -42,6 +46,16 @@ llvm.return } +llvm.func @known_block_sizes() + attributes {rocdl.kernel, + rocdl.flat_work_group_size = "128,128", + rocdl.reqd_work_group_size = array} { + // CHECK-LABEL: amdgpu_kernel void @known_block_sizes() + // CHECK: #[[$KNOWN_BLOCK_SIZE_ATTRS:[0-9]+]] + // CHECK: !reqd_work_group_size ![[$REQD_WORK_GROUP_SIZE:[0-9]+]] + llvm.return +} + llvm.func @rocdl.barrier() { // CHECK: fence syncscope("workgroup") release // CHECK-NEXT: call void @llvm.amdgcn.s.barrier() @@ -225,5 +239,8 @@ llvm.return } -// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1, 256" "amdgpu-implicitarg-num-bytes"="56" } -// CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1, 1024" +// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-implicitarg-num-bytes"="56" } +// CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024" +// CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128" +// CHECK-DAG: ![[$RANGE]] = !{i32 0, i32 64} +// CHECK-DAG: ![[$REQD_WORK_GROUP_SIZE]] = !{i32 16, i32 4, i32 2}