diff --git a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp --- a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp +++ b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp @@ -18,6 +18,7 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/Support/raw_ostream.h" using namespace mlir; using namespace mlir::LLVM; @@ -71,15 +72,34 @@ // For GPU kernels, // 1. Insert AMDGPU_KERNEL calling convention. - // 2. Insert amdgpu-flat-workgroup-size(1, 256) attribute. + // 2. Insert amdgpu-flat-work-group-size(1, 256) attribute unless the user + // has overriden this value - 256 is the default in clang // 3. Insert amdgpu-implicitarg-num-bytes=56 (which must be set on OpenCL // and HIP kernels per Clang) llvm::Function *llvmFunc = moduleTranslation.lookupFunction(func.getName()); llvmFunc->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); - llvmFunc->addFnAttr("amdgpu-flat-work-group-size", "1, 256"); + if (!llvmFunc->hasFnAttribute("amdgpu-flat-work-group-size")) { + llvmFunc->addFnAttr("amdgpu-flat-work-group-size", "1, 256"); + } llvmFunc->addFnAttr("amdgpu-implicitarg-num-bytes", "56"); } + // Override flat-work-group-size + if ("rocdl.max_flat_work_group_size" == attribute.getName()) { + auto func = dyn_cast(op); + if (!func) + return failure(); + auto value = attribute.getValue().dyn_cast(); + if (!value) + return failure(); + + llvm::Function *llvmFunc = + moduleTranslation.lookupFunction(func.getName()); + llvm::SmallString<8> llvmAttrValue; + llvm::raw_svector_ostream attrValueStream(llvmAttrValue); + attrValueStream << "1, " << value.getInt(); + llvmFunc->addFnAttr("amdgpu-flat-work-group-size", llvmAttrValue); + } return success(); } }; diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir --- a/mlir/test/Target/LLVMIR/rocdl.mlir +++ b/mlir/test/Target/LLVMIR/rocdl.mlir @@ -30,7 +30,15 @@ } llvm.func @kernel_func() attributes {rocdl.kernel} { - // CHECK-LABEL: amdgpu_kernel void @kernel_func + // CHECK-LABEL: amdgpu_kernel void @kernel_func() + // CHECK: #[[$KERNEL_ATTRS:[0-9]+]] + llvm.return +} + +llvm.func @kernel_func_workgroups() + attributes {rocdl.kernel, rocdl.max_flat_work_group_size = 1024 : index} { + // CHECK-LABEL: amdgpu_kernel void @kernel_func_workgroups() + // CHECK: #[[$KERNEL_WORKGROUP_ATTRS:[0-9]+]] llvm.return } @@ -177,3 +185,5 @@ llvm.return } +// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1, 256" "amdgpu-implicitarg-num-bytes"="56" } +// CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1, 1024"