This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
mlir/
-
lib/Target/LLVMIR/Dialect/ROCDL/
-
Target/
-
LLVMIR/
-
Dialect/
-
ROCDL/
1/2
ROCDLToLLVMIRTranslation.cpp
-
test/Target/LLVMIR/
-
Target/
-
LLVMIR/
-
rocdl.mlir

Differential D115741

[MLIR][GPU] Make max flat work group size for ROCDL kernels configurable
ClosedPublic

Authored by krzysz00 on Dec 14 2021, 9:38 AM.

Download Raw Diff

Details

Reviewers

whchung
ftynse

Commits

rGc57b2a0635df: [MLIR][GPU] Make max flat work group size for ROCDL kernels configurable

Summary

While the default value for the amdgpu-flat-work-group-size attribute,
"1, 256", matches the defaults from Clang, some users of the ROCDL dialect,
namely Tensorflow, use larger workgroups, such as 1024. Therefore,
instead of hardcoding this value, we add a rocdl.max_flat_work_group_size
attribute that can be set on GPU kernels to override the default value.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

krzysz00 created this revision.Dec 14 2021, 9:38 AM

Herald added a reviewer: ftynse. · View Herald TranscriptDec 14 2021, 9:38 AM

Herald added subscribers: sdasgup3, wenzhicui, wrengr and 24 others. · View Herald Transcript

krzysz00 requested review of this revision.Dec 14 2021, 9:38 AM

Herald added a project: Restricted Project. · View Herald TranscriptDec 14 2021, 9:38 AM

Herald added subscribers: stephenneuendorffer, nicolasvasilache. · View Herald Transcript

How about add a unit test to check the attribute can be overriden?

mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
21	I assume we can live without this header file?

This revision now requires changes to proceed.Dec 14 2021, 9:53 AM

Harbormaster completed remote builds in B139249: Diff 394292.Dec 14 2021, 10:23 AM

Fix bug, add tests

I've added a unit test

mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
21	We do need one, since I'm using `raw_svector_ostream` below

whchung accepted this revision.Dec 14 2021, 10:50 AM

This revision is now accepted and ready to land.Dec 14 2021, 10:50 AM

Harbormaster completed remote builds in B139259: Diff 394311.Dec 14 2021, 11:03 AM

Closed by commit rGc57b2a0635df: [MLIR][GPU] Make max flat work group size for ROCDL kernels configurable (authored by krzysz00). · Explain WhyDec 14 2021, 12:12 PM

This revision was automatically updated to reflect the committed changes.

krzysz00 added a commit: rGc57b2a0635df: [MLIR][GPU] Make max flat work group size for ROCDL kernels configurable.

Revision Contents

Path

Size

mlir/

lib/

Target/

LLVMIR/

Dialect/

ROCDL/

ROCDLToLLVMIRTranslation.cpp

24 lines

test/

Target/

LLVMIR/

rocdl.mlir

12 lines

Diff 394341

mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp

Show All 12 Lines

#include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"		#include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"		#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
#include "mlir/IR/Operation.h"		#include "mlir/IR/Operation.h"
#include "mlir/Target/LLVMIR/ModuleTranslation.h"		#include "mlir/Target/LLVMIR/ModuleTranslation.h"

#include "llvm/IR/IRBuilder.h"		#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"		#include "llvm/IR/IntrinsicsAMDGPU.h"
		#include "llvm/Support/raw_ostream.h"
		whchungUnsubmitted Not Done Reply Inline Actions I assume we can live without this header file? whchung: I assume we can live without this header file?
		krzysz00AuthorUnsubmitted Done Reply Inline Actions We do need one, since I'm using `raw_svector_ostream` below krzysz00: We do need one, since I'm using `raw_svector_ostream` below

using namespace mlir;		using namespace mlir;
using namespace mlir::LLVM;		using namespace mlir::LLVM;
using mlir::LLVM::detail::createIntrinsicCall;		using mlir::LLVM::detail::createIntrinsicCall;

// Create a call to ROCm-Device-Library function		// Create a call to ROCm-Device-Library function
// Currently this routine will work only for calling ROCDL functions that		// Currently this routine will work only for calling ROCDL functions that
// take a single int32 argument. It is likely that the interface of this		// take a single int32 argument. It is likely that the interface of this
Show All 37 Lines	amendOperation(Operation *op, NamedAttribute attribute,
LLVM::ModuleTranslation &moduleTranslation) const final {		LLVM::ModuleTranslation &moduleTranslation) const final {
if (attribute.getName() == ROCDL::ROCDLDialect::getKernelFuncAttrName()) {		if (attribute.getName() == ROCDL::ROCDLDialect::getKernelFuncAttrName()) {
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);		auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
if (!func)		if (!func)
return failure();		return failure();

// For GPU kernels,		// For GPU kernels,
// 1. Insert AMDGPU_KERNEL calling convention.		// 1. Insert AMDGPU_KERNEL calling convention.
// 2. Insert amdgpu-flat-workgroup-size(1, 256) attribute.		// 2. Insert amdgpu-flat-work-group-size(1, 256) attribute unless the user
		// has overriden this value - 256 is the default in clang
// 3. Insert amdgpu-implicitarg-num-bytes=56 (which must be set on OpenCL		// 3. Insert amdgpu-implicitarg-num-bytes=56 (which must be set on OpenCL
// and HIP kernels per Clang)		// and HIP kernels per Clang)
llvm::Function *llvmFunc =		llvm::Function *llvmFunc =
moduleTranslation.lookupFunction(func.getName());		moduleTranslation.lookupFunction(func.getName());
llvmFunc->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);		llvmFunc->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
		if (!llvmFunc->hasFnAttribute("amdgpu-flat-work-group-size")) {
llvmFunc->addFnAttr("amdgpu-flat-work-group-size", "1, 256");		llvmFunc->addFnAttr("amdgpu-flat-work-group-size", "1, 256");
		}
llvmFunc->addFnAttr("amdgpu-implicitarg-num-bytes", "56");		llvmFunc->addFnAttr("amdgpu-implicitarg-num-bytes", "56");
}		}
		// Override flat-work-group-size
		if ("rocdl.max_flat_work_group_size" == attribute.getName()) {
		auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
		if (!func)
		return failure();
		auto value = attribute.getValue().dyn_cast<IntegerAttr>();
		if (!value)
		return failure();

		llvm::Function *llvmFunc =
		moduleTranslation.lookupFunction(func.getName());
		llvm::SmallString<8> llvmAttrValue;
		llvm::raw_svector_ostream attrValueStream(llvmAttrValue);
		attrValueStream << "1, " << value.getInt();
		llvmFunc->addFnAttr("amdgpu-flat-work-group-size", llvmAttrValue);
		}
return success();		return success();
}		}
};		};
} // namespace		} // namespace

void mlir::registerROCDLDialectTranslation(DialectRegistry &registry) {		void mlir::registerROCDLDialectTranslation(DialectRegistry &registry) {
registry.insert<ROCDL::ROCDLDialect>();		registry.insert<ROCDL::ROCDLDialect>();
registry.addDialectInterface<ROCDL::ROCDLDialect,		registry.addDialectInterface<ROCDL::ROCDLDialect,
ROCDLDialectLLVMIRTranslationInterface>();		ROCDLDialectLLVMIRTranslationInterface>();
}		}

void mlir::registerROCDLDialectTranslation(MLIRContext &context) {		void mlir::registerROCDLDialectTranslation(MLIRContext &context) {
DialectRegistry registry;		DialectRegistry registry;
registerROCDLDialectTranslation(registry);		registerROCDLDialectTranslation(registry);
context.appendDialectRegistry(registry);		context.appendDialectRegistry(registry);
}		}

mlir/test/Target/LLVMIR/rocdl.mlir

Show All 24 Lines	llvm.func @rocdl_special_regs() -> i32 {
// CHECK: call i64 @__ockl_get_global_size(i32 1)		// CHECK: call i64 @__ockl_get_global_size(i32 1)
%11 = rocdl.grid.dim.y : i64		%11 = rocdl.grid.dim.y : i64
// CHECK: call i64 @__ockl_get_global_size(i32 2)		// CHECK: call i64 @__ockl_get_global_size(i32 2)
%12 = rocdl.grid.dim.z : i64		%12 = rocdl.grid.dim.z : i64
llvm.return %1 : i32		llvm.return %1 : i32
}		}

llvm.func @kernel_func() attributes {rocdl.kernel} {		llvm.func @kernel_func() attributes {rocdl.kernel} {
// CHECK-LABEL: amdgpu_kernel void @kernel_func		// CHECK-LABEL: amdgpu_kernel void @kernel_func()
		// CHECK: #[[$KERNEL_ATTRS:[0-9]+]]
		llvm.return
		}

		llvm.func @kernel_func_workgroups()
		attributes {rocdl.kernel, rocdl.max_flat_work_group_size = 1024 : index} {
		// CHECK-LABEL: amdgpu_kernel void @kernel_func_workgroups()
		// CHECK: #[[$KERNEL_WORKGROUP_ATTRS:[0-9]+]]
llvm.return		llvm.return
}		}

llvm.func @rocdl.barrier() {		llvm.func @rocdl.barrier() {
// CHECK: fence syncscope("workgroup") release		// CHECK: fence syncscope("workgroup") release
// CHECK-NEXT: call void @llvm.amdgcn.s.barrier()		// CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
// CHECK-NEXT: fence syncscope("workgroup") acquire		// CHECK-NEXT: fence syncscope("workgroup") acquire
rocdl.barrier		rocdl.barrier
▲ Show 20 Lines • Show All 130 Lines • ▼ Show 20 Lines	llvm.func @rocdl.mubuf(%rsrc : vector<4xi32>, %vindex : i32,
// CHECK: call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %{{.}}, <4 x i32> %{{.}}, i32 %{{.}}, i32 %{{.}}, i1 {{.}}, i1 {{.}})		// CHECK: call void @llvm.amdgcn.buffer.store.v2f32(<2 x float> %{{.}}, <4 x i32> %{{.}}, i32 %{{.}}, i32 %{{.}}, i1 {{.}}, i1 {{.}})
rocdl.buffer.store %vdata2, %rsrc, %vindex, %offset, %glc, %slc : vector<2xf32>		rocdl.buffer.store %vdata2, %rsrc, %vindex, %offset, %glc, %slc : vector<2xf32>
// CHECK: call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %{{.}}, <4 x i32> %{{.}}, i32 %{{.}}, i32 %{{.}}, i1 {{.}}, i1 {{.}})		// CHECK: call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %{{.}}, <4 x i32> %{{.}}, i32 %{{.}}, i32 %{{.}}, i1 {{.}}, i1 {{.}})
rocdl.buffer.store %vdata4, %rsrc, %vindex, %offset, %glc, %slc : vector<4xf32>		rocdl.buffer.store %vdata4, %rsrc, %vindex, %offset, %glc, %slc : vector<4xf32>

llvm.return		llvm.return
}		}

		// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1, 256" "amdgpu-implicitarg-num-bytes"="56" }
		// CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1, 1024"