Diff 482200

mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td

Show All 24 Lines	def ROCDL_Dialect : Dialect {
let cppNamespace = "::mlir::ROCDL";		let cppNamespace = "::mlir::ROCDL";
let dependentDialects = ["LLVM::LLVMDialect"];		let dependentDialects = ["LLVM::LLVMDialect"];
let hasOperationAttrVerify = 1;		let hasOperationAttrVerify = 1;

let extraClassDeclaration = [{		let extraClassDeclaration = [{
/// Get the name of the attribute used to annotate external kernel		/// Get the name of the attribute used to annotate external kernel
/// functions.		/// functions.
static StringRef getKernelFuncAttrName() { return "rocdl.kernel"; }		static StringRef getKernelFuncAttrName() { return "rocdl.kernel"; }
		static constexpr ::llvm::StringLiteral getFlatWorkGroupSizeAttrName() {
		return ::llvm::StringLiteral("rocdl.flat_work_group_size");
		}
		static constexpr ::llvm::StringLiteral getReqdWorkGroupSizeAttrName() {
		return ::llvm::StringLiteral("rocdl.reqd_work_group_size");
		}
}];		}];
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// ROCDL op definitions		// ROCDL op definitions
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

class ROCDL_Op<string mnemonic, list<Trait> traits = []> :		class ROCDL_Op<string mnemonic, list<Trait> traits = []> :
LLVM_OpBase<ROCDL_Dialect, mnemonic, traits> {		LLVM_OpBase<ROCDL_Dialect, mnemonic, traits> {
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// ROCDL special register op definitions		// ROCDL special register op definitions
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

class ROCDL_SpecialRegisterOp<string mnemonic,		class ROCDL_SpecialRegisterOp<string mnemonic,
list<Trait> traits = []> :		list<Trait> traits = []> :
ROCDL_Op<mnemonic, !listconcat(traits, [Pure])>,		ROCDL_Op<mnemonic, !listconcat(traits, [Pure])>,
Results<(outs LLVM_Type:$res)>, Arguments<(ins)> {		Results<(outs LLVM_Type:$res)>, Arguments<(ins)> {
string llvmBuilder = "$res = createIntrinsicCall(builder,"		string llvmBuilder = "$res = createIntrinsicCallWithRange(builder,"
# "llvm::Intrinsic::amdgcn_" # !subst(".","_", mnemonic) # ");";		# "llvm::Intrinsic::amdgcn_" # !subst(".","_", mnemonic)
		# ", op->getAttrOfType<::mlir::DenseI32ArrayAttr>(\"range\"));";
let assemblyFormat = "attr-dict `:` type($res)";		let assemblyFormat = "attr-dict `:` type($res)";
}		}

class ROCDL_DeviceFunctionOp<string mnemonic, string device_function,		class ROCDL_DeviceFunctionOp<string mnemonic, string device_function,
int parameter, list<Trait> traits = []> :		int parameter, list<Trait> traits = []> :
ROCDL_Op<mnemonic, !listconcat(traits, [Pure])>,		ROCDL_Op<mnemonic, !listconcat(traits, [Pure])>,
Results<(outs LLVM_Type:$res)>, Arguments<(ins)> {		Results<(outs LLVM_Type:$res)>, Arguments<(ins)> {
string llvmBuilder = "$res = createDeviceFunctionCall(builder, \""		string llvmBuilder = "$res = createDeviceFunctionCall(builder, \""
▲ Show 20 Lines • Show All 190 Lines • Show Last 20 Lines

mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h

	//===- IndexIntrinsicsOpLowering.h - GPU IndexOps Lowering class - C++ --===//			//===- IndexIntrinsicsOpLowering.h - GPU IndexOps Lowering class - C++ --===//
	//			//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.			// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.			// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception			// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	#ifndef MLIR_CONVERSION_GPUCOMMON_INDEXINTRINSICSOPLOWERING_H_			#ifndef MLIR_CONVERSION_GPUCOMMON_INDEXINTRINSICSOPLOWERING_H_
	#define MLIR_CONVERSION_GPUCOMMON_INDEXINTRINSICSOPLOWERING_H_			#define MLIR_CONVERSION_GPUCOMMON_INDEXINTRINSICSOPLOWERING_H_

	#include "mlir/Conversion/LLVMCommon/Pattern.h"			#include "mlir/Conversion/LLVMCommon/Pattern.h"
	#include "mlir/Dialect/GPU/IR/GPUDialect.h"			#include "mlir/Dialect/GPU/IR/GPUDialect.h"
	#include "mlir/Dialect/LLVMIR/LLVMDialect.h"			#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
				#include "mlir/IR/BuiltinAttributes.h"

	namespace mlir {			namespace mlir {

	// Rewriting that replaces Op with XOp, YOp, or ZOp depending on the dimension			// Rewriting that replaces Op with XOp, YOp, or ZOp depending on the dimension
	// that Op operates on. Op is assumed to return an `index` value and			// that Op operates on. Op is assumed to return an `index` value and
	// XOp, YOp and ZOp are assumed to return an `llvm.i32` value. Depending on			// XOp, YOp and ZOp are assumed to return an `llvm.i32` value. Depending on
	// `indexBitwidth`, sign-extend or truncate the resulting value to match the			// `indexBitwidth`, sign-extend or truncate the resulting value to match the
	// bitwidth expected by the consumers of the value.			// bitwidth expected by the consumers of the value.
	template <typename Op, typename XOp, typename YOp, typename ZOp>			template <typename Op, typename XOp, typename YOp, typename ZOp>
	struct GPUIndexIntrinsicOpLowering : public ConvertOpToLLVMPattern<Op> {			struct GPUIndexIntrinsicOpLowering : public ConvertOpToLLVMPattern<Op> {
	private:			private:
	unsigned indexBitwidth;			unsigned indexBitwidth;
				StringRef boundsAttrName;

	public:			public:
	explicit GPUIndexIntrinsicOpLowering(LLVMTypeConverter &typeConverter)			explicit GPUIndexIntrinsicOpLowering(LLVMTypeConverter &typeConverter)
	: ConvertOpToLLVMPattern<Op>(typeConverter),			: ConvertOpToLLVMPattern<Op>(typeConverter),
	indexBitwidth(typeConverter.getIndexTypeBitwidth()) {}			indexBitwidth(typeConverter.getIndexTypeBitwidth()),
				boundsAttrName("") {}

				explicit GPUIndexIntrinsicOpLowering(LLVMTypeConverter &typeConverter,
				StringRef boundsAttrName)
				: ConvertOpToLLVMPattern<Op>(typeConverter),
				indexBitwidth(typeConverter.getIndexTypeBitwidth()),
				boundsAttrName(boundsAttrName) {}

	// Convert the kernel arguments to an LLVM type, preserve the rest.			// Convert the kernel arguments to an LLVM type, preserve the rest.
	LogicalResult			LogicalResult
	matchAndRewrite(Op op, typename Op::Adaptor adaptor,			matchAndRewrite(Op op, typename Op::Adaptor adaptor,
	ConversionPatternRewriter &rewriter) const override {			ConversionPatternRewriter &rewriter) const override {
	auto loc = op->getLoc();			auto loc = op->getLoc();
	MLIRContext *context = rewriter.getContext();			MLIRContext *context = rewriter.getContext();
	Value newOp;			Operation *newOp;
	switch (op.getDimension()) {			switch (op.getDimension()) {
	case gpu::Dimension::x:			case gpu::Dimension::x:
	newOp = rewriter.create<XOp>(loc, IntegerType::get(context, 32));			newOp = rewriter.create<XOp>(loc, IntegerType::get(context, 32));
	break;			break;
	case gpu::Dimension::y:			case gpu::Dimension::y:
	newOp = rewriter.create<YOp>(loc, IntegerType::get(context, 32));			newOp = rewriter.create<YOp>(loc, IntegerType::get(context, 32));
	break;			break;
	case gpu::Dimension::z:			case gpu::Dimension::z:
	newOp = rewriter.create<ZOp>(loc, IntegerType::get(context, 32));			newOp = rewriter.create<ZOp>(loc, IntegerType::get(context, 32));
	break;			break;
	}			}

				Operation *function;
				if (auto gpuFunc = op->template getParentOfType<gpu::GPUFuncOp>())
				function = gpuFunc;
				if (auto llvmFunc = op->template getParentOfType<LLVM::LLVMFuncOp>())
				function = llvmFunc;
				if (!boundsAttrName.empty() && function) {
				if (auto attr = function->template getAttrOfType<DenseI32ArrayAttr>(
				boundsAttrName)) {
				int32_t maximum = attr[static_cast<uint32_t>(op.getDimension())];
				newOp->setAttr("range", rewriter.getDenseI32ArrayAttr({0, maximum}));
				}
				}

	if (indexBitwidth > 32) {			if (indexBitwidth > 32) {
	newOp = rewriter.create<LLVM::SExtOp>(			newOp = rewriter.create<LLVM::SExtOp>(
	loc, IntegerType::get(context, indexBitwidth), newOp);			loc, IntegerType::get(context, indexBitwidth), newOp->getResult(0));
	} else if (indexBitwidth < 32) {			} else if (indexBitwidth < 32) {
	newOp = rewriter.create<LLVM::TruncOp>(			newOp = rewriter.create<LLVM::TruncOp>(
	loc, IntegerType::get(context, indexBitwidth), newOp);			loc, IntegerType::get(context, indexBitwidth), newOp->getResult(0));
	}			}

	rewriter.replaceOp(op, {newOp});			rewriter.replaceOp(op, newOp->getResults());
	return success();			return success();
	}			}
	};			};

	} // namespace mlir			} // namespace mlir

	#endif // MLIR_CONVERSION_GPUCOMMON_INDEXINTRINSICSOPLOWERING_H_			#endif // MLIR_CONVERSION_GPUCOMMON_INDEXINTRINSICSOPLOWERING_H_

mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp

Show All 20 Lines
#include "mlir/Conversion/LLVMCommon/LoweringOptions.h"		#include "mlir/Conversion/LLVMCommon/LoweringOptions.h"
#include "mlir/Conversion/LLVMCommon/Pattern.h"		#include "mlir/Conversion/LLVMCommon/Pattern.h"
#include "mlir/Conversion/LLVMCommon/TypeConverter.h"		#include "mlir/Conversion/LLVMCommon/TypeConverter.h"
#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"		#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"		#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
#include "mlir/Dialect/Func/IR/FuncOps.h"		#include "mlir/Dialect/Func/IR/FuncOps.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"		#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/GPU/Transforms/Passes.h"		#include "mlir/Dialect/GPU/Transforms/Passes.h"
		#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"		#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
#include "mlir/Dialect/Math/IR/Math.h"		#include "mlir/Dialect/Math/IR/Math.h"
#include "mlir/Dialect/Vector/IR/VectorOps.h"		#include "mlir/Dialect/Vector/IR/VectorOps.h"
		#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/Pass/Pass.h"		#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/DialectConversion.h"		#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"		#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "llvm/Support/FormatVariadic.h"		#include "llvm/Support/FormatVariadic.h"

#include "../GPUCommon/GPUOpsLowering.h"		#include "../GPUCommon/GPUOpsLowering.h"
#include "../GPUCommon/IndexIntrinsicsOpLowering.h"		#include "../GPUCommon/IndexIntrinsicsOpLowering.h"
#include "../GPUCommon/OpToFuncCallLowering.h"		#include "../GPUCommon/OpToFuncCallLowering.h"
▲ Show 20 Lines • Show All 94 Lines • ▼ Show 20 Lines	void runOnOperation() override {
cf::populateControlFlowToLLVMConversionPatterns(converter, llvmPatterns);		cf::populateControlFlowToLLVMConversionPatterns(converter, llvmPatterns);
populateFuncToLLVMConversionPatterns(converter, llvmPatterns);		populateFuncToLLVMConversionPatterns(converter, llvmPatterns);
populateMemRefToLLVMConversionPatterns(converter, llvmPatterns);		populateMemRefToLLVMConversionPatterns(converter, llvmPatterns);
populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime);		populateGpuToROCDLConversionPatterns(converter, llvmPatterns, runtime);
LLVMConversionTarget target(getContext());		LLVMConversionTarget target(getContext());
configureGpuToROCDLConversionLegality(target);		configureGpuToROCDLConversionLegality(target);
if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))		if (failed(applyPartialConversion(m, target, std::move(llvmPatterns))))
signalPassFailure();		signalPassFailure();

		// Manually rewrite known block size attributes so the LLVMIR translation
		// infrastructure can pick them up.
		m.walk([ctx](LLVM::LLVMFuncOp op) {
		if (auto blockSizes =
		op->removeAttr(gpu::GPUFuncOp::getKnownBlockSizeAttrName())
		.dyn_cast_or_null<DenseI32ArrayAttr>()) {
		op->setAttr(ROCDL::ROCDLDialect::getReqdWorkGroupSizeAttrName(),
		blockSizes);
		// Also set up the rocdl.flat_work_group_size attribute to prevent
		// conflicting metadata.
		uint32_t flatSize = 1;
		for (uint32_t size : blockSizes.asArrayRef()) {
		flatSize *= size;
		}
		StringAttr flatSizeAttr =
		StringAttr::get(ctx, Twine(flatSize) + "," + Twine(flatSize));
		op->setAttr(ROCDL::ROCDLDialect::getFlatWorkGroupSizeAttrName(),
		flatSizeAttr);
		}
		});
}		}
};		};

} // namespace		} // namespace

void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) {		void mlir::configureGpuToROCDLConversionLegality(ConversionTarget &target) {
target.addIllegalOp<func::FuncOp>();		target.addIllegalOp<func::FuncOp>();
target.addLegalDialect<::mlir::LLVM::LLVMDialect>();		target.addLegalDialect<::mlir::LLVM::LLVMDialect>();
Show All 18 Lines
void mlir::populateGpuToROCDLConversionPatterns(		void mlir::populateGpuToROCDLConversionPatterns(
LLVMTypeConverter &converter, RewritePatternSet &patterns,		LLVMTypeConverter &converter, RewritePatternSet &patterns,
mlir::gpu::amd::Runtime runtime) {		mlir::gpu::amd::Runtime runtime) {
using mlir::gpu::amd::Runtime;		using mlir::gpu::amd::Runtime;

populateWithGenerated(patterns);		populateWithGenerated(patterns);
patterns		patterns
.add<GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,		.add<GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>,		ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>>(
GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,		converter, gpu::GPUFuncOp::getKnownBlockSizeAttrName());
		patterns.add<GPUIndexIntrinsicOpLowering<
		gpu::BlockIdOp, ROCDL::BlockIdXOp, ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>>(
		converter, gpu::GPUFuncOp::getKnownGridSizeAttrName());
		patterns
		.add<GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>,		ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>,
GPUIndexIntrinsicOpLowering<gpu::BlockIdOp, ROCDL::BlockIdXOp,
ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>,
GPUIndexIntrinsicOpLowering<gpu::GridDimOp, ROCDL::GridDimXOp,		GPUIndexIntrinsicOpLowering<gpu::GridDimOp, ROCDL::GridDimXOp,
ROCDL::GridDimYOp, ROCDL::GridDimZOp>,		ROCDL::GridDimYOp, ROCDL::GridDimZOp>,
GPUReturnOpLowering>(converter);		GPUReturnOpLowering>(converter);
patterns.add<GPUFuncOpLowering>(		patterns.add<GPUFuncOpLowering>(
converter, /allocaAddrSpace=/5,		converter, /allocaAddrSpace=/5,
StringAttr::get(&converter.getContext(),		StringAttr::get(&converter.getContext(),
ROCDL::ROCDLDialect::getKernelFuncAttrName()));		ROCDL::ROCDLDialect::getKernelFuncAttrName()));
if (Runtime::HIP == runtime) {		if (Runtime::HIP == runtime) {
▲ Show 20 Lines • Show All 52 Lines • Show Last 20 Lines

mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp

//===- ROCDLToLLVMIRTranslation.cpp - Translate ROCDL to LLVM IR ----------===//		//===- ROCDLToLLVMIRTranslation.cpp - Translate ROCDL to LLVM IR ----------===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
// This file implements a translation between the MLIR ROCDL dialect and		// This file implements a translation between the MLIR ROCDL dialect and
// LLVM IR.		// LLVM IR.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"		#include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"		#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
		#include "mlir/IR/BuiltinAttributes.h"
#include "mlir/IR/Operation.h"		#include "mlir/IR/Operation.h"
#include "mlir/Target/LLVMIR/ModuleTranslation.h"		#include "mlir/Target/LLVMIR/ModuleTranslation.h"

#include "llvm/IR/IRBuilder.h"		#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"		#include "llvm/IR/IntrinsicsAMDGPU.h"
		#include "llvm/IR/MDBuilder.h"
#include "llvm/Support/raw_ostream.h"		#include "llvm/Support/raw_ostream.h"

using namespace mlir;		using namespace mlir;
using namespace mlir::LLVM;		using namespace mlir::LLVM;
using mlir::LLVM::detail::createIntrinsicCall;		using mlir::LLVM::detail::createIntrinsicCall;

		static llvm::Value *createIntrinsicCallWithRange(llvm::IRBuilderBase &builder,
		llvm::Intrinsic::ID intrinsic,
		DenseI32ArrayAttr maybeRange) {
		auto *inst = llvm::cast<llvm::CallInst>(
		createIntrinsicCall(builder, intrinsic, {}, {}));
		if (maybeRange) {
		SmallVector<llvm::APInt, 2> apInts;
		for (int32_t i : maybeRange.asArrayRef())
		apInts.push_back(llvm::APInt(32, i));
		llvm::MDBuilder mdBuilder(builder.getContext());
		llvm::MDNode *range = mdBuilder.createRange(apInts[0], apInts[1]);
		inst->setMetadata(llvm::LLVMContext::MD_range, range);
		}
		return inst;
		}

// Create a call to ROCm-Device-Library function		// Create a call to ROCm-Device-Library function
// Currently this routine will work only for calling ROCDL functions that		// Currently this routine will work only for calling ROCDL functions that
// take a single int32 argument. It is likely that the interface of this		// take a single int32 argument. It is likely that the interface of this
// function will change to make it more generic.		// function will change to make it more generic.
static llvm::Value *createDeviceFunctionCall(llvm::IRBuilderBase &builder,		static llvm::Value *createDeviceFunctionCall(llvm::IRBuilderBase &builder,
StringRef fnName, int parameter) {		StringRef fnName, int parameter) {
llvm::Module *module = builder.GetInsertBlock()->getModule();		llvm::Module *module = builder.GetInsertBlock()->getModule();
llvm::FunctionType *functionType = llvm::FunctionType::get(		llvm::FunctionType *functionType = llvm::FunctionType::get(
Show All 40 Lines	if (attribute.getName() == ROCDL::ROCDLDialect::getKernelFuncAttrName()) {
// 2. Insert amdgpu-flat-work-group-size(1, 256) attribute unless the user		// 2. Insert amdgpu-flat-work-group-size(1, 256) attribute unless the user
// has overriden this value - 256 is the default in clang		// has overriden this value - 256 is the default in clang
// 3. Insert amdgpu-implicitarg-num-bytes=56 (which must be set on OpenCL		// 3. Insert amdgpu-implicitarg-num-bytes=56 (which must be set on OpenCL
// and HIP kernels per Clang)		// and HIP kernels per Clang)
llvm::Function *llvmFunc =		llvm::Function *llvmFunc =
moduleTranslation.lookupFunction(func.getName());		moduleTranslation.lookupFunction(func.getName());
llvmFunc->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);		llvmFunc->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
if (!llvmFunc->hasFnAttribute("amdgpu-flat-work-group-size")) {		if (!llvmFunc->hasFnAttribute("amdgpu-flat-work-group-size")) {
llvmFunc->addFnAttr("amdgpu-flat-work-group-size", "1, 256");		llvmFunc->addFnAttr("amdgpu-flat-work-group-size", "1,256");
}		}
llvmFunc->addFnAttr("amdgpu-implicitarg-num-bytes", "56");		llvmFunc->addFnAttr("amdgpu-implicitarg-num-bytes", "56");
}		}
// Override flat-work-group-size		// Override flat-work-group-size
		// TODO: update clients to rocdl.flat_work_group_size instead,
		// then remove this half of the branch
if ("rocdl.max_flat_work_group_size" == attribute.getName()) {		if ("rocdl.max_flat_work_group_size" == attribute.getName()) {
auto func = dyn_cast<LLVM::LLVMFuncOp>(op);		auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
if (!func)		if (!func)
return failure();		return failure();
auto value = attribute.getValue().dyn_cast<IntegerAttr>();		auto value = attribute.getValue().dyn_cast<IntegerAttr>();
if (!value)		if (!value)
return failure();		return failure();

llvm::Function *llvmFunc =		llvm::Function *llvmFunc =
moduleTranslation.lookupFunction(func.getName());		moduleTranslation.lookupFunction(func.getName());
llvm::SmallString<8> llvmAttrValue;		llvm::SmallString<8> llvmAttrValue;
llvm::raw_svector_ostream attrValueStream(llvmAttrValue);		llvm::raw_svector_ostream attrValueStream(llvmAttrValue);
attrValueStream << "1, " << value.getInt();		attrValueStream << "1," << value.getInt();
llvmFunc->addFnAttr("amdgpu-flat-work-group-size", llvmAttrValue);		llvmFunc->addFnAttr("amdgpu-flat-work-group-size", llvmAttrValue);
}		}
		if (ROCDL::ROCDLDialect::getFlatWorkGroupSizeAttrName() ==
		attribute.getName()) {
		auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
		if (!func)
		return failure();
		auto value = attribute.getValue().dyn_cast<StringAttr>();
		if (!value)
		return failure();

		llvm::Function *llvmFunc =
		moduleTranslation.lookupFunction(func.getName());
		llvm::SmallString<8> llvmAttrValue;
		llvmAttrValue.append(value.getValue());
		llvm::errs() << llvmAttrValue << "\n";
		llvmFunc->addFnAttr("amdgpu-flat-work-group-size", llvmAttrValue);
		}

		// Set reqd_work_group_size metadata
		if (ROCDL::ROCDLDialect::getReqdWorkGroupSizeAttrName() ==
		attribute.getName()) {
		auto func = dyn_cast<LLVM::LLVMFuncOp>(op);
		if (!func)
		return failure();
		auto value = attribute.getValue().dyn_cast<DenseI32ArrayAttr>();
		if (!value)
		return failure();
		llvm::LLVMContext &llvmContext = moduleTranslation.getLLVMContext();
		SmallVector<llvm::Metadata *, 3> metadata;
		llvm::Type *i32 = llvm::IntegerType::get(llvmContext, 32);
		for (int32_t i : value.asArrayRef()) {
		llvm::Constant *constant = llvm::ConstantInt::get(i32, i);
		metadata.push_back(llvm::ConstantAsMetadata::get(constant));
		}
		llvm::Function *llvmFunc =
		moduleTranslation.lookupFunction(func.getName());
		llvm::MDNode *node = llvm::MDNode::get(llvmContext, metadata);
		llvmFunc->setMetadata("reqd_work_group_size", node);
		}
return success();		return success();
}		}
};		};
} // namespace		} // namespace

void mlir::registerROCDLDialectTranslation(DialectRegistry &registry) {		void mlir::registerROCDLDialectTranslation(DialectRegistry &registry) {
registry.insert<ROCDL::ROCDLDialect>();		registry.insert<ROCDL::ROCDLDialect>();
registry.addExtension(+[](MLIRContext ctx, ROCDL::ROCDLDialect dialect) {		registry.addExtension(+[](MLIRContext ctx, ROCDL::ROCDLDialect dialect) {
Show All 9 Lines

mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir

Show First 20 Lines • Show All 53 Lines • ▼ Show 20 Lines	func.return %tIdX, %tIdY, %tIdZ, %bDimX, %bDimY, %bDimZ,
: index, index, index, index, index, index,		: index, index, index, index, index, index,
index, index, index, index, index, index		index, index, index, index, index, index
}		}
}		}

// -----		// -----

gpu.module @test_module {		gpu.module @test_module {
		// CHECK-LABEL: func @gpu_index_ops_range()
		// CHECK-SAME: rocdl.flat_work_group_size = "1536,1536"
		// CHECK-SAME: rocdl.reqd_work_group_size = array<i32: 8, 12, 16>
		func.func @gpu_index_ops_range()
		-> (index, index, index, index, index, index) attributes
		{gpu.known_block_size = array<i32: 8, 12, 16>,
		gpu.known_grid_size = array<i32: 20, 24, 28>} {

		// CHECK: rocdl.workitem.id.x {range = array<i32: 0, 8>} : i32
		%tIdX = gpu.thread_id x
		// CHECK: rocdl.workitem.id.y {range = array<i32: 0, 12>} : i32
		%tIdY = gpu.thread_id y
		// CHECK: rocdl.workitem.id.z {range = array<i32: 0, 16>} : i32
		%tIdZ = gpu.thread_id z

		// CHECK: rocdl.workgroup.id.x {range = array<i32: 0, 20>} : i32
		%bIdX = gpu.block_id x
		// CHECK: rocdl.workgroup.id.y {range = array<i32: 0, 24>} : i32
		%bIdY = gpu.block_id y
		// CHECK: rocdl.workgroup.id.z {range = array<i32: 0, 28>} : i32
		%bIdZ = gpu.block_id z

		func.return %tIdX, %tIdY, %tIdZ, %bIdX, %bIdY, %bIdZ
		: index, index, index, index, index, index
		}
		}

		// -----

		gpu.module @test_module {
// CHECK-LABEL: func @gpu_index_comp		// CHECK-LABEL: func @gpu_index_comp
// CHECK32-LABEL: func @gpu_index_comp		// CHECK32-LABEL: func @gpu_index_comp
func.func @gpu_index_comp(%idx : index) -> index {		func.func @gpu_index_comp(%idx : index) -> index {
// CHECK: = llvm.add %{{.}}, %{{.}} : i64		// CHECK: = llvm.add %{{.}}, %{{.}} : i64
// CHECK32: = llvm.add %{{.}}, %{{.}} : i32		// CHECK32: = llvm.add %{{.}}, %{{.}} : i32
%0 = arith.addi %idx, %idx : index		%0 = arith.addi %idx, %idx : index
// CHECK: llvm.return %{{.*}} : i64		// CHECK: llvm.return %{{.*}} : i64
// CHECK32: llvm.return %{{.*}} : i32		// CHECK32: llvm.return %{{.*}} : i32
▲ Show 20 Lines • Show All 333 Lines • Show Last 20 Lines

mlir/test/Target/LLVMIR/rocdl.mlir

Show All 20 Lines	llvm.func @rocdl_special_regs() -> i32 {
// CHECK: call i64 @__ockl_get_local_size(i32 2)		// CHECK: call i64 @__ockl_get_local_size(i32 2)
%9 = rocdl.workgroup.dim.z : i64		%9 = rocdl.workgroup.dim.z : i64
// CHECK: call i64 @__ockl_get_global_size(i32 0)		// CHECK: call i64 @__ockl_get_global_size(i32 0)
%10 = rocdl.grid.dim.x : i64		%10 = rocdl.grid.dim.x : i64
// CHECK: call i64 @__ockl_get_global_size(i32 1)		// CHECK: call i64 @__ockl_get_global_size(i32 1)
%11 = rocdl.grid.dim.y : i64		%11 = rocdl.grid.dim.y : i64
// CHECK: call i64 @__ockl_get_global_size(i32 2)		// CHECK: call i64 @__ockl_get_global_size(i32 2)
%12 = rocdl.grid.dim.z : i64		%12 = rocdl.grid.dim.z : i64

		// CHECK: call i32 @llvm.amdgcn.workitem.id.x(),{{.*}} !range ![[$RANGE:[0-9]+]]
		%13 = rocdl.workitem.id.x {range = array<i32: 0, 64>} : i32

llvm.return %1 : i32		llvm.return %1 : i32
}		}

llvm.func @kernel_func() attributes {rocdl.kernel} {		llvm.func @kernel_func() attributes {rocdl.kernel} {
// CHECK-LABEL: amdgpu_kernel void @kernel_func()		// CHECK-LABEL: amdgpu_kernel void @kernel_func()
// CHECK: #[[$KERNEL_ATTRS:[0-9]+]]		// CHECK: #[[$KERNEL_ATTRS:[0-9]+]]
llvm.return		llvm.return
}		}

llvm.func @kernel_func_workgroups()		llvm.func @kernel_func_workgroups()
attributes {rocdl.kernel, rocdl.max_flat_work_group_size = 1024 : index} {		attributes {rocdl.kernel, rocdl.max_flat_work_group_size = 1024 : index} {
// CHECK-LABEL: amdgpu_kernel void @kernel_func_workgroups()		// CHECK-LABEL: amdgpu_kernel void @kernel_func_workgroups()
// CHECK: #[[$KERNEL_WORKGROUP_ATTRS:[0-9]+]]		// CHECK: #[[$KERNEL_WORKGROUP_ATTRS:[0-9]+]]
llvm.return		llvm.return
}		}

		llvm.func @known_block_sizes()
		attributes {rocdl.kernel,
		rocdl.flat_work_group_size = "128,128",
		rocdl.reqd_work_group_size = array<i32: 16, 4, 2>} {
		// CHECK-LABEL: amdgpu_kernel void @known_block_sizes()
		// CHECK: #[[$KNOWN_BLOCK_SIZE_ATTRS:[0-9]+]]
		// CHECK: !reqd_work_group_size ![[$REQD_WORK_GROUP_SIZE:[0-9]+]]
		llvm.return
		}

llvm.func @rocdl.barrier() {		llvm.func @rocdl.barrier() {
// CHECK: fence syncscope("workgroup") release		// CHECK: fence syncscope("workgroup") release
// CHECK-NEXT: call void @llvm.amdgcn.s.barrier()		// CHECK-NEXT: call void @llvm.amdgcn.s.barrier()
// CHECK-NEXT: fence syncscope("workgroup") acquire		// CHECK-NEXT: fence syncscope("workgroup") acquire
rocdl.barrier		rocdl.barrier
llvm.return		llvm.return
}		}

▲ Show 20 Lines • Show All 167 Lines • ▼ Show 20 Lines	llvm.func @rocdl.raw.buffer.atomic(%rsrc : vector<4xi32>,
// CHECK-LABEL: rocdl.raw.buffer.atomic		// CHECK-LABEL: rocdl.raw.buffer.atomic
// CHECK: call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %{{.}}, <4 x i32> %{{.}}, i32 %{{.}}, i32 %{{.}}, i32 {{.*}}		// CHECK: call float @llvm.amdgcn.raw.buffer.atomic.fadd.f32(float %{{.}}, <4 x i32> %{{.}}, i32 %{{.}}, i32 %{{.}}, i32 {{.*}}

rocdl.raw.buffer.atomic.fadd %vdata1, %rsrc, %offset, %soffset, %aux : f32		rocdl.raw.buffer.atomic.fadd %vdata1, %rsrc, %offset, %soffset, %aux : f32

llvm.return		llvm.return
}		}

// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1, 256" "amdgpu-implicitarg-num-bytes"="56" }		// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-implicitarg-num-bytes"="56" }
// CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1, 1024"		// CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024"
		// CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128"
		// CHECK-DAG: ![[$RANGE]] = !{i32 0, i32 64}
		// CHECK-DAG: ![[$REQD_WORK_GROUP_SIZE]] = !{i32 16, i32 4, i32 2}

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][ROCDL] Translate known block size attributes to ROCDL
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 482200

mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td

mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h

mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp

mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp

mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir

mlir/test/Target/LLVMIR/rocdl.mlir

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][ROCDL] Translate known block size attributes to ROCDLClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 482200

mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td

mlir/lib/Conversion/GPUCommon/IndexIntrinsicsOpLowering.h

mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp

mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp

mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir

mlir/test/Target/LLVMIR/rocdl.mlir

[mlir][ROCDL] Translate known block size attributes to ROCDL
ClosedPublic