Diff 283547

mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h

	//===- GPUCommonPass.h - MLIR GPU runtime support -------------------------===//			//===- GPUCommonPass.h - MLIR GPU runtime support -------------------------===//
	//			//
	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.			// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.			// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception			// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	#ifndef MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_			#ifndef MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_
	#define MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_			#define MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_

	#include "mlir/Support/LLVM.h"			#include "mlir/Support/LLVM.h"
	#include "llvm/IR/Module.h"			#include "llvm/IR/Module.h"
	#include <vector>			#include <vector>

	namespace mlir {			namespace mlir {

				class LLVMTypeConverter;
	class Location;			class Location;
	struct LogicalResult;			struct LogicalResult;
	class ModuleOp;			class ModuleOp;
	class Operation;			class Operation;
				class OwningRewritePatternList;

	template <typename T>			template <typename T>
	class OperationPass;			class OperationPass;

	namespace gpu {			namespace gpu {
	class GPUModuleOp;			class GPUModuleOp;
	} // namespace gpu			} // namespace gpu

	Show All 12 Lines
	///			///
	/// This pass does not generate code to call GPU runtime APIs directly but			/// This pass does not generate code to call GPU runtime APIs directly but
	/// instead uses a small wrapper library that exports a stable and conveniently			/// instead uses a small wrapper library that exports a stable and conveniently
	/// typed ABI on top of GPU runtimes such as CUDA or ROCm (HIP).			/// typed ABI on top of GPU runtimes such as CUDA or ROCm (HIP).
	std::unique_ptr<OperationPass<ModuleOp>>			std::unique_ptr<OperationPass<ModuleOp>>
	createConvertGpuLaunchFuncToGpuRuntimeCallsPass(			createConvertGpuLaunchFuncToGpuRuntimeCallsPass(
	StringRef gpuBinaryAnnotation = "");			StringRef gpuBinaryAnnotation = "");

				/// Collect a set of patterns to convert from the GPU dialect to LLVM.
				void populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
				OwningRewritePatternList &patterns,
				StringRef gpuBinaryAnnotation);

	/// Creates a pass to convert kernel functions into GPU target object blobs.			/// Creates a pass to convert kernel functions into GPU target object blobs.
	///			///
	/// This transformation takes the body of each function that is annotated with			/// This transformation takes the body of each function that is annotated with
	/// the 'gpu.kernel' attribute, copies it to a new LLVM module, compiles the			/// the 'gpu.kernel' attribute, copies it to a new LLVM module, compiles the
	/// module with help of the GPU backend to target object and then invokes			/// module with help of the GPU backend to target object and then invokes
	/// the provided blobGenerator to produce a binary blob. Such blob is then			/// the provided blobGenerator to produce a binary blob. Such blob is then
	/// attached as a string attribute to the kernel function.			/// attached as a string attribute to the kernel function.
	///			///
	Show All 23 Lines

mlir/lib/Conversion/GPUCommon/CMakeLists.txt

Show All 28 Lines	add_mlir_conversion_library(MLIRGPUToGPURuntimeTransforms
${NVPTX_LIBS}		${NVPTX_LIBS}

LINK_LIBS PUBLIC		LINK_LIBS PUBLIC
MLIRGPU		MLIRGPU
MLIRIR		MLIRIR
MLIRLLVMIR		MLIRLLVMIR
MLIRPass		MLIRPass
MLIRSupport		MLIRSupport
		MLIRStandardToLLVM
)		)

mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp

	Show All 10 Lines
	// ABI, this pass uses a slim runtime layer that builds on top of the public			// ABI, this pass uses a slim runtime layer that builds on top of the public
	// API from GPU runtime headers.			// API from GPU runtime headers.
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"			#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"

	#include "../PassDetail.h"			#include "../PassDetail.h"
				#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
	#include "mlir/Dialect/GPU/GPUDialect.h"			#include "mlir/Dialect/GPU/GPUDialect.h"
	#include "mlir/Dialect/LLVMIR/LLVMDialect.h"			#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
	#include "mlir/IR/Attributes.h"			#include "mlir/IR/Attributes.h"
	#include "mlir/IR/Builders.h"			#include "mlir/IR/Builders.h"
	#include "mlir/IR/Function.h"			#include "mlir/IR/Function.h"
	#include "mlir/IR/Module.h"			#include "mlir/IR/Module.h"
	#include "mlir/IR/StandardTypes.h"			#include "mlir/IR/StandardTypes.h"

	#include "llvm/ADT/STLExtras.h"			#include "llvm/ADT/STLExtras.h"
	#include "llvm/IR/DataLayout.h"			#include "llvm/IR/DataLayout.h"
	#include "llvm/IR/DerivedTypes.h"			#include "llvm/IR/DerivedTypes.h"
	#include "llvm/IR/Module.h"			#include "llvm/IR/Module.h"
	#include "llvm/IR/Type.h"			#include "llvm/IR/Type.h"
	#include "llvm/Support/Error.h"			#include "llvm/Support/Error.h"
	#include "llvm/Support/FormatVariadic.h"			#include "llvm/Support/FormatVariadic.h"

	using namespace mlir;			using namespace mlir;

	// To avoid name mangling, these are defined in the mini-runtime file.
	static constexpr const char *kGpuModuleLoadName = "mgpuModuleLoad";
	static constexpr const char *kGpuModuleGetFunctionName =
	"mgpuModuleGetFunction";
	static constexpr const char *kGpuLaunchKernelName = "mgpuLaunchKernel";
	static constexpr const char *kGpuStreamCreateName = "mgpuStreamCreate";
	static constexpr const char *kGpuStreamSynchronizeName =
	"mgpuStreamSynchronize";
	static constexpr const char *kGpuMemHostRegisterName = "mgpuMemHostRegister";
	static constexpr const char *kGpuBinaryStorageSuffix = "_gpubin_cst";			static constexpr const char *kGpuBinaryStorageSuffix = "_gpubin_cst";

	namespace {			namespace {

	/// A pass to convert gpu.launch_func operations into a sequence of GPU
	/// runtime calls. Currently it supports CUDA and ROCm (HIP).
	///
	/// In essence, a gpu.launch_func operations gets compiled into the following
	/// sequence of runtime calls:
	///
	/// * moduleLoad -- loads the module given the cubin / hsaco data
	/// * moduleGetFunction -- gets a handle to the actual kernel function
	/// * getStreamHelper -- initializes a new compute stream on GPU
	/// * launchKernel -- launches the kernel on a stream
	/// * streamSynchronize -- waits for operations on the stream to finish
	///
	/// Intermediate data structures are allocated on the stack.
	class GpuLaunchFuncToGpuRuntimeCallsPass			class GpuLaunchFuncToGpuRuntimeCallsPass
	: public ConvertGpuLaunchFuncToGpuRuntimeCallsBase<			: public ConvertGpuLaunchFuncToGpuRuntimeCallsBase<
	GpuLaunchFuncToGpuRuntimeCallsPass> {			GpuLaunchFuncToGpuRuntimeCallsPass> {
	private:			public:
	LLVM::LLVMDialect *getLLVMDialect() { return llvmDialect; }			GpuLaunchFuncToGpuRuntimeCallsPass(StringRef gpuBinaryAnnotation) {
				if (!gpuBinaryAnnotation.empty())
	void initializeCachedTypes() {			this->gpuBinaryAnnotation = gpuBinaryAnnotation.str();
	llvmVoidType = LLVM::LLVMType::getVoidTy(&getContext());
	llvmPointerType = LLVM::LLVMType::getInt8PtrTy(&getContext());
	llvmPointerPointerType = llvmPointerType.getPointerTo();
	llvmInt8Type = LLVM::LLVMType::getInt8Ty(&getContext());
	llvmInt32Type = LLVM::LLVMType::getInt32Ty(&getContext());
	llvmInt64Type = LLVM::LLVMType::getInt64Ty(&getContext());
	llvmIntPtrType = LLVM::LLVMType::getIntNTy(
	&getContext(), llvmDialect->getDataLayout().getPointerSizeInBits());
	}			}

	LLVM::LLVMType getVoidType() { return llvmVoidType; }			// Run the dialect converter on the module.
				void runOnOperation() override;
	LLVM::LLVMType getPointerType() { return llvmPointerType; }			};

	LLVM::LLVMType getPointerPointerType() { return llvmPointerPointerType; }			class FunctionCallBuilder {
				public:
				FunctionCallBuilder(StringRef functionName, LLVM::LLVMType returnType,
				ArrayRef<LLVM::LLVMType> argumentTypes)
				: functionName(functionName),
				functionType(LLVM::LLVMType::getFunctionTy(returnType, argumentTypes,
				/isVarArg=/false)) {}
				LLVM::CallOp create(Location loc, OpBuilder &builder,
				ArrayRef<Value> arguments) const;

	LLVM::LLVMType getInt8Type() { return llvmInt8Type; }			private:
				StringRef functionName;
				LLVM::LLVMType functionType;
				};

	LLVM::LLVMType getInt32Type() { return llvmInt32Type; }			template <typename OpTy>
				class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
				public:
				explicit ConvertOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
				: ConvertOpToLLVMPattern<OpTy>(typeConverter) {}

	LLVM::LLVMType getInt64Type() { return llvmInt64Type; }			protected:
				MLIRContext *context = &this->typeConverter.getContext();

	LLVM::LLVMType getIntPtrType() {			LLVM::LLVMType llvmVoidType = LLVM::LLVMType::getVoidTy(context);
	return LLVM::LLVMType::getIntNTy(			LLVM::LLVMType llvmPointerType = LLVM::LLVMType::getInt8PtrTy(context);
	&getContext(),			LLVM::LLVMType llvmPointerPointerType = llvmPointerType.getPointerTo();
	getLLVMDialect()->getDataLayout().getPointerSizeInBits());			LLVM::LLVMType llvmInt8Type = LLVM::LLVMType::getInt8Ty(context);
	}			LLVM::LLVMType llvmInt32Type = LLVM::LLVMType::getInt32Ty(context);
				LLVM::LLVMType llvmInt64Type = LLVM::LLVMType::getInt64Ty(context);
				LLVM::LLVMType llvmIntPtrType = LLVM::LLVMType::getIntNTy(
				context, this->typeConverter.getPointerBitwidth(0));

				FunctionCallBuilder moduleLoadCallBuilder = {
				"mgpuModuleLoad",
				herhutUnsubmitted Not Done Reply Inline Actions What is the advantage of having these as field vs. a local helper function? herhut: What is the advantage of having these as field vs. a local helper function?
				csiggAuthorUnsubmitted Done Reply Inline Actions It doesn't make a big difference. Having a FunctionCallBuilder class maybe separates the details a little from the rest of the ConversionPattern. And probably overall it's a little shorter. But I can certainly change it, if you prefer functions. csigg: It doesn't make a big difference. Having a FunctionCallBuilder class maybe separates the…
				llvmPointerType /* void module /,
				{llvmPointerType /* void cubin /}};
				FunctionCallBuilder moduleGetFunctionCallBuilder = {
				"mgpuModuleGetFunction",
				llvmPointerType /* void function /,
				{
				llvmPointerType, /* void module /
				llvmPointerType /* char name /
				}};
				FunctionCallBuilder launchKernelCallBuilder = {
				"mgpuLaunchKernel",
				llvmVoidType,
				{
				llvmPointerType, /* void* f */
				llvmIntPtrType, /* intptr_t gridXDim */
				llvmIntPtrType, /* intptr_t gridyDim */
				llvmIntPtrType, /* intptr_t gridZDim */
				llvmIntPtrType, /* intptr_t blockXDim */
				llvmIntPtrType, /* intptr_t blockYDim */
				llvmIntPtrType, /* intptr_t blockZDim */
				llvmInt32Type, /* unsigned int sharedMemBytes */
				llvmPointerType, /* void hstream /
				llvmPointerPointerType, /* void *kernelParams /
				llvmPointerPointerType /* void *extra /
				}};
				FunctionCallBuilder streamCreateCallBuilder = {
				"mgpuStreamCreate", llvmPointerType /* void stream /, {}};
				FunctionCallBuilder streamSynchronizeCallBuilder = {
				"mgpuStreamSynchronize",
				llvmVoidType,
				{llvmPointerType /* void stream /}};
				};

	// Allocate a void pointer on the stack.			/// A rewrite patter to convert gpu.launch_func operations into a sequence of
	Value allocatePointer(OpBuilder &builder, Location loc) {			/// GPU runtime calls. Currently it supports CUDA and ROCm (HIP).
	auto one = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),			///
	builder.getI32IntegerAttr(1));			/// In essence, a gpu.launch_func operations gets compiled into the following
	return builder.create<LLVM::AllocaOp>(loc, getPointerPointerType(), one,			/// sequence of runtime calls:
	/alignment=/0);			///
	}			/// * moduleLoad -- loads the module given the cubin / hsaco data
				/// * moduleGetFunction -- gets a handle to the actual kernel function
				/// * getStreamHelper -- initializes a new compute stream on GPU
				/// * launchKernel -- launches the kernel on a stream
				/// * streamSynchronize -- waits for operations on the stream to finish
				///
				/// Intermediate data structures are allocated on the stack.
				class ConvertLaunchFuncOpToGpuRuntimeCallPattern
				: public ConvertOpToGpuRuntimeCallPattern<gpu::LaunchFuncOp> {
				public:
				ConvertLaunchFuncOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter,
				StringRef gpuBinaryAnnotation)
				: ConvertOpToGpuRuntimeCallPattern<gpu::LaunchFuncOp>(typeConverter),
				gpuBinaryAnnotation(gpuBinaryAnnotation) {}

	void declareGpuRuntimeFunctions(Location loc);			private:
	void addParamToList(OpBuilder &builder, Location loc, Value param, Value list,			void addParamToArray(OpBuilder &builder, Location loc, Value param,
	unsigned pos, Value one);			Value array, unsigned pos, Value one) const;
	Value setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder);			Value generateParamsArray(gpu::LaunchFuncOp launchOp, unsigned numArguments,
				OpBuilder &builder) const;
	Value generateKernelNameConstant(StringRef moduleName, StringRef name,			Value generateKernelNameConstant(StringRef moduleName, StringRef name,
	Location loc, OpBuilder &builder);			Location loc, OpBuilder &builder) const;
	void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp);

	public:			LogicalResult
	GpuLaunchFuncToGpuRuntimeCallsPass() = default;			matchAndRewrite(Operation *op, ArrayRef<Value> operands,
	GpuLaunchFuncToGpuRuntimeCallsPass(StringRef gpuBinaryAnnotation) {			ConversionPatternRewriter &rewriter) const override;
	this->gpuBinaryAnnotation = gpuBinaryAnnotation.str();
	}

	// Run the dialect converter on the module.			llvm::SmallString<32> gpuBinaryAnnotation;
	void runOnOperation() override {			};
	// Cache the LLVMDialect for the current module.
	llvmDialect = getContext().getRegisteredDialect<LLVM::LLVMDialect>();
	// Cache the used LLVM types.
	initializeCachedTypes();

	getOperation().walk(			class EraseGpuModuleOpPattern : public OpRewritePattern<gpu::GPUModuleOp> {
	[this](mlir::gpu::LaunchFuncOp op) { translateGpuLaunchCalls(op); });			using OpRewritePattern<gpu::GPUModuleOp>::OpRewritePattern;
				herhutUnsubmitted Not Done Reply Inline Actions This is dangerous, as the module could be deleted before the global has been created. Also, it is not strictly required as part of the lowering as long as we mark the nested gpu module as legal. It only needs to cleaned away shortly before exporting the module to llvm and that can be done by just traversing the outer module. herhut: This is dangerous, as the module could be deleted before the global has been created. Also, it…
				csiggAuthorUnsubmitted Done Reply Inline Actions I don't think it's dangerous as long as the `ConvertLaunchFuncOpToGpuRuntimeCallPattern` does not run after `EraseGpuModuleOpPattern`. In the same pass is OK, and the `populateGpuToLLVMConversionPatterns` function makes sure they are always part of the same PatternsList. Why would you like to not clean up the gpu modules when they are no longer needed, but have the user to do this manually? Doesn't `populateGpuToLLVMConversionPatterns` imply that it removes all gpu ops, instead of leaving one particular op type without uses? csigg: I don't think it's dangerous as long as the `ConvertLaunchFuncOpToGpuRuntimeCallPattern` does…
				herhutUnsubmitted Not Done Reply Inline Actions But how do you guarantee this? If the kernel module is before the functions that contain the launch, then it might get removed already, no? herhut: But how do you guarantee this? If the kernel module is before the functions that contain the…
				csiggAuthorUnsubmitted Done Reply Inline Actions I verified that it does not matter whether the kernel module is before the function or after. `rewriter.erase()` only marks it for removal, the actual removal happens during the pass' `finalize()`. So the kernel module will always be available for the gpu.launch_func rewrite in the same pass. csigg: I verified that it does not matter whether the kernel module is before the function or after.
				herhutUnsubmitted Not Done Reply Inline Actions Ah, ok, thanks for checking. In that case this is fine of course. herhut: Ah, ok, thanks for checking. In that case this is fine of course.

				LogicalResult matchAndRewrite(gpu::GPUModuleOp op,
				PatternRewriter &rewriter) const override {
	// GPU kernel modules are no longer necessary since we have a global			// GPU kernel modules are no longer necessary since we have a global
	// constant with the CUBIN, or HSACO data.			// constant with the CUBIN, or HSACO data.
	for (auto m :			rewriter.eraseOp(op);
	llvm::make_early_inc_range(getOperation().getOps<gpu::GPUModuleOp>()))			return success();
	m.erase();
	}			}

	private:
	LLVM::LLVMDialect *llvmDialect;
	LLVM::LLVMType llvmVoidType;
	LLVM::LLVMType llvmPointerType;
	LLVM::LLVMType llvmPointerPointerType;
	LLVM::LLVMType llvmInt8Type;
	LLVM::LLVMType llvmInt32Type;
	LLVM::LLVMType llvmInt64Type;
	LLVM::LLVMType llvmIntPtrType;
	};			};

	} // anonymous namespace			} // namespace

	// Adds declarations for the needed helper functions from the runtime wrappers.			void GpuLaunchFuncToGpuRuntimeCallsPass::runOnOperation() {
	// The types in comments give the actual types expected/returned but the API			LLVMTypeConverter converter(&getContext());
	// uses void pointers. This is fine as they have the same linkage in C.			OwningRewritePatternList patterns;
	void GpuLaunchFuncToGpuRuntimeCallsPass::declareGpuRuntimeFunctions(			populateGpuToLLVMConversionPatterns(converter, patterns, gpuBinaryAnnotation);
	Location loc) {
	ModuleOp module = getOperation();			LLVMConversionTarget target(getContext());
	OpBuilder builder(module.getBody()->getTerminator());			if (failed(applyPartialConversion(getOperation(), target, patterns)))
	if (!module.lookupSymbol(kGpuModuleLoadName)) {			signalPassFailure();
	builder.create<LLVM::LLVMFuncOp>(			}
	loc, kGpuModuleLoadName,
	LLVM::LLVMType::getFunctionTy(getPointerType(),			LLVM::CallOp FunctionCallBuilder::create(Location loc, OpBuilder &builder,
	{getPointerType()}, /* void cubin /			ArrayRef<Value> arguments) const {
	/isVarArg=/false));			auto module = builder.getBlock()->getParent()->getParentOfType<ModuleOp>();
	}			auto function = [&] {
	if (!module.lookupSymbol(kGpuModuleGetFunctionName)) {			if (auto function = module.lookupSymbol<LLVM::LLVMFuncOp>(functionName))
	// The helper uses void* instead of CUDA's opaque CUmodule and			return function;
	// CUfunction, or ROCm (HIP)'s opaque hipModule_t and hipFunction_t.			return OpBuilder(module.getBody()->getTerminator())
	builder.create<LLVM::LLVMFuncOp>(			.create<LLVM::LLVMFuncOp>(loc, functionName, functionType);
	loc, kGpuModuleGetFunctionName,			}();
	LLVM::LLVMType::getFunctionTy(getPointerType(),			return builder.create<LLVM::CallOp>(
	{			loc, const_cast<LLVM::LLVMType &>(functionType).getFunctionResultType(),
	getPointerType(), /* void module /			builder.getSymbolRefAttr(function), arguments);
	getPointerType() /* char name /
	},
	/isVarArg=/false));
	}
	if (!module.lookupSymbol(kGpuLaunchKernelName)) {
	// Other than the CUDA or ROCm (HIP) api, the wrappers use uintptr_t to
	// match the LLVM type if MLIR's index type, which the GPU dialect uses.
	// Furthermore, they use void* instead of CUDA's opaque CUfunction and
	// CUstream, or ROCm (HIP)'s opaque hipFunction_t and hipStream_t.
	builder.create<LLVM::LLVMFuncOp>(
	loc, kGpuLaunchKernelName,
	LLVM::LLVMType::getFunctionTy(
	getVoidType(),
	{
	getPointerType(), /* void* f */
	getIntPtrType(), /* intptr_t gridXDim */
	getIntPtrType(), /* intptr_t gridyDim */
	getIntPtrType(), /* intptr_t gridZDim */
	getIntPtrType(), /* intptr_t blockXDim */
	getIntPtrType(), /* intptr_t blockYDim */
	getIntPtrType(), /* intptr_t blockZDim */
	getInt32Type(), /* unsigned int sharedMemBytes */
	getPointerType(), /* void hstream /
	getPointerPointerType(), /* void *kernelParams /
	getPointerPointerType() /* void *extra /
	},
	/isVarArg=/false));
	}
	if (!module.lookupSymbol(kGpuStreamCreateName)) {
	// Helper function to get the current GPU compute stream. Uses void*
	// instead of CUDA's opaque CUstream, or ROCm (HIP)'s opaque hipStream_t.
	builder.create<LLVM::LLVMFuncOp>(
	loc, kGpuStreamCreateName,
	LLVM::LLVMType::getFunctionTy(getPointerType(), /isVarArg=/false));
	}
	if (!module.lookupSymbol(kGpuStreamSynchronizeName)) {
	builder.create<LLVM::LLVMFuncOp>(
	loc, kGpuStreamSynchronizeName,
	LLVM::LLVMType::getFunctionTy(getVoidType(),
	{getPointerType()}, /* void stream /
	/isVarArg=/false));
	}
	if (!module.lookupSymbol(kGpuMemHostRegisterName)) {
	builder.create<LLVM::LLVMFuncOp>(
	loc, kGpuMemHostRegisterName,
	LLVM::LLVMType::getFunctionTy(getVoidType(),
	{
	getPointerType(), /* void ptr /
	getInt64Type() /* int64 sizeBytes*/
	},
	/isVarArg=/false));
	}
	}			}

	/// Emits the IR with the following structure:			/// Emits the IR with the following structure:
	///			///
	/// %data = llvm.alloca 1 x type-of(<param>)			/// %data = llvm.alloca 1 x type-of(<param>)
	/// llvm.store <param>, %data			/// llvm.store <param>, %data
	/// %typeErased = llvm.bitcast %data to !llvm<"i8*">			/// %typeErased = llvm.bitcast %data to !llvm<"i8*">
	/// %addr = llvm.getelementptr <list>[<pos>]			/// %addr = llvm.getelementptr <array>[<pos>]
	/// llvm.store %typeErased, %addr			/// llvm.store %typeErased, %addr
	///			///
	/// This is necessary to construct the list of arguments passed to the kernel			/// This is necessary to construct the array of arguments passed to the kernel
	/// function as accepted by cuLaunchKernel, i.e. as a void** that points to list			/// function as accepted by cuLaunchKernel, i.e. as a void** that points to
	/// of stack-allocated type-erased pointers to the actual arguments.			/// array of stack-allocated type-erased pointers to the actual arguments.
	void GpuLaunchFuncToGpuRuntimeCallsPass::addParamToList(OpBuilder &builder,			void ConvertLaunchFuncOpToGpuRuntimeCallPattern::addParamToArray(
	Location loc,			OpBuilder &builder, Location loc, Value param, Value array, unsigned pos,
	Value param, Value list,			Value one) const {
	unsigned pos,
	Value one) {
	auto memLocation = builder.create<LLVM::AllocaOp>(			auto memLocation = builder.create<LLVM::AllocaOp>(
	loc, param.getType().cast<LLVM::LLVMType>().getPointerTo(), one,			loc, param.getType().cast<LLVM::LLVMType>().getPointerTo(), one,
	/alignment=/1);			/alignment=/0);
				herhutUnsubmitted Not Done Reply Inline Actions This alignment seems odd. Not you doing but `0` would be canonical. herhut: This alignment seems odd. Not you doing but `0` would be canonical.
				csiggAuthorUnsubmitted Done Reply Inline Actions Changed to 0. csigg: Changed to 0.
	builder.create<LLVM::StoreOp>(loc, param, memLocation);			builder.create<LLVM::StoreOp>(loc, param, memLocation);
	auto casted =			auto casted =
	builder.create<LLVM::BitcastOp>(loc, getPointerType(), memLocation);			builder.create<LLVM::BitcastOp>(loc, llvmPointerType, memLocation);

	auto index = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),			auto index = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type,
	builder.getI32IntegerAttr(pos));			builder.getI32IntegerAttr(pos));
	auto gep = builder.create<LLVM::GEPOp>(loc, getPointerPointerType(), list,			auto gep = builder.create<LLVM::GEPOp>(loc, llvmPointerPointerType, array,
	ArrayRef<Value>{index});			index.getResult());
	builder.create<LLVM::StoreOp>(loc, casted, gep);			builder.create<LLVM::StoreOp>(loc, casted, gep);
	}			}

	// Generates a parameters array to be used with a CUDA / ROCm (HIP) kernel			// Generates a parameters array to be used with a CUDA / ROCm (HIP) kernel
	// launch call. The arguments are extracted from the launchOp.			// launch call. The arguments are extracted from the launchOp.
	// The generated code is essentially as follows:			// The generated code is essentially as follows:
	//			//
	// %array = alloca(numparams * sizeof(void *))			// %array = alloca(numparams * sizeof(void *))
	// for (i : [0, NumKernelOperands))			// for (i : [0, NumKernelOperands))
	// %array[i] = cast<void*>(KernelOperand[i])			// %array[i] = cast<void*>(KernelOperand[i])
	// return %array			// return %array
	Value GpuLaunchFuncToGpuRuntimeCallsPass::setupParamsArray(			Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateParamsArray(
	gpu::LaunchFuncOp launchOp, OpBuilder &builder) {			gpu::LaunchFuncOp launchOp, unsigned numArguments,
				OpBuilder &builder) const {
	// Get the launch target.
	auto gpuFunc = SymbolTable::lookupNearestSymbolFrom<LLVM::LLVMFuncOp>(
	launchOp, launchOp.kernel());
	if (!gpuFunc)
	return {};

	unsigned numArgs = gpuFunc.getNumArguments();

	auto numKernelOperands = launchOp.getNumKernelOperands();			auto numKernelOperands = launchOp.getNumKernelOperands();
	Location loc = launchOp.getLoc();			Location loc = launchOp.getLoc();
	auto one = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),			auto one = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type,
	builder.getI32IntegerAttr(1));			builder.getI32IntegerAttr(1));
	auto arraySize = builder.create<LLVM::ConstantOp>(			auto arraySize = builder.create<LLVM::ConstantOp>(
	loc, getInt32Type(), builder.getI32IntegerAttr(numArgs));			loc, llvmInt32Type, builder.getI32IntegerAttr(numArguments));
	auto array = builder.create<LLVM::AllocaOp>(loc, getPointerPointerType(),			auto array = builder.create<LLVM::AllocaOp>(loc, llvmPointerPointerType,
	arraySize, /alignment=/0);			arraySize, /alignment=/0);

	unsigned pos = 0;			unsigned pos = 0;
	for (unsigned idx = 0; idx < numKernelOperands; ++idx) {			for (unsigned idx = 0; idx < numKernelOperands; ++idx) {
	auto operand = launchOp.getKernelOperand(idx);			auto operand = launchOp.getKernelOperand(idx);
	auto llvmType = operand.getType().cast<LLVM::LLVMType>();			auto llvmType = operand.getType().cast<LLVM::LLVMType>();

	// Assume all struct arguments come from MemRef. If this assumption does not			// Assume all struct arguments come from MemRef. If this assumption does not
	// hold anymore then we `launchOp` to lower from MemRefType and not after			// hold anymore then we `launchOp` to lower from MemRefType and not after
	// LLVMConversion has taken place and the MemRef information is lost.			// LLVMConversion has taken place and the MemRef information is lost.
	if (!llvmType.isStructTy()) {			if (!llvmType.isStructTy()) {
	addParamToList(builder, loc, operand, array, pos++, one);			addParamToArray(builder, loc, operand, array, pos++, one);
	continue;			continue;
	}			}

	// Put individual components of a memref descriptor into the flat argument			// Put individual components of a memref descriptor into the flat argument
	// list. We cannot use unpackMemref from LLVM lowering here because we have			// list. We cannot use unpackMemref from LLVM lowering here because we have
	// no access to MemRefType that had been lowered away.			// no access to MemRefType that had been lowered away.
	for (int32_t j = 0, ej = llvmType.getStructNumElements(); j < ej; ++j) {			for (int32_t j = 0, ej = llvmType.getStructNumElements(); j < ej; ++j) {
	auto elemType = llvmType.getStructElementType(j);			auto elemType = llvmType.getStructElementType(j);
	if (elemType.isArrayTy()) {			if (elemType.isArrayTy()) {
	for (int32_t k = 0, ek = elemType.getArrayNumElements(); k < ek; ++k) {			for (int32_t k = 0, ek = elemType.getArrayNumElements(); k < ek; ++k) {
	Value elem = builder.create<LLVM::ExtractValueOp>(			Value elem = builder.create<LLVM::ExtractValueOp>(
	loc, elemType.getArrayElementType(), operand,			loc, elemType.getArrayElementType(), operand,
	builder.getI32ArrayAttr({j, k}));			builder.getI32ArrayAttr({j, k}));
	addParamToList(builder, loc, elem, array, pos++, one);			addParamToArray(builder, loc, elem, array, pos++, one);
	}			}
	} else {			} else {
	assert((elemType.isIntegerTy() \|\| elemType.isFloatTy() \|\|			assert((elemType.isIntegerTy() \|\| elemType.isFloatTy() \|\|
	elemType.isDoubleTy() \|\| elemType.isPointerTy()) &&			elemType.isDoubleTy() \|\| elemType.isPointerTy()) &&
	"expected scalar type");			"expected scalar type");
	Value strct = builder.create<LLVM::ExtractValueOp>(			Value strct = builder.create<LLVM::ExtractValueOp>(
	loc, elemType, operand, builder.getI32ArrayAttr(j));			loc, elemType, operand, builder.getI32ArrayAttr(j));
	addParamToList(builder, loc, strct, array, pos++, one);			addParamToArray(builder, loc, strct, array, pos++, one);
	}			}
	}			}
	}			}

	return array;			return array;
	}			}

	// Generates an LLVM IR dialect global that contains the name of the given			// Generates an LLVM IR dialect global that contains the name of the given
	// kernel function as a C string, and returns a pointer to its beginning.			// kernel function as a C string, and returns a pointer to its beginning.
	// The code is essentially:			// The code is essentially:
	//			//
	// llvm.global constant @kernel_name("function_name\00")			// llvm.global constant @kernel_name("function_name\00")
	// func(...) {			// func(...) {
	// %0 = llvm.addressof @kernel_name			// %0 = llvm.addressof @kernel_name
	// %1 = llvm.constant (0 : index)			// %1 = llvm.constant (0 : index)
	// %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*">			// %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*">
	// }			// }
	Value GpuLaunchFuncToGpuRuntimeCallsPass::generateKernelNameConstant(			Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateKernelNameConstant(
	StringRef moduleName, StringRef name, Location loc, OpBuilder &builder) {			StringRef moduleName, StringRef name, Location loc,
				OpBuilder &builder) const {
	// Make sure the trailing zero is included in the constant.			// Make sure the trailing zero is included in the constant.
	std::vector<char> kernelName(name.begin(), name.end());			std::vector<char> kernelName(name.begin(), name.end());
	kernelName.push_back('\0');			kernelName.push_back('\0');

	std::string globalName =			std::string globalName =
	std::string(llvm::formatv("{0}_{1}_kernel_name", moduleName, name));			std::string(llvm::formatv("{0}_{1}_kernel_name", moduleName, name));
	return LLVM::createGlobalString(			return LLVM::createGlobalString(
	loc, builder, globalName, StringRef(kernelName.data(), kernelName.size()),			loc, builder, globalName, StringRef(kernelName.data(), kernelName.size()),
	LLVM::Linkage::Internal);			LLVM::Linkage::Internal);
	}			}

	// Emits LLVM IR to launch a kernel function. Expects the module that contains			// Emits LLVM IR to launch a kernel function. Expects the module that contains
	// the compiled kernel function as a cubin in the 'nvvm.cubin' attribute, or a			// the compiled kernel function as a cubin in the 'nvvm.cubin' attribute, or a
	// hsaco in the 'rocdl.hsaco' attribute of the kernel function in the IR.			// hsaco in the 'rocdl.hsaco' attribute of the kernel function in the IR.
	//			//
	// %0 = call %binarygetter			// %0 = call %binarygetter
	// %1 = call %moduleLoad(%0)			// %1 = call %moduleLoad(%0)
	// %2 = <see generateKernelNameConstant>			// %2 = <see generateKernelNameConstant>
	// %3 = call %moduleGetFunction(%1, %2)			// %3 = call %moduleGetFunction(%1, %2)
	// %4 = call %streamCreate()			// %4 = call %streamCreate()
	// %5 = <see setupParamsArray>			// %5 = <see generateParamsArray>
	// call %launchKernel(%3, <launchOp operands 0..5>, 0, %4, %5, nullptr)			// call %launchKernel(%3, <launchOp operands 0..5>, 0, %4, %5, nullptr)
	// call %streamSynchronize(%4)			// call %streamSynchronize(%4)
	void GpuLaunchFuncToGpuRuntimeCallsPass::translateGpuLaunchCalls(			LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(
	mlir::gpu::LaunchFuncOp launchOp) {			Operation *op, ArrayRef<Value> operands,
	OpBuilder builder(launchOp);			ConversionPatternRewriter &rewriter) const {
	Location loc = launchOp.getLoc();			Location loc = op->getLoc();
				csiggAuthorUnsubmitted Done Reply Inline Actions Oops. I'm looking into this now. csigg: Oops. I'm looking into this now.
	declareGpuRuntimeFunctions(loc);			auto launchOp = cast<gpu::LaunchFuncOp>(op);
				herhutUnsubmitted Not Done Reply Inline Actions Please use the `rewriter` instead, herhut: Please use the `rewriter` instead,
				csiggAuthorUnsubmitted Done Reply Inline Actions Done, but I did not change the `builder` type in the other functions. Should I? csigg: Done, but I did not change the `builder` type in the other functions. Should I?
				herhutUnsubmitted Not Done Reply Inline Actions I cannot see that change. No need to change type as long as it is backed by the rewriter for the pattern. Otherwise the rollback would not work. herhut: I cannot see that change. No need to change type as long as it is backed by the rewriter for…
				csiggAuthorUnsubmitted Done Reply Inline Actions Yes, it's backed by the rewriter. You don't see the change because I haven't pushed it. It's always a bit of a dance and I wanted to wait for the other things to get resolved. csigg: Yes, it's backed by the rewriter. You don't see the change because I haven't pushed it. It's…
				auto moduleOp = op->getParentOfType<ModuleOp>();

	auto zero = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
	builder.getI32IntegerAttr(0));
	// Create an LLVM global with CUBIN extracted from the kernel annotation and			// Create an LLVM global with CUBIN extracted from the kernel annotation and
	// obtain a pointer to the first byte in it.			// obtain a pointer to the first byte in it.
	auto kernelModule = getOperation().lookupSymbol<gpu::GPUModuleOp>(			auto kernelModule =
	launchOp.getKernelModuleName());			moduleOp.lookupSymbol<gpu::GPUModuleOp>(launchOp.getKernelModuleName());
	assert(kernelModule && "expected a kernel module");			assert(kernelModule && "expected a kernel module");

	auto binaryAttr = kernelModule.getAttrOfType<StringAttr>(gpuBinaryAnnotation);			auto binaryAttr = kernelModule.getAttrOfType<StringAttr>(gpuBinaryAnnotation);
	if (!binaryAttr) {			if (!binaryAttr) {
	kernelModule.emitOpError()			kernelModule.emitOpError()
	<< "missing " << gpuBinaryAnnotation << " attribute";			<< "missing " << gpuBinaryAnnotation << " attribute";
	return signalPassFailure();			return failure();
	}			}

	SmallString<128> nameBuffer(kernelModule.getName());			SmallString<128> nameBuffer(kernelModule.getName());
	nameBuffer.append(kGpuBinaryStorageSuffix);			nameBuffer.append(kGpuBinaryStorageSuffix);
	Value data =			Value data =
	LLVM::createGlobalString(loc, builder, nameBuffer.str(),			LLVM::createGlobalString(loc, rewriter, nameBuffer.str(),
	binaryAttr.getValue(), LLVM::Linkage::Internal);			binaryAttr.getValue(), LLVM::Linkage::Internal);

	// Emit the load module call to load the module data. Error checking is done			auto module = moduleLoadCallBuilder.create(loc, rewriter, data);
	// in the called helper function.
	auto gpuModuleLoad =
	getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuModuleLoadName);
	auto module = builder.create<LLVM::CallOp>(
	loc, ArrayRef<Type>{getPointerType()},
	builder.getSymbolRefAttr(gpuModuleLoad), ArrayRef<Value>{data});
	// Get the function from the module. The name corresponds to the name of			// Get the function from the module. The name corresponds to the name of
	// the kernel function.			// the kernel function.
	auto kernelName = generateKernelNameConstant(			auto kernelName = generateKernelNameConstant(
	launchOp.getKernelModuleName(), launchOp.getKernelName(), loc, builder);			launchOp.getKernelModuleName(), launchOp.getKernelName(), loc, rewriter);
	auto gpuModuleGetFunction =			auto function = moduleGetFunctionCallBuilder.create(
	getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuModuleGetFunctionName);			loc, rewriter, {module.getResult(0), kernelName});
	auto function = builder.create<LLVM::CallOp>(
	loc, ArrayRef<Type>{getPointerType()},
	builder.getSymbolRefAttr(gpuModuleGetFunction),
	ArrayRef<Value>{module.getResult(0), kernelName});
	// Grab the global stream needed for execution.			// Grab the global stream needed for execution.
	auto gpuStreamCreate =			auto stream = streamCreateCallBuilder.create(loc, rewriter, {});
	getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuStreamCreateName);
	auto stream = builder.create<LLVM::CallOp>(			// Get the launch target.
	loc, ArrayRef<Type>{getPointerType()},			auto gpuFuncOp = SymbolTable::lookupNearestSymbolFrom<LLVM::LLVMFuncOp>(
				herhutUnsubmitted Not Done Reply Inline Actions Would `launchOp.getNumKernelOperands()`work here, as well? It would avoid the search for the kernel. herhut: Would `launchOp.getNumKernelOperands()`work here, as well? It would avoid the search for the…
				csiggAuthorUnsubmitted Done Reply Inline Actions I don't think so because that would give you the non-flattened number of arguments, right? csigg: I don't think so because that would give you the non-flattened number of arguments, right?
				herhutUnsubmitted Done Reply Inline Actions Good point, thanks! herhut: Good point, thanks!
	builder.getSymbolRefAttr(gpuStreamCreate), ArrayRef<Value>{});			launchOp, launchOp.kernel());
				if (!gpuFuncOp) {
				launchOp.emitOpError() << "corresponding kernel function not found";
				return failure();
				}
				// Build array of kernel parameters.
				auto kernelParams =
				generateParamsArray(launchOp, gpuFuncOp.getNumArguments(), rewriter);

	// Invoke the function with required arguments.			// Invoke the function with required arguments.
	auto gpuLaunchKernel =			auto zero = rewriter.create<LLVM::ConstantOp>(loc, llvmInt32Type,
	getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuLaunchKernelName);			rewriter.getI32IntegerAttr(0));
	auto paramsArray = setupParamsArray(launchOp, builder);
	if (!paramsArray) {
	launchOp.emitOpError() << "cannot pass given parameters to the kernel";
	return signalPassFailure();
	}
	auto nullpointer =			auto nullpointer =
	builder.create<LLVM::IntToPtrOp>(loc, getPointerPointerType(), zero);			rewriter.create<LLVM::IntToPtrOp>(loc, llvmPointerPointerType, zero);
	builder.create<LLVM::CallOp>(			launchKernelCallBuilder.create(
	loc, ArrayRef<Type>{getVoidType()},			loc, rewriter,
	builder.getSymbolRefAttr(gpuLaunchKernel),			{function.getResult(0), launchOp.gridSizeX(), launchOp.gridSizeY(),
	ArrayRef<Value>{function.getResult(0), launchOp.getOperand(0),			launchOp.gridSizeZ(), launchOp.blockSizeX(), launchOp.blockSizeY(),
	launchOp.getOperand(1), launchOp.getOperand(2),			launchOp.blockSizeZ(), zero, /* sharedMemBytes */
	launchOp.getOperand(3), launchOp.getOperand(4),
	launchOp.getOperand(5), zero, /* sharedMemBytes */
	stream.getResult(0), /* stream */			stream.getResult(0), /* stream */
	paramsArray, /* kernel params */			kernelParams, /* kernel params */
	nullpointer /* extra */});			nullpointer /* extra */});
	// Sync on the stream to make it synchronous.			streamSynchronizeCallBuilder.create(loc, rewriter, stream.getResult(0));
	auto gpuStreamSync =
	getOperation().lookupSymbol<LLVM::LLVMFuncOp>(kGpuStreamSynchronizeName);			rewriter.eraseOp(op);
	builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getVoidType()},			return success();
	builder.getSymbolRefAttr(gpuStreamSync),
	ArrayRef<Value>(stream.getResult(0)));
	launchOp.erase();
	}			}

	std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>			std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
	mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass(			mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass(
	StringRef gpuBinaryAnnotation) {			StringRef gpuBinaryAnnotation) {
	if (gpuBinaryAnnotation.empty())
	return std::make_unique<GpuLaunchFuncToGpuRuntimeCallsPass>();
	return std::make_unique<GpuLaunchFuncToGpuRuntimeCallsPass>(			return std::make_unique<GpuLaunchFuncToGpuRuntimeCallsPass>(
	gpuBinaryAnnotation);			gpuBinaryAnnotation);
	}			}

				void mlir::populateGpuToLLVMConversionPatterns(
				LLVMTypeConverter &converter, OwningRewritePatternList &patterns,
				StringRef gpuBinaryAnnotation) {
				patterns.insert<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(
				converter, gpuBinaryAnnotation);
				patterns.insert<EraseGpuModuleOpPattern>(&converter.getContext());
				}

mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp

Show First 20 Lines • Show All 104 Lines • ▼ Show 20 Lines	OwnedBlob compilePtxToCubin(const std::string ptx, Location loc,

return result;		return result;
}		}

static LogicalResult runMLIRPasses(ModuleOp m) {		static LogicalResult runMLIRPasses(ModuleOp m) {
PassManager pm(m.getContext());		PassManager pm(m.getContext());
applyPassManagerCLOptions(pm);		applyPassManagerCLOptions(pm);

		const char gpuBinaryAnnotation[] = "nvvm.cubin";
pm.addPass(createGpuKernelOutliningPass());		pm.addPass(createGpuKernelOutliningPass());
auto &kernelPm = pm.nest<gpu::GPUModuleOp>();		auto &kernelPm = pm.nest<gpu::GPUModuleOp>();
kernelPm.addPass(createStripDebugInfoPass());		kernelPm.addPass(createStripDebugInfoPass());
kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass());		kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass());
kernelPm.addPass(createConvertGPUKernelToBlobPass(		kernelPm.addPass(createConvertGPUKernelToBlobPass(
translateModuleToNVVMIR, compilePtxToCubin, "nvptx64-nvidia-cuda",		translateModuleToNVVMIR, compilePtxToCubin, "nvptx64-nvidia-cuda",
"sm_35", "+ptx60", "nvvm.cubin"));		"sm_35", "+ptx60", gpuBinaryAnnotation));
pm.addPass(createLowerToLLVMPass());		pm.addPass(createLowerToLLVMPass());
pm.addPass(createConvertGpuLaunchFuncToGpuRuntimeCallsPass());		pm.addPass(
		createConvertGpuLaunchFuncToGpuRuntimeCallsPass(gpuBinaryAnnotation));

return pm.run(m);		return pm.run(m);
}		}

int main(int argc, char **argv) {		int main(int argc, char **argv) {
registerPassManagerCLOptions();		registerPassManagerCLOptions();
mlir::registerAllDialects();		mlir::registerAllDialects();
llvm::InitLLVM y(argc, argv);		llvm::InitLLVM y(argc, argv);
Show All 12 Lines

mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp

Show First 20 Lines • Show All 293 Lines • ▼ Show 20 Lines	static LogicalResult runMLIRPasses(ModuleOp m) {

// Configure target chip ISA version if it has not been specified.		// Configure target chip ISA version if it has not been specified.
if (!targetChip.size())		if (!targetChip.size())
configTargetChip();		configTargetChip();

// Configure target features per ROCm / HIP version.		// Configure target features per ROCm / HIP version.
configTargetFeatures();		configTargetFeatures();

		const char gpuBinaryAnnotation[] = "rocdl.hsaco";
pm.addPass(createGpuKernelOutliningPass());		pm.addPass(createGpuKernelOutliningPass());
auto &kernelPm = pm.nest<gpu::GPUModuleOp>();		auto &kernelPm = pm.nest<gpu::GPUModuleOp>();
kernelPm.addPass(createStripDebugInfoPass());		kernelPm.addPass(createStripDebugInfoPass());
kernelPm.addPass(createLowerGpuOpsToROCDLOpsPass());		kernelPm.addPass(createLowerGpuOpsToROCDLOpsPass());
kernelPm.addPass(createConvertGPUKernelToBlobPass(		kernelPm.addPass(createConvertGPUKernelToBlobPass(
compileModuleToROCDLIR, compileISAToHsaco, tripleName, targetChip,		compileModuleToROCDLIR, compileISAToHsaco, tripleName, targetChip,
features, /gpuBinaryAnnotation=/"rocdl.hsaco"));		features, gpuBinaryAnnotation));
pm.addPass(createLowerToLLVMPass());		pm.addPass(createLowerToLLVMPass());
pm.addPass(createConvertGpuLaunchFuncToGpuRuntimeCallsPass(		pm.addPass(
/gpuBinaryAnnotation=/"rocdl.hsaco"));		createConvertGpuLaunchFuncToGpuRuntimeCallsPass(gpuBinaryAnnotation));

return pm.run(m);		return pm.run(m);
}		}

int main(int argc, char **argv) {		int main(int argc, char **argv) {
registerPassManagerCLOptions();		registerPassManagerCLOptions();
mlir::registerAllDialects();		mlir::registerAllDialects();
llvm::InitLLVM y(argc, argv);		llvm::InitLLVM y(argc, argv);
Show All 13 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[MLIR] Change GpuLaunchFuncToGpuRuntimeCallsPass to wrap a RewritePattern with the same functionality.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 283547

mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h

mlir/lib/Conversion/GPUCommon/CMakeLists.txt

mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp

mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp

mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp

This is an archive of the discontinued LLVM Phabricator instance.

[MLIR] Change GpuLaunchFuncToGpuRuntimeCallsPass to wrap a RewritePattern with the same functionality.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 283547

mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h

mlir/lib/Conversion/GPUCommon/CMakeLists.txt

mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp

mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp

mlir/tools/mlir-rocm-runner/mlir-rocm-runner.cpp

[MLIR] Change GpuLaunchFuncToGpuRuntimeCallsPass to wrap a RewritePattern with the same functionality.
ClosedPublic