Diff 264738

mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h

This file was added.

				//===- GPUToCUDAPass.h - MLIR CUDA runtime support --------------- C++ --===//
				//
				herhutUnsubmitted Done Reply Inline Actions Please fix comment. herhut: Please fix comment.
				// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
				// See https://llvm.org/LICENSE.txt for license information.
				// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
				//
				//===----------------------------------------------------------------------===//
				#ifndef MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_
				#define MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_

				#include "mlir/Support/LLVM.h"
				#include <functional>
				#include <memory>
				rriddleUnsubmitted Done Reply Inline Actions Where is functional used? rriddle: Where is functional used?
				#include <string>
				#include <vector>
				rriddleUnsubmitted Not Done Reply Inline Actions Why are these necessary? rriddle: Why are these necessary?
				whchungAuthorUnsubmitted Done Reply Inline Actions These headers were introduced back in commit c72c6c390710 , but they don't really seem necessary now. I can modify D80142 and remove them. whchung: These headers were introduced back in commit c72c6c390710 , but they don't really seem…

				namespace mlir {

				class Location;
				class ModuleOp;

				template <typename T>
				class OperationPass;

				/// Creates a pass to convert a gpu.launch_func operation into a sequence of
				/// GPU runtime calls.
				///
				/// This pass does not generate code to call GPU runtime APIs directly but
				/// instead uses a small wrapper library that exports a stable and conveniently
				/// typed ABI on top of GPU runtimes such as CUDA or ROCm (HIP).
				std::unique_ptr<OperationPass<ModuleOp>>
				createConvertGpuLaunchFuncToGpuRuntimeCallsPass();

				} // namespace mlir

				#endif // MLIR_CONVERSION_GPUCOMMON_GPUCOMMONPASS_H_

mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h

	Show All 39 Lines
	/// module with help of the nvptx backend to PTX and then invokes the provided			/// module with help of the nvptx backend to PTX and then invokes the provided
	/// cubinGenerator to produce a binary blob (the cubin). Such blob is then			/// cubinGenerator to produce a binary blob (the cubin). Such blob is then
	/// attached as a string attribute named 'nvvm.cubin' to the kernel function.			/// attached as a string attribute named 'nvvm.cubin' to the kernel function.
	/// After the transformation, the body of the kernel function is removed (i.e.,			/// After the transformation, the body of the kernel function is removed (i.e.,
	/// it is turned into a declaration).			/// it is turned into a declaration).
	std::unique_ptr<OperationPass<gpu::GPUModuleOp>>			std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
	createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator);			createConvertGPUKernelToCubinPass(CubinGenerator cubinGenerator);

	/// Creates a pass to convert a gpu.launch_func operation into a sequence of
	/// CUDA calls.
	///
	/// This pass does not generate code to call CUDA directly but instead uses a
	/// small wrapper library that exports a stable and conveniently typed ABI
	/// on top of CUDA.
	std::unique_ptr<OperationPass<ModuleOp>>
	createConvertGpuLaunchFuncToCudaCallsPass();

	} // namespace mlir			} // namespace mlir

	#endif // MLIR_CONVERSION_GPUTOCUDA_GPUTOCUDAPASS_H_			#endif // MLIR_CONVERSION_GPUTOCUDA_GPUTOCUDAPASS_H_

mlir/include/mlir/Conversion/Passes.td

	Show First 20 Lines • Show All 73 Lines • ▼ Show 20 Lines

	def ConvertAVX512ToLLVM : Pass<"convert-avx512-to-llvm", "ModuleOp"> {			def ConvertAVX512ToLLVM : Pass<"convert-avx512-to-llvm", "ModuleOp"> {
	let summary = "Convert the operations from the avx512 dialect into the LLVM "			let summary = "Convert the operations from the avx512 dialect into the LLVM "
	"dialect";			"dialect";
	let constructor = "mlir::createConvertAVX512ToLLVMPass()";			let constructor = "mlir::createConvertAVX512ToLLVMPass()";
	}			}

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// GPUToCUDA			// GPUCommon
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	def ConvertGpuLaunchFuncToCudaCalls : Pass<"launch-func-to-cuda", "ModuleOp"> {			def ConvertGpuLaunchFuncToGpuRuntimeCalls : Pass<"launch-func-to-gpu-runtime",
	let summary = "Convert all launch_func ops to CUDA runtime calls";			"ModuleOp"> {
	let constructor = "mlir::createConvertGpuLaunchFuncToCudaCallsPass()";			let summary = "Convert all launch_func ops to GPU runtime calls";
				let constructor = "mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass()";
				let options = [
				Option<"gpuModuleLoadName", "gpu-module-load-name", "std::string",
				"\"mcuModuleLoad\"",
				"GPU runtime API to load the module given the binary data">,
				Option<"gpuModuleGetFunctionName", "gpu-module-get-function-name",
				"std::string",
				"\"mcuModuleGetFunction\"",
				"GPU runtime API to get a handle to the actual kernel function">,
				Option<"gpuLaunchKernelName", "gpu-launch-kernel-name", "std::string",
				"\"mcuLaunchKernel\"",
				"GPU runtime API to launch the kernel on a stream">,
				Option<"gpuGetStreamHelperName", "gpu-get-stream-helper-name",
				"std::string",
				"\"mcuGetStreamHelper\"",
				"GPU runtime API to initialize a new compute stream on GPU">,
				Option<"gpuStreamSynchronizeName", "gpu-stream-synchronize-name",
				"std::string",
				"\"mcuStreamSynchronize\"",
				"GPU runtime API to wait for operations on the stream to finish">,
				Option<"gpuMemHostRegisterName", "gpu-mem-host-register-name",
				"std::string",
				"\"mcuMemHostRegister\"",
				"GPU runtime API to bind host memory buffer to be visible on GPU">,
				Option<"gpuBinaryAnnotation", "gpu-binary-annotation", "std::string",
				"\"nvvm.cubin\"",
				"Annotation attribute string for GPU binary">,
				Option<"gpuBinaryStorageSuffix", "gpu-binary-storage-suffix", "std::string",
				"\"_cubin_cst\"",
				"GPU binary Storage Suffix">,
				];
	}			}

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// GPUToNVVM			// GPUToNVVM
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	def ConvertGpuOpsToNVVMOps : Pass<"convert-gpu-to-nvvm", "gpu::GPUModuleOp"> {			def ConvertGpuOpsToNVVMOps : Pass<"convert-gpu-to-nvvm", "gpu::GPUModuleOp"> {
	let summary = "Generate NVVM operations for gpu operations";			let summary = "Generate NVVM operations for gpu operations";
	▲ Show 20 Lines • Show All 193 Lines • Show Last 20 Lines

mlir/include/mlir/InitAllPasses.h

	Show All 9 Lines
	// passes to the system.			// passes to the system.
	//			//
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	#ifndef MLIR_INITALLPASSES_H_			#ifndef MLIR_INITALLPASSES_H_
	#define MLIR_INITALLPASSES_H_			#define MLIR_INITALLPASSES_H_

	#include "mlir/Conversion/AVX512ToLLVM/ConvertAVX512ToLLVM.h"			#include "mlir/Conversion/AVX512ToLLVM/ConvertAVX512ToLLVM.h"
				#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
	#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"			#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
	#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"			#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
	#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"			#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
	#include "mlir/Conversion/GPUToROCm/GPUToROCmPass.h"			#include "mlir/Conversion/GPUToROCm/GPUToROCmPass.h"
	#include "mlir/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.h"			#include "mlir/Conversion/GPUToSPIRV/ConvertGPUToSPIRVPass.h"
	#include "mlir/Conversion/GPUToVulkan/ConvertGPUToVulkanPass.h"			#include "mlir/Conversion/GPUToVulkan/ConvertGPUToVulkanPass.h"
	#include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"			#include "mlir/Conversion/LinalgToLLVM/LinalgToLLVM.h"
	#include "mlir/Conversion/LinalgToSPIRV/LinalgToSPIRVPass.h"			#include "mlir/Conversion/LinalgToSPIRV/LinalgToSPIRVPass.h"
	▲ Show 20 Lines • Show All 81 Lines • Show Last 20 Lines

mlir/lib/Conversion/CMakeLists.txt

	add_subdirectory(AffineToStandard)			add_subdirectory(AffineToStandard)
	add_subdirectory(AVX512ToLLVM)			add_subdirectory(AVX512ToLLVM)
				add_subdirectory(GPUCommon)
	add_subdirectory(GPUToCUDA)			add_subdirectory(GPUToCUDA)
	add_subdirectory(GPUToNVVM)			add_subdirectory(GPUToNVVM)
	add_subdirectory(GPUToROCDL)			add_subdirectory(GPUToROCDL)
	add_subdirectory(GPUToROCm)			add_subdirectory(GPUToROCm)
	add_subdirectory(GPUToSPIRV)			add_subdirectory(GPUToSPIRV)
	add_subdirectory(GPUToVulkan)			add_subdirectory(GPUToVulkan)
	add_subdirectory(LinalgToLLVM)			add_subdirectory(LinalgToLLVM)
	add_subdirectory(LinalgToSPIRV)			add_subdirectory(LinalgToSPIRV)
	add_subdirectory(MIOpenToGPU)			add_subdirectory(MIOpenToGPU)
	add_subdirectory(LinalgToStandard)			add_subdirectory(LinalgToStandard)
	add_subdirectory(SCFToGPU)			add_subdirectory(SCFToGPU)
	add_subdirectory(SCFToStandard)			add_subdirectory(SCFToStandard)
	add_subdirectory(StandardToLLVM)			add_subdirectory(StandardToLLVM)
	add_subdirectory(StandardToSPIRV)			add_subdirectory(StandardToSPIRV)
	add_subdirectory(VectorToLLVM)			add_subdirectory(VectorToLLVM)
	add_subdirectory(VectorToSCF)			add_subdirectory(VectorToSCF)

mlir/lib/Conversion/GPUCommon/CMakeLists.txt

This file was added.

				set(SOURCES
				ConvertLaunchFuncToRuntimeCalls.cpp
				)

				add_mlir_conversion_library(MLIRGPUtoGPURuntimeTransforms
				${SOURCES}

				DEPENDS
				MLIRConversionPassIncGen
				intrinsics_gen

				LINK_COMPONENTS
				Core

				LINK_LIBS PUBLIC
				MLIRGPU
				MLIRIR
				MLIRLLVMIR
				#MLIRNVVMIR
				MLIRPass
				herhutUnsubmitted Done Reply Inline Actions Please remove commented-out lines. herhut: Please remove commented-out lines.
				MLIRSupport
				#MLIRTargetNVVMIR
				)

mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp

This file was moved from mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp.

//===- ConvertLaunchFuncToCudaCalls.cpp - MLIR CUDA lowering passes -------===//		//===- ConvertLaunchFuncToGpuRuntimeCalls.cpp - MLIR GPU lowering passes --===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
// This file implements a pass to convert gpu.launch_func op into a sequence of		// This file implements a pass to convert gpu.launch_func op into a sequence of
// CUDA runtime calls. As the CUDA runtime does not have a stable published ABI,		// GPU runtime calls. As most of GPU runtimes does not have a stable published
// this pass uses a slim runtime layer that builds on top of the public API from		// ABI, this pass uses a slim runtime layer that builds on top of the public
// the CUDA headers.		// API from GPU runtime headers.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"		#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"

#include "../PassDetail.h"		#include "../PassDetail.h"
#include "mlir/Dialect/GPU/GPUDialect.h"		#include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"		#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/IR/Attributes.h"		#include "mlir/IR/Attributes.h"
#include "mlir/IR/Builders.h"		#include "mlir/IR/Builders.h"
#include "mlir/IR/Function.h"		#include "mlir/IR/Function.h"
#include "mlir/IR/Module.h"		#include "mlir/IR/Module.h"
#include "mlir/IR/StandardTypes.h"		#include "mlir/IR/StandardTypes.h"

#include "llvm/ADT/STLExtras.h"		#include "llvm/ADT/STLExtras.h"
#include "llvm/IR/DataLayout.h"		#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DerivedTypes.h"		#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Module.h"		#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"		#include "llvm/IR/Type.h"
#include "llvm/Support/Error.h"		#include "llvm/Support/Error.h"
#include "llvm/Support/FormatVariadic.h"		#include "llvm/Support/FormatVariadic.h"

using namespace mlir;		using namespace mlir;

// To avoid name mangling, these are defined in the mini-runtime file.
static constexpr const char *cuModuleLoadName = "mcuModuleLoad";
static constexpr const char *cuModuleGetFunctionName = "mcuModuleGetFunction";
static constexpr const char *cuLaunchKernelName = "mcuLaunchKernel";
static constexpr const char *cuGetStreamHelperName = "mcuGetStreamHelper";
static constexpr const char *cuStreamSynchronizeName = "mcuStreamSynchronize";
static constexpr const char *kMcuMemHostRegister = "mcuMemHostRegister";

static constexpr const char *kCubinAnnotation = "nvvm.cubin";
static constexpr const char *kCubinStorageSuffix = "_cubin_cst";

namespace {		namespace {

/// A pass to convert gpu.launch_func operations into a sequence of CUDA		/// A pass to convert gpu.launch_func operations into a sequence of GPU
/// runtime calls.		/// runtime calls. Currently it supports CUDA and ROCm (HIP).
///		///
/// In essence, a gpu.launch_func operations gets compiled into the following		/// In essence, a gpu.launch_func operations gets compiled into the following
/// sequence of runtime calls:		/// sequence of runtime calls:
///		///
/// * mcuModuleLoad -- loads the module given the cubin data		/// * moduleLoad -- loads the module given the cubin / hsaco data
/// * mcuModuleGetFunction -- gets a handle to the actual kernel function		/// * moduleGetFunction -- gets a handle to the actual kernel function
/// * mcuGetStreamHelper -- initializes a new CUDA stream		/// * getStreamHelper -- initializes a new compute stream on GPU
/// * mcuLaunchKernelName -- launches the kernel on a stream		/// * launchKernel -- launches the kernel on a stream
/// * mcuStreamSynchronize -- waits for operations on the stream to finish		/// * streamSynchronize -- waits for operations on the stream to finish
///		///
/// Intermediate data structures are allocated on the stack.		/// Intermediate data structures are allocated on the stack.
class GpuLaunchFuncToCudaCallsPass		class GpuLaunchFuncToGpuRuntimeCallsPass
: public ConvertGpuLaunchFuncToCudaCallsBase<GpuLaunchFuncToCudaCallsPass> {		: public ConvertGpuLaunchFuncToGpuRuntimeCallsBase<
		GpuLaunchFuncToGpuRuntimeCallsPass> {
private:		private:
LLVM::LLVMDialect *getLLVMDialect() { return llvmDialect; }		LLVM::LLVMDialect *getLLVMDialect() { return llvmDialect; }

llvm::LLVMContext &getLLVMContext() {		llvm::LLVMContext &getLLVMContext() {
return getLLVMDialect()->getLLVMContext();		return getLLVMDialect()->getLLVMContext();
}		}

void initializeCachedTypes() {		void initializeCachedTypes() {
Show All 21 Lines	private:
LLVM::LLVMType getInt64Type() { return llvmInt64Type; }		LLVM::LLVMType getInt64Type() { return llvmInt64Type; }

LLVM::LLVMType getIntPtrType() {		LLVM::LLVMType getIntPtrType() {
const llvm::Module &module = getLLVMDialect()->getLLVMModule();		const llvm::Module &module = getLLVMDialect()->getLLVMModule();
return LLVM::LLVMType::getIntNTy(		return LLVM::LLVMType::getIntNTy(
getLLVMDialect(), module.getDataLayout().getPointerSizeInBits());		getLLVMDialect(), module.getDataLayout().getPointerSizeInBits());
}		}

LLVM::LLVMType getCUResultType() {		LLVM::LLVMType getGpuRuntimeResultType() {
// This is declared as an enum in CUDA but helpers use i32.		// This is declared as an enum in CUDA but helpers use i32.
		herhutUnsubmitted Done Reply Inline Actions There is a `CUDA` left-over here. herhut: There is a `CUDA` left-over here.
return getInt32Type();		return getInt32Type();
}		}

// Allocate a void pointer on the stack.		// Allocate a void pointer on the stack.
Value allocatePointer(OpBuilder &builder, Location loc) {		Value allocatePointer(OpBuilder &builder, Location loc) {
auto one = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),		auto one = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
builder.getI32IntegerAttr(1));		builder.getI32IntegerAttr(1));
return builder.create<LLVM::AllocaOp>(loc, getPointerPointerType(), one,		return builder.create<LLVM::AllocaOp>(loc, getPointerPointerType(), one,
/alignment=/0);		/alignment=/0);
}		}

void declareCudaFunctions(Location loc);		void declareGpuRuntimeFunctions(Location loc);
void addParamToList(OpBuilder &builder, Location loc, Value param, Value list,		void addParamToList(OpBuilder &builder, Location loc, Value param, Value list,
unsigned pos, Value one);		unsigned pos, Value one);
Value setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder);		Value setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder);
Value generateKernelNameConstant(StringRef moduleName, StringRef name,		Value generateKernelNameConstant(StringRef moduleName, StringRef name,
Location loc, OpBuilder &builder);		Location loc, OpBuilder &builder);
void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp);		void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp);

public:		public:
// Run the dialect converter on the module.		// Run the dialect converter on the module.
void runOnOperation() override {		void runOnOperation() override {
// Cache the LLVMDialect for the current module.		// Cache the LLVMDialect for the current module.
llvmDialect = getContext().getRegisteredDialect<LLVM::LLVMDialect>();		llvmDialect = getContext().getRegisteredDialect<LLVM::LLVMDialect>();
// Cache the used LLVM types.		// Cache the used LLVM types.
initializeCachedTypes();		initializeCachedTypes();

getOperation().walk(		getOperation().walk(
[this](mlir::gpu::LaunchFuncOp op) { translateGpuLaunchCalls(op); });		[this](mlir::gpu::LaunchFuncOp op) { translateGpuLaunchCalls(op); });

// GPU kernel modules are no longer necessary since we have a global		// GPU kernel modules are no longer necessary since we have a global
// constant with the CUBIN data.		// constant with the CUBIN, or HSACO data.
for (auto m :		for (auto m :
llvm::make_early_inc_range(getOperation().getOps<gpu::GPUModuleOp>()))		llvm::make_early_inc_range(getOperation().getOps<gpu::GPUModuleOp>()))
m.erase();		m.erase();
}		}

private:		private:
LLVM::LLVMDialect *llvmDialect;		LLVM::LLVMDialect *llvmDialect;
LLVM::LLVMType llvmVoidType;		LLVM::LLVMType llvmVoidType;
LLVM::LLVMType llvmPointerType;		LLVM::LLVMType llvmPointerType;
LLVM::LLVMType llvmPointerPointerType;		LLVM::LLVMType llvmPointerPointerType;
LLVM::LLVMType llvmInt8Type;		LLVM::LLVMType llvmInt8Type;
LLVM::LLVMType llvmInt32Type;		LLVM::LLVMType llvmInt32Type;
LLVM::LLVMType llvmInt64Type;		LLVM::LLVMType llvmInt64Type;
LLVM::LLVMType llvmIntPtrType;		LLVM::LLVMType llvmIntPtrType;
};		};

} // anonymous namespace		} // anonymous namespace

// Adds declarations for the needed helper functions from the CUDA wrapper.		// Adds declarations for the needed helper functions from the runtime wrappers.
// The types in comments give the actual types expected/returned but the API		// The types in comments give the actual types expected/returned but the API
// uses void pointers. This is fine as they have the same linkage in C.		// uses void pointers. This is fine as they have the same linkage in C.
void GpuLaunchFuncToCudaCallsPass::declareCudaFunctions(Location loc) {		void GpuLaunchFuncToGpuRuntimeCallsPass::declareGpuRuntimeFunctions(
		Location loc) {
ModuleOp module = getOperation();		ModuleOp module = getOperation();
OpBuilder builder(module.getBody()->getTerminator());		OpBuilder builder(module.getBody()->getTerminator());
if (!module.lookupSymbol(cuModuleLoadName)) {		if (!module.lookupSymbol(gpuModuleLoadName)) {
builder.create<LLVM::LLVMFuncOp>(		builder.create<LLVM::LLVMFuncOp>(
loc, cuModuleLoadName,		loc, gpuModuleLoadName,
LLVM::LLVMType::getFunctionTy(		LLVM::LLVMType::getFunctionTy(
getCUResultType(),		getGpuRuntimeResultType(),
{		{
getPointerPointerType(), /* CUmodule module /		getPointerPointerType(), /* CUmodule module /
getPointerType() /* void cubin /		getPointerType() /* void cubin /
},		},
/isVarArg=/false));		/isVarArg=/false));
}		}
if (!module.lookupSymbol(cuModuleGetFunctionName)) {		if (!module.lookupSymbol(gpuModuleGetFunctionName)) {
// The helper uses void* instead of CUDA's opaque CUmodule and		// The helper uses void* instead of CUDA's opaque CUmodule and
// CUfunction.		// CUfunction.
builder.create<LLVM::LLVMFuncOp>(		builder.create<LLVM::LLVMFuncOp>(
loc, cuModuleGetFunctionName,		loc, gpuModuleGetFunctionName,
LLVM::LLVMType::getFunctionTy(		LLVM::LLVMType::getFunctionTy(
getCUResultType(),		getGpuRuntimeResultType(),
{		{
getPointerPointerType(), /* void *function /		getPointerPointerType(), /* void *function /
getPointerType(), /* void module /		getPointerType(), /* void module /
getPointerType() /* char name /		getPointerType() /* char name /
},		},
/isVarArg=/false));		/isVarArg=/false));
}		}
if (!module.lookupSymbol(cuLaunchKernelName)) {		if (!module.lookupSymbol(gpuLaunchKernelName)) {
// Other than the CUDA api, the wrappers use uintptr_t to match the		// Other than the CUDA api, the wrappers use uintptr_t to match the
		herhutUnsubmitted Done Reply Inline Actions Maybe remove the CUDA reference here, too. herhut: Maybe remove the CUDA reference here, too.
// LLVM type if MLIR's index type, which the GPU dialect uses.		// LLVM type if MLIR's index type, which the GPU dialect uses.
// Furthermore, they use void* instead of CUDA's opaque CUfunction and		// Furthermore, they use void* instead of CUDA's opaque CUfunction and
// CUstream.		// CUstream.
builder.create<LLVM::LLVMFuncOp>(		builder.create<LLVM::LLVMFuncOp>(
loc, cuLaunchKernelName,		loc, gpuLaunchKernelName,
LLVM::LLVMType::getFunctionTy(		LLVM::LLVMType::getFunctionTy(
getCUResultType(),		getGpuRuntimeResultType(),
{		{
getPointerType(), /* void* f */		getPointerType(), /* void* f */
getIntPtrType(), /* intptr_t gridXDim */		getIntPtrType(), /* intptr_t gridXDim */
getIntPtrType(), /* intptr_t gridyDim */		getIntPtrType(), /* intptr_t gridyDim */
getIntPtrType(), /* intptr_t gridZDim */		getIntPtrType(), /* intptr_t gridZDim */
getIntPtrType(), /* intptr_t blockXDim */		getIntPtrType(), /* intptr_t blockXDim */
getIntPtrType(), /* intptr_t blockYDim */		getIntPtrType(), /* intptr_t blockYDim */
getIntPtrType(), /* intptr_t blockZDim */		getIntPtrType(), /* intptr_t blockZDim */
getInt32Type(), /* unsigned int sharedMemBytes */		getInt32Type(), /* unsigned int sharedMemBytes */
getPointerType(), /* void hstream /		getPointerType(), /* void hstream /
getPointerPointerType(), /* void *kernelParams /		getPointerPointerType(), /* void *kernelParams /
getPointerPointerType() /* void *extra /		getPointerPointerType() /* void *extra /
},		},
/isVarArg=/false));		/isVarArg=/false));
}		}
if (!module.lookupSymbol(cuGetStreamHelperName)) {		if (!module.lookupSymbol(gpuGetStreamHelperName)) {
// Helper function to get the current CUDA stream. Uses void* instead of		// Helper function to get the current CUDA stream. Uses void* instead of
// CUDAs opaque CUstream.		// CUDAs opaque CUstream.
builder.create<LLVM::LLVMFuncOp>(		builder.create<LLVM::LLVMFuncOp>(
loc, cuGetStreamHelperName,		loc, gpuGetStreamHelperName,
LLVM::LLVMType::getFunctionTy(getPointerType(), /isVarArg=/false));		LLVM::LLVMType::getFunctionTy(getPointerType(), /isVarArg=/false));
}		}
if (!module.lookupSymbol(cuStreamSynchronizeName)) {		if (!module.lookupSymbol(gpuStreamSynchronizeName)) {
builder.create<LLVM::LLVMFuncOp>(		builder.create<LLVM::LLVMFuncOp>(
loc, cuStreamSynchronizeName,		loc, gpuStreamSynchronizeName,
LLVM::LLVMType::getFunctionTy(getCUResultType(),		LLVM::LLVMType::getFunctionTy(getGpuRuntimeResultType(),
getPointerType() /* CUstream stream */,		getPointerType() /* CUstream stream */,
/isVarArg=/false));		/isVarArg=/false));
}		}
if (!module.lookupSymbol(kMcuMemHostRegister)) {		if (!module.lookupSymbol(gpuMemHostRegisterName)) {
builder.create<LLVM::LLVMFuncOp>(		builder.create<LLVM::LLVMFuncOp>(
loc, kMcuMemHostRegister,		loc, gpuMemHostRegisterName,
LLVM::LLVMType::getFunctionTy(getVoidType(),		LLVM::LLVMType::getFunctionTy(getVoidType(),
{		{
getPointerType(), /* void ptr /		getPointerType(), /* void ptr /
getInt64Type() /* int64 sizeBytes*/		getInt64Type() /* int64 sizeBytes*/
},		},
/isVarArg=/false));		/isVarArg=/false));
}		}
}		}

/// Emits the IR with the following structure:		/// Emits the IR with the following structure:
///		///
/// %data = llvm.alloca 1 x type-of(<param>)		/// %data = llvm.alloca 1 x type-of(<param>)
/// llvm.store <param>, %data		/// llvm.store <param>, %data
/// %typeErased = llvm.bitcast %data to !llvm<"i8*">		/// %typeErased = llvm.bitcast %data to !llvm<"i8*">
/// %addr = llvm.getelementptr <list>[<pos>]		/// %addr = llvm.getelementptr <list>[<pos>]
/// llvm.store %typeErased, %addr		/// llvm.store %typeErased, %addr
///		///
/// This is necessary to construct the list of arguments passed to the kernel		/// This is necessary to construct the list of arguments passed to the kernel
/// function as accepted by cuLaunchKernel, i.e. as a void** that points to list		/// function as accepted by cuLaunchKernel, i.e. as a void** that points to list
/// of stack-allocated type-erased pointers to the actual arguments.		/// of stack-allocated type-erased pointers to the actual arguments.
void GpuLaunchFuncToCudaCallsPass::addParamToList(OpBuilder &builder,		void GpuLaunchFuncToGpuRuntimeCallsPass::addParamToList(OpBuilder &builder,
Location loc, Value param,		Location loc,
Value list, unsigned pos,		Value param, Value list,
		unsigned pos,
Value one) {		Value one) {
auto memLocation = builder.create<LLVM::AllocaOp>(		auto memLocation = builder.create<LLVM::AllocaOp>(
loc, param.getType().cast<LLVM::LLVMType>().getPointerTo(), one,		loc, param.getType().cast<LLVM::LLVMType>().getPointerTo(), one,
/alignment=/1);		/alignment=/1);
builder.create<LLVM::StoreOp>(loc, param, memLocation);		builder.create<LLVM::StoreOp>(loc, param, memLocation);
auto casted =		auto casted =
builder.create<LLVM::BitcastOp>(loc, getPointerType(), memLocation);		builder.create<LLVM::BitcastOp>(loc, getPointerType(), memLocation);

auto index = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),		auto index = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
builder.getI32IntegerAttr(pos));		builder.getI32IntegerAttr(pos));
auto gep = builder.create<LLVM::GEPOp>(loc, getPointerPointerType(), list,		auto gep = builder.create<LLVM::GEPOp>(loc, getPointerPointerType(), list,
ArrayRef<Value>{index});		ArrayRef<Value>{index});
builder.create<LLVM::StoreOp>(loc, casted, gep);		builder.create<LLVM::StoreOp>(loc, casted, gep);
}		}

// Generates a parameters array to be used with a CUDA kernel launch call. The		// Generates a parameters array to be used with a CUDA / ROCm (HIP) kernel
// arguments are extracted from the launchOp.		// launch call. The arguments are extracted from the launchOp.
// The generated code is essentially as follows:		// The generated code is essentially as follows:
//		//
// %array = alloca(numparams * sizeof(void *))		// %array = alloca(numparams * sizeof(void *))
// for (i : [0, NumKernelOperands))		// for (i : [0, NumKernelOperands))
// %array[i] = cast<void*>(KernelOperand[i])		// %array[i] = cast<void*>(KernelOperand[i])
// return %array		// return %array
Value GpuLaunchFuncToCudaCallsPass::setupParamsArray(gpu::LaunchFuncOp launchOp,		Value GpuLaunchFuncToGpuRuntimeCallsPass::setupParamsArray(
OpBuilder &builder) {		gpu::LaunchFuncOp launchOp, OpBuilder &builder) {

// Get the launch target.		// Get the launch target.
auto gpuFunc = SymbolTable::lookupNearestSymbolFrom<LLVM::LLVMFuncOp>(		auto gpuFunc = SymbolTable::lookupNearestSymbolFrom<LLVM::LLVMFuncOp>(
launchOp, launchOp.kernel());		launchOp, launchOp.kernel());
if (!gpuFunc)		if (!gpuFunc)
return {};		return {};

unsigned numArgs = gpuFunc.getNumArguments();		unsigned numArgs = gpuFunc.getNumArguments();
▲ Show 20 Lines • Show All 51 Lines • ▼ Show 20 Lines
// The code is essentially:		// The code is essentially:
//		//
// llvm.global constant @kernel_name("function_name\00")		// llvm.global constant @kernel_name("function_name\00")
// func(...) {		// func(...) {
// %0 = llvm.addressof @kernel_name		// %0 = llvm.addressof @kernel_name
// %1 = llvm.constant (0 : index)		// %1 = llvm.constant (0 : index)
// %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*">		// %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*">
// }		// }
Value GpuLaunchFuncToCudaCallsPass::generateKernelNameConstant(		Value GpuLaunchFuncToGpuRuntimeCallsPass::generateKernelNameConstant(
StringRef moduleName, StringRef name, Location loc, OpBuilder &builder) {		StringRef moduleName, StringRef name, Location loc, OpBuilder &builder) {
// Make sure the trailing zero is included in the constant.		// Make sure the trailing zero is included in the constant.
std::vector<char> kernelName(name.begin(), name.end());		std::vector<char> kernelName(name.begin(), name.end());
kernelName.push_back('\0');		kernelName.push_back('\0');

std::string globalName =		std::string globalName =
std::string(llvm::formatv("{0}_{1}_kernel_name", moduleName, name));		std::string(llvm::formatv("{0}_{1}_kernel_name", moduleName, name));
return LLVM::createGlobalString(		return LLVM::createGlobalString(
loc, builder, globalName, StringRef(kernelName.data(), kernelName.size()),		loc, builder, globalName, StringRef(kernelName.data(), kernelName.size()),
LLVM::Linkage::Internal, llvmDialect);		LLVM::Linkage::Internal, llvmDialect);
}		}

// Emits LLVM IR to launch a kernel function. Expects the module that contains		// Emits LLVM IR to launch a kernel function. Expects the module that contains
// the compiled kernel function as a cubin in the 'nvvm.cubin' attribute of the		// the compiled kernel function as a cubin in the 'nvvm.cubin' attribute, or a
// kernel function in the IR.		// hsaco in the 'rocdl.hsaco' attribute of the kernel function in the IR.
// While MLIR has no global constants, also expects a cubin getter function in		// While MLIR has no global constants, also expects a cubin getter function in
// an 'nvvm.cubingetter' attribute. Such function is expected to return a		// an 'nvvm.cubingetter' attribute, or a hsaco getter function in an
// pointer to the cubin blob when invoked.		// 'rocdl.hsacogetter' attribute. Such function is expected to return a pointer
		// to the binary blob when invoked.
// With these given, the generated code in essence is		// With these given, the generated code in essence is
//		//
// %0 = call %cubingetter		// %0 = call %binarygetter
// %1 = alloca sizeof(void*)		// %1 = alloca sizeof(void*)
// call %mcuModuleLoad(%2, %1)		// call %moduleLoad(%2, %1)
// %2 = alloca sizeof(void*)		// %2 = alloca sizeof(void*)
// %3 = load %1		// %3 = load %1
// %4 = <see generateKernelNameConstant>		// %4 = <see generateKernelNameConstant>
// call %mcuModuleGetFunction(%2, %3, %4)		// call %moduleGetFunction(%2, %3, %4)
// %5 = call %mcuGetStreamHelper()		// %5 = call %getStreamHelper()
// %6 = load %2		// %6 = load %2
// %7 = <see setupParamsArray>		// %7 = <see setupParamsArray>
// call %mcuLaunchKernel(%6, <launchOp operands 0..5>, 0, %5, %7, nullptr)		// call %launchKernel(%6, <launchOp operands 0..5>, 0, %5, %7, nullptr)
// call %mcuStreamSynchronize(%5)		// call %streamSynchronize(%5)
void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls(		void GpuLaunchFuncToGpuRuntimeCallsPass::translateGpuLaunchCalls(
mlir::gpu::LaunchFuncOp launchOp) {		mlir::gpu::LaunchFuncOp launchOp) {
OpBuilder builder(launchOp);		OpBuilder builder(launchOp);
Location loc = launchOp.getLoc();		Location loc = launchOp.getLoc();
declareCudaFunctions(loc);		declareGpuRuntimeFunctions(loc);

auto zero = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),		auto zero = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
builder.getI32IntegerAttr(0));		builder.getI32IntegerAttr(0));
// Create an LLVM global with CUBIN extracted from the kernel annotation and		// Create an LLVM global with CUBIN extracted from the kernel annotation and
// obtain a pointer to the first byte in it.		// obtain a pointer to the first byte in it.
auto kernelModule = getOperation().lookupSymbol<gpu::GPUModuleOp>(		auto kernelModule = getOperation().lookupSymbol<gpu::GPUModuleOp>(
launchOp.getKernelModuleName());		launchOp.getKernelModuleName());
assert(kernelModule && "expected a kernel module");		assert(kernelModule && "expected a kernel module");

auto cubinAttr = kernelModule.getAttrOfType<StringAttr>(kCubinAnnotation);		auto binaryAttr = kernelModule.getAttrOfType<StringAttr>(gpuBinaryAnnotation);
if (!cubinAttr) {		if (!binaryAttr) {
kernelModule.emitOpError()		kernelModule.emitOpError()
<< "missing " << kCubinAnnotation << " attribute";		<< "missing " << gpuBinaryAnnotation << " attribute";
return signalPassFailure();		return signalPassFailure();
}		}

SmallString<128> nameBuffer(kernelModule.getName());		SmallString<128> nameBuffer(kernelModule.getName());
nameBuffer.append(kCubinStorageSuffix);		nameBuffer.append(gpuBinaryStorageSuffix);
Value data = LLVM::createGlobalString(		Value data = LLVM::createGlobalString(
loc, builder, nameBuffer.str(), cubinAttr.getValue(),		loc, builder, nameBuffer.str(), binaryAttr.getValue(),
LLVM::Linkage::Internal, getLLVMDialect());		LLVM::Linkage::Internal, getLLVMDialect());

// Emit the load module call to load the module data. Error checking is done		// Emit the load module call to load the module data. Error checking is done
// in the called helper function.		// in the called helper function.
auto cuModule = allocatePointer(builder, loc);		auto gpuModule = allocatePointer(builder, loc);
auto cuModuleLoad =		auto gpuModuleLoad =
getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuModuleLoadName);		getOperation().lookupSymbol<LLVM::LLVMFuncOp>(gpuModuleLoadName);
builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getCUResultType()},		builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getGpuRuntimeResultType()},
builder.getSymbolRefAttr(cuModuleLoad),		builder.getSymbolRefAttr(gpuModuleLoad),
ArrayRef<Value>{cuModule, data});		ArrayRef<Value>{gpuModule, data});
// Get the function from the module. The name corresponds to the name of		// Get the function from the module. The name corresponds to the name of
// the kernel function.		// the kernel function.
auto cuOwningModuleRef =		auto gpuOwningModuleRef =
builder.create<LLVM::LoadOp>(loc, getPointerType(), cuModule);		builder.create<LLVM::LoadOp>(loc, getPointerType(), gpuModule);
auto kernelName = generateKernelNameConstant(		auto kernelName = generateKernelNameConstant(
launchOp.getKernelModuleName(), launchOp.getKernelName(), loc, builder);		launchOp.getKernelModuleName(), launchOp.getKernelName(), loc, builder);
auto cuFunction = allocatePointer(builder, loc);		auto gpuFunction = allocatePointer(builder, loc);
auto cuModuleGetFunction =		auto gpuModuleGetFunction =
getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuModuleGetFunctionName);		getOperation().lookupSymbol<LLVM::LLVMFuncOp>(gpuModuleGetFunctionName);
builder.create<LLVM::CallOp>(		builder.create<LLVM::CallOp>(
loc, ArrayRef<Type>{getCUResultType()},		loc, ArrayRef<Type>{getGpuRuntimeResultType()},
builder.getSymbolRefAttr(cuModuleGetFunction),		builder.getSymbolRefAttr(gpuModuleGetFunction),
ArrayRef<Value>{cuFunction, cuOwningModuleRef, kernelName});		ArrayRef<Value>{gpuFunction, gpuOwningModuleRef, kernelName});
// Grab the global stream needed for execution.		// Grab the global stream needed for execution.
auto cuGetStreamHelper =		auto gpuGetStreamHelper =
getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuGetStreamHelperName);		getOperation().lookupSymbol<LLVM::LLVMFuncOp>(gpuGetStreamHelperName);
auto cuStream = builder.create<LLVM::CallOp>(		auto gpuStream = builder.create<LLVM::CallOp>(
loc, ArrayRef<Type>{getPointerType()},		loc, ArrayRef<Type>{getPointerType()},
builder.getSymbolRefAttr(cuGetStreamHelper), ArrayRef<Value>{});		builder.getSymbolRefAttr(gpuGetStreamHelper), ArrayRef<Value>{});
// Invoke the function with required arguments.		// Invoke the function with required arguments.
auto cuLaunchKernel =		auto gpuLaunchKernel =
getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuLaunchKernelName);		getOperation().lookupSymbol<LLVM::LLVMFuncOp>(gpuLaunchKernelName);
auto cuFunctionRef =		auto gpuFunctionRef =
builder.create<LLVM::LoadOp>(loc, getPointerType(), cuFunction);		builder.create<LLVM::LoadOp>(loc, getPointerType(), gpuFunction);
auto paramsArray = setupParamsArray(launchOp, builder);		auto paramsArray = setupParamsArray(launchOp, builder);
if (!paramsArray) {		if (!paramsArray) {
launchOp.emitOpError() << "cannot pass given parameters to the kernel";		launchOp.emitOpError() << "cannot pass given parameters to the kernel";
return signalPassFailure();		return signalPassFailure();
}		}
auto nullpointer =		auto nullpointer =
builder.create<LLVM::IntToPtrOp>(loc, getPointerPointerType(), zero);		builder.create<LLVM::IntToPtrOp>(loc, getPointerPointerType(), zero);
builder.create<LLVM::CallOp>(		builder.create<LLVM::CallOp>(
loc, ArrayRef<Type>{getCUResultType()},		loc, ArrayRef<Type>{getGpuRuntimeResultType()},
builder.getSymbolRefAttr(cuLaunchKernel),		builder.getSymbolRefAttr(gpuLaunchKernel),
ArrayRef<Value>{cuFunctionRef, launchOp.getOperand(0),		ArrayRef<Value>{gpuFunctionRef, launchOp.getOperand(0),
launchOp.getOperand(1), launchOp.getOperand(2),		launchOp.getOperand(1), launchOp.getOperand(2),
launchOp.getOperand(3), launchOp.getOperand(4),		launchOp.getOperand(3), launchOp.getOperand(4),
launchOp.getOperand(5), zero, /* sharedMemBytes */		launchOp.getOperand(5), zero, /* sharedMemBytes */
cuStream.getResult(0), /* stream */		gpuStream.getResult(0), /* stream */
paramsArray, /* kernel params */		paramsArray, /* kernel params */
nullpointer /* extra */});		nullpointer /* extra */});
// Sync on the stream to make it synchronous.		// Sync on the stream to make it synchronous.
auto cuStreamSync =		auto gpuStreamSync =
getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuStreamSynchronizeName);		getOperation().lookupSymbol<LLVM::LLVMFuncOp>(gpuStreamSynchronizeName);
builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getCUResultType()},		builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getGpuRuntimeResultType()},
builder.getSymbolRefAttr(cuStreamSync),		builder.getSymbolRefAttr(gpuStreamSync),
ArrayRef<Value>(cuStream.getResult(0)));		ArrayRef<Value>(gpuStream.getResult(0)));
launchOp.erase();		launchOp.erase();
}		}

std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>		std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
mlir::createConvertGpuLaunchFuncToCudaCallsPass() {		mlir::createConvertGpuLaunchFuncToGpuRuntimeCallsPass() {
return std::make_unique<GpuLaunchFuncToCudaCallsPass>();		return std::make_unique<GpuLaunchFuncToGpuRuntimeCallsPass>();
}		}

mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt

	set(LLVM_OPTIONAL_SOURCES			set(LLVM_OPTIONAL_SOURCES
	ConvertKernelFuncToCubin.cpp			ConvertKernelFuncToCubin.cpp
	)			)

	set(SOURCES
	nicolasvasilacheUnsubmitted Done Reply Inline Actions This seems to break the build, SOURCES can be empty when I try to build it. nicolasvasilache: This seems to break the build, SOURCES can be empty when I try to build it.
	whchungAuthorUnsubmitted Done Reply Inline Actions @nicolasvasilache in your setup, MLIR_CUDA_CONVERSIONS_ENABLED is OFF, correct? Let me see how to address this. whchung: @nicolasvasilache in your setup, MLIR_CUDA_CONVERSIONS_ENABLED is OFF, correct? Let me see how…
	ConvertLaunchFuncToCudaCalls.cpp
	)

	if (MLIR_CUDA_CONVERSIONS_ENABLED)			if (MLIR_CUDA_CONVERSIONS_ENABLED)
	list(APPEND SOURCES "ConvertKernelFuncToCubin.cpp")			list(APPEND SOURCES "ConvertKernelFuncToCubin.cpp")
	set(NVPTX_LIBS			set(NVPTX_LIBS
	MC			MC
	NVPTXCodeGen			NVPTXCodeGen
	NVPTXDesc			NVPTXDesc
	NVPTXInfo			NVPTXInfo
	)			)
	Show All 23 Lines

mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp

This file was moved to mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp.

mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir

This file was moved from mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir.

	// RUN: mlir-opt -allow-unregistered-dialect %s --launch-func-to-cuda \| FileCheck %s			// RUN: mlir-opt -allow-unregistered-dialect %s --launch-func-to-gpu-runtime \| FileCheck %s
				herhutUnsubmitted Done Reply Inline Actions Move this file to GPUCommon directory. herhut: Move this file to GPUCommon directory.
				whchungAuthorUnsubmitted Done Reply Inline Actions this file is in GPUCommon now. whchung: this file is in GPUCommon now.

	module attributes {gpu.container_module} {			module attributes {gpu.container_module} {

	// CHECK: llvm.mlir.global internal constant @[[kernel_name:.*]]("kernel\00")			// CHECK: llvm.mlir.global internal constant @[[kernel_name:.*]]("kernel\00")
	// CHECK: llvm.mlir.global internal constant @[[global:.*]]("CUBIN")			// CHECK: llvm.mlir.global internal constant @[[global:.*]]("CUBIN")

	gpu.module @kernel_module attributes {nvvm.cubin = "CUBIN"} {			gpu.module @kernel_module attributes {nvvm.cubin = "CUBIN"} {
	llvm.func @kernel(%arg0: !llvm.float, %arg1: !llvm<"float*">) attributes {gpu.kernel} {			llvm.func @kernel(%arg0: !llvm.float, %arg1: !llvm<"float*">) attributes {gpu.kernel} {
	Show All 27 Lines

mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir

This file was moved to mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir.

mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp

//===- mlir-cuda-runner.cpp - MLIR CUDA Execution Driver-------------------===//		//===- mlir-cuda-runner.cpp - MLIR CUDA Execution Driver-------------------===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
// This is a command line utility that executes an MLIR file on the GPU by		// This is a command line utility that executes an MLIR file on the GPU by
// translating MLIR to NVVM/LVVM IR before JIT-compiling and executing the		// translating MLIR to NVVM/LVVM IR before JIT-compiling and executing the
// latter.		// latter.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "llvm/ADT/STLExtras.h"		#include "llvm/ADT/STLExtras.h"

		#include "mlir/Conversion/GPUCommon/GPUCommonPass.h"
#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"		#include "mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h"
#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"		#include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"
#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"		#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"		#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
#include "mlir/Dialect/GPU/GPUDialect.h"		#include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/Dialect/GPU/Passes.h"		#include "mlir/Dialect/GPU/Passes.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"		#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"		#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
▲ Show 20 Lines • Show All 85 Lines • ▼ Show 20 Lines	static LogicalResult runMLIRPasses(ModuleOp m) {
applyPassManagerCLOptions(pm);		applyPassManagerCLOptions(pm);

pm.addPass(createGpuKernelOutliningPass());		pm.addPass(createGpuKernelOutliningPass());
auto &kernelPm = pm.nest<gpu::GPUModuleOp>();		auto &kernelPm = pm.nest<gpu::GPUModuleOp>();
kernelPm.addPass(createStripDebugInfoPass());		kernelPm.addPass(createStripDebugInfoPass());
kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass());		kernelPm.addPass(createLowerGpuOpsToNVVMOpsPass());
kernelPm.addPass(createConvertGPUKernelToCubinPass(&compilePtxToCubin));		kernelPm.addPass(createConvertGPUKernelToCubinPass(&compilePtxToCubin));
pm.addPass(createLowerToLLVMPass());		pm.addPass(createLowerToLLVMPass());
pm.addPass(createConvertGpuLaunchFuncToCudaCallsPass());		pm.addPass(createConvertGpuLaunchFuncToGpuRuntimeCallsPass());

return pm.run(m);		return pm.run(m);
}		}

int main(int argc, char **argv) {		int main(int argc, char **argv) {
registerPassManagerCLOptions();		registerPassManagerCLOptions();
mlir::registerAllDialects();		mlir::registerAllDialects();
llvm::InitLLVM y(argc, argv);		llvm::InitLLVM y(argc, argv);
llvm::InitializeNativeTarget();		llvm::InitializeNativeTarget();
llvm::InitializeNativeTargetAsmPrinter();		llvm::InitializeNativeTargetAsmPrinter();
mlir::initializeLLVMPasses();		mlir::initializeLLVMPasses();
return mlir::JitRunnerMain(argc, argv, &runMLIRPasses);		return mlir::JitRunnerMain(argc, argv, &runMLIRPasses);
}		}

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][gpu] Refactor ConvertGpuLaunchFuncToCudaCalls pass.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 264738

mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h

mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h

mlir/include/mlir/Conversion/Passes.td

mlir/include/mlir/InitAllPasses.h

mlir/lib/Conversion/CMakeLists.txt

mlir/lib/Conversion/GPUCommon/CMakeLists.txt

mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp

mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt

mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp

mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir

mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir

mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][gpu] Refactor ConvertGpuLaunchFuncToCudaCalls pass.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 264738

mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h

mlir/include/mlir/Conversion/GPUToCUDA/GPUToCUDAPass.h

mlir/include/mlir/Conversion/Passes.td

mlir/include/mlir/InitAllPasses.h

mlir/lib/Conversion/CMakeLists.txt

mlir/lib/Conversion/GPUCommon/CMakeLists.txt

mlir/lib/Conversion/GPUCommon/ConvertLaunchFuncToRuntimeCalls.cpp

mlir/lib/Conversion/GPUToCUDA/CMakeLists.txt

mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp

mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir

mlir/test/Conversion/GPUToCUDA/lower-launch-func-to-cuda.mlir

mlir/tools/mlir-cuda-runner/mlir-cuda-runner.cpp

[mlir][gpu] Refactor ConvertGpuLaunchFuncToCudaCalls pass.
ClosedPublic