This is an archive of the discontinued LLVM Phabricator instance.

[MLIR][Prototype] async gpu ops
AbandonedPublic

Authored by csigg on Apr 22 2020, 7:53 AM.

Download Raw Diff

Details

Reviewers

herhut
rriddle
ftynse

Summary

For demonstration/discussion purposes only, not meant to be reviewed/submitted.

This is the code of the 'early prototype' presented during the GPU host-side dialect discussion during MLIR's design meetings (slides and recording:
https://drive.google.com/corp/drive/folders/1-93qa9Esu2m0_xoZrB_5x3CjkIUqL_DD)

Instead of a new 'async' op with a region, the prototype adds a variadic list of 'gpu.chain' inputs and one 'gpu.chain' output to individual async ops.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

csigg created this revision.Apr 22 2020, 7:53 AM

Herald added a reviewer: herhut. · View Herald TranscriptApr 22 2020, 7:53 AM

Herald added a reviewer: rriddle. · View Herald Transcript

Herald added a project: Restricted Project. · View Herald Transcript

Herald added subscribers: llvm-commits, frgossen, grosul1 and 12 others. · View Herald Transcript

csigg retitled this revision from For demonstration/discussion purposes only, not meant to be reviewed/submitted. to [MLIR][Prototype] async gpu ops.Apr 22 2020, 7:55 AM

csigg edited the summary of this revision. (Show Details)

csigg removed reviewers: herhut, rriddle.

Herald added a reviewer: herhut. · View Herald TranscriptApr 22 2020, 7:55 AM

Herald added a reviewer: rriddle. · View Herald Transcript

Harbormaster failed remote builds in B54249: Diff 259290!Apr 22 2020, 8:38 AM

csigg abandoned this revision.Dec 3 2020, 1:45 AM

Herald added a reviewer: ftynse. · View Herald TranscriptDec 3 2020, 1:45 AM

Herald added a project: Restricted Project. · View Herald Transcript

Herald added subscribers: teijeong, rdzhabarov, tatianashp and 5 others. · View Herald Transcript

Revision Contents

Path

Size

mlir/

include/

mlir/

Conversion/

StandardToLLVM/

ConvertStandardToLLVM.h

2 lines

Dialect/

GPU/

GPUDialect.h

20 lines

GPUOps.td

71 lines

IR/

DialectSymbolRegistry.def

1 line

lib/

Conversion/

GPUToCUDA/

ConvertKernelFuncToCubin.cpp

6 lines

ConvertLaunchFuncToCudaCalls.cpp

232 lines

StandardToLLVM/

StandardToLLVM.cpp

16 lines

Dialect/

GPU/

IR/

GPUDialect.cpp

133 lines

Transforms/

KernelOutlining.cpp

8 lines

Target/

LLVMIR/

ModuleTranslation.cpp

2 lines

test/

Dialect/

GPU/

ops.mlir

14 lines

mlir-cuda-runner/

async.mlir

50 lines

simple.mlir

27 lines

tools/

mlir-cuda-runner/

cuda-runtime-wrappers.cpp

68 lines

Diff 259290

mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h

Show First 20 Lines • Show All 386 Lines • ▼ Show 20 Lines	public:
/// Gets the MLIR type wrapping the LLVM integer type whose bit width is		/// Gets the MLIR type wrapping the LLVM integer type whose bit width is
/// defined by the used type converter.		/// defined by the used type converter.
LLVM::LLVMType getIndexType() const;		LLVM::LLVMType getIndexType() const;

/// Gets the MLIR type wrapping the LLVM void type.		/// Gets the MLIR type wrapping the LLVM void type.
LLVM::LLVMType getVoidType() const;		LLVM::LLVMType getVoidType() const;

/// Get the MLIR type wrapping the LLVM i8* type.		/// Get the MLIR type wrapping the LLVM i8* type.
LLVM::LLVMType getVoidPtrType() const;		LLVM::LLVMType getVoidPtrType(unsigned addrspace = 0) const;

/// Create an LLVM dialect operation defining the given index constant.		/// Create an LLVM dialect operation defining the given index constant.
Value createIndexConstant(ConversionPatternRewriter &builder, Location loc,		Value createIndexConstant(ConversionPatternRewriter &builder, Location loc,
uint64_t value) const;		uint64_t value) const;

// Given subscript indices and array sizes in row-major order,		// Given subscript indices and array sizes in row-major order,
// i_n, i_{n-1}, ..., i_1		// i_n, i_{n-1}, ..., i_1
// s_n, s_{n-1}, ..., s_1		// s_n, s_{n-1}, ..., s_1
▲ Show 20 Lines • Show All 108 Lines • Show Last 20 Lines

mlir/include/mlir/Dialect/GPU/GPUDialect.h

	Show All 12 Lines

	#ifndef MLIR_DIALECT_GPU_GPUDIALECT_H			#ifndef MLIR_DIALECT_GPU_GPUDIALECT_H
	#define MLIR_DIALECT_GPU_GPUDIALECT_H			#define MLIR_DIALECT_GPU_GPUDIALECT_H

	#include "mlir/IR/Dialect.h"			#include "mlir/IR/Dialect.h"
	#include "mlir/IR/FunctionSupport.h"			#include "mlir/IR/FunctionSupport.h"
	#include "mlir/IR/OpDefinition.h"			#include "mlir/IR/OpDefinition.h"
	#include "mlir/IR/OpImplementation.h"			#include "mlir/IR/OpImplementation.h"
				#include "mlir/IR/StandardTypes.h"
	#include "mlir/IR/SymbolTable.h"			#include "mlir/IR/SymbolTable.h"
				#include "mlir/IR/Types.h"
	#include "mlir/Interfaces/SideEffects.h"			#include "mlir/Interfaces/SideEffects.h"

	namespace mlir {			namespace mlir {
	class FuncOp;			class FuncOp;

	namespace gpu {			namespace gpu {

	/// Utility class for the GPU dialect to represent triples of `Value`s			/// Utility class for the GPU dialect to represent triples of `Value`s
	/// accessible through `.x`, `.y`, and `.z` similarly to CUDA notation.			/// accessible through `.x`, `.y`, and `.z` similarly to CUDA notation.
	struct KernelDim3 {			struct KernelDim3 {
	Value x;			Value x;
	Value y;			Value y;
	Value z;			Value z;
	};			};

				enum GpuTypes {
				Chain = Type::FIRST_GPU_TYPE,
				LAST_USED_GPU_TYPE = Chain,
				};

				class ChainType : public Type::TypeBase<ChainType, Type> {
				public:
				// Used for generic hooks in TypeBase.
				using Base::Base;
				/// Construction hook.
				static ChainType get(MLIRContext *context) {
				/// Custom, uniq'ed construction in the MLIRContext.
				return Base::get(context, GpuTypes::Chain);
				}
				/// Used to implement llvm-style cast.
				static bool kindof(unsigned kind) { return kind == GpuTypes::Chain; }
				};

	#include "mlir/Dialect/GPU/GPUOpsDialect.h.inc"			#include "mlir/Dialect/GPU/GPUOpsDialect.h.inc"

	#define GET_OP_CLASSES			#define GET_OP_CLASSES
	#include "mlir/Dialect/GPU/GPUOps.h.inc"			#include "mlir/Dialect/GPU/GPUOps.h.inc"

	} // end namespace gpu			} // end namespace gpu
	} // end namespace mlir			} // end namespace mlir

	#endif // MLIR_DIALECT_GPU_GPUDIALECT_H			#endif // MLIR_DIALECT_GPU_GPUDIALECT_H

mlir/include/mlir/Dialect/GPU/GPUOps.td

Show All 17 Lines
include "mlir/Interfaces/SideEffects.td"		include "mlir/Interfaces/SideEffects.td"

// Type constraint accepting standard integers, indices and wrapped LLVM integer		// Type constraint accepting standard integers, indices and wrapped LLVM integer
// types.		// types.
def IntLikeOrLLVMInt : TypeConstraint<		def IntLikeOrLLVMInt : TypeConstraint<
Or<[AnySignlessInteger.predicate, Index.predicate, LLVMInt.predicate]>,		Or<[AnySignlessInteger.predicate, Index.predicate, LLVMInt.predicate]>,
"integer, index or LLVM dialect equivalent">;		"integer, index or LLVM dialect equivalent">;

		def GPU_ChainType : DialectType<
		GPU_Dialect, CPred<"$_self.isa<::mlir::gpu::ChainType>()">, "chain">,
		BuildableType<"mlir::gpu::ChainType::get($_builder.getContext())">;

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// GPU Dialect operations.		// GPU Dialect operations.
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

class GPU_Op<string mnemonic, list<OpTrait> traits = []> :		class GPU_Op<string mnemonic, list<OpTrait> traits = []> :
Op<GPU_Dialect, mnemonic, traits>;		Op<GPU_Dialect, mnemonic, traits>;

class GPU_IndexOp<string mnemonic, list<OpTrait> traits = []> :		class GPU_IndexOp<string mnemonic, list<OpTrait> traits = []> :
▲ Show 20 Lines • Show All 206 Lines • ▼ Show 20 Lines	let extraClassDeclaration = [{
LogicalResult verifyBody();		LogicalResult verifyBody();
}];		}];

// let verifier = [{ return ::verifFuncOpy(*this); }];		// let verifier = [{ return ::verifFuncOpy(*this); }];
let printer = [{ printGPUFuncOp(p, *this); }];		let printer = [{ printGPUFuncOp(p, *this); }];
let parser = [{ return parseGPUFuncOp(parser, result); }];		let parser = [{ return parseGPUFuncOp(parser, result); }];
}		}

def GPU_LaunchFuncOp : GPU_Op<"launch_func">,		def GPU_LaunchFuncOp : GPU_Op<"launch_func", [AttrSizedOperandSegments]>,
Arguments<(ins IntLikeOrLLVMInt:$gridSizeX, IntLikeOrLLVMInt:$gridSizeY,		Arguments<(ins Variadic<GPU_ChainType>:$chains, IntLikeOrLLVMInt:$gridSizeX, IntLikeOrLLVMInt:$gridSizeY,
IntLikeOrLLVMInt:$gridSizeZ, IntLikeOrLLVMInt:$blockSizeX,		IntLikeOrLLVMInt:$gridSizeZ, IntLikeOrLLVMInt:$blockSizeX,
IntLikeOrLLVMInt:$blockSizeY, IntLikeOrLLVMInt:$blockSizeZ,		IntLikeOrLLVMInt:$blockSizeY, IntLikeOrLLVMInt:$blockSizeZ,
Variadic<AnyType>:$operands)>,		Variadic<AnyType>:$operands)>,
Results<(outs)> {		Results<(outs GPU_ChainType)> {
let summary = "Launches a function as a GPU kerneel";		let summary = "Launches a function as a GPU kernel";

let description = [{		let description = [{
Launch a kernel function on the specified grid of thread blocks.		Launch a kernel function on the specified grid of thread blocks.
`gpu.launch` operations are lowered to `gpu.launch_func` operations by		`gpu.launch` operations are lowered to `gpu.launch_func` operations by
outlining the kernel body into a function in a dedicated module, which		outlining the kernel body into a function in a dedicated module, which
reflects the separate compilation process. The kernel function is required		reflects the separate compilation process. The kernel function is required
to have the `gpu.kernel` attribute. The module containing the kernel		to have the `gpu.kernel` attribute. The module containing the kernel
function is required to have the `gpu.kernel_module` attribute and must be		function is required to have the `gpu.kernel_module` attribute and must be
▲ Show 20 Lines • Show All 55 Lines • ▼ Show 20 Lines	module attributes {gpu.container_module} {
-> ()		-> ()
}		}
```		```
}];		}];

let skipDefaultBuilders = 1;		let skipDefaultBuilders = 1;

let builders = [		let builders = [
OpBuilder<"Builder *builder, OperationState &result, GPUFuncOp kernelFunc, "		OpBuilder<"Builder *builder, OperationState &result, ValueRange chains, GPUFuncOp kernelFunc, "
"Value gridSizeX, Value gridSizeY, Value gridSizeZ, "		"Value gridSizeX, Value gridSizeY, Value gridSizeZ, "
"Value blockSizeX, Value blockSizeY, Value blockSizeZ, "		"Value blockSizeX, Value blockSizeY, Value blockSizeZ, "
"ValueRange kernelOperands">,		"ValueRange kernelOperands">,
OpBuilder<"Builder *builder, OperationState &result, GPUFuncOp kernelFunc, "		OpBuilder<"Builder *builder, OperationState &result, ValueRange chains, GPUFuncOp kernelFunc, "
"KernelDim3 gridSize, KernelDim3 blockSize, "		"KernelDim3 gridSize, KernelDim3 blockSize, "
"ValueRange kernelOperands">		"ValueRange kernelOperands">
];		];

let extraClassDeclaration = [{		let extraClassDeclaration = [{
/// The kernel function specified by the operation's `kernel` attribute.		/// The kernel function specified by the operation's `kernel` attribute.
StringRef kernel();		StringRef kernel();

/// The number of operands passed to the kernel function.		/// The number of operands passed to the kernel function.
unsigned getNumKernelOperands();		unsigned getNumKernelOperands();

/// The name of the kernel module specified by the operation's		/// The name of the kernel module specified by the operation's
/// `kernel_module` attribute.		/// `kernel_module` attribute.
StringRef getKernelModuleName();		StringRef getKernelModuleName();

/// The i-th operand passed to the kernel function.		/// The i-th operand passed to the kernel function.
Value getKernelOperand(unsigned i);		Value getKernelOperand(unsigned i);

/// Get the SSA values passed as operands to specify the grid size.		/// Get the SSA values passed as operands to specify the grid size.
KernelDim3 getGridSizeOperandValues();		KernelDim3 getGridSizeOperandValues();

/// Get the SSA values passed as operands to specify the block size.		/// Get the SSA values passed as operands to specify the block size.
KernelDim3 getBlockSizeOperandValues();		KernelDim3 getBlockSizeOperandValues();

		OperandRange getChains();

/// The number of launch configuration operands, placed at the leading		/// The number of launch configuration operands, placed at the leading
/// positions of the operand list.		/// positions of the operand list.
static constexpr unsigned kNumConfigOperands = 6;		static constexpr unsigned kNumConfigOperands = 6;

// This needs to quietly verify if attributes with names defined below are		// This needs to quietly verify if attributes with names defined below are
// present since it is run before the verifier of this op.		// present since it is run before the verifier of this op.
friend LogicalResult GPUDialect::verifyOperationAttribute(Operation *,		friend LogicalResult GPUDialect::verifyOperationAttribute(Operation *,
NamedAttribute);		NamedAttribute);

/// The name of the symbolRef attribute specifying the kernel to launch.		/// The name of the symbolRef attribute specifying the kernel to launch.
static StringRef getKernelAttrName() { return "kernel"; }		static StringRef getKernelAttrName() { return "kernel"; }

/// The name of the symbolRef attribute specifying the name of the module		/// The name of the symbolRef attribute specifying the name of the module
/// containing the kernel to launch.		/// containing the kernel to launch.
static StringRef getKernelModuleAttrName() { return "kernel_module"; }		static StringRef getKernelModuleAttrName() { return "kernel_module"; }
}];		}];

let verifier = [{ return ::verify(*this); }];		let verifier = [{ return ::verify(*this); }];
}		}

def GPU_LaunchOp : GPU_Op<"launch">,		def GPU_LaunchOp : GPU_Op<"launch">,
Arguments<(ins Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,		Arguments<(ins Variadic<GPU_ChainType>:$chains,
		Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ)>,		Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ)>,
Results<(outs)> {		// Variadic (0 or 1) to be backwards compatible.
		Results<(outs Variadic<GPU_ChainType>)> {
let summary = "GPU kernel launch operation";		let summary = "GPU kernel launch operation";

let description = [{		let description = [{
Launch a kernel on the specified grid of thread blocks. The body of the		Launch a kernel on the specified grid of thread blocks. The body of the
kernel is defined by the single region that this operation contains. The		kernel is defined by the single region that this operation contains. The
operation takes six operands, with first three operands being grid sizes		operation takes six operands, with first three operands being grid sizes
along x,y,z dimensions and the following three arguments being block sizes		along x,y,z dimensions and the following three arguments being block sizes
along x,y,z dimension. When a lower-dimensional kernel is required,		along x,y,z dimension. When a lower-dimensional kernel is required,
▲ Show 20 Lines • Show All 54 Lines • ▼ Show 20 Lines	def GPU_LaunchOp : GPU_Op<"launch">,

let regions = (region AnyRegion:$body);		let regions = (region AnyRegion:$body);

let skipDefaultBuilders = 1;		let skipDefaultBuilders = 1;

let builders = [		let builders = [
OpBuilder<"Builder *builder, OperationState &result, Value gridSizeX,"		OpBuilder<"Builder *builder, OperationState &result, Value gridSizeX,"
"Value gridSizeY, Value gridSizeZ, Value blockSizeX,"		"Value gridSizeY, Value gridSizeZ, Value blockSizeX,"
"Value blockSizeY, Value blockSizeZ">		"Value blockSizeY, Value blockSizeZ">,
		OpBuilder<"Builder *builder, OperationState &result, ValueRange chains, "
		"Value gridSizeX, Value gridSizeY, Value gridSizeZ, "
		"Value blockSizeX, Value blockSizeY, Value blockSizeZ">
];		];

let extraClassDeclaration = [{		let extraClassDeclaration = [{
/// Get the SSA values corresponding to kernel block identifiers.		/// Get the SSA values corresponding to kernel block identifiers.
KernelDim3 getBlockIds();		KernelDim3 getBlockIds();
/// Get the SSA values corresponding to kernel thread identifiers.		/// Get the SSA values corresponding to kernel thread identifiers.
KernelDim3 getThreadIds();		KernelDim3 getThreadIds();
/// Get the SSA values corresponding to kernel grid size.		/// Get the SSA values corresponding to kernel grid size.
KernelDim3 getGridSize();		KernelDim3 getGridSize();
/// Get the SSA values corresponding to kernel block size.		/// Get the SSA values corresponding to kernel block size.
KernelDim3 getBlockSize();		KernelDim3 getBlockSize();

/// Get the SSA values passed as operands to specify the grid size.		/// Get the SSA values passed as operands to specify the grid size.
KernelDim3 getGridSizeOperandValues();		KernelDim3 getGridSizeOperandValues();
/// Get the SSA values passed as operands to specify the block size.		/// Get the SSA values passed as operands to specify the block size.
KernelDim3 getBlockSizeOperandValues();		KernelDim3 getBlockSizeOperandValues();

		OperandRange getChains();

static StringRef getBlocksKeyword() { return "blocks"; }		static StringRef getBlocksKeyword() { return "blocks"; }
static StringRef getThreadsKeyword() { return "threads"; }		static StringRef getThreadsKeyword() { return "threads"; }

/// The number of launch configuration operands, placed at the leading		/// The number of launch configuration operands, placed at the leading
/// positions of the operand list.		/// positions of the operand list.
static constexpr unsigned kNumConfigOperands = 6;		static constexpr unsigned kNumConfigOperands = 6;

/// The number of region attributes containing the launch configuration,		/// The number of region attributes containing the launch configuration,
▲ Show 20 Lines • Show All 161 Lines • ▼ Show 20 Lines	let description = [{

Either none or all work items of a workgroup need to execute this op		Either none or all work items of a workgroup need to execute this op
in convergence.		in convergence.
}];		}];
let parser = [{ return success(); }];		let parser = [{ return success(); }];
let printer = [{ p << getOperationName(); }];		let printer = [{ p << getOperationName(); }];
}		}

		def GPU_MemcpyOp : GPU_Op<"memcpy"> {
		let summary = "GPU memory copy operation";
		let description = [{
		The `memcpy` operation copies a region of memory from GPU to host or vice
		versa.
		}];

		// TODO(csigg): src and dst should be AnyUnrankedMemRef.
		let arguments = (ins Variadic<GPU_ChainType>:$chains, AnyType:$dst,
		AnyType:$src, OptionalAttr<I64Attr>:$element_size);
		let results = (outs GPU_ChainType);

		let builders = [OpBuilder<
		"Builder *builder, OperationState &result, ValueRange chains, Value dst, " #
		"Value src", [{
		result.addOperands(chains);
		result.addOperands({dst, src});
		result.types.push_back(ChainType::get(builder->getContext()));
		}]>];

		let extraClassDeclaration = [{
		}];

		let assemblyFormat = [{
		(`[` $chains^ `]`)? `(` $dst`,` $src `)` attr-dict `:` type($dst)`,` type($src)
		}];
		}

		def GPU_WaitOp : GPU_Op<"wait"> {
		let summary = "GPU async wait operation";
		let description = [{
		The `wait` operation blocks on a list of chains.
		}];

		let arguments = (ins Variadic<GPU_ChainType>:$chains);
		let results = (outs);

		let assemblyFormat = "`[` $chains `]` attr-dict";
		}

def GPU_GPUModuleOp : GPU_Op<"module", [		def GPU_GPUModuleOp : GPU_Op<"module", [
IsolatedFromAbove, SymbolTable, Symbol,		IsolatedFromAbove, SymbolTable, Symbol,
SingleBlockImplicitTerminator<"ModuleEndOp">		SingleBlockImplicitTerminator<"ModuleEndOp">
]> {		]> {
let summary = "A top level compilation unit containing code to be run on a GPU.";		let summary = "A top level compilation unit containing code to be run on a GPU.";
let description = [{		let description = [{
GPU module contains code that is intended to be run on a GPU. A host device		GPU module contains code that is intended to be run on a GPU. A host device
can launch this code through a gpu.launc_func that creates a fully		can launch this code through a gpu.launc_func that creates a fully
▲ Show 20 Lines • Show All 45 Lines • Show Last 20 Lines

mlir/include/mlir/IR/DialectSymbolRegistry.def

	Show All 19 Lines
	DEFINE_SYM_KIND_RANGE(IREE) // IREE stands for IR Execution Engine			DEFINE_SYM_KIND_RANGE(IREE) // IREE stands for IR Execution Engine
	DEFINE_SYM_KIND_RANGE(LINALG) // Linear Algebra Dialect			DEFINE_SYM_KIND_RANGE(LINALG) // Linear Algebra Dialect
	DEFINE_SYM_KIND_RANGE(FIR) // Flang Fortran IR Dialect			DEFINE_SYM_KIND_RANGE(FIR) // Flang Fortran IR Dialect
	DEFINE_SYM_KIND_RANGE(OPENMP) // OpenMP IR Dialect			DEFINE_SYM_KIND_RANGE(OPENMP) // OpenMP IR Dialect
	DEFINE_SYM_KIND_RANGE(TOY) // Toy language (tutorial) Dialect			DEFINE_SYM_KIND_RANGE(TOY) // Toy language (tutorial) Dialect
	DEFINE_SYM_KIND_RANGE(SPIRV) // SPIR-V dialect			DEFINE_SYM_KIND_RANGE(SPIRV) // SPIR-V dialect
	DEFINE_SYM_KIND_RANGE(XLA_HLO) // XLA HLO dialect			DEFINE_SYM_KIND_RANGE(XLA_HLO) // XLA HLO dialect
	DEFINE_SYM_KIND_RANGE(SHAPE) // Shape dialect			DEFINE_SYM_KIND_RANGE(SHAPE) // Shape dialect
				DEFINE_SYM_KIND_RANGE(GPU) // GPU dialect

	// The following ranges are reserved for experimenting with MLIR dialects in a			// The following ranges are reserved for experimenting with MLIR dialects in a
	// private context without having to register them here.			// private context without having to register them here.
	DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_0)			DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_0)
	DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_1)			DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_1)
	DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_2)			DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_2)
	DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_3)			DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_3)
	DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_4)			DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_4)
	DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_5)			DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_5)
	DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_6)			DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_6)
	DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_7)			DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_7)
	DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_8)			DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_8)
	DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_9)			DEFINE_SYM_KIND_RANGE(PRIVATE_EXPERIMENTAL_9)

	#undef DEFINE_SYM_KIND_RANGE			#undef DEFINE_SYM_KIND_RANGE

mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp

Show All 25 Lines
#include "mlir/Target/NVVMIR.h"		#include "mlir/Target/NVVMIR.h"

#include "llvm/ADT/Optional.h"		#include "llvm/ADT/Optional.h"
#include "llvm/ADT/Twine.h"		#include "llvm/ADT/Twine.h"
#include "llvm/IR/Constants.h"		#include "llvm/IR/Constants.h"
#include "llvm/IR/LegacyPassManager.h"		#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/Module.h"		#include "llvm/IR/Module.h"
#include "llvm/Support/Error.h"		#include "llvm/Support/Error.h"
		#include "llvm/Support/Mutex.h"
#include "llvm/Support/TargetRegistry.h"		#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/TargetSelect.h"		#include "llvm/Support/TargetSelect.h"
#include "llvm/Target/TargetMachine.h"		#include "llvm/Target/TargetMachine.h"

using namespace mlir;		using namespace mlir;

namespace {		namespace {
// TODO(herhut): Move to shared location.		// TODO(herhut): Move to shared location.
Show All 10 Lines	: public PassWrapper<GpuKernelToCubinPass,
OperationPass<gpu::GPUModuleOp>> {		OperationPass<gpu::GPUModuleOp>> {
public:		public:
GpuKernelToCubinPass(CubinGenerator cubinGenerator)		GpuKernelToCubinPass(CubinGenerator cubinGenerator)
: cubinGenerator(cubinGenerator) {}		: cubinGenerator(cubinGenerator) {}

void runOnOperation() override {		void runOnOperation() override {
gpu::GPUModuleOp module = getOperation();		gpu::GPUModuleOp module = getOperation();

		auto llvmDialect =
		module.getContext()->getRegisteredDialect<mlir::LLVM::LLVMDialect>();
		llvm::sys::SmartScopedLock<true> scopedLock(
		llvmDialect->getLLVMContextMutex());

// Make sure the NVPTX target is initialized.		// Make sure the NVPTX target is initialized.
LLVMInitializeNVPTXTarget();		LLVMInitializeNVPTXTarget();
LLVMInitializeNVPTXTargetInfo();		LLVMInitializeNVPTXTargetInfo();
LLVMInitializeNVPTXTargetMC();		LLVMInitializeNVPTXTargetMC();
LLVMInitializeNVPTXAsmPrinter();		LLVMInitializeNVPTXAsmPrinter();

auto llvmModule = translateModuleToNVVMIR(module);		auto llvmModule = translateModuleToNVVMIR(module);
if (!llvmModule)		if (!llvmModule)
▲ Show 20 Lines • Show All 91 Lines • Show Last 20 Lines

mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp

Show All 35 Lines

// To avoid name mangling, these are defined in the mini-runtime file.		// To avoid name mangling, these are defined in the mini-runtime file.
static constexpr const char *cuModuleLoadName = "mcuModuleLoad";		static constexpr const char *cuModuleLoadName = "mcuModuleLoad";
static constexpr const char *cuModuleGetFunctionName = "mcuModuleGetFunction";		static constexpr const char *cuModuleGetFunctionName = "mcuModuleGetFunction";
static constexpr const char *cuLaunchKernelName = "mcuLaunchKernel";		static constexpr const char *cuLaunchKernelName = "mcuLaunchKernel";
static constexpr const char *cuGetStreamHelperName = "mcuGetStreamHelper";		static constexpr const char *cuGetStreamHelperName = "mcuGetStreamHelper";
static constexpr const char *cuStreamSynchronizeName = "mcuStreamSynchronize";		static constexpr const char *cuStreamSynchronizeName = "mcuStreamSynchronize";
static constexpr const char *kMcuMemHostRegister = "mcuMemHostRegister";		static constexpr const char *kMcuMemHostRegister = "mcuMemHostRegister";
		static constexpr const char *cuGraphAddKernelNodeName = "mcuGraphAddKernelNode";
		static constexpr const char *cuGraphAddMemcpyNodeName = "mcuGraphAddMemcpyNode";
		static constexpr const char *cuGraphExecuteName = "mcuGraphExecute";
		static constexpr const char *cuGetGraphHelperName = "mcuGetGraphHelper";

static constexpr const char *kCubinAnnotation = "nvvm.cubin";		static constexpr const char *kCubinAnnotation = "nvvm.cubin";
static constexpr const char *kCubinStorageSuffix = "_cubin_cst";		static constexpr const char *kCubinStorageSuffix = "_cubin_cst";

namespace {		namespace {

/// A pass to convert gpu.launch_func operations into a sequence of CUDA		/// A pass to convert gpu.launch_func operations into a sequence of CUDA
/// runtime calls.		/// runtime calls.
Show All 22 Lines	void initializeCachedTypes() {
llvmVoidType = LLVM::LLVMType::getVoidTy(llvmDialect);		llvmVoidType = LLVM::LLVMType::getVoidTy(llvmDialect);
llvmPointerType = LLVM::LLVMType::getInt8PtrTy(llvmDialect);		llvmPointerType = LLVM::LLVMType::getInt8PtrTy(llvmDialect);
llvmPointerPointerType = llvmPointerType.getPointerTo();		llvmPointerPointerType = llvmPointerType.getPointerTo();
llvmInt8Type = LLVM::LLVMType::getInt8Ty(llvmDialect);		llvmInt8Type = LLVM::LLVMType::getInt8Ty(llvmDialect);
llvmInt32Type = LLVM::LLVMType::getInt32Ty(llvmDialect);		llvmInt32Type = LLVM::LLVMType::getInt32Ty(llvmDialect);
llvmInt64Type = LLVM::LLVMType::getInt64Ty(llvmDialect);		llvmInt64Type = LLVM::LLVMType::getInt64Ty(llvmDialect);
llvmIntPtrType = LLVM::LLVMType::getIntNTy(		llvmIntPtrType = LLVM::LLVMType::getIntNTy(
llvmDialect, module.getDataLayout().getPointerSizeInBits());		llvmDialect, module.getDataLayout().getPointerSizeInBits());
		llvmKernelNodeParamsType = LLVM::LLVMType::getStructTy(
		llvmDialect,
		{
		getPointerType(), // CUfunction
		getInt32Type(), getInt32Type(), getInt32Type(), // gridDim
		getInt32Type(), getInt32Type(), getInt32Type(), // blockDim
		getInt32Type(), // sharedMemBytes
		getPointerPointerType(), // kernelParams
		getPointerPointerType() // extras
		},
		false);
}		}

LLVM::LLVMType getVoidType() { return llvmVoidType; }		LLVM::LLVMType getVoidType() { return llvmVoidType; }

LLVM::LLVMType getPointerType() { return llvmPointerType; }		LLVM::LLVMType getPointerType() { return llvmPointerType; }

LLVM::LLVMType getPointerPointerType() { return llvmPointerPointerType; }		LLVM::LLVMType getPointerPointerType() { return llvmPointerPointerType; }

Show All 9 Lines	return LLVM::LLVMType::getIntNTy(
getLLVMDialect(), module.getDataLayout().getPointerSizeInBits());		getLLVMDialect(), module.getDataLayout().getPointerSizeInBits());
}		}

LLVM::LLVMType getCUResultType() {		LLVM::LLVMType getCUResultType() {
// This is declared as an enum in CUDA but helpers use i32.		// This is declared as an enum in CUDA but helpers use i32.
return getInt32Type();		return getInt32Type();
}		}

		LLVM::LLVMType getKernelNodeParamsType() { return llvmKernelNodeParamsType; }

// Allocate a void pointer on the stack.		// Allocate a void pointer on the stack.
Value allocatePointer(OpBuilder &builder, Location loc) {		Value allocatePointer(OpBuilder &builder, Location loc) {
auto one = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),		auto one = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
builder.getI32IntegerAttr(1));		builder.getI32IntegerAttr(1));
return builder.create<LLVM::AllocaOp>(loc, getPointerPointerType(), one,		return builder.create<LLVM::AllocaOp>(loc, getPointerPointerType(), one,
/alignment=/0);		/alignment=/0);
}		}

void declareCudaFunctions(Location loc);		void declareCudaFunctions(Location loc);
void addParamToList(OpBuilder &builder, Location loc, Value param, Value list,		void addParamToList(OpBuilder &builder, Location loc, Value param, Value list,
unsigned pos, Value one);		unsigned pos, Value one);
Value setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder);		Value setupParamsArray(gpu::LaunchFuncOp launchOp, OpBuilder &builder);
Value generateKernelNameConstant(StringRef moduleName, StringRef name,		Value generateKernelNameConstant(StringRef moduleName, StringRef name,
Location loc, OpBuilder &builder);		Location loc, OpBuilder &builder);
void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp);		void translateGpuLaunchCalls(mlir::gpu::LaunchFuncOp launchOp);
		void translateGpuMemcpyOp(mlir::gpu::MemcpyOp memcpyOp);
		void translateGpuWaitOp(mlir::gpu::WaitOp waitOp);

public:		public:
// Run the dialect converter on the module.		// Run the dialect converter on the module.
void runOnOperation() override {		void runOnOperation() override {
// Cache the LLVMDialect for the current module.		// Cache the LLVMDialect for the current module.
llvmDialect = getContext().getRegisteredDialect<LLVM::LLVMDialect>();		llvmDialect = getContext().getRegisteredDialect<LLVM::LLVMDialect>();
// Cache the used LLVM types.		// Cache the used LLVM types.
initializeCachedTypes();		initializeCachedTypes();

getOperation().walk(		getOperation().walk(
[this](mlir::gpu::LaunchFuncOp op) { translateGpuLaunchCalls(op); });		[this](mlir::gpu::LaunchFuncOp op) { translateGpuLaunchCalls(op); });

		getOperation().walk(
		[this](mlir::gpu::MemcpyOp op) { translateGpuMemcpyOp(op); });
		getOperation().walk(
		[this](mlir::gpu::WaitOp op) { translateGpuWaitOp(op); });

// GPU kernel modules are no longer necessary since we have a global		// GPU kernel modules are no longer necessary since we have a global
// constant with the CUBIN data.		// constant with the CUBIN data.
for (auto m :		for (auto m :
llvm::make_early_inc_range(getOperation().getOps<gpu::GPUModuleOp>()))		llvm::make_early_inc_range(getOperation().getOps<gpu::GPUModuleOp>()))
m.erase();		m.erase();
}		}

private:		private:
LLVM::LLVMDialect *llvmDialect;		LLVM::LLVMDialect *llvmDialect;
LLVM::LLVMType llvmVoidType;		LLVM::LLVMType llvmVoidType;
LLVM::LLVMType llvmPointerType;		LLVM::LLVMType llvmPointerType;
LLVM::LLVMType llvmPointerPointerType;		LLVM::LLVMType llvmPointerPointerType;
LLVM::LLVMType llvmInt8Type;		LLVM::LLVMType llvmInt8Type;
LLVM::LLVMType llvmInt32Type;		LLVM::LLVMType llvmInt32Type;
LLVM::LLVMType llvmInt64Type;		LLVM::LLVMType llvmInt64Type;
LLVM::LLVMType llvmIntPtrType;		LLVM::LLVMType llvmIntPtrType;
		LLVM::LLVMType llvmKernelNodeParamsType;
};		};

} // anonymous namespace		} // anonymous namespace

// Adds declarations for the needed helper functions from the CUDA wrapper.		// Adds declarations for the needed helper functions from the CUDA wrapper.
// The types in comments give the actual types expected/returned but the API		// The types in comments give the actual types expected/returned but the API
// uses void pointers. This is fine as they have the same linkage in C.		// uses void pointers. This is fine as they have the same linkage in C.
void GpuLaunchFuncToCudaCallsPass::declareCudaFunctions(Location loc) {		void GpuLaunchFuncToCudaCallsPass::declareCudaFunctions(Location loc) {
▲ Show 20 Lines • Show All 67 Lines • ▼ Show 20 Lines	builder.create<LLVM::LLVMFuncOp>(
loc, kMcuMemHostRegister,		loc, kMcuMemHostRegister,
LLVM::LLVMType::getFunctionTy(getVoidType(),		LLVM::LLVMType::getFunctionTy(getVoidType(),
{		{
getPointerType(), /* void ptr /		getPointerType(), /* void ptr /
getInt64Type() /* int64 sizeBytes*/		getInt64Type() /* int64 sizeBytes*/
},		},
/isVarArg=/false));		/isVarArg=/false));
}		}
		if (!module.lookupSymbol(cuGraphAddKernelNodeName)) {
		builder.create<LLVM::LLVMFuncOp>(
		loc, cuGraphAddKernelNodeName,
		LLVM::LLVMType::getFunctionTy(
		getCUResultType(),
		{
		getPointerPointerType(), /* CUgraphNode* graph_node */
		getPointerType(), /* CUgraph graph */
		getPointerPointerType(), /* const CUgraphNode* dependencies */
		getInt32Type(), /* size_t numDependencies */
		getKernelNodeParamsType()
		.getPointerTo(), /* CUDA_KERNEL_NODE_PARAMS* params */
		},
		/isVarArg=/false));
		}
		if (!module.lookupSymbol(cuGraphAddMemcpyNodeName)) {
		auto memrefTy = LLVM::LLVMType::getStructTy(
		llvmDialect, {getInt64Type(), getPointerType()});
		builder.create<LLVM::LLVMFuncOp>(
		loc, cuGraphAddMemcpyNodeName,
		LLVM::LLVMType::getFunctionTy(
		getCUResultType(),
		{
		getPointerPointerType(), /* CUgraphNode* graph_node */
		getPointerType(), /* CUgraph graph */
		getPointerPointerType(), /* const CUgraphNode* dependencies */
		getInt32Type(), /* size_t numDependencies */
		memrefTy, /* dst */
		memrefTy, /* src */
		getInt64Type(), /* size_t element_size */
		},
		/isVarArg=/false));
		}
		if (!module.lookupSymbol(cuGraphExecuteName)) {
		builder.create<LLVM::LLVMFuncOp>(
		loc, cuGraphExecuteName,
		LLVM::LLVMType::getFunctionTy(
		getCUResultType(),
		{
		getPointerType(), /* CUgraph graph */
		getPointerPointerType(), /* const CUgraphNode* dependencies */
		getInt32Type(), /* size_t numDependencies */
		},
		/isVarArg=/false));
		}
		if (!module.lookupSymbol(cuGetGraphHelperName)) {
		// Helper function to get the current CUDA stream. Uses void* instead of
		// CUDAs opaque CUstream.
		builder.create<LLVM::LLVMFuncOp>(
		loc, cuGetGraphHelperName,
		LLVM::LLVMType::getFunctionTy(getPointerType(), /isVarArg=/false));
		}
}		}

/// Emits the IR with the following structure:		/// Emits the IR with the following structure:
///		///
/// %data = llvm.alloca 1 x type-of(<param>)		/// %data = llvm.alloca 1 x type-of(<param>)
/// llvm.store <param>, %data		/// llvm.store <param>, %data
/// %typeErased = llvm.bitcast %data to !llvm<"i8*">		/// %typeErased = llvm.bitcast %data to !llvm<"i8*">
/// %addr = llvm.getelementptr <list>[<pos>]		/// %addr = llvm.getelementptr <list>[<pos>]
▲ Show 20 Lines • Show All 180 Lines • ▼ Show 20 Lines	void GpuLaunchFuncToCudaCallsPass::translateGpuLaunchCalls(
auto cuFunction = allocatePointer(builder, loc);		auto cuFunction = allocatePointer(builder, loc);
auto cuModuleGetFunction =		auto cuModuleGetFunction =
getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuModuleGetFunctionName);		getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuModuleGetFunctionName);
builder.create<LLVM::CallOp>(		builder.create<LLVM::CallOp>(
loc, ArrayRef<Type>{getCUResultType()},		loc, ArrayRef<Type>{getCUResultType()},
builder.getSymbolRefAttr(cuModuleGetFunction),		builder.getSymbolRefAttr(cuModuleGetFunction),
ArrayRef<Value>{cuFunction, cuOwningModuleRef, kernelName});		ArrayRef<Value>{cuFunction, cuOwningModuleRef, kernelName});
// Grab the global stream needed for execution.		// Grab the global stream needed for execution.
auto cuGetStreamHelper =		auto cuGetGraphHelper =
getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuGetStreamHelperName);		getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuGetGraphHelperName);
auto cuStream = builder.create<LLVM::CallOp>(		auto cuGraph = builder.create<LLVM::CallOp>(
loc, ArrayRef<Type>{getPointerType()},		loc, ArrayRef<Type>{getPointerType()},
builder.getSymbolRefAttr(cuGetStreamHelper), ArrayRef<Value>{});		builder.getSymbolRefAttr(cuGetGraphHelper), ArrayRef<Value>{});
// Invoke the function with required arguments.		// Invoke the function with required arguments.
auto cuLaunchKernel =		auto cuGraphAddKernelNode =
getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuLaunchKernelName);		getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuGraphAddKernelNodeName);
auto cuFunctionRef =		auto cuFunctionRef =
builder.create<LLVM::LoadOp>(loc, getPointerType(), cuFunction);		builder.create<LLVM::LoadOp>(loc, getPointerType(), cuFunction);
auto paramsArray = setupParamsArray(launchOp, builder);		auto paramsArray = setupParamsArray(launchOp, builder);
if (!paramsArray) {		if (!paramsArray) {
launchOp.emitOpError() << "cannot pass given parameters to the kernel";		launchOp.emitOpError() << "cannot pass given parameters to the kernel";
return signalPassFailure();		return signalPassFailure();
}		}

		auto one = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
		builder.getI32IntegerAttr(1));
		auto kernelNodeParams = builder.create<LLVM::AllocaOp>(
		loc, getKernelNodeParamsType().getPointerTo(), one, /alignement=/0);
		auto setParamsValue = [&](Value value, size_t i) {
		auto index = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
		builder.getI32IntegerAttr(i));
		auto ptrTy = value.getType().cast<LLVM::LLVMType>().getPointerTo();
		auto gep = builder.create<LLVM::GEPOp>(loc, ptrTy, kernelNodeParams,
		ArrayRef<Value>{zero, index});
		builder.create<LLVM::StoreOp>(loc, value, gep);
		};

		setParamsValue(cuFunctionRef, 0);
		for (const auto &en :
		llvm::enumerate(launchOp.getOperands()
		.drop_front(launchOp.getChains().size())
		.take_front(launchOp.kNumConfigOperands))) {
		auto truncOp =
		builder.create<LLVM::TruncOp>(loc, getInt32Type(), en.value());
		setParamsValue(truncOp, 1 + en.index());
		}
		setParamsValue(/sharedMemSizeBytes=/zero, 7);
		setParamsValue(paramsArray, 8);
auto nullpointer =		auto nullpointer =
builder.create<LLVM::IntToPtrOp>(loc, getPointerPointerType(), zero);		builder.create<LLVM::IntToPtrOp>(loc, getPointerPointerType(), zero);
		setParamsValue(nullpointer, 9);

		auto chainsSize = builder.create<LLVM::ConstantOp>(
		loc, getInt32Type(), builder.getI32IntegerAttr(launchOp.chains().size()));
		auto chainsArray = builder.create<LLVM::AllocaOp>(
		loc, getPointerPointerType(), chainsSize, /alignment=/0);
		for (size_t i = 0; i < launchOp.chains().size(); ++i) {
		auto index = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
		builder.getI32IntegerAttr(i));
		auto gep = builder.create<LLVM::GEPOp>(loc, getPointerPointerType(),
		chainsArray, ArrayRef<Value>{index});
		builder.create<LLVM::StoreOp>(loc, launchOp.chains()[i], gep);
		}

		auto nodePointer = allocatePointer(builder, loc);

builder.create<LLVM::CallOp>(		builder.create<LLVM::CallOp>(
loc, ArrayRef<Type>{getCUResultType()},		loc, ArrayRef<Type>{getCUResultType()},
builder.getSymbolRefAttr(cuLaunchKernel),		builder.getSymbolRefAttr(cuGraphAddKernelNode),
ArrayRef<Value>{cuFunctionRef, launchOp.getOperand(0),		ArrayRef<Value>{nodePointer, cuGraph.getResult(0), chainsArray,
launchOp.getOperand(1), launchOp.getOperand(2),		chainsSize, kernelNodeParams});
launchOp.getOperand(3), launchOp.getOperand(4),
launchOp.getOperand(5), zero, /* sharedMemBytes */		Value nodeRef =
cuStream.getResult(0), /* stream */		builder.create<LLVM::LoadOp>(loc, getPointerType(), nodePointer);
paramsArray, /* kernel params */
nullpointer /* extra */});		launchOp.replaceAllUsesWith(nodeRef);
// Sync on the stream to make it synchronous.
auto cuStreamSync =
getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuStreamSynchronizeName);
builder.create<LLVM::CallOp>(loc, ArrayRef<Type>{getCUResultType()},
builder.getSymbolRefAttr(cuStreamSync),
ArrayRef<Value>(cuStream.getResult(0)));
launchOp.erase();		launchOp.erase();
}		}

		void GpuLaunchFuncToCudaCallsPass::translateGpuMemcpyOp(
		mlir::gpu::MemcpyOp memcpyOp) {
		OpBuilder builder(memcpyOp);
		Location loc = memcpyOp.getLoc();
		declareCudaFunctions(loc);

		auto chainsSize = builder.create<LLVM::ConstantOp>(
		loc, getInt32Type(), builder.getI32IntegerAttr(memcpyOp.chains().size()));
		auto chainsArray = builder.create<LLVM::AllocaOp>(
		loc, getPointerPointerType(), chainsSize, /alignment=/0);
		for (size_t i = 0; i < memcpyOp.chains().size(); ++i) {
		auto index = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
		builder.getI32IntegerAttr(i));
		auto gep = builder.create<LLVM::GEPOp>(loc, getPointerPointerType(),
		chainsArray, ArrayRef<Value>{index});
		builder.create<LLVM::StoreOp>(loc, memcpyOp.chains()[i], gep);
		}

		auto nodePointer = allocatePointer(builder, loc);

		auto cuGetGraphHelper =
		getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuGetGraphHelperName);
		auto cuGraph = builder.create<LLVM::CallOp>(
		loc, ArrayRef<Type>{getPointerType()},
		builder.getSymbolRefAttr(cuGetGraphHelper), ArrayRef<Value>{});

		auto elementSize = builder.create<LLVM::ConstantOp>(
		loc, getInt64Type(),
		builder.getI64IntegerAttr(memcpyOp.element_size()->getLimitedValue()));

		auto cuGraphAddMemcpyNode =
		getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuGraphAddMemcpyNodeName);

		builder.create<LLVM::CallOp>(
		loc, ArrayRef<Type>{getCUResultType()},
		builder.getSymbolRefAttr(cuGraphAddMemcpyNode),
		ArrayRef<Value>{nodePointer, cuGraph.getResult(0), chainsArray,
		chainsSize, memcpyOp.dst(), memcpyOp.src(), elementSize});

		Value nodeRef =
		builder.create<LLVM::LoadOp>(loc, getPointerType(), nodePointer);

		memcpyOp.replaceAllUsesWith(nodeRef);
		memcpyOp.erase();
		}

		void GpuLaunchFuncToCudaCallsPass::translateGpuWaitOp(
		mlir::gpu::WaitOp waitOp) {
		OpBuilder builder(waitOp);
		Location loc = waitOp.getLoc();
		declareCudaFunctions(loc);

		auto chainsSize = builder.create<LLVM::ConstantOp>(
		loc, getInt32Type(), builder.getI32IntegerAttr(waitOp.chains().size()));
		auto chainsArray = builder.create<LLVM::AllocaOp>(
		loc, getPointerPointerType(), chainsSize, /alignment=/0);
		for (size_t i = 0; i < waitOp.chains().size(); ++i) {
		auto index = builder.create<LLVM::ConstantOp>(loc, getInt32Type(),
		builder.getI32IntegerAttr(i));
		auto gep = builder.create<LLVM::GEPOp>(loc, getPointerPointerType(),
		chainsArray, ArrayRef<Value>{index});
		builder.create<LLVM::StoreOp>(loc, waitOp.chains()[i], gep);
		}

		auto cuGetGraphHelper =
		getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuGetGraphHelperName);
		auto cuGraph = builder.create<LLVM::CallOp>(
		loc, ArrayRef<Type>{getPointerType()},
		builder.getSymbolRefAttr(cuGetGraphHelper), ArrayRef<Value>{});

		auto cuGraphExecute =
		getOperation().lookupSymbol<LLVM::LLVMFuncOp>(cuGraphExecuteName);

		builder.create<LLVM::CallOp>(
		loc, ArrayRef<Type>{getCUResultType()},
		builder.getSymbolRefAttr(cuGraphExecute),
		ArrayRef<Value>{cuGraph.getResult(0), chainsArray, chainsSize});
		waitOp.erase();
		}

std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>		std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
mlir::createConvertGpuLaunchFuncToCudaCallsPass() {		mlir::createConvertGpuLaunchFuncToCudaCallsPass() {
return std::make_unique<GpuLaunchFuncToCudaCallsPass>();		return std::make_unique<GpuLaunchFuncToCudaCallsPass>();
}		}

mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp

Show First 20 Lines • Show All 719 Lines • ▼ Show 20 Lines
LLVM::LLVMType ConvertToLLVMPattern::getIndexType() const {		LLVM::LLVMType ConvertToLLVMPattern::getIndexType() const {
return typeConverter.getIndexType();		return typeConverter.getIndexType();
}		}

LLVM::LLVMType ConvertToLLVMPattern::getVoidType() const {		LLVM::LLVMType ConvertToLLVMPattern::getVoidType() const {
return LLVM::LLVMType::getVoidTy(&getDialect());		return LLVM::LLVMType::getVoidTy(&getDialect());
}		}

LLVM::LLVMType ConvertToLLVMPattern::getVoidPtrType() const {		LLVM::LLVMType ConvertToLLVMPattern::getVoidPtrType(unsigned addrspace) const {
return LLVM::LLVMType::getInt8PtrTy(&getDialect());		return LLVM::LLVMType::getInt8Ty(&getDialect()).getPointerTo(addrspace);
}		}

Value ConvertToLLVMPattern::createIndexConstant(		Value ConvertToLLVMPattern::createIndexConstant(
ConversionPatternRewriter &builder, Location loc, uint64_t value) const {		ConversionPatternRewriter &builder, Location loc, uint64_t value) const {
return createIndexAttrConstant(builder, loc, getIndexType(), value);		return createIndexAttrConstant(builder, loc, getIndexType(), value);
}		}

Value ConvertToLLVMPattern::linearizeSubscripts(		Value ConvertToLLVMPattern::linearizeSubscripts(
▲ Show 20 Lines • Show All 781 Lines • ▼ Show 20 Lines	Value allocateBuffer(Location loc, Value cumulativeSize, Operation *op,
AllocOp allocOp = cast<AllocOp>(op);		AllocOp allocOp = cast<AllocOp>(op);

Optional<int64_t> allocationAlignment = getAllocationAlignment(allocOp);		Optional<int64_t> allocationAlignment = getAllocationAlignment(allocOp);
// Whether to use std lib function aligned_alloc that supports alignment.		// Whether to use std lib function aligned_alloc that supports alignment.
bool useAlignedAlloc = allocationAlignment.hasValue();		bool useAlignedAlloc = allocationAlignment.hasValue();

// Insert the malloc/aligned_alloc declaration if it is not already present.		// Insert the malloc/aligned_alloc declaration if it is not already present.
auto allocFuncName = useAlignedAlloc ? "aligned_alloc" : "malloc";		auto allocFuncName = useAlignedAlloc ? "aligned_alloc" : "malloc";
		if (memRefType.getMemorySpace() == 1)
		allocFuncName = "mcuMalloc";
auto module = allocOp.getParentOfType<ModuleOp>();		auto module = allocOp.getParentOfType<ModuleOp>();
auto allocFunc = module.lookupSymbol<LLVM::LLVMFuncOp>(allocFuncName);		auto allocFunc = module.lookupSymbol<LLVM::LLVMFuncOp>(allocFuncName);
if (!allocFunc) {		if (!allocFunc) {
OpBuilder moduleBuilder(op->getParentOfType<ModuleOp>().getBodyRegion());		OpBuilder moduleBuilder(op->getParentOfType<ModuleOp>().getBodyRegion());
SmallVector<LLVM::LLVMType, 2> callArgTypes = {getIndexType()};		SmallVector<LLVM::LLVMType, 2> callArgTypes = {getIndexType()};
// aligned_alloc(size_t alignment, size_t size)		// aligned_alloc(size_t alignment, size_t size)
if (useAlignedAlloc)		if (useAlignedAlloc)
callArgTypes.push_back(getIndexType());		callArgTypes.push_back(getIndexType());
allocFunc = moduleBuilder.create<LLVM::LLVMFuncOp>(		allocFunc = moduleBuilder.create<LLVM::LLVMFuncOp>(
rewriter.getUnknownLoc(), allocFuncName,		rewriter.getUnknownLoc(), allocFuncName,
LLVM::LLVMType::getFunctionTy(getVoidPtrType(), callArgTypes,		LLVM::LLVMType::getFunctionTy(
		getVoidPtrType(memRefType.getMemorySpace()), callArgTypes,
/isVarArg=/false));		/isVarArg=/false));
}		}

// Allocate the underlying buffer and store a pointer to it in the MemRef		// Allocate the underlying buffer and store a pointer to it in the MemRef
// descriptor.		// descriptor.
SmallVector<Value, 2> callArgs;		SmallVector<Value, 2> callArgs;
if (useAlignedAlloc) {		if (useAlignedAlloc) {
// Use aligned_alloc.		// Use aligned_alloc.
assert(allocationAlignment && "allocation alignment should be present");		assert(allocationAlignment && "allocation alignment should be present");
Show All 18 Lines	if (useAlignedAlloc) {
loc,		loc,
rewriter.create<LLVM::AddOp>(loc, cumulativeSize, accessAlignment),		rewriter.create<LLVM::AddOp>(loc, cumulativeSize, accessAlignment),
one);		one);
}		}
callArgs.push_back(cumulativeSize);		callArgs.push_back(cumulativeSize);
}		}
auto allocFuncSymbol = rewriter.getSymbolRefAttr(allocFunc);		auto allocFuncSymbol = rewriter.getSymbolRefAttr(allocFunc);
allocatedBytePtr = rewriter		allocatedBytePtr = rewriter
.create<LLVM::CallOp>(loc, getVoidPtrType(),		.create<LLVM::CallOp>(
		loc, getVoidPtrType(memRefType.getMemorySpace()),
allocFuncSymbol, callArgs)		allocFuncSymbol, callArgs)
.getResult(0);		.getResult(0);
// For heap allocations, the allocated pointer is a cast of the byte pointer		// For heap allocations, the allocated pointer is a cast of the byte pointer
// to the type pointer.		// to the type pointer.
return rewriter.create<LLVM::BitcastOp>(loc, elementPtrType,		return rewriter.create<LLVM::BitcastOp>(loc, elementPtrType,
allocatedBytePtr);		allocatedBytePtr);
}		}

// An `alloc` is converted into a definition of a memref descriptor value and		// An `alloc` is converted into a definition of a memref descriptor value and
▲ Show 20 Lines • Show All 1,400 Lines • Show Last 20 Lines

mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

//===- GPUDialect.cpp - MLIR Dialect for GPU Kernels implementation -------===//		//===- GPUDialect.cpp - MLIR Dialect for GPU Kernels implementation -------===//
//		//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.		// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.		// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception		// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
// This file implements the GPU kernel-related dialect and its operations.		// This file implements the GPU kernel-related dialect and its operations.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "mlir/Dialect/GPU/GPUDialect.h"		#include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"		#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/StandardOps/IR/Ops.h"		#include "mlir/Dialect/StandardOps/IR/Ops.h"
#include "mlir/IR/Builders.h"		#include "mlir/IR/Builders.h"
		#include "mlir/IR/DialectImplementation.h"
#include "mlir/IR/Function.h"		#include "mlir/IR/Function.h"
#include "mlir/IR/FunctionImplementation.h"		#include "mlir/IR/FunctionImplementation.h"
#include "mlir/IR/Module.h"		#include "mlir/IR/Module.h"
#include "mlir/IR/OpImplementation.h"		#include "mlir/IR/OpImplementation.h"
#include "mlir/IR/PatternMatch.h"		#include "mlir/IR/PatternMatch.h"
#include "mlir/IR/StandardTypes.h"		#include "mlir/IR/StandardTypes.h"
		#include "mlir/IR/TypeUtilities.h"

using namespace mlir;		using namespace mlir;
using namespace mlir::gpu;		using namespace mlir::gpu;

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// GPUDialect		// GPUDialect
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

bool GPUDialect::isKernel(Operation *op) {		bool GPUDialect::isKernel(Operation *op) {
UnitAttr isKernelAttr = op->getAttrOfType<UnitAttr>(getKernelFuncAttrName());		UnitAttr isKernelAttr = op->getAttrOfType<UnitAttr>(getKernelFuncAttrName());
return static_cast<bool>(isKernelAttr);		return static_cast<bool>(isKernelAttr);
}		}

GPUDialect::GPUDialect(MLIRContext *context)		GPUDialect::GPUDialect(MLIRContext *context)
: Dialect(getDialectNamespace(), context) {		: Dialect(getDialectNamespace(), context) {
		addTypes<ChainType>();
addOperations<		addOperations<
#define GET_OP_LIST		#define GET_OP_LIST
#include "mlir/Dialect/GPU/GPUOps.cpp.inc"		#include "mlir/Dialect/GPU/GPUOps.cpp.inc"
>();		>();
}		}

		Type GPUDialect::parseType(DialectAsmParser &parser) const {
		// Parse the main keyword for the type.
		StringRef keyword;
		if (parser.parseKeyword(&keyword))
		return Type();
		MLIRContext *context = getContext();

		// Handle 'chain' types.
		if (keyword == "chain")
		return ChainType::get(context);

		parser.emitError(parser.getNameLoc(), "unknown gpu type: " + keyword);
		return Type();
		}

		void GPUDialect::printType(Type type, DialectAsmPrinter &os) const {
		switch (type.getKind()) {
		default:
		llvm_unreachable("Unhandled gpu type");
		case GpuTypes::Chain:
		os << "chain";
		break;
		}
		}

LogicalResult GPUDialect::verifyOperationAttribute(Operation *op,		LogicalResult GPUDialect::verifyOperationAttribute(Operation *op,
NamedAttribute attr) {		NamedAttribute attr) {
if (!attr.second.isa<UnitAttr>() \|\|		if (!attr.second.isa<UnitAttr>() \|\|
attr.first != getContainerModuleAttrName())		attr.first != getContainerModuleAttrName())
return success();		return success();

auto module = dyn_cast<ModuleOp>(op);		auto module = dyn_cast<ModuleOp>(op);
if (!module)		if (!module)
▲ Show 20 Lines • Show All 144 Lines • ▼ Show 20 Lines	if (parser.parseColonType(valueType) \|\|
return failure();		return failure();
return success();		return success();
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// LaunchOp		// LaunchOp
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

void LaunchOp::build(Builder *builder, OperationState &result, Value gridSizeX,		void LaunchOp::build(Builder *builder, OperationState &result,
Value gridSizeY, Value gridSizeZ, Value blockSizeX,		ValueRange chains, Value gridSizeX, Value gridSizeY,
Value blockSizeY, Value blockSizeZ) {		Value gridSizeZ, Value blockSizeX, Value blockSizeY,
		Value blockSizeZ) {
		result.addOperands(chains);
// Add grid and block sizes as op operands, followed by the data operands.		// Add grid and block sizes as op operands, followed by the data operands.
result.addOperands(		result.addOperands(
{gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ});		{gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ});

// Create a kernel body region with kNumConfigRegionAttributes + N arguments,		// Create a kernel body region with kNumConfigRegionAttributes + N arguments,
// where the first kNumConfigRegionAttributes arguments have `index` type and		// where the first kNumConfigRegionAttributes arguments have `index` type and
// the rest have the same types as the data operands.		// the rest have the same types as the data operands.
Region *kernelRegion = result.addRegion();		Region *kernelRegion = result.addRegion();
Block *body = new Block();		Block *body = new Block();
body->addArguments(		body->addArguments(
std::vector<Type>(kNumConfigRegionAttributes, builder->getIndexType()));		std::vector<Type>(kNumConfigRegionAttributes, builder->getIndexType()));
kernelRegion->push_back(body);		kernelRegion->push_back(body);
}		}
		void LaunchOp::build(Builder *builder, OperationState &result, Value gridSizeX,
		Value gridSizeY, Value gridSizeZ, Value blockSizeX,
		Value blockSizeY, Value blockSizeZ) {
		build(builder, result, {}, gridSizeX, gridSizeY, gridSizeZ, blockSizeX,
		blockSizeY, blockSizeZ);
		}

KernelDim3 LaunchOp::getBlockIds() {		KernelDim3 LaunchOp::getBlockIds() {
assert(!body().getBlocks().empty() && "FuncOp body must not be empty.");		assert(!body().getBlocks().empty() && "FuncOp body must not be empty.");
auto args = body().getBlocks().front().getArguments();		auto args = body().getBlocks().front().getArguments();
return KernelDim3{args[0], args[1], args[2]};		return KernelDim3{args[0], args[1], args[2]};
}		}

KernelDim3 LaunchOp::getThreadIds() {		KernelDim3 LaunchOp::getThreadIds() {
Show All 10 Lines

KernelDim3 LaunchOp::getBlockSize() {		KernelDim3 LaunchOp::getBlockSize() {
assert(!body().getBlocks().empty() && "FuncOp body must not be empty.");		assert(!body().getBlocks().empty() && "FuncOp body must not be empty.");
auto args = body().getBlocks().front().getArguments();		auto args = body().getBlocks().front().getArguments();
return KernelDim3{args[9], args[10], args[11]};		return KernelDim3{args[9], args[10], args[11]};
}		}

KernelDim3 LaunchOp::getGridSizeOperandValues() {		KernelDim3 LaunchOp::getGridSizeOperandValues() {
return KernelDim3{getOperand(0), getOperand(1), getOperand(2)};		auto operands = getOperands().drop_front(getChains().size());
		return KernelDim3{operands[0], operands[1], operands[2]};
}		}

KernelDim3 LaunchOp::getBlockSizeOperandValues() {		KernelDim3 LaunchOp::getBlockSizeOperandValues() {
return KernelDim3{getOperand(3), getOperand(4), getOperand(5)};		auto operands = getOperands().drop_front(getChains().size());
		return KernelDim3{operands[3], operands[4], operands[5]};
		}

		OperandRange LaunchOp::getChains() {
		auto operands = getOperands();
		return {operands.begin(), llvm::find_if_not(operands, [](const Value &op) {
		return op.getType().isa<ChainType>();
		})};
}		}

static LogicalResult verify(LaunchOp op) {		static LogicalResult verify(LaunchOp op) {
// Kernel launch takes kNumConfigOperands leading operands for grid/block		// Kernel launch takes kNumConfigOperands leading operands for grid/block
// sizes and transforms them into kNumConfigRegionAttributes region arguments		// sizes and transforms them into kNumConfigRegionAttributes region arguments
// for block/thread identifiers and grid/block sizes.		// for block/thread identifiers and grid/block sizes.
if (!op.body().empty()) {		if (!op.body().empty()) {
Block &entryBlock = op.body().front();		Block &entryBlock = op.body().front();
if (entryBlock.getNumArguments() !=		if (entryBlock.getNumArguments() - LaunchOp::kNumConfigRegionAttributes !=
LaunchOp::kNumConfigOperands + op.getNumOperands())		op.getNumOperands() - op.getChains().size() -
		LaunchOp::kNumConfigOperands)
return op.emitOpError("unexpected number of region arguments");		return op.emitOpError("unexpected number of region arguments");
}		}

// Block terminators without successors are expected to exit the kernel region		// Block terminators without successors are expected to exit the kernel region
// and must be `gpu.terminator`.		// and must be `gpu.terminator`.
for (Block &block : op.body()) {		for (Block &block : op.body()) {
if (block.empty())		if (block.empty())
continue;		continue;
Show All 12 Lines	static LogicalResult verify(LaunchOp op) {
return success();		return success();
}		}

// Pretty-print the kernel grid/block size assignment as		// Pretty-print the kernel grid/block size assignment as
// (%iter-x, %iter-y, %iter-z) in		// (%iter-x, %iter-y, %iter-z) in
// (%size-x = %ssa-use, %size-y = %ssa-use, %size-z = %ssa-use)		// (%size-x = %ssa-use, %size-y = %ssa-use, %size-z = %ssa-use)
// where %size-* and %iter-* will correspond to the body region arguments.		// where %size-* and %iter-* will correspond to the body region arguments.
static void printSizeAssignment(OpAsmPrinter &p, KernelDim3 size,		static void printSizeAssignment(OpAsmPrinter &p, KernelDim3 size,
ValueRange operands, KernelDim3 ids) {		KernelDim3 operands, KernelDim3 ids) {
p << '(' << ids.x << ", " << ids.y << ", " << ids.z << ") in (";		p << '(' << ids.x << ", " << ids.y << ", " << ids.z << ") in (";
p << size.x << " = " << operands[0] << ", ";		p << size.x << " = " << operands.x << ", ";
p << size.y << " = " << operands[1] << ", ";		p << size.y << " = " << operands.y << ", ";
p << size.z << " = " << operands[2] << ')';		p << size.z << " = " << operands.z << ')';
}		}

static void printLaunchOp(OpAsmPrinter &p, LaunchOp op) {		static void printLaunchOp(OpAsmPrinter &p, LaunchOp op) {
ValueRange operands = op.getOperands();

// Print the launch configuration.		// Print the launch configuration.
p << LaunchOp::getOperationName() << ' ' << op.getBlocksKeyword();		p << LaunchOp::getOperationName();
printSizeAssignment(p, op.getGridSize(), operands.take_front(3),		// Note: only print [] if op returns a chain for backwards compatibility.
		if (!op.getResults().empty()) {
		p << '[';
		p.printOperands(op.getChains());
		p << ']';
		}
		p << ' ' << op.getBlocksKeyword();
		printSizeAssignment(p, op.getGridSize(), op.getGridSizeOperandValues(),
op.getBlockIds());		op.getBlockIds());
p << ' ' << op.getThreadsKeyword();		p << ' ' << op.getThreadsKeyword();
printSizeAssignment(p, op.getBlockSize(), operands.slice(3, 3),		printSizeAssignment(p, op.getBlockSize(), op.getBlockSizeOperandValues(),
op.getThreadIds());		op.getThreadIds());

p.printRegion(op.body(), /printEntryBlockArgs=/false);		p.printRegion(op.body(), /printEntryBlockArgs=/false);
p.printOptionalAttrDict(op.getAttrs());		p.printOptionalAttrDict(op.getAttrs());
}		}

// Parse the size assignment blocks for blocks and threads. These have the form		// Parse the size assignment blocks for blocks and threads. These have the form
// (%region_arg, %region_arg, %region_arg) in		// (%region_arg, %region_arg, %region_arg) in
Show All 26 Lines
}		}

// Parses a Launch operation.		// Parses a Launch operation.
// operation ::= `gpu.launch` `blocks` `(` ssa-id-list `)` `in` ssa-reassignment		// operation ::= `gpu.launch` `blocks` `(` ssa-id-list `)` `in` ssa-reassignment
// `threads` `(` ssa-id-list `)` `in` ssa-reassignment		// `threads` `(` ssa-id-list `)` `in` ssa-reassignment
// region attr-dict?		// region attr-dict?
// ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)`		// ssa-reassignment ::= `(` ssa-id `=` ssa-use (`,` ssa-id `=` ssa-use)* `)`
static ParseResult parseLaunchOp(OpAsmParser &parser, OperationState &result) {		static ParseResult parseLaunchOp(OpAsmParser &parser, OperationState &result) {
		// Note: only return chain if it contains '[]' for backwards compatibility.
		if (!parser.parseOptionalLSquare()) {
		SmallVector<OpAsmParser::OperandType, 1> chains;
		auto chainTy = ChainType::get(parser.getBuilder().getContext());
		if (parser.parseOperandList(chains) \|\|
		parser.resolveOperands(chains, chainTy, result.operands) \|\|
		parser.parseOptionalRSquare() \|\|
		parser.addTypeToList(chainTy, result.types))
		return failure();
		}

// Sizes of the grid and block.		// Sizes of the grid and block.
SmallVector<OpAsmParser::OperandType, LaunchOp::kNumConfigOperands> sizes(		SmallVector<OpAsmParser::OperandType, LaunchOp::kNumConfigOperands> sizes(
LaunchOp::kNumConfigOperands);		LaunchOp::kNumConfigOperands);
MutableArrayRef<OpAsmParser::OperandType> sizesRef(sizes);		MutableArrayRef<OpAsmParser::OperandType> sizesRef(sizes);

// Actual (data) operands passed to the kernel.		// Actual (data) operands passed to the kernel.
SmallVector<OpAsmParser::OperandType, 4> dataOperands;		SmallVector<OpAsmParser::OperandType, 4> dataOperands;

Show All 30 Lines	return failure(parser.parseRegion(*body, regionArgs, dataTypes) \|\|
parser.parseOptionalAttrDict(result.attributes));		parser.parseOptionalAttrDict(result.attributes));
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// LaunchFuncOp		// LaunchFuncOp
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

void LaunchFuncOp::build(Builder *builder, OperationState &result,		void LaunchFuncOp::build(Builder *builder, OperationState &result,
GPUFuncOp kernelFunc, Value gridSizeX, Value gridSizeY,		ValueRange chains, GPUFuncOp kernelFunc,
Value gridSizeZ, Value blockSizeX, Value blockSizeY,		Value gridSizeX, Value gridSizeY, Value gridSizeZ,
Value blockSizeZ, ValueRange kernelOperands) {		Value blockSizeX, Value blockSizeY, Value blockSizeZ,
		ValueRange kernelOperands) {
		result.types.push_back(ChainType::get(builder->getContext()));
		result.addOperands(chains);
// Add grid and block sizes as op operands, followed by the data operands.		// Add grid and block sizes as op operands, followed by the data operands.
result.addOperands(		result.addOperands(
{gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ});		{gridSizeX, gridSizeY, gridSizeZ, blockSizeX, blockSizeY, blockSizeZ});
result.addOperands(kernelOperands);		result.addOperands(kernelOperands);
result.addAttribute(getKernelAttrName(),		result.addAttribute(getKernelAttrName(),
builder->getStringAttr(kernelFunc.getName()));		builder->getStringAttr(kernelFunc.getName()));
auto kernelModule = kernelFunc.getParentOfType<GPUModuleOp>();		auto kernelModule = kernelFunc.getParentOfType<GPUModuleOp>();
result.addAttribute(getKernelModuleAttrName(),		result.addAttribute(getKernelModuleAttrName(),
builder->getSymbolRefAttr(kernelModule.getName()));		builder->getSymbolRefAttr(kernelModule.getName()));
		SmallVector<int32_t, 8> operandSegmentSizes(8, 1);
		operandSegmentSizes.front() = static_cast<int32_t>(chains.size());
		operandSegmentSizes.back() = static_cast<int32_t>(kernelOperands.size());
		result.addAttribute(getOperandSegmentSizeAttr(),
		builder->getI32VectorAttr(operandSegmentSizes));
}		}

void LaunchFuncOp::build(Builder *builder, OperationState &result,		void LaunchFuncOp::build(Builder *builder, OperationState &result,
GPUFuncOp kernelFunc, KernelDim3 gridSize,		ValueRange chains, GPUFuncOp kernelFunc,
KernelDim3 blockSize, ValueRange kernelOperands) {		KernelDim3 gridSize, KernelDim3 blockSize,
build(builder, result, kernelFunc, gridSize.x, gridSize.y, gridSize.z,		ValueRange kernelOperands) {
		build(builder, result, chains, kernelFunc, gridSize.x, gridSize.y, gridSize.z,
blockSize.x, blockSize.y, blockSize.z, kernelOperands);		blockSize.x, blockSize.y, blockSize.z, kernelOperands);
}		}

StringRef LaunchFuncOp::kernel() {		StringRef LaunchFuncOp::kernel() {
return getAttrOfType<StringAttr>(getKernelAttrName()).getValue();		return getAttrOfType<StringAttr>(getKernelAttrName()).getValue();
}		}

unsigned LaunchFuncOp::getNumKernelOperands() {		unsigned LaunchFuncOp::getNumKernelOperands() {
return getNumOperands() - kNumConfigOperands;		// TODO(csigg): Use the attribute from AttrSizedOperandSegments?
		return getNumOperands() - getChains().size() - kNumConfigOperands;
}		}

StringRef LaunchFuncOp::getKernelModuleName() {		StringRef LaunchFuncOp::getKernelModuleName() {
return getAttrOfType<SymbolRefAttr>(getKernelModuleAttrName())		return getAttrOfType<SymbolRefAttr>(getKernelModuleAttrName())
.getRootReference();		.getRootReference();
}		}

Value LaunchFuncOp::getKernelOperand(unsigned i) {		Value LaunchFuncOp::getKernelOperand(unsigned i) {
return getOperation()->getOperand(i + kNumConfigOperands);		return getOperand(getChains().size() + kNumConfigOperands + i);
}		}

KernelDim3 LaunchFuncOp::getGridSizeOperandValues() {		KernelDim3 LaunchFuncOp::getGridSizeOperandValues() {
return KernelDim3{getOperand(0), getOperand(1), getOperand(2)};		auto operands = getOperands().drop_front(getChains().size());
		return KernelDim3{operands[0], operands[1], operands[2]};
}		}

KernelDim3 LaunchFuncOp::getBlockSizeOperandValues() {		KernelDim3 LaunchFuncOp::getBlockSizeOperandValues() {
return KernelDim3{getOperand(3), getOperand(4), getOperand(5)};		auto operands = getOperands().drop_front(getChains().size());
		return KernelDim3{operands[3], operands[4], operands[5]};
		}

		OperandRange LaunchFuncOp::getChains() {
		auto num_chains =
		*getAttrOfType<DenseIntElementsAttr>(getOperandSegmentSizeAttr()).begin();
		auto begin = getOperands().begin();
		return {begin, begin + num_chains.getLimitedValue()};
}		}

static LogicalResult verify(LaunchFuncOp op) {		static LogicalResult verify(LaunchFuncOp op) {
auto module = op.getParentOfType<ModuleOp>();		auto module = op.getParentOfType<ModuleOp>();
if (!module)		if (!module)
return op.emitOpError("expected to belong to a module");		return op.emitOpError("expected to belong to a module");

if (!module.getAttrOfType<UnitAttr>(GPUDialect::getContainerModuleAttrName()))		if (!module.getAttrOfType<UnitAttr>(GPUDialect::getContainerModuleAttrName()))
▲ Show 20 Lines • Show All 350 Lines • Show Last 20 Lines

mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp

	Show First 20 Lines • Show All 192 Lines • ▼ Show 20 Lines

	// Replace `gpu.launch` operations with an `gpu.launch_func` operation launching			// Replace `gpu.launch` operations with an `gpu.launch_func` operation launching
	// `kernelFunc`. The kernel func contains the body of the `gpu.launch` with			// `kernelFunc`. The kernel func contains the body of the `gpu.launch` with
	// constant region arguments inlined.			// constant region arguments inlined.
	static void convertToLaunchFuncOp(gpu::LaunchOp launchOp,			static void convertToLaunchFuncOp(gpu::LaunchOp launchOp,
	gpu::GPUFuncOp kernelFunc,			gpu::GPUFuncOp kernelFunc,
	ValueRange operands) {			ValueRange operands) {
	OpBuilder builder(launchOp);			OpBuilder builder(launchOp);
	builder.create<gpu::LaunchFuncOp>(			auto launchFuncOp = builder.create<gpu::LaunchFuncOp>(
	launchOp.getLoc(), kernelFunc, launchOp.getGridSizeOperandValues(),			launchOp.getLoc(), launchOp.getChains(), kernelFunc,
	launchOp.getBlockSizeOperandValues(), operands);			launchOp.getGridSizeOperandValues(), launchOp.getBlockSizeOperandValues(),
				operands);
				launchOp.replaceAllUsesWith(launchFuncOp.getOperation());
	launchOp.erase();			launchOp.erase();
	}			}

	namespace {			namespace {
	/// Pass that moves the kernel of each LaunchOp into its separate nested module.			/// Pass that moves the kernel of each LaunchOp into its separate nested module.
	///			///
	/// This pass moves the kernel code of each LaunchOp into a function created			/// This pass moves the kernel code of each LaunchOp into a function created
	/// inside a nested module. It also creates an external function of the same			/// inside a nested module. It also creates an external function of the same
	▲ Show 20 Lines • Show All 91 Lines • Show Last 20 Lines

mlir/lib/Target/LLVMIR/ModuleTranslation.cpp

Show First 20 Lines • Show All 491 Lines • ▼ Show 20 Lines	LogicalResult ModuleTranslation::convertBlock(Block &bb, bool ignoreArguments) {

return success();		return success();
}		}

/// Create named global variables that correspond to llvm.mlir.global		/// Create named global variables that correspond to llvm.mlir.global
/// definitions.		/// definitions.
LogicalResult ModuleTranslation::convertGlobals() {		LogicalResult ModuleTranslation::convertGlobals() {
// Lock access to the llvm context.		// Lock access to the llvm context.
llvm::sys::SmartScopedLock<true> scopedLock(
llvmDialect->getLLVMContextMutex());
for (auto op : getModuleBody(mlirModule).getOps<LLVM::GlobalOp>()) {		for (auto op : getModuleBody(mlirModule).getOps<LLVM::GlobalOp>()) {
llvm::Type *type = op.getType().getUnderlyingType();		llvm::Type *type = op.getType().getUnderlyingType();
llvm::Constant *cst = llvm::UndefValue::get(type);		llvm::Constant *cst = llvm::UndefValue::get(type);
if (op.getValueOrNull()) {		if (op.getValueOrNull()) {
// String attributes are treated separately because they cannot appear as		// String attributes are treated separately because they cannot appear as
// in-function constants and are thus not supported by getLLVMConstant.		// in-function constants and are thus not supported by getLLVMConstant.
if (auto strAttr = op.getValueOrNull().dyn_cast_or_null<StringAttr>()) {		if (auto strAttr = op.getValueOrNull().dyn_cast_or_null<StringAttr>()) {
cst = llvm::ConstantDataArray::getString(		cst = llvm::ConstantDataArray::getString(
▲ Show 20 Lines • Show All 317 Lines • Show Last 20 Lines

mlir/test/Dialect/GPU/ops.mlir

// RUN: mlir-opt -allow-unregistered-dialect %s \| FileCheck %s		// RUN: mlir-opt -allow-unregistered-dialect %s \| FileCheck %s --dump-input-on-failure

module attributes {gpu.container_module} {		module attributes {gpu.container_module} {

// CHECK-LABEL:func @no_args(%{{.*}}: index)		// CHECK-LABEL:func @no_args(%{{.*}}: index)
func @no_args(%sz : index) {		func @no_args(%sz : index) {
// CHECK: gpu.launch blocks(%{{.}}, %{{.}}, %{{.}}) in (%{{.}} = %{{.}}, %{{.}} = %{{.}}, %{{.}} = %{{.}}) threads(%{{.}}, %{{.}}, %{{.}}) in (%{{.}} = %{{.}}, %{{.}} = %{{.}}, %{{.}} = %{{.}})		// CHECK: gpu.launch blocks(%{{.}}, %{{.}}, %{{.}}) in (%{{.}} = %{{.}}, %{{.}} = %{{.}}, %{{.}} = %{{.}}) threads(%{{.}}, %{{.}}, %{{.}}) in (%{{.}} = %{{.}}, %{{.}} = %{{.}}, %{{.}} = %{{.}})
gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %sz, %grid_y = %sz, %grid_z = %sz)		gpu.launch blocks(%bx, %by, %bz) in (%grid_x = %sz, %grid_y = %sz, %grid_z = %sz)
threads(%tx, %ty, %tz) in (%block_x = %sz, %block_y = %sz, %block_z = %sz) {		threads(%tx, %ty, %tz) in (%block_x = %sz, %block_y = %sz, %block_z = %sz) {
▲ Show 20 Lines • Show All 68 Lines • ▼ Show 20 Lines	func @foo() {
// CHECK: "gpu.launch_func"(%{{.}}, %{{.}}, %{{.}}, %{{.}}, %{{.}}, %{{.}}, %{{.}}, %{{.}}) {kernel = "kernel_2", kernel_module = @kernels} : (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> ()		// CHECK: "gpu.launch_func"(%{{.}}, %{{.}}, %{{.}}, %{{.}}, %{{.}}, %{{.}}, %{{.}}, %{{.}}) {kernel = "kernel_2", kernel_module = @kernels} : (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> ()
"gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1)		"gpu.launch_func"(%cst, %cst, %cst, %cst, %cst, %cst, %0, %1)
{ kernel = "kernel_2", kernel_module = @kernels }		{ kernel = "kernel_2", kernel_module = @kernels }
: (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> ()		: (index, index, index, index, index, index, f32, memref<?xf32, 1>) -> ()

return		return
}		}

		func @async() {
		%size = constant 32 : index
		// CHECK: %{{.}} = gpu.alloc (%{{.}}) : memref<?xf32>
		%b0 = gpu.alloc (%size) : memref<?xf32>
		// CHECK: %{{.}}, %{{.}} = gpu.alloc[] (%{{.*}}) : memref<?xf32>
		%b1, %t1 = gpu.alloc[] (%size) : memref<?xf32>
		// CHECK: %{{.}}, %{{.}} = gpu.alloc[%{{.}}] (%{{.}}) : memref<?xf32>
		%b2, %t2 = gpu.alloc[%t1] (%size) : memref<?xf32>

		return
		}

module @gpu_funcs attributes {gpu.kernel_module} {		module @gpu_funcs attributes {gpu.kernel_module} {
// CHECK-LABEL: gpu.func @kernel_1({{.*}}: f32)		// CHECK-LABEL: gpu.func @kernel_1({{.*}}: f32)
// CHECK: workgroup		// CHECK: workgroup
// CHECK: private		// CHECK: private
// CHECK: attributes		// CHECK: attributes
gpu.func @kernel_1(%arg0: f32)		gpu.func @kernel_1(%arg0: f32)
workgroup(%arg1: memref<42xf32, 3>)		workgroup(%arg1: memref<42xf32, 3>)
private(%arg2: memref<2xf32, 5>, %arg3: memref<1xf32, 5>)		private(%arg2: memref<2xf32, 5>, %arg3: memref<1xf32, 5>)
▲ Show 20 Lines • Show All 42 Lines • Show Last 20 Lines

mlir/test/mlir-cuda-runner/async.mlir

This file was added.

				// RUN: mlir-cuda-runner %s --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void \| FileCheck %s

				func @main() {
				%one = constant 1 : index
				%count = constant 4 : index
				// b0 = iota (init)
				%b0 = alloc(%count) : memref<?xi32, 1>
				%t1 = gpu.launch[] blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
				threads(%tx, %ty, %tz) in (%block_x = %count, %block_y = %one, %block_z = %one) {
				%val = index_cast %tx : index to i32
				store %val, %b0[%tx] : memref<?xi32, 1>
				gpu.terminator
				}
				// b1 = iota (init)
				%b1 = alloc(%count) : memref<?xi32, 1>
				%t2 = gpu.launch[] blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
				threads(%tx, %ty, %tz) in (%block_x = %count, %block_y = %one, %block_z = %one) {
				%val = index_cast %tx : index to i32
				store %val, %b1[%tx] : memref<?xi32, 1>
				gpu.terminator
				}
				// b2 = b0 + b1 (join)
				%b2 = alloc(%count) : memref<?xi32, 1>
				%t3 = gpu.launch[%t1, %t2] blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
				threads(%tx, %ty, %tz) in (%block_x = %count, %block_y = %one, %block_z = %one) {
				%v0 = load %b0[%tx] : memref<?xi32, 1>
				%v1 = load %b1[%tx] : memref<?xi32, 1>
				%sum = addi %v0, %v1 : i32
				store %sum, %b2[%tx] : memref<?xi32, 1>
				gpu.terminator
				}
				// copy b2 to h0 and h1 (fork)
				%h0 = alloc(%count) : memref<?xi32>
				%h1 = alloc(%count) : memref<?xi32>
				%g0 = memref_cast %h0 : memref<?xi32> to memref<*xi32>
				%g1 = memref_cast %h1 : memref<?xi32> to memref<*xi32>
				%c2 = memref_cast %b2 : memref<?xi32, 1> to memref<*xi32, 1>
				%t4 = gpu.memcpy[%t3] (%g0, %c2) { element_size = 4 } : memref<xi32>, memref<xi32, 1>
				%t5 = gpu.memcpy[%t3] (%g1, %c2) { element_size = 4 } : memref<xi32>, memref<xi32, 1>
				// wait for copies to complete (sync)
				gpu.wait[%t4, %t5]
				// print result
				// CHECK: [0 2 4 8]
				call @print_memref_i32(%g0) : (memref<*xi32>) -> ()
				// CHECK: [0 2 4 8]
				call @print_memref_i32(%g1) : (memref<*xi32>) -> ()
				return
				}

				func @print_memref_i32(memref<*xi32>)

mlir/test/mlir-cuda-runner/simple.mlir

This file was added.

				// RUN: mlir-cuda-runner %s --shared-libs=%cuda_wrapper_library_dir/libcuda-runtime-wrappers%shlibext,%linalg_test_lib_dir/libmlir_runner_utils%shlibext --entry-point-result=void \| FileCheck %s

				func @main() {
				%one = constant 1 : index
				%count = constant 4 : index
				// b0 = iota (init)
				%b0 = alloc(%count) : memref<?xi32, 1>
				%t1 = gpu.launch[] blocks(%bx, %by, %bz) in (%grid_x = %one, %grid_y = %one, %grid_z = %one)
				threads(%tx, %ty, %tz) in (%block_x = %count, %block_y = %one, %block_z = %one) {
				%val = index_cast %tx : index to i32
				store %val, %b0[%tx] : memref<?xi32, 1>
				gpu.terminator
				}
				// copy b0 to h0
				%h0 = alloc(%count) : memref<?xi32>
				%g0 = memref_cast %h0 : memref<?xi32> to memref<*xi32>
				%c0 = memref_cast %b0 : memref<?xi32, 1> to memref<*xi32, 1>
				%t2 = gpu.memcpy[%t1] (%g0, %c0) { element_size = 4 } : memref<xi32>, memref<xi32, 1>
				// wait for copy to complete (sync)
				gpu.wait[%t2]
				// print result
				// CHECK: [0, 1, 2, 3]
				call @print_memref_i32(%g0) : (memref<*xi32>) -> ()
				return
				}

				func @print_memref_i32(memref<*xi32>)

mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp

	Show All 15 Lines
	#include <numeric>			#include <numeric>

	#include "llvm/ADT/ArrayRef.h"			#include "llvm/ADT/ArrayRef.h"
	#include "llvm/Support/raw_ostream.h"			#include "llvm/Support/raw_ostream.h"

	#include "cuda.h"			#include "cuda.h"

	namespace {			namespace {
	int32_t reportErrorIfAny(CUresult result, const char *where) {			CUresult reportErrorIfAny(CUresult result, const char *where) {
	if (result != CUDA_SUCCESS) {			if (result != CUDA_SUCCESS) {
	llvm::errs() << "CUDA failed with " << result << " in " << where << "\n";			llvm::errs() << "CUDA failed with " << result << " in " << where << "\n";
	}			}
	return result;			return result;
	}			}
	} // anonymous namespace			} // anonymous namespace

	extern "C" int32_t mcuModuleLoad(void *module, void data) {			extern "C" int32_t mcuModuleLoad(void *module, void data) {
	Show All 27 Lines
	}			}

	extern "C" void *mcuGetStreamHelper() {			extern "C" void *mcuGetStreamHelper() {
	CUstream stream;			CUstream stream;
	reportErrorIfAny(cuStreamCreate(&stream, CU_STREAM_DEFAULT), "StreamCreate");			reportErrorIfAny(cuStreamCreate(&stream, CU_STREAM_DEFAULT), "StreamCreate");
	return stream;			return stream;
	}			}

				extern "C" CUgraph mcuGetGraphHelper() {
				static auto result = [] {
				CUgraph graph = nullptr;
				reportErrorIfAny(cuGraphCreate(&graph, 0), "cuGraphCreate");
				return graph;
				}();
				return result;
				}
				extern "C" CUresult mcuGraphExecute(CUgraph graph) {
				CUgraphExec exec = nullptr;
				reportErrorIfAny(cuGraphInstantiate(&exec, graph, /phErrorNode=/NULL,
				/logBuffer=/NULL, /bufferSize=/0),
				"cuGraphInstantiate");
				reportErrorIfAny(cuGraphLaunch(exec, nullptr), "cuGraphLaunch");
				return reportErrorIfAny(cuGraphExecDestroy(exec), "cuGraphExecDestroy");
				}
				extern "C" CUresult mcuGraphAddKernelNode(CUgraphNode *node, CUgraph graph,
				const CUgraphNode *deps,
				int32_t num_deps,
				CUDA_KERNEL_NODE_PARAMS *params) {
				return reportErrorIfAny(
				cuGraphAddKernelNode(node, graph, deps, num_deps, params),
				"cuGraphAddKernelNode");
				}

				struct StridedMemRefDesc {
				void *basePtr;
				CUdeviceptr data;
				int64_t offset;
				int64_t sizes[4];
				};

				extern "C" CUresult
				mcuGraphAddMemcpyNode(CUgraphNode node, CUgraph graph, const CUgraphNode deps,
				int32_t num_deps, int64_t dst_rank,
				StridedMemRefDesc *dst_desc, int64_t src_rank,
				StridedMemRefDesc *src_desc, size_t element_size_bytes) {
				CUcontext ctx;
				reportErrorIfAny(cuCtxGetCurrent(&ctx), "cuCtxGetCurrent");
				auto get_size = [](int64_t rank, StridedMemRefDesc *desc) {
				const int64_t *begin = desc->sizes;
				return std::accumulate(begin, begin + rank, 1, std::multiplies<int64_t>());
				};
				int64_t src_size = get_size(src_rank, src_desc);
				assert(src_size == get_size(dst_rank, dst_desc));

				CUDA_MEMCPY3D copy_params = {};
				copy_params.srcMemoryType = CU_MEMORYTYPE_UNIFIED;
				copy_params.srcDevice = src_desc->data;
				copy_params.dstMemoryType = CU_MEMORYTYPE_UNIFIED;
				copy_params.dstDevice = dst_desc->data;
				copy_params.WidthInBytes = src_size * element_size_bytes;
				copy_params.Height = copy_params.Depth = 1;

				return reportErrorIfAny(
				cuGraphAddMemcpyNode(node, graph, deps, num_deps, &copy_params, ctx),
				"cuGraphAddMemcpyNode");
				}

				extern "C" void *mcuMalloc(size_t size) {
				void *ptr;
				reportErrorIfAny(cuMemAlloc(reinterpret_cast<CUdeviceptr *>(&ptr), size),
				"cuMemAlloc");
				return ptr;
				}

	extern "C" int32_t mcuStreamSynchronize(void *stream) {			extern "C" int32_t mcuStreamSynchronize(void *stream) {
	return reportErrorIfAny(			return reportErrorIfAny(
	cuStreamSynchronize(reinterpret_cast<CUstream>(stream)), "StreamSync");			cuStreamSynchronize(reinterpret_cast<CUstream>(stream)), "StreamSync");
	}			}

	/// Helper functions for writing mlir example code			/// Helper functions for writing mlir example code

	// Allows to register byte array with the CUDA runtime. Helpful until we have			// Allows to register byte array with the CUDA runtime. Helpful until we have
	▲ Show 20 Lines • Show All 86 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[MLIR][Prototype] async gpu opsAbandonedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 259290

mlir/include/mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h

mlir/include/mlir/Dialect/GPU/GPUDialect.h

mlir/include/mlir/Dialect/GPU/GPUOps.td

mlir/include/mlir/IR/DialectSymbolRegistry.def

mlir/lib/Conversion/GPUToCUDA/ConvertKernelFuncToCubin.cpp

mlir/lib/Conversion/GPUToCUDA/ConvertLaunchFuncToCudaCalls.cpp

mlir/lib/Conversion/StandardToLLVM/StandardToLLVM.cpp

mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

mlir/lib/Dialect/GPU/Transforms/KernelOutlining.cpp

mlir/lib/Target/LLVMIR/ModuleTranslation.cpp

mlir/test/Dialect/GPU/ops.mlir

mlir/test/mlir-cuda-runner/async.mlir

mlir/test/mlir-cuda-runner/simple.mlir

mlir/tools/mlir-cuda-runner/cuda-runtime-wrappers.cpp

[MLIR][Prototype] async gpu ops
AbandonedPublic