Diff 521108

mlir/include/mlir/Dialect/GPU/IR/GPUBase.td

	Show First 20 Lines • Show All 103 Lines • ▼ Show 20 Lines
	// Memref type acceptable to gpu.subgroup_mma_{load\|store}_matrix ops.			// Memref type acceptable to gpu.subgroup_mma_{load\|store}_matrix ops.
	def GPU_MMAMemRef : MemRefOf<[I8, I32, F16, F32, VectorOfRankAndType<[1], [I8, I32, F16, F32]>]>;			def GPU_MMAMemRef : MemRefOf<[I8, I32, F16, F32, VectorOfRankAndType<[1], [I8, I32, F16, F32]>]>;

	class MMAMatrixOf<list<Type> allowedTypes> :			class MMAMatrixOf<list<Type> allowedTypes> :
	ContainerType<AnyTypeOf<allowedTypes>, IsMMAMatrixTypePred,			ContainerType<AnyTypeOf<allowedTypes>, IsMMAMatrixTypePred,
	"$_self.cast<::mlir::gpu::MMAMatrixType>().getElementType()",			"$_self.cast<::mlir::gpu::MMAMatrixType>().getElementType()",
	"gpu.mma_matrix", "::mlir::gpu::MMAMatrixType">;			"gpu.mma_matrix", "::mlir::gpu::MMAMatrixType">;

				// Generic type for all sparse handles (could be refined).
				def GPU_SparseHandle : DialectType<
				GPU_Dialect, CPred<"$_self.isa<::mlir::gpu::SparseHandleType>()">, "sparse handle type">,
				BuildableType<"mlir::gpu::SparseHandleType::get($_builder.getContext())">;

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// GPU Interfaces.			// GPU Interfaces.
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	def GPU_AsyncOpInterface : OpInterface<"AsyncOpInterface"> {			def GPU_AsyncOpInterface : OpInterface<"AsyncOpInterface"> {
	let description = [{			let description = [{
	Interface for GPU operations that execute asynchronously on the device.			Interface for GPU operations that execute asynchronously on the device.

	▲ Show 20 Lines • Show All 49 Lines • Show Last 20 Lines

mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h

Show First 20 Lines • Show All 157 Lines • ▼ Show 20 Lines	public:
/// C += A*B. This function returns which operand in the given equation is		/// C += A*B. This function returns which operand in the given equation is
/// held by this type. String returned can be one of"AOp", "BOp" and "COp".		/// held by this type. String returned can be one of"AOp", "BOp" and "COp".
StringRef getOperand() const;		StringRef getOperand() const;
};		};

// Adds a `gpu.async.token` to the front of the argument list.		// Adds a `gpu.async.token` to the front of the argument list.
void addAsyncDependency(Operation *op, Value token);		void addAsyncDependency(Operation *op, Value token);

		// Represents any sparse handle.
		class SparseHandleType
		: public Type::TypeBase<SparseHandleType, Type, TypeStorage> {
		public:
		// Used for generic hooks in TypeBase.
		using Base::Base;
		};

} // namespace gpu		} // namespace gpu
} // namespace mlir		} // namespace mlir

#include "mlir/Dialect/GPU/IR/GPUOpsEnums.h.inc"		#include "mlir/Dialect/GPU/IR/GPUOpsEnums.h.inc"

#include "mlir/Dialect/GPU/IR/GPUOpsDialect.h.inc"		#include "mlir/Dialect/GPU/IR/GPUOpsDialect.h.inc"

#include "mlir/Dialect/GPU/IR/GPUOpInterfaces.h.inc"		#include "mlir/Dialect/GPU/IR/GPUOpInterfaces.h.inc"
Show All 10 Lines

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

Show First 20 Lines • Show All 1,527 Lines • ▼ Show 20 Lines	let extraClassDeclaration = [{
}		}
}];		}];

let assemblyFormat = [{		let assemblyFormat = [{
$opType $args attr-dict `:` functional-type($args, $res)		$opType $args attr-dict `:` functional-type($args, $res)
}];		}];
}		}

		//
		// Operation on sparse matrices, called from the host
		// (currently lowers to cuSparse for CUDA only, no ROCM lowering).
		//

		def GPU_CreateSparseEnvOp : GPU_Op<"create_sparse_env", [GPU_AsyncOpInterface]> {
		let summary = "Create sparse environment operation";
		let description = [{
		The `gpu.create_sparse_env` operation initializes a sparse environment.
		It must be executed prior to any other sparse operation. The operation
		returns a handle to the new sparse environment.

		If the `async` keyword is present, the op is executed asynchronously (i.e.
		it does not block until the execution has finished on the device). In
		that case, it returns a !gpu.async.token in addition to the environment.

		Example:

		```mlir
		%env, %token = gpu.create_sparse_env async [%dep]
		```
		}];

		let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies);
		let results = (outs Res<GPU_SparseHandle>:$env, Optional<GPU_AsyncToken>:$asyncToken);

		let assemblyFormat = [{
		custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) attr-dict
		}];
		}

		def GPU_DestroySparseEnvOp : GPU_Op<"destroy_sparse_env", [GPU_AsyncOpInterface]> {
		let summary = "Destroy sparse environment operation";
		let description = [{
		The `gpu.destroy_sparse_env` operation releases all resources of a sparse
		environment represented by a handle that was previously created by a
		`gpu.create_sparse_env` operation.

		If the `async` keyword is present, the op is executed asynchronously (i.e.
		it does not block until the execution has finished on the device). In
		that case, it returns a !gpu.async.token in addition to the environment.

		Example:

		```mlir
		%token = gpu.destroy_sparse_env async [%dep] %env
		```
		}];

		let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
		Arg<GPU_SparseHandle>:$env);
		let results = (outs Optional<GPU_AsyncToken>:$asyncToken);

		let assemblyFormat = [{
		custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) $env attr-dict
		}];
		}

		def GPU_CreateDnVecOp : GPU_Op<"create_dn_vec", [GPU_AsyncOpInterface]> {
		let summary = "Create dense vector operation";
		let description = [{
		The `gpu.create_dn_vec` operation initializes a dense vector from
		the given values buffer and size. The buffer must already be copied
		from the host to the device prior to using this operation. The
		operation returns a handle to the dense vector descriptor.

		If the `async` keyword is present, the op is executed asynchronously (i.e.
		it does not block until the execution has finished on the device). In
		that case, it returns a !gpu.async.token in addition to the environment.

		Example:

		```mlir
		%dvec, %token = gpu.create_dn_vec async [%dep] %mem, %size : memref<?xf64>
		```
		}];

		let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
		AnyMemRef:$memref, Index:$size);
		let results = (outs Res<GPU_SparseHandle>:$dvec, Optional<GPU_AsyncToken>:$asyncToken);

		let assemblyFormat = [{
		custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
		$memref `,` $size attr-dict `:` type($memref)
		}];
		}

		def GPU_DestroyDnVecOp : GPU_Op<"destroy_dn_vec", [GPU_AsyncOpInterface]> {
		let summary = "Destroy dense vector operation";
		let description = [{
		The `gpu.destroy_sparse_env` operation releases all resources of a dense
		vector represented by a handle that was previously created by a
		`gpu.create_dn_vec` operation.

		If the `async` keyword is present, the op is executed asynchronously (i.e.
		it does not block until the execution has finished on the device). In
		that case, it returns a !gpu.async.token in addition to the environment.

		Example:

		```mlir
		%token = gpu.destroy_dn_vec async [%dep] %dvec
		```
		}];

		let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
		Arg<GPU_SparseHandle>:$dvec);
		let results = (outs Optional<GPU_AsyncToken>:$asyncToken);

		let assemblyFormat = [{
		custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) $dvec attr-dict
		}];
		}

		def GPU_CreateCooOp : GPU_Op<"create_coo", [GPU_AsyncOpInterface]> {
		let summary = "Create sparse matrix in COO format operation";
		let description = [{
		The `gpu.create_coo` operation initializes a sparse matrix in COO format
		with the given sizes from the given index and values buffers. The buffers
		must already be copied from the host to the device prior to using this
		operation. The operation returns a handle to the sparse matrix descriptor.

		If the `async` keyword is present, the op is executed asynchronously (i.e.
		it does not block until the execution has finished on the device). In
		that case, it returns a !gpu.async.token in addition to the environment.

		Example:

		```mlir
		%spmat, %token = gpu.create_coo async [%dep] %rows, %cols, %nnz, %rowIdx,
		%colIdx, %values : memref<?xindex>, memref<?xindex>, memref<?xf64>
		```
		}];

		let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
		Index:$rows,
		Index:$cols,
		Index:$nnz,
		AnyMemRef:$rowIdxs,
		AnyMemRef:$colIdxs,
		AnyMemRef:$values);
		let results = (outs Res<GPU_SparseHandle>:$spmat, Optional<GPU_AsyncToken>:$asyncToken);

		let assemblyFormat = [{
		custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
		$rows `,` $cols `,` $nnz `,` $rowIdxs `,` $colIdxs `,` $values attr-dict
		`:` type($rowIdxs) `,` type($colIdxs) `,` type($values)
		}];
		}

		def GPU_CreateCsrOp : GPU_Op<"create_csr", [GPU_AsyncOpInterface]> {
		let summary = "Create sparse matrix in CSR format operation";
		let description = [{
		The `gpu.create_csr` operation initializes a sparse matrix in CSR format
		with the given sizes from the given position, index, and values buffers.
		The buffers must already be copied from the host to the device prior to
		using this operation. The operation returns a handle to the sparse
		matrix descriptor.

		If the `async` keyword is present, the op is executed asynchronously (i.e.
		it does not block until the execution has finished on the device). In
		that case, it returns a !gpu.async.token in addition to the environment.

		Example:

		```mlir
		%spmat, %token = gpu.create_csr async [%dep] %rows, %cols, %nnz, %rowPos,
		%colIdx, %values : memref<?xindex>, memref<?xindex>, memref<?xf64>
		```
		}];

		let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
		Index:$rows,
		Index:$cols,
		Index:$nnz,
		AnyMemRef:$rowPos,
		AnyMemRef:$colIdxs,
		AnyMemRef:$values);
		let results = (outs Res<GPU_SparseHandle>:$spmat, Optional<GPU_AsyncToken>:$asyncToken);

		let assemblyFormat = [{
		custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
		$rows `,` $cols `,` $nnz `,` $rowPos `,` $colIdxs `,` $values attr-dict
		`:` type($rowPos) `,` type($colIdxs) `,` type($values)
		}];
		}

		def GPU_DestroySpMatOp : GPU_Op<"destroy_sp_mat", [GPU_AsyncOpInterface]> {
		let summary = "Destroy sparse matrix operation";
		let description = [{

		def GPU_DestroySpMatOp : GPU_Op<"destroy_sp_mat", [GPU_AsyncOpInterface]> {
		let summary = "Destroy sparse matrix operation";
		let description = [{
		The `gpu.destroy_sp_mat` operation releases all resources of a sparse
		matrix represented by a handle that was previously created by a
		one of the sparse matrix creation operations.

		If the `async` keyword is present, the op is executed asynchronously (i.e.
		it does not block until the execution has finished on the device). In
		that case, it returns a !gpu.async.token in addition to the environment.

		Example:

		```mlir
		%token = gpu.destroy_sp_mat async [%dep] %spmat
		```
		}];

		let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
		Arg<GPU_SparseHandle>:$spmat);
		let results = (outs Optional<GPU_AsyncToken>:$asyncToken);

		let assemblyFormat = [{
		custom<AsyncDependencies>(type($asyncToken), $asyncDependencies) $spmat attr-dict
		}];
		}

		def GPU_SpMVBufferSizeOp : GPU_Op<"spmv_buffer_size", [GPU_AsyncOpInterface]> {
		let summary = "Precompute buffersize for SpMV operation";
		let description = [{
		The `gpu.spmv_buffer_size` operation returns the buffer size required
		to perform the SpMV operation on the given sparse matrix and dense vectors.
		The operation expects handles returned by previous sparse operations
		to construct an environment and the operands for SpMV.

		If the `async` keyword is present, the op is executed asynchronously (i.e.
		it does not block until the execution has finished on the device). In
		that case, it returns a !gpu.async.token in addition to the environment.

		Example:

		```mlir
		%buffersz, %token = gpu.spmv_buffersize async [%dep] %env, %spmatA, %dnX, %dnY
		```
		}];

		let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
		GPU_SparseHandle:$env,
		GPU_SparseHandle:$spmatA,
		GPU_SparseHandle:$dnX,
		GPU_SparseHandle:$dnY);
		let results = (outs Res<Index>:$bufferSz, Optional<GPU_AsyncToken>:$asyncToken);

		let assemblyFormat = [{
		custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
		$env `,` $spmatA `,` $dnX `,` $dnY attr-dict
		}];
		}

		def GPU_SpMVOp : GPU_Op<"spmv", [GPU_AsyncOpInterface]> {
		let summary = "SpMV operation";
		let description = [{
		The `gpu.spmv` operation performs the SpMV operation on the given sparse matrix,
		dense vectors, and buffer. The operation expects handles returned by previous
		sparse operations to construct an environment and the operands for SpMV. The
		buffer must have been allocated on the device.

		If the `async` keyword is present, the op is executed asynchronously (i.e.
		it does not block until the execution has finished on the device). In
		that case, it returns a !gpu.async.token in addition to the environment.

		Example:

		```mlir
		%token = gpu.spmv async [%dep] %env, %spmatA, %dnX, %dnY : memref<?xf64>
		```
		}];

		let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
		GPU_SparseHandle:$env,
		GPU_SparseHandle:$spmatA,
		GPU_SparseHandle:$dnX,
		GPU_SparseHandle:$dnY,
		AnyMemRef:$buffer);
		let results = (outs Optional<GPU_AsyncToken>:$asyncToken);

		let assemblyFormat = [{
		custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
		$env `,` $spmatA `,` $dnX `,` $dnY `,` $buffer attr-dict `:` type($buffer)
		}];
		}

#endif // GPU_OPS		#endif // GPU_OPS

mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp

Show First 20 Lines • Show All 186 Lines • ▼ Show 20 Lines	FunctionCallBuilder memsetCallBuilder = {
llvmVoidType,		llvmVoidType,
{llvmPointerType /* void dst /, llvmInt32Type /* unsigned int value */,		{llvmPointerType /* void dst /, llvmInt32Type /* unsigned int value */,
llvmIntPtrType /* intptr_t sizeBytes */,		llvmIntPtrType /* intptr_t sizeBytes */,
llvmPointerType /* void stream /}};		llvmPointerType /* void stream /}};
FunctionCallBuilder setDefaultDeviceCallBuilder = {		FunctionCallBuilder setDefaultDeviceCallBuilder = {
"mgpuSetDefaultDevice",		"mgpuSetDefaultDevice",
llvmVoidType,		llvmVoidType,
{llvmInt32Type /* uint32_t devIndex */}};		{llvmInt32Type /* uint32_t devIndex */}};
		FunctionCallBuilder createSparseEnvCallBuilder = {
		"mgpuCreateSparseEnv",
		llvmPointerType,
		{llvmPointerType /* void stream /}};
		FunctionCallBuilder destroySparseEnvCallBuilder = {
		"mgpuDestroySparseEnv",
		llvmVoidType,
		{llvmPointerType, llvmPointerType /* void stream /}};
		FunctionCallBuilder createDnVecCallBuilder = {
		"mgpuCreateDnVec",
		llvmPointerType,
		{llvmIntPtrType, llvmPointerType, llvmInt32Type,
		llvmPointerType /* void stream /}};
		FunctionCallBuilder destroyDnVecCallBuilder = {
		"mgpuDestroyDnVec",
		llvmVoidType,
		{llvmPointerType, llvmPointerType /* void stream /}};
		FunctionCallBuilder createCooCallBuilder = {
		"mgpuCreateCoo",
		llvmPointerType,
		{llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmPointerType,
		llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type,
		llvmPointerType /* void stream /}};
		FunctionCallBuilder createCsrCallBuilder = {
		"mgpuCreateCsr",
		llvmPointerType,
		{llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmPointerType,
		llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type,
		llvmInt32Type, llvmPointerType /* void stream /}};
		FunctionCallBuilder destroySpMatCallBuilder = {
		"mgpuDestroySpMat",
		llvmVoidType,
		{llvmPointerType, llvmPointerType /* void stream /}};
		FunctionCallBuilder spMVBufferSizeCallBuilder = {
		"mgpuSpMVBufferSize",
		llvmIntPtrType,
		{llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType,
		llvmPointerType /* void stream /}};
		FunctionCallBuilder spMVCallBuilder = {
		"mgpuSpMV",
		llvmVoidType,
		{llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType,
		llvmPointerType, llvmPointerType /* void stream /}};
};		};

/// A rewrite pattern to convert gpu.host_register operations into a GPU runtime		/// A rewrite pattern to convert gpu.host_register operations into a GPU runtime
/// call. Currently it supports CUDA and ROCm (HIP).		/// call. Currently it supports CUDA and ROCm (HIP).
class ConvertHostRegisterOpToGpuRuntimeCallPattern		class ConvertHostRegisterOpToGpuRuntimeCallPattern
: public ConvertOpToGpuRuntimeCallPattern<gpu::HostRegisterOp> {		: public ConvertOpToGpuRuntimeCallPattern<gpu::HostRegisterOp> {
public:		public:
ConvertHostRegisterOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)		ConvertHostRegisterOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
▲ Show 20 Lines • Show All 173 Lines • ▼ Show 20 Lines	ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern(
LLVMTypeConverter &typeConverter)		LLVMTypeConverter &typeConverter)
: ConvertOpToGpuRuntimeCallPattern<gpu::SetDefaultDeviceOp>(		: ConvertOpToGpuRuntimeCallPattern<gpu::SetDefaultDeviceOp>(
typeConverter) {}		typeConverter) {}

LogicalResult		LogicalResult
matchAndRewrite(gpu::SetDefaultDeviceOp op, OpAdaptor adaptor,		matchAndRewrite(gpu::SetDefaultDeviceOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override;		ConversionPatternRewriter &rewriter) const override;
};		};

		class ConvertCreateSparseEnvOpToGpuRuntimeCallPattern
		: public ConvertOpToGpuRuntimeCallPattern<gpu::CreateSparseEnvOp> {
		public:
		ConvertCreateSparseEnvOpToGpuRuntimeCallPattern(
		LLVMTypeConverter &typeConverter)
		: ConvertOpToGpuRuntimeCallPattern<gpu::CreateSparseEnvOp>(
		typeConverter) {}

		private:
		LogicalResult
		matchAndRewrite(gpu::CreateSparseEnvOp op, OpAdaptor adaptor,
		ConversionPatternRewriter &rewriter) const override;
		};

		class ConvertDestroySparseEnvOpToGpuRuntimeCallPattern
		: public ConvertOpToGpuRuntimeCallPattern<gpu::DestroySparseEnvOp> {
		public:
		ConvertDestroySparseEnvOpToGpuRuntimeCallPattern(
		LLVMTypeConverter &typeConverter)
		: ConvertOpToGpuRuntimeCallPattern<gpu::DestroySparseEnvOp>(
		typeConverter) {}

		private:
		LogicalResult
		matchAndRewrite(gpu::DestroySparseEnvOp op, OpAdaptor adaptor,
		ConversionPatternRewriter &rewriter) const override;
		};

		class ConvertCreateDnVecOpToGpuRuntimeCallPattern
		: public ConvertOpToGpuRuntimeCallPattern<gpu::CreateDnVecOp> {
		public:
		ConvertCreateDnVecOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
		: ConvertOpToGpuRuntimeCallPattern<gpu::CreateDnVecOp>(typeConverter) {}

		private:
		LogicalResult
		matchAndRewrite(gpu::CreateDnVecOp op, OpAdaptor adaptor,
		ConversionPatternRewriter &rewriter) const override;
		};

		class ConvertDestroyDnVecOpToGpuRuntimeCallPattern
		: public ConvertOpToGpuRuntimeCallPattern<gpu::DestroyDnVecOp> {
		public:
		ConvertDestroyDnVecOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
		: ConvertOpToGpuRuntimeCallPattern<gpu::DestroyDnVecOp>(typeConverter) {}

		private:
		LogicalResult
		matchAndRewrite(gpu::DestroyDnVecOp op, OpAdaptor adaptor,
		ConversionPatternRewriter &rewriter) const override;
		};

		class ConvertCreateCooOpToGpuRuntimeCallPattern
		: public ConvertOpToGpuRuntimeCallPattern<gpu::CreateCooOp> {
		public:
		ConvertCreateCooOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
		: ConvertOpToGpuRuntimeCallPattern<gpu::CreateCooOp>(typeConverter) {}

		private:
		LogicalResult
		matchAndRewrite(gpu::CreateCooOp op, OpAdaptor adaptor,
		ConversionPatternRewriter &rewriter) const override;
		};

		class ConvertCreateCsrOpToGpuRuntimeCallPattern
		: public ConvertOpToGpuRuntimeCallPattern<gpu::CreateCsrOp> {
		public:
		ConvertCreateCsrOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
		: ConvertOpToGpuRuntimeCallPattern<gpu::CreateCsrOp>(typeConverter) {}

		private:
		LogicalResult
		matchAndRewrite(gpu::CreateCsrOp op, OpAdaptor adaptor,
		ConversionPatternRewriter &rewriter) const override;
		};

		class ConvertDestroySpMatOpToGpuRuntimeCallPattern
		: public ConvertOpToGpuRuntimeCallPattern<gpu::DestroySpMatOp> {
		public:
		ConvertDestroySpMatOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
		: ConvertOpToGpuRuntimeCallPattern<gpu::DestroySpMatOp>(typeConverter) {}

		private:
		LogicalResult
		matchAndRewrite(gpu::DestroySpMatOp op, OpAdaptor adaptor,
		ConversionPatternRewriter &rewriter) const override;
		};

		class ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern
		: public ConvertOpToGpuRuntimeCallPattern<gpu::SpMVBufferSizeOp> {
		public:
		ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern(
		LLVMTypeConverter &typeConverter)
		: ConvertOpToGpuRuntimeCallPattern<gpu::SpMVBufferSizeOp>(typeConverter) {
		}

		private:
		LogicalResult
		matchAndRewrite(gpu::SpMVBufferSizeOp op, OpAdaptor adaptor,
		ConversionPatternRewriter &rewriter) const override;
		};

		class ConvertSpMVOpToGpuRuntimeCallPattern
		: public ConvertOpToGpuRuntimeCallPattern<gpu::SpMVOp> {
		public:
		ConvertSpMVOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter)
		: ConvertOpToGpuRuntimeCallPattern<gpu::SpMVOp>(typeConverter) {}

		private:
		LogicalResult
		matchAndRewrite(gpu::SpMVOp op, OpAdaptor adaptor,
		ConversionPatternRewriter &rewriter) const override;
		};

} // namespace		} // namespace

void GpuToLLVMConversionPass::runOnOperation() {		void GpuToLLVMConversionPass::runOnOperation() {
LowerToLLVMOptions options(&getContext());		LowerToLLVMOptions options(&getContext());
options.useOpaquePointers = useOpaquePointers;		options.useOpaquePointers = useOpaquePointers;

LLVMTypeConverter converter(&getContext(), options);		LLVMTypeConverter converter(&getContext(), options);
RewritePatternSet patterns(&getContext());		RewritePatternSet patterns(&getContext());
▲ Show 20 Lines • Show All 562 Lines • ▼ Show 20 Lines	LogicalResult ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern::matchAndRewrite(
gpu::SetDefaultDeviceOp op, OpAdaptor adaptor,		gpu::SetDefaultDeviceOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const {		ConversionPatternRewriter &rewriter) const {
Location loc = op.getLoc();		Location loc = op.getLoc();
setDefaultDeviceCallBuilder.create(loc, rewriter, {adaptor.getDevIndex()});		setDefaultDeviceCallBuilder.create(loc, rewriter, {adaptor.getDevIndex()});
rewriter.replaceOp(op, {});		rewriter.replaceOp(op, {});
return success();		return success();
}		}

		LogicalResult ConvertCreateSparseEnvOpToGpuRuntimeCallPattern::matchAndRewrite(
		gpu::CreateSparseEnvOp op, OpAdaptor adaptor,
		ConversionPatternRewriter &rewriter) const {
		if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) \|\|
		failed(isAsyncWithOneDependency(rewriter, op)))
		return failure();
		Location loc = op.getLoc();
		auto stream = adaptor.getAsyncDependencies().front();
		auto handle =
		createSparseEnvCallBuilder.create(loc, rewriter, {stream}).getResult();
		rewriter.replaceOp(op, {handle, stream});
		return success();
		}

		LogicalResult ConvertDestroySparseEnvOpToGpuRuntimeCallPattern::matchAndRewrite(
		gpu::DestroySparseEnvOp op, OpAdaptor adaptor,
		ConversionPatternRewriter &rewriter) const {
		if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) \|\|
		failed(isAsyncWithOneDependency(rewriter, op)))
		return failure();
		Location loc = op.getLoc();
		auto stream = adaptor.getAsyncDependencies().front();
		destroySparseEnvCallBuilder.create(loc, rewriter, {adaptor.getEnv(), stream});
		rewriter.replaceOp(op, {stream});
		return success();
		}

		LogicalResult ConvertCreateDnVecOpToGpuRuntimeCallPattern::matchAndRewrite(
		gpu::CreateDnVecOp op, OpAdaptor adaptor,
		ConversionPatternRewriter &rewriter) const {
		if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) \|\|
		failed(isAsyncWithOneDependency(rewriter, op)))
		return failure();
		Location loc = op.getLoc();
		auto stream = adaptor.getAsyncDependencies().front();
		Value pVec =
		MemRefDescriptor(adaptor.getMemref()).allocatedPtr(rewriter, loc);
		if (!getTypeConverter()->useOpaquePointers())
		pVec = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pVec);
		Type dType = op.getMemref().getType().cast<MemRefType>().getElementType();
		auto dw = rewriter.create<LLVM::ConstantOp>(loc, llvmInt32Type,
		dType.getIntOrFloatBitWidth());
		auto handle =
		createDnVecCallBuilder
		.create(loc, rewriter, {adaptor.getSize(), pVec, dw, stream})
		.getResult();
		rewriter.replaceOp(op, {handle, stream});
		return success();
		}

		LogicalResult ConvertDestroyDnVecOpToGpuRuntimeCallPattern::matchAndRewrite(
		gpu::DestroyDnVecOp op, OpAdaptor adaptor,
		ConversionPatternRewriter &rewriter) const {
		if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) \|\|
		failed(isAsyncWithOneDependency(rewriter, op)))
		return failure();
		Location loc = op.getLoc();
		auto stream = adaptor.getAsyncDependencies().front();
		destroyDnVecCallBuilder.create(loc, rewriter, {adaptor.getDvec(), stream});
		rewriter.replaceOp(op, {stream});
		return success();
		}

		LogicalResult ConvertCreateCooOpToGpuRuntimeCallPattern::matchAndRewrite(
		gpu::CreateCooOp op, OpAdaptor adaptor,
		ConversionPatternRewriter &rewriter) const {
		if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) \|\|
		failed(isAsyncWithOneDependency(rewriter, op)))
		return failure();
		Location loc = op.getLoc();
		auto stream = adaptor.getAsyncDependencies().front();
		Value pRowIdxs =
		MemRefDescriptor(adaptor.getRowIdxs()).allocatedPtr(rewriter, loc);
		Value pColIdxs =
		MemRefDescriptor(adaptor.getColIdxs()).allocatedPtr(rewriter, loc);
		Value pValues =
		MemRefDescriptor(adaptor.getValues()).allocatedPtr(rewriter, loc);
		if (!getTypeConverter()->useOpaquePointers()) {
		pRowIdxs = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pRowIdxs);
		pColIdxs = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pColIdxs);
		pValues = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pValues);
		}
		Type iType = op.getColIdxs().getType().cast<MemRefType>().getElementType();
		Type dType = op.getValues().getType().cast<MemRefType>().getElementType();
		auto iw = rewriter.create<LLVM::ConstantOp>(
		loc, llvmInt32Type, iType.isIndex() ? 64 : iType.getIntOrFloatBitWidth());
		auto dw = rewriter.create<LLVM::ConstantOp>(loc, llvmInt32Type,
		dType.getIntOrFloatBitWidth());
		auto handle =
		createCooCallBuilder
		.create(loc, rewriter,
		{adaptor.getRows(), adaptor.getCols(), adaptor.getNnz(),
		pRowIdxs, pColIdxs, pValues, iw, dw, stream})
		.getResult();
		rewriter.replaceOp(op, {handle, stream});
		return success();
		}

		LogicalResult ConvertCreateCsrOpToGpuRuntimeCallPattern::matchAndRewrite(
		gpu::CreateCsrOp op, OpAdaptor adaptor,
		ConversionPatternRewriter &rewriter) const {
		if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) \|\|
		failed(isAsyncWithOneDependency(rewriter, op)))
		return failure();
		Location loc = op.getLoc();
		auto stream = adaptor.getAsyncDependencies().front();
		Value pRowPos =
		MemRefDescriptor(adaptor.getRowPos()).allocatedPtr(rewriter, loc);
		Value pColIdxs =
		MemRefDescriptor(adaptor.getColIdxs()).allocatedPtr(rewriter, loc);
		Value pValues =
		MemRefDescriptor(adaptor.getValues()).allocatedPtr(rewriter, loc);
		if (!getTypeConverter()->useOpaquePointers()) {
		pRowPos = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pRowPos);
		pColIdxs = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pColIdxs);
		pValues = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pValues);
		}
		Type pType = op.getRowPos().getType().cast<MemRefType>().getElementType();
		Type iType = op.getColIdxs().getType().cast<MemRefType>().getElementType();
		Type dType = op.getValues().getType().cast<MemRefType>().getElementType();
		auto pw = rewriter.create<LLVM::ConstantOp>(
		loc, llvmInt32Type, pType.isIndex() ? 64 : pType.getIntOrFloatBitWidth());
		auto iw = rewriter.create<LLVM::ConstantOp>(
		loc, llvmInt32Type, iType.isIndex() ? 64 : iType.getIntOrFloatBitWidth());
		auto dw = rewriter.create<LLVM::ConstantOp>(loc, llvmInt32Type,
		dType.getIntOrFloatBitWidth());
		auto handle =
		createCsrCallBuilder
		.create(loc, rewriter,
		{adaptor.getRows(), adaptor.getCols(), adaptor.getNnz(),
		pRowPos, pColIdxs, pValues, pw, iw, dw, stream})
		.getResult();
		rewriter.replaceOp(op, {handle, stream});
		return success();
		}

		LogicalResult ConvertDestroySpMatOpToGpuRuntimeCallPattern::matchAndRewrite(
		gpu::DestroySpMatOp op, OpAdaptor adaptor,
		ConversionPatternRewriter &rewriter) const {
		if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) \|\|
		failed(isAsyncWithOneDependency(rewriter, op)))
		return failure();
		Location loc = op.getLoc();
		auto stream = adaptor.getAsyncDependencies().front();
		destroySpMatCallBuilder.create(loc, rewriter, {adaptor.getSpmat(), stream});
		rewriter.replaceOp(op, {stream});
		return success();
		}

		LogicalResult ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern::matchAndRewrite(
		gpu::SpMVBufferSizeOp op, OpAdaptor adaptor,
		ConversionPatternRewriter &rewriter) const {
		if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) \|\|
		failed(isAsyncWithOneDependency(rewriter, op)))
		return failure();
		Location loc = op.getLoc();
		auto stream = adaptor.getAsyncDependencies().front();
		auto bufferSize = spMVBufferSizeCallBuilder
		.create(loc, rewriter,
		{adaptor.getEnv(), adaptor.getSpmatA(),
		adaptor.getDnX(), adaptor.getDnY(), stream})
		.getResult();
		rewriter.replaceOp(op, {bufferSize, stream});
		return success();
		}

		LogicalResult ConvertSpMVOpToGpuRuntimeCallPattern::matchAndRewrite(
		gpu::SpMVOp op, OpAdaptor adaptor,
		ConversionPatternRewriter &rewriter) const {
		if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) \|\|
		failed(isAsyncWithOneDependency(rewriter, op)))
		return failure();
		Location loc = op.getLoc();
		auto stream = adaptor.getAsyncDependencies().front();
		Value pBuf =
		MemRefDescriptor(adaptor.getBuffer()).allocatedPtr(rewriter, loc);
		if (!getTypeConverter()->useOpaquePointers())
		pBuf = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pBuf);
		spMVCallBuilder.create(loc, rewriter,
		{adaptor.getEnv(), adaptor.getSpmatA(),
		adaptor.getDnX(), adaptor.getDnY(), pBuf, stream});
		rewriter.replaceOp(op, {stream});
		return success();
		}

void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,		void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
RewritePatternSet &patterns,		RewritePatternSet &patterns,
StringRef gpuBinaryAnnotation,		StringRef gpuBinaryAnnotation,
bool kernelBarePtrCallConv) {		bool kernelBarePtrCallConv) {
converter.addConversion([&converter](gpu::AsyncTokenType type) -> Type {		converter.addConversion([&converter](gpu::AsyncTokenType type) -> Type {
return converter.getPointerType(		return converter.getPointerType(
IntegerType::get(&converter.getContext(), 8));		IntegerType::get(&converter.getContext(), 8));
});		});
		converter.addConversion([&converter](gpu::SparseHandleType type) -> Type {
		return converter.getPointerType(
		IntegerType::get(&converter.getContext(), 8));
		});

patterns.add<ConvertAllocOpToGpuRuntimeCallPattern,		patterns.add<ConvertAllocOpToGpuRuntimeCallPattern,
ConvertDeallocOpToGpuRuntimeCallPattern,		ConvertDeallocOpToGpuRuntimeCallPattern,
ConvertHostRegisterOpToGpuRuntimeCallPattern,		ConvertHostRegisterOpToGpuRuntimeCallPattern,
ConvertHostUnregisterOpToGpuRuntimeCallPattern,		ConvertHostUnregisterOpToGpuRuntimeCallPattern,
ConvertMemcpyOpToGpuRuntimeCallPattern,		ConvertMemcpyOpToGpuRuntimeCallPattern,
ConvertMemsetOpToGpuRuntimeCallPattern,		ConvertMemsetOpToGpuRuntimeCallPattern,
ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern,		ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern,
ConvertWaitAsyncOpToGpuRuntimeCallPattern,		ConvertWaitAsyncOpToGpuRuntimeCallPattern,
ConvertWaitOpToGpuRuntimeCallPattern,		ConvertWaitOpToGpuRuntimeCallPattern,
ConvertAsyncYieldToGpuRuntimeCallPattern>(converter);		ConvertAsyncYieldToGpuRuntimeCallPattern,
		ConvertCreateSparseEnvOpToGpuRuntimeCallPattern,
		ConvertDestroySparseEnvOpToGpuRuntimeCallPattern,
		ConvertCreateDnVecOpToGpuRuntimeCallPattern,
		ConvertDestroyDnVecOpToGpuRuntimeCallPattern,
		ConvertCreateCooOpToGpuRuntimeCallPattern,
		ConvertCreateCsrOpToGpuRuntimeCallPattern,
		ConvertDestroySpMatOpToGpuRuntimeCallPattern,
		ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern,
		ConvertSpMVOpToGpuRuntimeCallPattern>(converter);
patterns.add<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(		patterns.add<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(
converter, gpuBinaryAnnotation, kernelBarePtrCallConv);		converter, gpuBinaryAnnotation, kernelBarePtrCallConv);
patterns.add<EraseGpuModuleOpPattern>(&converter.getContext());		patterns.add<EraseGpuModuleOpPattern>(&converter.getContext());
}		}

mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

Show First 20 Lines • Show All 140 Lines • ▼ Show 20 Lines	bool isLegalToInline(Operation , Region , bool, IRMapping &) const final {
return true;		return true;
}		}
};		};
} // namespace		} // namespace

void GPUDialect::initialize() {		void GPUDialect::initialize() {
addTypes<AsyncTokenType>();		addTypes<AsyncTokenType>();
addTypes<MMAMatrixType>();		addTypes<MMAMatrixType>();
		addTypes<SparseHandleType>();
addOperations<		addOperations<
#define GET_OP_LIST		#define GET_OP_LIST
#include "mlir/Dialect/GPU/IR/GPUOps.cpp.inc"		#include "mlir/Dialect/GPU/IR/GPUOps.cpp.inc"
>();		>();
addAttributes<		addAttributes<
#define GET_ATTRDEF_LIST		#define GET_ATTRDEF_LIST
#include "mlir/Dialect/GPU/IR/GPUOpsAttributes.cpp.inc"		#include "mlir/Dialect/GPU/IR/GPUOpsAttributes.cpp.inc"
>();		>();
Show All 38 Lines	if (keyword == "mma_matrix") {
if (parser.parseGreater())		if (parser.parseGreater())
return nullptr;		return nullptr;

return MMAMatrixType::getChecked(mlir::detail::getDefaultDiagnosticEmitFn(		return MMAMatrixType::getChecked(mlir::detail::getDefaultDiagnosticEmitFn(
parser.getEncodedSourceLoc(beginLoc)),		parser.getEncodedSourceLoc(beginLoc)),
shape, elementType, operand);		shape, elementType, operand);
}		}

		if (keyword == "sparse.handle")
		return SparseHandleType::get(context);

parser.emitError(parser.getNameLoc(), "unknown gpu type: " + keyword);		parser.emitError(parser.getNameLoc(), "unknown gpu type: " + keyword);
return Type();		return Type();
}		}

void GPUDialect::printType(Type type, DialectAsmPrinter &os) const {		void GPUDialect::printType(Type type, DialectAsmPrinter &os) const {
TypeSwitch<Type>(type)		TypeSwitch<Type>(type)
.Case<AsyncTokenType>([&](Type) { os << "async.token"; })		.Case<AsyncTokenType>([&](Type) { os << "async.token"; })
		.Case<SparseHandleType>([&](Type) { os << "sparse.handle"; })
.Case<MMAMatrixType>([&](MMAMatrixType fragTy) {		.Case<MMAMatrixType>([&](MMAMatrixType fragTy) {
os << "mma_matrix<";		os << "mma_matrix<";
auto shape = fragTy.getShape();		auto shape = fragTy.getShape();
for (auto dim = shape.begin(), e = shape.end() - 1; dim != e; ++dim)		for (auto dim = shape.begin(), e = shape.end() - 1; dim != e; ++dim)
os << *dim << 'x';		os << *dim << 'x';
os << shape.back() << 'x' << fragTy.getElementType();		os << shape.back() << 'x' << fragTy.getElementType();
os << ", \"" << fragTy.getOperand() << "\"" << '>';		os << ", \"" << fragTy.getOperand() << "\"" << '>';
})		})
▲ Show 20 Lines • Show All 1,534 Lines • Show Last 20 Lines

mlir/lib/ExecutionEngine/CMakeLists.txt

Show First 20 Lines • Show All 184 Lines • ▼ Show 20 Lines	if(MLIR_ENABLE_CUDA_RUNNER)
else()		else()
message(SEND_ERROR		message(SEND_ERROR
"Building the mlir cuda runner requires a working CUDA install")		"Building the mlir cuda runner requires a working CUDA install")
endif()		endif()

# We need the libcuda.so library.		# We need the libcuda.so library.
find_library(CUDA_RUNTIME_LIBRARY cuda)		find_library(CUDA_RUNTIME_LIBRARY cuda)

		# We need the libcusparse.so library.
		find_library(CUDA_CUSPARSE_LIBRARY cusparse HINTS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES})
		aartbikAuthorUnsubmitted Done Reply Inline Actions A question for the reviewers is of course if this is an acceptable dependence (cuSPARSE has been part of CUDA for a long time now) and this is only pulled in when built with the extra flag MLIR_ENABLE_CUDA_RUNNER aartbik: A question for the reviewers is of course if this is an acceptable dependence (cuSPARSE has…
		ThomasRaouxUnsubmitted Done Reply Inline Actions That looks reasonable to me, an alternative would be to introduce another flag but if we expect cuSPARSE to be there along with CUDA runtime I don't see a reason for that. ThomasRaoux: That looks reasonable to me, an alternative would be to introduce another flag but if we expect…
		aartbikAuthorUnsubmitted Done Reply Inline Actions Thanks for the pointer! Yeah, if this turns out to be a breaker, I will add the flag, but for now let's proceed with the assumption. aartbik: Thanks for the pointer! Yeah, if this turns out to be a breaker, I will add the flag, but for…

add_mlir_library(mlir_cuda_runtime		add_mlir_library(mlir_cuda_runtime
SHARED		SHARED
CudaRuntimeWrappers.cpp		CudaRuntimeWrappers.cpp

EXCLUDE_FROM_LIBMLIR		EXCLUDE_FROM_LIBMLIR
)		)
set_property(TARGET mlir_cuda_runtime PROPERTY CXX_STANDARD 14)		set_property(TARGET mlir_cuda_runtime PROPERTY CXX_STANDARD 14)
target_include_directories(mlir_cuda_runtime		target_include_directories(mlir_cuda_runtime
PRIVATE		PRIVATE
${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}		${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
)		)
target_link_libraries(mlir_cuda_runtime		target_link_libraries(mlir_cuda_runtime
PRIVATE		PRIVATE
${CUDA_RUNTIME_LIBRARY}		${CUDA_RUNTIME_LIBRARY}
		${CUDA_CUSPARSE_LIBRARY}
)		)
endif()		endif()

if(MLIR_ENABLE_ROCM_RUNNER)		if(MLIR_ENABLE_ROCM_RUNNER)
# Configure ROCm support.		# Configure ROCm support.
if (NOT DEFINED ROCM_PATH)		if (NOT DEFINED ROCM_PATH)
if (NOT DEFINED ENV{ROCM_PATH})		if (NOT DEFINED ENV{ROCM_PATH})
set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed")		set(ROCM_PATH "/opt/rocm" CACHE PATH "Path to which ROCm has been installed")
▲ Show 20 Lines • Show All 77 Lines • Show Last 20 Lines

mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp

Show All 11 Lines
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "mlir/ExecutionEngine/CRunnerUtils.h"		#include "mlir/ExecutionEngine/CRunnerUtils.h"

#include <stdio.h>		#include <stdio.h>

#include "cuda.h"		#include "cuda.h"
		#include "cusparse.h"

#ifdef _WIN32		#ifdef _WIN32
#define MLIR_CUDA_WRAPPERS_EXPORT __declspec(dllexport)		#define MLIR_CUDA_WRAPPERS_EXPORT __declspec(dllexport)
#else		#else
#define MLIR_CUDA_WRAPPERS_EXPORT		#define MLIR_CUDA_WRAPPERS_EXPORT
#endif // _WIN32		#endif // _WIN32

#define CUDA_REPORT_IF_ERROR(expr) \		#define CUDA_REPORT_IF_ERROR(expr) \
[](CUresult result) { \		[](CUresult result) { \
if (!result) \		if (!result) \
return; \		return; \
const char *name = nullptr; \		const char *name = nullptr; \
cuGetErrorName(result, &name); \		cuGetErrorName(result, &name); \
if (!name) \		if (!name) \
name = "<unknown>"; \		name = "<unknown>"; \
fprintf(stderr, "'%s' failed with '%s'\n", #expr, name); \		fprintf(stderr, "'%s' failed with '%s'\n", #expr, name); \
}(expr)		}(expr)

		#define CUSPARSE_REPORT_IF_ERROR(expr) \
		{ \
		cusparseStatus_t status = (expr); \
		if (status != CUSPARSE_STATUS_SUCCESS) { \
		fprintf(stderr, "cuSPARSE '%s' failed with '%s'\n", #expr, \
		cusparseGetErrorString(status)); \
		} \
		}

thread_local static int32_t defaultDevice = 0;		thread_local static int32_t defaultDevice = 0;

// Make the primary context of the current default device current for the		// Make the primary context of the current default device current for the
// duration		// duration
// of the instance and restore the previous context on destruction.		// of the instance and restore the previous context on destruction.
class ScopedContext {		class ScopedContext {
public:		public:
ScopedContext() {		ScopedContext() {
▲ Show 20 Lines • Show All 160 Lines • ▼ Show 20 Lines	mgpuMemHostUnregisterMemRef(int64_t rank,
int64_t elementSizeBytes) {		int64_t elementSizeBytes) {
auto ptr = descriptor->data + descriptor->offset elementSizeBytes;		auto ptr = descriptor->data + descriptor->offset elementSizeBytes;
mgpuMemHostUnregister(ptr);		mgpuMemHostUnregister(ptr);
}		}

extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) {		extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) {
defaultDevice = device;		defaultDevice = device;
}		}

		/// Wrapper methods for the cuSparse library.

		ThomasRaouxUnsubmitted Done Reply Inline Actions nit: remove empty line? ThomasRaoux: nit: remove empty line?
		aartbikAuthorUnsubmitted Done Reply Inline Actions I followed the style found at L161 on introducing a new section. But I made it a bit more clear by adding extra /// on such lines. aartbik: I followed the style found at L161 on introducing a new section. But I made it a bit more clear…
		static inline cudaDataType_t dataTp(int32_t width) {
		switch (width) {
		case 32:
		return CUDA_R_32F;
		default:
		return CUDA_R_64F;
		}
		}

		static inline cusparseIndexType_t idxTp(int32_t width) {
		switch (width) {
		case 32:
		return CUSPARSE_INDEX_32I;
		default:
		return CUSPARSE_INDEX_64I;
		}
		}

		extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
		mgpuCreateSparseEnv(CUstream /stream/) {
		cusparseHandle_t handle = nullptr;
		CUSPARSE_REPORT_IF_ERROR(cusparseCreate(&handle))
		return reinterpret_cast<void *>(handle);
		}

		extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
		mgpuDestroySparseEnv(void h, CUstream /stream*/) {
		cusparseHandle_t handle = reinterpret_cast<cusparseHandle_t>(h);
		CUSPARSE_REPORT_IF_ERROR(cusparseDestroy(handle))
		}

		extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
		mgpuCreateDnVec(intptr_t size, void values, int32_t dw, CUstream /stream*/) {
		cusparseDnVecDescr_t vec = nullptr;
		cudaDataType_t dtp = dataTp(dw);
		CUSPARSE_REPORT_IF_ERROR(cusparseCreateDnVec(&vec, size, values, dtp))
		return reinterpret_cast<void *>(vec);
		}

		extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
		mgpuDestroyDnVec(void v, CUstream /stream*/) {
		cusparseDnVecDescr_t vec = reinterpret_cast<cusparseDnVecDescr_t>(v);
		CUSPARSE_REPORT_IF_ERROR(cusparseDestroyDnVec(vec))
		}

		extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
		mgpuCreateCoo(intptr_t rows, intptr_t cols, intptr_t nnz, void *rowIdxs,
		void colIdxs, void values, int32_t iw, int32_t dw,
		CUstream /stream/) {
		cusparseSpMatDescr_t mat = nullptr;
		cusparseIndexType_t itp = idxTp(iw);
		cudaDataType_t dtp = dataTp(dw);
		CUSPARSE_REPORT_IF_ERROR(cusparseCreateCoo(&mat, rows, cols, nnz, rowIdxs,
		colIdxs, values, itp,
		CUSPARSE_INDEX_BASE_ZERO, dtp))
		return reinterpret_cast<void *>(mat);
		}

		extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
		mgpuCreateCsr(intptr_t rows, intptr_t cols, intptr_t nnz, void *rowPos,
		void colIdxs, void values, int32_t pw, int32_t iw, int32_t dw,
		CUstream /stream/) {
		cusparseSpMatDescr_t mat = nullptr;
		cusparseIndexType_t ptp = idxTp(pw);
		cusparseIndexType_t itp = idxTp(iw);
		cudaDataType_t dtp = dataTp(dw);
		CUSPARSE_REPORT_IF_ERROR(cusparseCreateCsr(&mat, rows, cols, nnz, rowPos,
		colIdxs, values, ptp, itp,
		CUSPARSE_INDEX_BASE_ZERO, dtp))
		return reinterpret_cast<void *>(mat);
		}

		extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
		mgpuDestroySpMat(void m, CUstream /stream*/) {
		cusparseSpMatDescr_t mat = reinterpret_cast<cusparseSpMatDescr_t>(m);
		CUSPARSE_REPORT_IF_ERROR(cusparseDestroySpMat(mat))
		}

		extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t
		mgpuSpMVBufferSize(void h, void a, void x, void y, CUstream /stream/) {
		cusparseHandle_t handle = reinterpret_cast<cusparseHandle_t>(h);
		cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
		cusparseDnVecDescr_t vecX = reinterpret_cast<cusparseDnVecDescr_t>(x);
		cusparseDnVecDescr_t vecY = reinterpret_cast<cusparseDnVecDescr_t>(y);
		double alpha = 1.0;
		double beta = 1.0;
		size_t bufferSize = 0;
		CUSPARSE_REPORT_IF_ERROR(cusparseSpMV_bufferSize(
		handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecY,
		CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize))
		return bufferSize == 0 ? 1 : bufferSize; // avoid zero-alloc
		}

		extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
		mgpuSpMV(void h, void a, void x, void y, void b, CUstream /stream*/) {
		cusparseHandle_t handle = reinterpret_cast<cusparseHandle_t>(h);
		cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
		cusparseDnVecDescr_t vecX = reinterpret_cast<cusparseDnVecDescr_t>(x);
		cusparseDnVecDescr_t vecY = reinterpret_cast<cusparseDnVecDescr_t>(y);
		double alpha = 1.0;
		double beta = 1.0;
		CUSPARSE_REPORT_IF_ERROR(
		cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX,
		&beta, vecY, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, b))
		}

mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir

This file was added.

				// RUN: mlir-opt %s --gpu-to-llvm='use-opaque-pointers=1' \| FileCheck %s

				module attributes {gpu.container_module} {

				// CHECK-LABEL: func @matvec
				// CHECK: llvm.call @mgpuStreamCreate
				// CHECK: llvm.call @mgpuMemAlloc
				// CHECK: llvm.call @mgpuMemAlloc
				// CHECK: llvm.call @mgpuCreateSparseEnv
				// CHECK: llvm.call @mgpuCreateCoo
				// CHECK: llvm.call @mgpuCreateDnVec
				// CHECK: llvm.call @mgpuSpMVBufferSize
				// CHECK: llvm.call @mgpuSpM
				// CHECK: llvm.call @mgpuDestroySpMat
				// CHECK: llvm.call @mgpuDestroyDnVec
				// CHECK: llvm.call @mgpuDestroySparseEnv
				// CHECK: llvm.call @mgpuStreamSynchronize
				// CHECK: llvm.call @mgpuStreamDestroy
				func.func @matvec(%arg0: index) {
				%token0 = gpu.wait async
				%mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
				%mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
				%env, %token3 = gpu.create_sparse_env async [%token2]
				%spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
				%dnvec, %token5 = gpu.create_dn_vec async [%token4] %mem2, %arg0 : memref<?xf64>
				%bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %env, %spmat, %dnvec, %dnvec
				%token7 = gpu.spmv async [%token6] %env, %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64>
				%token8 = gpu.destroy_sp_mat async [%token7] %spmat
				%token9 = gpu.destroy_dn_vec async [%token8] %dnvec
				%token10 = gpu.destroy_sparse_env async [%token9] %env
				gpu.wait [%token10]
				return
				}

				}

mlir/test/Dialect/GPU/ops.mlir

Show First 20 Lines • Show All 311 Lines • ▼ Show 20 Lines	module attributes {gpu.container_module} {
}		}

// CHECK-LABEL: func @set_default_device		// CHECK-LABEL: func @set_default_device
func.func @set_default_device(%arg0: i32) {		func.func @set_default_device(%arg0: i32) {
// CHECK: gpu.set_default_device		// CHECK: gpu.set_default_device
gpu.set_default_device %arg0		gpu.set_default_device %arg0
return		return
}		}

		// CHECK-LABEL: func @sparse_ops
		func.func @sparse_ops(%arg0: index) {
		// CHECK: gpu.wait async
		%token0 = gpu.wait async
		// CHECK: gpu.alloc async
		%mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref<?xindex>
		// CHECK: gpu.alloc async
		%mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref<?xf64>
		// CHECK: gpu.create_sparse_env async
		%env, %token3 = gpu.create_sparse_env async [%token2]
		// CHECK: gpu.create_coo async
		%spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
		// CHECK: gpu.create_dn_vec async
		%dnvec, %token5 = gpu.create_dn_vec async [%token4] %mem2, %arg0 : memref<?xf64>
		// CHECK: gpu.spmv_buffer_size async
		%bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %env, %spmat, %dnvec, %dnvec
		// CHECK: gpu.spmv async
		%token7 = gpu.spmv async [%token6] %env, %spmat, %dnvec, %dnvec, %mem2 : memref<?xf64>
		// CHECK: gpu.destroy_sp_mat async
		%token8 = gpu.destroy_sp_mat async [%token7] %spmat
		// CHECK: gpu.destroy_dn_vec async
		%token9 = gpu.destroy_dn_vec async [%token8] %dnvec
		// CHECK: gpu.destroy_sparse_env async
		%token10 = gpu.destroy_sparse_env async [%token9] %env
		// CHECK: gpu.wait
		gpu.wait [%token10]
		return
		}
}		}

// Just check that this doesn't crash.		// Just check that this doesn't crash.
gpu.module @module {		gpu.module @module {
"gpu.func"() ({		"gpu.func"() ({
gpu.return		gpu.return
}) {function_type = () -> (), sym_name = "func"} : () -> ()		}) {function_type = () -> (), sym_name = "func"} : () -> ()
}		}

utils/bazel/llvm-project-overlay/mlir/BUILD.bazel

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 7,751 Lines • ▼ Show 20 Lines	cc_library(
tags = [		tags = [
"manual", # External dependency		"manual", # External dependency
"nobuildkite", # TODO(gcmn): Add support for this target		"nobuildkite", # TODO(gcmn): Add support for this target
],		],
deps = [		deps = [
":LLVMSupportHeaders",		":LLVMSupportHeaders",
":mlir_c_runner_utils",		":mlir_c_runner_utils",
"@cuda//:cuda_headers",		"@cuda//:cuda_headers",
		"@cuda//:cusparse_static",
		aartbikAuthorUnsubmitted Done Reply Inline Actions I could use some help making sure that lib/ExecutionEngine/CMakeLists.txt also keeps working with cusparse.h dependence. aartbik: I could use some help making sure that lib/ExecutionEngine/CMakeLists.txt also keeps working…
"@cuda//:libcuda",		"@cuda//:libcuda",
],		],
)		)

# Indirection to avoid 'libmlir_cuda_runtime.so' filename clash.		# Indirection to avoid 'libmlir_cuda_runtime.so' filename clash.
alias(		alias(
name = "mlir_cuda_runtime",		name = "mlir_cuda_runtime",
actual = "_mlir_cuda_runtime",		actual = "_mlir_cuda_runtime",
▲ Show 20 Lines • Show All 3,566 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][gpu][sparse] add gpu ops for sparse matrix computations
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 521108

mlir/include/mlir/Dialect/GPU/IR/GPUBase.td

mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp

mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

mlir/lib/ExecutionEngine/CMakeLists.txt

mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp

mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir

mlir/test/Dialect/GPU/ops.mlir

utils/bazel/llvm-project-overlay/mlir/BUILD.bazel

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][gpu][sparse] add gpu ops for sparse matrix computationsClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 521108

mlir/include/mlir/Dialect/GPU/IR/GPUBase.td

mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h

mlir/include/mlir/Dialect/GPU/IR/GPUOps.td

mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp

mlir/lib/Dialect/GPU/IR/GPUDialect.cpp

mlir/lib/ExecutionEngine/CMakeLists.txt

mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp

mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir

mlir/test/Dialect/GPU/ops.mlir

utils/bazel/llvm-project-overlay/mlir/BUILD.bazel

[mlir][gpu][sparse] add gpu ops for sparse matrix computations
ClosedPublic