This is an archive of the discontinued LLVM Phabricator instance.

[mlir][sparse][gpu] various cuSparse refinements
ClosedPublic

Authored by aartbik on May 24 2023, 8:55 PM.

Download Raw Diff

Details

Reviewers

Peiming
K-Wu
nicolasvasilache
herhut

Commits

rGbcb698bfdc72: [mlir][sparse][gpu] various cuSparse refinements

Summary

(1) keep all cuSparse ops on single stream without wait() in right order
(2) use more type precise memref types for COO
(3) use ToTensor on resulting memref (even though it folds away again)

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

aartbik created this revision.May 24 2023, 8:55 PM

Herald added a reviewer: K-Wu. · View Herald TranscriptMay 24 2023, 8:55 PM

Herald added a project: Restricted Project. · View Herald Transcript

Herald added subscribers: K-Wu, bviyer, hanchung and 25 others. · View Herald Transcript

aartbik requested review of this revision.May 24 2023, 8:55 PM

Herald added a reviewer: nicolasvasilache. · View Herald TranscriptMay 24 2023, 8:55 PM

Herald added a reviewer: herhut. · View Herald Transcript

Herald added a project: Restricted Project. · View Herald Transcript

Herald added subscribers: stephenneuendorffer, nicolasvasilache. · View Herald Transcript

aartbik mentioned this in D151279: [mlir][gpu][sparse] adding initial cusparse sddmm libgen support.May 24 2023, 9:09 PM

Harbormaster completed remote builds in B234393: Diff 525428.May 24 2023, 9:29 PM

K-Wu added inline comments.May 24 2023, 9:38 PM

mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
392	Can we do an colA assertion at the beginning of the function instead of doing it twice?

aartbik added inline comments.May 24 2023, 9:45 PM

mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
392	Ah, in this case we could but.... if we implement the "unreachable" part (which I have in my local workspace wiht a COO_AOS) then you have a path without col coming in. So it looks a bit redundant now, but think of "unreachable" testing for !col at least ;-)

K-Wu added inline comments.May 24 2023, 9:47 PM

mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
392	okay that makes sense. Thank you.

LGTM

This revision is now accepted and ready to land.May 24 2023, 9:48 PM

Closed by commit rGbcb698bfdc72: [mlir][sparse][gpu] various cuSparse refinements (authored by aartbik). · Explain WhyMay 24 2023, 10:33 PM

This revision was automatically updated to reflect the committed changes.

aartbik added a commit: rGbcb698bfdc72: [mlir][sparse][gpu] various cuSparse refinements.

Revision Contents

Path

Size

mlir/

lib/

Dialect/

SparseTensor/

Transforms/

SparseGPUCodegen.cpp

31 lines

ExecutionEngine/

CudaRuntimeWrappers.cpp

9 lines

test/

Dialect/

SparseTensor/

GPU/

gpu_matmul_lib.mlir

39 lines

gpu_matvec_lib.mlir

27 lines

Diff 525442

mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp

Show First 20 Lines • Show All 372 Lines • ▼ Show 20 Lines	static Value genFirstPosOrCrds(OpBuilder &builder, Location loc, Value a,
return genToPositions(builder, loc, a, 1);		return genToPositions(builder, loc, a, 1);
}		}

/// Generates the second coordinates of a sparse matrix.		/// Generates the second coordinates of a sparse matrix.
static Value genSecondCrds(OpBuilder &builder, Location loc, Value a,		static Value genSecondCrds(OpBuilder &builder, Location loc, Value a,
bool isCOO, bool enableRT) {		bool isCOO, bool enableRT) {
if (isCOO && !enableRT)		if (isCOO && !enableRT)
return Value(); // nothing needed		return Value(); // nothing needed
return genToCoordinates(builder, loc, a, 1, /cooStart=/0);		return genToCoordinates(builder, loc, a, 1, /cooStart=/isCOO ? 0 : 2);
}		}

/// Generates the sparse matrix multiplication.		/// Generates the sparse matrix multiplication.
static Operation *genSpMat(OpBuilder &builder, Location loc, Type handleTp,		static Operation *genSpMat(OpBuilder &builder, Location loc, Type handleTp,
Type tokenTp, Value token, Value szY, Value szX,		Type tokenTp, Value token, Value sz1, Value sz2,
Value nseA, Value rowA, Value colA, Value valA,		Value nseA, Value rowA, Value colA, Value valA,
bool isCOO, bool enableRT) {		bool isCOO, bool enableRT) {
if (isCOO) {		if (isCOO) {
// Library uses SoA COO, direct IR uses AoS COO.		// Library uses SoA COO, direct IR uses AoS COO.
if (enableRT)		if (enableRT) {
		assert(colA);
		K-WuUnsubmitted Not Done Reply Inline Actions Can we do an colA assertion at the beginning of the function instead of doing it twice? K-Wu: Can we do an colA assertion at the beginning of the function instead of doing it twice?
		aartbikAuthorUnsubmitted Done Reply Inline Actions Ah, in this case we could but.... if we implement the "unreachable" part (which I have in my local workspace wiht a COO_AOS) then you have a path without col coming in. So it looks a bit redundant now, but think of "unreachable" testing for !col at least ;-) aartbik: Ah, in this case we could but.... if we implement the "unreachable" part (which I have in my…
		K-WuUnsubmitted Done Reply Inline Actions okay that makes sense. Thank you. K-Wu: okay that makes sense. Thank you.
return builder.create<gpu::CreateCooOp>(loc, handleTp, tokenTp, token,		return builder.create<gpu::CreateCooOp>(loc, handleTp, tokenTp, token,
szY, szX, nseA, rowA, colA, valA);		sz1, sz2, nseA, rowA, colA, valA);
		}
llvm_unreachable("gpu::CreateCooAoSOp is deprecated");		llvm_unreachable("gpu::CreateCooAoSOp is deprecated");
}		}
return builder.create<gpu::CreateCsrOp>(loc, handleTp, tokenTp, token, szY,		assert(colA);
szX, nseA, rowA, colA, valA);		return builder.create<gpu::CreateCsrOp>(loc, handleTp, tokenTp, token, sz1,
		sz2, nseA, rowA, colA, valA);
}		}

/// Match and rewrite SpMV kernel.		/// Match and rewrite SpMV kernel.
static LogicalResult rewriteSpMV(PatternRewriter &rewriter,		static LogicalResult rewriteSpMV(PatternRewriter &rewriter,
linalg::GenericOp op, bool enableRT) {		linalg::GenericOp op, bool enableRT) {
Location loc = op.getLoc();		Location loc = op.getLoc();
Value a = op.getOperand(0);		Value a = op.getOperand(0);
Value x = op.getOperand(1);		Value x = op.getOperand(1);
▲ Show 20 Lines • Show All 71 Lines • ▼ Show 20 Lines	static LogicalResult rewriteSpMV(PatternRewriter &rewriter,
token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatA)		token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatA)
.getAsyncToken();		.getAsyncToken();
token = rewriter.create<gpu::DestroyDnVecOp>(loc, tokenTp, token, dnX)		token = rewriter.create<gpu::DestroyDnVecOp>(loc, tokenTp, token, dnX)
.getAsyncToken();		.getAsyncToken();
token = rewriter.create<gpu::DestroyDnVecOp>(loc, tokenTp, token, dnY)		token = rewriter.create<gpu::DestroyDnVecOp>(loc, tokenTp, token, dnY)
.getAsyncToken();		.getAsyncToken();
token = rewriter.create<gpu::DestroySparseEnvOp>(loc, tokenTp, token, handle)		token = rewriter.create<gpu::DestroySparseEnvOp>(loc, tokenTp, token, handle)
.getAsyncToken();		.getAsyncToken();
tokens.push_back(token);
genBlockingWait(rewriter, loc, tokens);
tokens.clear();
token = genFirstWait(rewriter, loc);
token = genCopyMemRef(rewriter, loc, memY, vecY, token);
token = genDeallocMemRef(rewriter, loc, rowA, token);		token = genDeallocMemRef(rewriter, loc, rowA, token);
if (colA)		if (colA)
token = genDeallocMemRef(rewriter, loc, colA, token);		token = genDeallocMemRef(rewriter, loc, colA, token);
token = genDeallocMemRef(rewriter, loc, valA, token);		token = genDeallocMemRef(rewriter, loc, valA, token);
token = genDeallocMemRef(rewriter, loc, buffer, token);		token = genDeallocMemRef(rewriter, loc, buffer, token);
token = genDeallocMemRef(rewriter, loc, vecX, token);		token = genDeallocMemRef(rewriter, loc, vecX, token);
		token = genCopyMemRef(rewriter, loc, memY, vecY, token);
token = genDeallocMemRef(rewriter, loc, vecY, token);		token = genDeallocMemRef(rewriter, loc, vecY, token);
tokens.push_back(token);		tokens.push_back(token);
genBlockingWait(rewriter, loc, tokens);		genBlockingWait(rewriter, loc, tokens);
tokens.clear();		tokens.clear();

// Done.		// Done.
rewriter.replaceOp(op, op.getDpsInitOperand(0)->get());		rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, memY);
return success();		return success();
}		}

/// Match and rewrite SpMM kernel.		/// Match and rewrite SpMM kernel.
static LogicalResult rewriteSpMM(PatternRewriter &rewriter,		static LogicalResult rewriteSpMM(PatternRewriter &rewriter,
linalg::GenericOp op, bool enableRT) {		linalg::GenericOp op, bool enableRT) {
Location loc = op.getLoc();		Location loc = op.getLoc();
Value a = op.getOperand(0);		Value a = op.getOperand(0);
▲ Show 20 Lines • Show All 73 Lines • ▼ Show 20 Lines	static LogicalResult rewriteSpMM(PatternRewriter &rewriter,
token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatA)		token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatA)
.getAsyncToken();		.getAsyncToken();
token = rewriter.create<gpu::DestroyDnMatOp>(loc, tokenTp, token, dnB)		token = rewriter.create<gpu::DestroyDnMatOp>(loc, tokenTp, token, dnB)
.getAsyncToken();		.getAsyncToken();
token = rewriter.create<gpu::DestroyDnMatOp>(loc, tokenTp, token, dnC)		token = rewriter.create<gpu::DestroyDnMatOp>(loc, tokenTp, token, dnC)
.getAsyncToken();		.getAsyncToken();
token = rewriter.create<gpu::DestroySparseEnvOp>(loc, tokenTp, token, handle)		token = rewriter.create<gpu::DestroySparseEnvOp>(loc, tokenTp, token, handle)
.getAsyncToken();		.getAsyncToken();
tokens.push_back(token);
genBlockingWait(rewriter, loc, tokens);
tokens.clear();
token = genFirstWait(rewriter, loc);
token = genCopyMemRef(rewriter, loc, bufC, matC, token);
token = genDeallocMemRef(rewriter, loc, rowA, token);		token = genDeallocMemRef(rewriter, loc, rowA, token);
if (colA)		if (colA)
token = genDeallocMemRef(rewriter, loc, colA, token);		token = genDeallocMemRef(rewriter, loc, colA, token);
token = genDeallocMemRef(rewriter, loc, valA, token);		token = genDeallocMemRef(rewriter, loc, valA, token);
token = genDeallocMemRef(rewriter, loc, buffer, token);		token = genDeallocMemRef(rewriter, loc, buffer, token);
token = genDeallocMemRef(rewriter, loc, matB, token);		token = genDeallocMemRef(rewriter, loc, matB, token);
		token = genCopyMemRef(rewriter, loc, bufC, matC, token);
token = genDeallocMemRef(rewriter, loc, matC, token);		token = genDeallocMemRef(rewriter, loc, matC, token);
tokens.push_back(token);		tokens.push_back(token);
genBlockingWait(rewriter, loc, tokens);		genBlockingWait(rewriter, loc, tokens);
tokens.clear();		tokens.clear();

// Done.		// Done.
rewriter.replaceOp(op, op.getDpsInitOperand(0)->get());		rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, matC);
return success();		return success();
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// Rewriting rules for direct code generation.		// Rewriting rules for direct code generation.
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

/// Proof-of-concept rewriter. This rule generates a GPU implementation		/// Proof-of-concept rewriter. This rule generates a GPU implementation
▲ Show 20 Lines • Show All 171 Lines • Show Last 20 Lines

mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp

Show First 20 Lines • Show All 242 Lines • ▼ Show 20 Lines	case 32:
return CUSPARSE_INDEX_32I;		return CUSPARSE_INDEX_32I;
default:		default:
return CUSPARSE_INDEX_64I;		return CUSPARSE_INDEX_64I;
}		}
}		}

// Some macro magic to get float/double alpha and beta on host.		// Some macro magic to get float/double alpha and beta on host.
#define ALPHABETA(w, alpha, beta) \		#define ALPHABETA(w, alpha, beta) \
float(alpha##f) = 1.0, (beta##f) = 1.0; \		float(alpha##f) = 1.0f; \
double(alpha##d) = 1.0, (beta##d) = 1.0; \		float(beta##f) = 1.0f; \
void (alpha##p), (beta##p); \		double(alpha##d) = 1.0; \
		double(beta##d) = 1.0; \
		const void *(alpha##p) = nullptr; \
		const void *(beta##p) = nullptr; \
if ((w) == 32) { \		if ((w) == 32) { \
(alpha##p) = reinterpret_cast<void *>(&(alpha##f)); \		(alpha##p) = reinterpret_cast<void *>(&(alpha##f)); \
(beta##p) = reinterpret_cast<void *>(&(beta##f)); \		(beta##p) = reinterpret_cast<void *>(&(beta##f)); \
} else { \		} else { \
(alpha##p) = reinterpret_cast<void *>(&(alpha##d)); \		(alpha##p) = reinterpret_cast<void *>(&(alpha##d)); \
(beta##p) = reinterpret_cast<void *>(&(beta##d)); \		(beta##p) = reinterpret_cast<void *>(&(beta##d)); \
}		}

▲ Show 20 Lines • Show All 137 Lines • Show Last 20 Lines

mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir

	// RUN: mlir-opt %s --linalg-generalize-named-ops \			// RUN: mlir-opt %s --linalg-generalize-named-ops \
	// RUN: --sparsification="enable-gpu-libgen" \| FileCheck %s			// RUN: --sparsification="enable-gpu-libgen" \| FileCheck %s

	#CSR = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>			#CSR = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>

	//			//
	// Compute matrix matrix C = AB			// Compute matrix matrix C = AB
	//			//
	// CHECK-LABEL: func.func @matmul(			// CHECK-LABEL: func.func @matmul(
	// CHECK-SAME: %[[VAL_0:.]]: tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>>,			// CHECK-SAME: %[[VAL_0:.]]: tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>>,
	// CHECK-SAME: %[[VAL_1:.*]]: tensor<?x?xf64>,			// CHECK-SAME: %[[VAL_1:.*]]: tensor<?x?xf64>,
	// CHECK-SAME: %[[VAL_2:.*]]: tensor<?x?xf64>) -> tensor<?x?xf64> {			// CHECK-SAME: %[[VAL_2:.*]]: tensor<?x?xf64>) -> tensor<?x?xf64> {
	// CHECK-DAG: %[[VAL_3:.*]] = arith.constant 0 : index			// CHECK-DAG: %[[VAL_3:.*]] = arith.constant 0 : index
	// CHECK-DAG: %[[VAL_4:.*]] = arith.constant 1 : index			// CHECK-DAG: %[[VAL_4:.*]] = arith.constant 1 : index
	// CHECK-DAG: %[[VAL_5:.]] = sparse_tensor.number_of_entries %[[VAL_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>>			// CHECK-DAG: %[[VAL_5:.]] = sparse_tensor.number_of_entries %[[VAL_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>>
	// CHECK-DAG: %[[VAL_6:.]] = tensor.dim %[[VAL_0]], %[[VAL_3]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>>			// CHECK-DAG: %[[VAL_6:.]] = tensor.dim %[[VAL_0]], %[[VAL_3]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>>
	// CHECK-DAG: %[[VAL_7:.*]] = tensor.dim %[[VAL_1]], %[[VAL_4]] : tensor<?x?xf64>			// CHECK-DAG: %[[VAL_7:.]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>>
	// CHECK-DAG: %[[VAL_8:.]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>>			// CHECK-DAG: %[[VAL_8:.*]] = tensor.dim %[[VAL_1]], %[[VAL_4]] : tensor<?x?xf64>
	// CHECK-DAG: %[[VAL_9:.]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>> to memref<?xindex>			// CHECK-DAG: %[[VAL_9:.]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>> to memref<?xindex>
	// CHECK-DAG: %[[VAL_10:.]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>> to memref<?xindex, strided<[?], offset: ?>>			// CHECK-DAG: %[[VAL_10:.]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>> to memref<?xindex>
	// CHECK: %[[VAL_11:.]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>> to memref<?xf64>			// CHECK-DAG: %[[VAL_11:.]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>> to memref<?xf64>
	// CHECK: %[[VAL_12:.*]] = gpu.wait async			// CHECK: %[[VAL_12:.*]] = gpu.wait async
	// CHECK: %[[VAL_13:.*]] = memref.dim %[[VAL_9]], %[[VAL_3]] : memref<?xindex>			// CHECK: %[[VAL_13:.*]] = memref.dim %[[VAL_9]], %[[VAL_3]] : memref<?xindex>
	// CHECK: %[[VAL_14:.]], %[[VAL_15:.]] = gpu.alloc async {{\[}}%[[VAL_12]]] (%[[VAL_13]]) : memref<?xindex>			// CHECK: %[[VAL_14:.]], %[[VAL_15:.]] = gpu.alloc async {{\[}}%[[VAL_12]]] (%[[VAL_13]]) : memref<?xindex>
	// CHECK: %[[VAL_16:.*]] = gpu.memcpy async {{\[}}%[[VAL_15]]] %[[VAL_14]], %[[VAL_9]] : memref<?xindex>, memref<?xindex>			// CHECK: %[[VAL_16:.*]] = gpu.memcpy async {{\[}}%[[VAL_15]]] %[[VAL_14]], %[[VAL_9]] : memref<?xindex>, memref<?xindex>
	// CHECK: %[[VAL_17:.*]] = gpu.wait async			// CHECK: %[[VAL_17:.*]] = gpu.wait async
	// CHECK: %[[VAL_18:.*]] = memref.dim %[[VAL_10]], %[[VAL_3]] : memref<?xindex, strided<[?], offset: ?>>			// CHECK: %[[VAL_18:.*]] = memref.dim %[[VAL_10]], %[[VAL_3]] : memref<?xindex>
	// CHECK: %[[VAL_19:.]], %[[VAL_20:.]] = gpu.alloc async {{\[}}%[[VAL_17]]] (%[[VAL_18]]) : memref<?xindex>			// CHECK: %[[VAL_19:.]], %[[VAL_20:.]] = gpu.alloc async {{\[}}%[[VAL_17]]] (%[[VAL_18]]) : memref<?xindex>
	// CHECK: %[[VAL_21:.*]] = gpu.memcpy async {{\[}}%[[VAL_20]]] %[[VAL_19]], %[[VAL_10]] : memref<?xindex>, memref<?xindex, strided<[?], offset: ?>>			// CHECK: %[[VAL_21:.*]] = gpu.memcpy async {{\[}}%[[VAL_20]]] %[[VAL_19]], %[[VAL_10]] : memref<?xindex>, memref<?xindex>
	// CHECK: %[[VAL_22:.*]] = gpu.wait async			// CHECK: %[[VAL_22:.*]] = gpu.wait async
	// CHECK: %[[VAL_23:.*]] = memref.dim %[[VAL_11]], %[[VAL_3]] : memref<?xf64>			// CHECK: %[[VAL_23:.*]] = memref.dim %[[VAL_11]], %[[VAL_3]] : memref<?xf64>
	// CHECK: %[[VAL_24:.]], %[[VAL_25:.]] = gpu.alloc async {{\[}}%[[VAL_22]]] (%[[VAL_23]]) : memref<?xf64>			// CHECK: %[[VAL_24:.]], %[[VAL_25:.]] = gpu.alloc async {{\[}}%[[VAL_22]]] (%[[VAL_23]]) : memref<?xf64>
	// CHECK: %[[VAL_26:.*]] = gpu.memcpy async {{\[}}%[[VAL_25]]] %[[VAL_24]], %[[VAL_11]] : memref<?xf64>, memref<?xf64>			// CHECK: %[[VAL_26:.*]] = gpu.memcpy async {{\[}}%[[VAL_25]]] %[[VAL_24]], %[[VAL_11]] : memref<?xf64>, memref<?xf64>
	// CHECK: %[[VAL_27:.*]] = bufferization.to_memref %[[VAL_1]] : memref<?x?xf64>			// CHECK: %[[VAL_27:.*]] = bufferization.to_memref %[[VAL_1]] : memref<?x?xf64>
	// CHECK: %[[VAL_28:.*]] = gpu.wait async			// CHECK: %[[VAL_28:.*]] = gpu.wait async
	// CHECK: %[[VAL_29:.*]] = memref.dim %[[VAL_27]], %[[VAL_3]] : memref<?x?xf64>			// CHECK: %[[VAL_29:.*]] = memref.dim %[[VAL_27]], %[[VAL_3]] : memref<?x?xf64>
	// CHECK: %[[VAL_30:.*]] = memref.dim %[[VAL_27]], %[[VAL_4]] : memref<?x?xf64>			// CHECK: %[[VAL_30:.*]] = memref.dim %[[VAL_27]], %[[VAL_4]] : memref<?x?xf64>
	// CHECK: %[[VAL_31:.]], %[[VAL_32:.]] = gpu.alloc async {{\[}}%[[VAL_28]]] (%[[VAL_29]], %[[VAL_30]]) : memref<?x?xf64>			// CHECK: %[[VAL_31:.]], %[[VAL_32:.]] = gpu.alloc async {{\[}}%[[VAL_28]]] (%[[VAL_29]], %[[VAL_30]]) : memref<?x?xf64>
	// CHECK: %[[VAL_33:.*]] = gpu.memcpy async {{\[}}%[[VAL_32]]] %[[VAL_31]], %[[VAL_27]] : memref<?x?xf64>, memref<?x?xf64>			// CHECK: %[[VAL_33:.*]] = gpu.memcpy async {{\[}}%[[VAL_32]]] %[[VAL_31]], %[[VAL_27]] : memref<?x?xf64>, memref<?x?xf64>
	// CHECK: %[[VAL_34:.*]] = bufferization.to_memref %[[VAL_2]] : memref<?x?xf64>			// CHECK: %[[VAL_34:.*]] = bufferization.to_memref %[[VAL_2]] : memref<?x?xf64>
	// CHECK: %[[VAL_35:.*]] = gpu.wait async			// CHECK: %[[VAL_35:.*]] = gpu.wait async
	// CHECK: %[[VAL_36:.*]] = memref.dim %[[VAL_34]], %[[VAL_3]] : memref<?x?xf64>			// CHECK: %[[VAL_36:.*]] = memref.dim %[[VAL_34]], %[[VAL_3]] : memref<?x?xf64>
	// CHECK: %[[VAL_37:.*]] = memref.dim %[[VAL_34]], %[[VAL_4]] : memref<?x?xf64>			// CHECK: %[[VAL_37:.*]] = memref.dim %[[VAL_34]], %[[VAL_4]] : memref<?x?xf64>
	// CHECK: %[[VAL_38:.]], %[[VAL_39:.]] = gpu.alloc async {{\[}}%[[VAL_35]]] (%[[VAL_36]], %[[VAL_37]]) : memref<?x?xf64>			// CHECK: %[[VAL_38:.]], %[[VAL_39:.]] = gpu.alloc async {{\[}}%[[VAL_35]]] (%[[VAL_36]], %[[VAL_37]]) : memref<?x?xf64>
	// CHECK: %[[VAL_40:.*]] = gpu.memcpy async {{\[}}%[[VAL_39]]] %[[VAL_38]], %[[VAL_34]] : memref<?x?xf64>, memref<?x?xf64>			// CHECK: %[[VAL_40:.*]] = gpu.memcpy async {{\[}}%[[VAL_39]]] %[[VAL_38]], %[[VAL_34]] : memref<?x?xf64>, memref<?x?xf64>
	// CHECK: gpu.wait {{\[}}%[[VAL_16]], %[[VAL_21]], %[[VAL_26]], %[[VAL_33]], %[[VAL_40]]]			// CHECK: gpu.wait {{\[}}%[[VAL_16]], %[[VAL_21]], %[[VAL_26]], %[[VAL_33]], %[[VAL_40]]]
	// CHECK: %[[VAL_41:.*]] = gpu.wait async			// CHECK: %[[VAL_41:.*]] = gpu.wait async
	// CHECK: %[[VAL_42:.]], %[[VAL_43:.]] = gpu.create_sparse_env async {{\[}}%[[VAL_41]]]			// CHECK: %[[VAL_42:.]], %[[VAL_43:.]] = gpu.create_sparse_env async {{\[}}%[[VAL_41]]]
	// CHECK: %[[VAL_44:.]], %[[VAL_45:.]] = gpu.create_csr async {{\[}}%[[VAL_43]]] %[[VAL_6]], %[[VAL_8]], %[[VAL_5]], %[[VAL_14]], %[[VAL_19]], %[[VAL_24]] : memref<?xindex>, memref<?xindex>, memref<?xf64>			// CHECK: %[[VAL_44:.]], %[[VAL_45:.]] = gpu.create_csr async {{\[}}%[[VAL_43]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_14]], %[[VAL_19]], %[[VAL_24]] : memref<?xindex>, memref<?xindex>, memref<?xf64>
	// CHECK: %[[VAL_46:.]], %[[VAL_47:.]] = gpu.create_dn_mat async {{\[}}%[[VAL_45]]] %[[VAL_8]], %[[VAL_7]], %[[VAL_31]] : memref<?x?xf64>			// CHECK: %[[VAL_46:.]], %[[VAL_47:.]] = gpu.create_dn_mat async {{\[}}%[[VAL_45]]] %[[VAL_7]], %[[VAL_8]], %[[VAL_31]] : memref<?x?xf64>
	// CHECK: %[[VAL_48:.]], %[[VAL_49:.]] = gpu.create_dn_mat async {{\[}}%[[VAL_47]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_38]] : memref<?x?xf64>			// CHECK: %[[VAL_48:.]], %[[VAL_49:.]] = gpu.create_dn_mat async {{\[}}%[[VAL_47]]] %[[VAL_6]], %[[VAL_8]], %[[VAL_38]] : memref<?x?xf64>
	// CHECK: %[[VAL_50:.]], %[[VAL_51:.]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_49]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]]			// CHECK: %[[VAL_50:.]], %[[VAL_51:.]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_49]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]]
	// CHECK: %[[VAL_52:.]], %[[VAL_53:.]] = gpu.alloc async {{\[}}%[[VAL_51]]] (%[[VAL_50]]) : memref<?xi8>			// CHECK: %[[VAL_52:.]], %[[VAL_53:.]] = gpu.alloc async {{\[}}%[[VAL_51]]] (%[[VAL_50]]) : memref<?xi8>
	// CHECK: %[[VAL_54:.*]] = gpu.spmm async {{\[}}%[[VAL_53]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]], %[[VAL_52]] : memref<?xi8>			// CHECK: %[[VAL_54:.*]] = gpu.spmm async {{\[}}%[[VAL_53]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]], %[[VAL_52]] : memref<?xi8>
	// CHECK: %[[VAL_55:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_54]]] %[[VAL_44]]			// CHECK: %[[VAL_55:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_54]]] %[[VAL_44]]
	// CHECK: %[[VAL_56:.*]] = gpu.destroy_dn_mat async {{\[}}%[[VAL_55]]] %[[VAL_46]]			// CHECK: %[[VAL_56:.*]] = gpu.destroy_dn_mat async {{\[}}%[[VAL_55]]] %[[VAL_46]]
	// CHECK: %[[VAL_57:.*]] = gpu.destroy_dn_mat async {{\[}}%[[VAL_56]]] %[[VAL_48]]			// CHECK: %[[VAL_57:.*]] = gpu.destroy_dn_mat async {{\[}}%[[VAL_56]]] %[[VAL_48]]
	// CHECK: %[[VAL_58:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_57]]] %[[VAL_42]]			// CHECK: %[[VAL_58:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_57]]] %[[VAL_42]]
	// CHECK: gpu.wait {{\[}}%[[VAL_58]]]			// CHECK: %[[VAL_59:.*]] = gpu.dealloc async {{\[}}%[[VAL_58]]] %[[VAL_14]] : memref<?xindex>
	// CHECK: %[[VAL_59:.*]] = gpu.wait async			// CHECK: %[[VAL_60:.*]] = gpu.dealloc async {{\[}}%[[VAL_59]]] %[[VAL_19]] : memref<?xindex>
	// CHECK: %[[VAL_60:.*]] = gpu.memcpy async {{\[}}%[[VAL_59]]] %[[VAL_34]], %[[VAL_38]] : memref<?x?xf64>, memref<?x?xf64>			// CHECK: %[[VAL_61:.*]] = gpu.dealloc async {{\[}}%[[VAL_60]]] %[[VAL_24]] : memref<?xf64>
	// CHECK: %[[VAL_61:.*]] = gpu.dealloc async {{\[}}%[[VAL_60]]] %[[VAL_14]] : memref<?xindex>			// CHECK: %[[VAL_62:.*]] = gpu.dealloc async {{\[}}%[[VAL_61]]] %[[VAL_52]] : memref<?xi8>
	// CHECK: %[[VAL_62:.*]] = gpu.dealloc async {{\[}}%[[VAL_61]]] %[[VAL_19]] : memref<?xindex>			// CHECK: %[[VAL_63:.*]] = gpu.dealloc async {{\[}}%[[VAL_62]]] %[[VAL_31]] : memref<?x?xf64>
	// CHECK: %[[VAL_63:.*]] = gpu.dealloc async {{\[}}%[[VAL_62]]] %[[VAL_24]] : memref<?xf64>			// CHECK: %[[VAL_64:.*]] = gpu.memcpy async {{\[}}%[[VAL_63]]] %[[VAL_34]], %[[VAL_38]] : memref<?x?xf64>, memref<?x?xf64>
	// CHECK: %[[VAL_64:.*]] = gpu.dealloc async {{\[}}%[[VAL_63]]] %[[VAL_52]] : memref<?xi8>			// CHECK: %[[VAL_65:.*]] = gpu.dealloc async {{\[}}%[[VAL_64]]] %[[VAL_38]] : memref<?x?xf64>
	// CHECK: %[[VAL_65:.*]] = gpu.dealloc async {{\[}}%[[VAL_64]]] %[[VAL_31]] : memref<?x?xf64>			// CHECK: gpu.wait {{\[}}%[[VAL_65]]]
	// CHECK: %[[VAL_66:.*]] = gpu.dealloc async {{\[}}%[[VAL_65]]] %[[VAL_38]] : memref<?x?xf64>			// CHECK: %[[VAL_66:.*]] = bufferization.to_tensor %[[VAL_38]] : memref<?x?xf64>
	// CHECK: gpu.wait {{\[}}%[[VAL_66]]]			// CHECK: return %[[VAL_66]] : tensor<?x?xf64>
	// CHECK: return %[[VAL_2]] : tensor<?x?xf64>
	// CHECK: }			// CHECK: }
	func.func @matmul(%A: tensor<?x?xf64, #CSR>, %B: tensor<?x?xf64>, %C_in: tensor<?x?xf64>) -> tensor<?x?xf64> {			func.func @matmul(%A: tensor<?x?xf64, #CSR>, %B: tensor<?x?xf64>, %C_in: tensor<?x?xf64>) -> tensor<?x?xf64> {
	%C_out = linalg.matmul			%C_out = linalg.matmul
	ins(%A, %B: tensor<?x?xf64, #CSR>, tensor<?x?xf64>)			ins(%A, %B: tensor<?x?xf64, #CSR>, tensor<?x?xf64>)
	outs(%C_in: tensor<?x?xf64>) -> tensor<?x?xf64>			outs(%C_in: tensor<?x?xf64>) -> tensor<?x?xf64>
	return %C_out : tensor<?x?xf64>			return %C_out : tensor<?x?xf64>
	}			}

mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir

	Show All 10 Lines
	// CHECK-SAME: %[[VAL_0:.]]: tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>>,			// CHECK-SAME: %[[VAL_0:.]]: tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>>,
	// CHECK-SAME: %[[VAL_1:.*]]: tensor<?xf64>,			// CHECK-SAME: %[[VAL_1:.*]]: tensor<?xf64>,
	// CHECK-SAME: %[[VAL_2:.*]]: tensor<?xf64>) -> tensor<?xf64> {			// CHECK-SAME: %[[VAL_2:.*]]: tensor<?xf64>) -> tensor<?xf64> {
	// CHECK-DAG: %[[VAL_3:.*]] = arith.constant 0 : index			// CHECK-DAG: %[[VAL_3:.*]] = arith.constant 0 : index
	// CHECK-DAG: %[[VAL_4:.*]] = arith.constant 1 : index			// CHECK-DAG: %[[VAL_4:.*]] = arith.constant 1 : index
	// CHECK-DAG: %[[VAL_5:.]] = sparse_tensor.number_of_entries %[[VAL_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>>			// CHECK-DAG: %[[VAL_5:.]] = sparse_tensor.number_of_entries %[[VAL_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>>
	// CHECK-DAG: %[[VAL_6:.]] = tensor.dim %[[VAL_0]], %[[VAL_3]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>>			// CHECK-DAG: %[[VAL_6:.]] = tensor.dim %[[VAL_0]], %[[VAL_3]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>>
	// CHECK-DAG: %[[VAL_7:.]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>>			// CHECK-DAG: %[[VAL_7:.]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>>
	// CHECK-DAG: %[[VAL_8:.]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>>			// CHECK-DAG: %[[VAL_8:.]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>> to memref<?xindex, strided<[?], offset: ?>>
	// CHECK-DAG: %[[VAL_9:.]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>>			// CHECK-DAG: %[[VAL_9:.]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>> to memref<?xindex, strided<[?], offset: ?>>
	// CHECK-DAG: %[[VAL_10:.]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>>			// CHECK-DAG: %[[VAL_10:.]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>> to memref<?xf64>
	// CHECK: %[[VAL_11:.*]] = gpu.wait async			// CHECK: %[[VAL_11:.*]] = gpu.wait async
	// CHECK: %[[VAL_12:.*]] = memref.dim %[[VAL_8]], %[[VAL_3]] : memref<?xindex, strided<[?], offset: ?>>			// CHECK: %[[VAL_12:.*]] = memref.dim %[[VAL_8]], %[[VAL_3]] : memref<?xindex, strided<[?], offset: ?>>
	// CHECK: %[[VAL_13:.]], %[[VAL_14:.]] = gpu.alloc async {{\[}}%[[VAL_11]]] (%[[VAL_12]]) : memref<?xindex>			// CHECK: %[[VAL_13:.]], %[[VAL_14:.]] = gpu.alloc async {{\[}}%[[VAL_11]]] (%[[VAL_12]]) : memref<?xindex>
	// CHECK: %[[VAL_15:.*]] = gpu.memcpy async {{\[}}%[[VAL_14]]] %[[VAL_13]], %[[VAL_8]] : memref<?xindex>, memref<?xindex, strided<[?], offset: ?>>			// CHECK: %[[VAL_15:.*]] = gpu.memcpy async {{\[}}%[[VAL_14]]] %[[VAL_13]], %[[VAL_8]] : memref<?xindex>, memref<?xindex, strided<[?], offset: ?>>
	// CHECK: %[[VAL_16:.*]] = gpu.wait async			// CHECK: %[[VAL_16:.*]] = gpu.wait async
	// CHECK: %[[VAL_17:.*]] = memref.dim %[[VAL_9]], %[[VAL_3]] : memref<?xindex, strided<[?], offset: ?>>			// CHECK: %[[VAL_17:.*]] = memref.dim %[[VAL_9]], %[[VAL_3]] : memref<?xindex, strided<[?], offset: ?>>
	// CHECK: %[[VAL_18:.]], %[[VAL_19:.]] = gpu.alloc async {{\[}}%[[VAL_16]]] (%[[VAL_17]]) : memref<?xindex>			// CHECK: %[[VAL_18:.]], %[[VAL_19:.]] = gpu.alloc async {{\[}}%[[VAL_16]]] (%[[VAL_17]]) : memref<?xindex>
	// CHECK: %[[VAL_20:.*]] = gpu.memcpy async {{\[}}%[[VAL_19]]] %[[VAL_18]], %[[VAL_9]] : memref<?xindex>, memref<?xindex, strided<[?], offset: ?>>			// CHECK: %[[VAL_20:.*]] = gpu.memcpy async {{\[}}%[[VAL_19]]] %[[VAL_18]], %[[VAL_9]] : memref<?xindex>, memref<?xindex, strided<[?], offset: ?>>
	Show All 19 Lines
	// CHECK: %[[VAL_45:.]], %[[VAL_46:.]] = gpu.create_dn_vec async {{\[}}%[[VAL_44]]] %[[VAL_35]], %[[VAL_6]] : memref<?xf64>			// CHECK: %[[VAL_45:.]], %[[VAL_46:.]] = gpu.create_dn_vec async {{\[}}%[[VAL_44]]] %[[VAL_35]], %[[VAL_6]] : memref<?xf64>
	// CHECK: %[[VAL_47:.]], %[[VAL_48:.]] = gpu.spmv_buffer_size async {{\[}}%[[VAL_46]]] %[[VAL_39]], %[[VAL_41]], %[[VAL_43]], %[[VAL_45]]			// CHECK: %[[VAL_47:.]], %[[VAL_48:.]] = gpu.spmv_buffer_size async {{\[}}%[[VAL_46]]] %[[VAL_39]], %[[VAL_41]], %[[VAL_43]], %[[VAL_45]]
	// CHECK: %[[VAL_49:.]], %[[VAL_50:.]] = gpu.alloc async {{\[}}%[[VAL_48]]] (%[[VAL_47]]) : memref<?xi8>			// CHECK: %[[VAL_49:.]], %[[VAL_50:.]] = gpu.alloc async {{\[}}%[[VAL_48]]] (%[[VAL_47]]) : memref<?xi8>
	// CHECK: %[[VAL_51:.*]] = gpu.spmv async {{\[}}%[[VAL_50]]] %[[VAL_39]], %[[VAL_41]], %[[VAL_43]], %[[VAL_45]], %[[VAL_49]] : memref<?xi8>			// CHECK: %[[VAL_51:.*]] = gpu.spmv async {{\[}}%[[VAL_50]]] %[[VAL_39]], %[[VAL_41]], %[[VAL_43]], %[[VAL_45]], %[[VAL_49]] : memref<?xi8>
	// CHECK: %[[VAL_52:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_51]]] %[[VAL_41]]			// CHECK: %[[VAL_52:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_51]]] %[[VAL_41]]
	// CHECK: %[[VAL_53:.*]] = gpu.destroy_dn_vec async {{\[}}%[[VAL_52]]] %[[VAL_43]]			// CHECK: %[[VAL_53:.*]] = gpu.destroy_dn_vec async {{\[}}%[[VAL_52]]] %[[VAL_43]]
	// CHECK: %[[VAL_54:.*]] = gpu.destroy_dn_vec async {{\[}}%[[VAL_53]]] %[[VAL_45]]			// CHECK: %[[VAL_54:.*]] = gpu.destroy_dn_vec async {{\[}}%[[VAL_53]]] %[[VAL_45]]
	// CHECK: %[[VAL_55:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_54]]] %[[VAL_39]]			// CHECK: %[[VAL_55:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_54]]] %[[VAL_39]]
	// CHECK: gpu.wait {{\[}}%[[VAL_55]]]			// CHECK: %[[VAL_56:.*]] = gpu.dealloc async {{\[}}%[[VAL_55]]] %[[VAL_13]] : memref<?xindex>
	// CHECK: %[[VAL_56:.*]] = gpu.wait async			// CHECK: %[[VAL_57:.*]] = gpu.dealloc async {{\[}}%[[VAL_56]]] %[[VAL_18]] : memref<?xindex>
	// CHECK: %[[VAL_57:.*]] = gpu.memcpy async {{\[}}%[[VAL_56]]] %[[VAL_32]], %[[VAL_35]] : memref<?xf64>, memref<?xf64>			// CHECK: %[[VAL_58:.*]] = gpu.dealloc async {{\[}}%[[VAL_57]]] %[[VAL_23]] : memref<?xf64>
	// CHECK: %[[VAL_58:.*]] = gpu.dealloc async {{\[}}%[[VAL_57]]] %[[VAL_13]] : memref<?xindex>			// CHECK: %[[VAL_59:.*]] = gpu.dealloc async {{\[}}%[[VAL_58]]] %[[VAL_49]] : memref<?xi8>
	// CHECK: %[[VAL_59:.*]] = gpu.dealloc async {{\[}}%[[VAL_58]]] %[[VAL_18]] : memref<?xindex>			// CHECK: %[[VAL_60:.*]] = gpu.dealloc async {{\[}}%[[VAL_59]]] %[[VAL_29]] : memref<?xf64>
	// CHECK: %[[VAL_60:.*]] = gpu.dealloc async {{\[}}%[[VAL_59]]] %[[VAL_23]] : memref<?xf64>			// CHECK: %[[VAL_61:.*]] = gpu.memcpy async {{\[}}%[[VAL_60]]] %[[VAL_32]], %[[VAL_35]] : memref<?xf64>, memref<?xf64>
	// CHECK: %[[VAL_61:.*]] = gpu.dealloc async {{\[}}%[[VAL_60]]] %[[VAL_49]] : memref<?xi8>			// CHECK: %[[VAL_62:.*]] = gpu.dealloc async {{\[}}%[[VAL_61]]] %[[VAL_35]] : memref<?xf64>
	// CHECK: %[[VAL_62:.*]] = gpu.dealloc async {{\[}}%[[VAL_61]]] %[[VAL_29]] : memref<?xf64>			// CHECK: gpu.wait {{\[}}%[[VAL_62]]]
	// CHECK: %[[VAL_63:.*]] = gpu.dealloc async {{\[}}%[[VAL_62]]] %[[VAL_35]] : memref<?xf64>			// CHECK: %[[VAL_63:.*]] = bufferization.to_tensor %[[VAL_32]] : memref<?xf64>
	// CHECK: gpu.wait {{\[}}%[[VAL_63]]]			// CHECK: return %[[VAL_63]] : tensor<?xf64>
	// CHECK: return %[[VAL_2]] : tensor<?xf64>
	// CHECK: }			// CHECK: }
	func.func @matvec(%A: tensor<?x?xf64, #SortedCOO>,			func.func @matvec(%A: tensor<?x?xf64, #SortedCOO>,
	%x: tensor<?xf64>,			%x: tensor<?xf64>,
	%y_in: tensor<?xf64>) -> tensor<?xf64> {			%y_in: tensor<?xf64>) -> tensor<?xf64> {
	%y_out = linalg.matvec			%y_out = linalg.matvec
	ins(%A, %x: tensor<?x?xf64, #SortedCOO>, tensor<?xf64>)			ins(%A, %x: tensor<?x?xf64, #SortedCOO>, tensor<?xf64>)
	outs(%y_in: tensor<?xf64>) -> tensor<?xf64>			outs(%y_in: tensor<?xf64>) -> tensor<?xf64>
	return %y_out : tensor<?xf64>			return %y_out : tensor<?xf64>
	}			}

	}			}