This is an archive of the discontinued LLVM Phabricator instance.

The only real concern I have (as mentioned in chat) is all the boilerplate of explicitly threading the async tokens through. There're some well-known design patterns for cleaning that up, and in the long-term I definitely think we should implement that (and upstream it so that other users of the gpu dialect can reuse it); but I don't think that should block the current CL

mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
524	Nit: why not "nseA"?
525–527	Nit: personally I think it'd be cleaner to have these in the order `szm;szk;szn` so that the dimops on `a` are together
553	This is more of a long-term nit, rather than something that needs to be fixed in the current CL, but: I mislike the asymmetry of not using `getAsyncToken` here. Of course, the only way to fix that would be to adjust `genSpMat` to return `gpu::AsyncOpInterface` in lieu of `Operation*`. Which is doable since both `gpu::Create{Coo,Csr}Op` support that interface; however, I'm not sure if that change would cause other problems for the various `genSpMat` callsites.

Thanks Wren. And yes, the codedup will be cleaned up somehow, but I did not want to optimize too early, before we have a better idea what direction we are going.
But as we recognize the other cuSparse methods, most of this should be moved into more general utils indeed.

mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
524	Yeah, as you know I prefer the nse too over nnz, but cuSparse follows the nnz convention. But since that is not visible here, I agree using nse is better
525–527	made it so
553	Yeah, agreed there is some redesign lingering there

rebased with main, addressed comments

Harbormaster completed remote builds in B232909: Diff 523430.May 18 2023, 10:36 AM

aartbik added inline comments.May 18 2023, 11:11 AM

mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir
5	note to self: rebase with the renaming in main when ready

rebased with main, and renamed lvlTypes

Harbormaster completed remote builds in B233287: Diff 523915.May 19 2023, 3:32 PM

Peiming accepted this revision.May 19 2023, 5:21 PM

This revision is now accepted and ready to land.May 19 2023, 5:21 PM

Closed by commit rGb75d6a40f15e: [mlir][sparse][gpu] recognize SpMM cuSparse during sparsification (authored by aartbik). · Explain WhyMay 19 2023, 5:23 PM

This revision was automatically updated to reflect the committed changes.

aartbik added a commit: rGb75d6a40f15e: [mlir][sparse][gpu] recognize SpMM cuSparse during sparsification.

Revision Contents

Path

Size

mlir/

lib/

Dialect/

SparseTensor/

Transforms/

SparseGPUCodegen.cpp

139 lines

test/

Dialect/

SparseTensor/

GPU/

gpu_matmul_lib.mlir

76 lines

Diff 523974

mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp

Show First 20 Lines • Show All 341 Lines • ▼ Show 20 Lines
static bool isAdmissibleCSR(SparseTensorType &aTp) {		static bool isAdmissibleCSR(SparseTensorType &aTp) {
return aTp.isDenseLvl(0) && aTp.isCompressedLvl(1) && aTp.isOrderedLvl(1) &&		return aTp.isDenseLvl(0) && aTp.isCompressedLvl(1) && aTp.isOrderedLvl(1) &&
aTp.isUniqueLvl(1) &&		aTp.isUniqueLvl(1) &&
(aTp.getElementType().isF64() \|\| aTp.getElementType().isF32()) &&		(aTp.getElementType().isF64() \|\| aTp.getElementType().isF32()) &&
(aTp.getCrdWidth() == 0 \|\| aTp.getCrdWidth() == 32 \|\|		(aTp.getCrdWidth() == 0 \|\| aTp.getCrdWidth() == 32 \|\|
aTp.getCrdWidth() == 64);		aTp.getCrdWidth() == 64);
}		}

		/// Test for admissible types on operands (with output parameter `isCOO`).
		static bool areAdmissibleTypes(SparseTensorType aTp, SparseTensorType bTp,
		SparseTensorType cTp, bool enableRT,
		bool &isCOO) {
		if (bTp.hasEncoding() \|\| cTp.hasEncoding())
		return false;
		if (isAdmissibleCOO(aTp)) {
		isCOO = true;
		return enableRT; // TODO: CreateCooAoSOp was deprecated, find another way
		}
		return isAdmissibleCSR(aTp);
		}

/// Generates the first positions/coordinates of a sparse matrix.		/// Generates the first positions/coordinates of a sparse matrix.
static Value genFirstPosOrCrds(OpBuilder &builder, Location loc, Value a,		static Value genFirstPosOrCrds(OpBuilder &builder, Location loc, Value a,
bool isCOO, bool enableRT) {		bool isCOO, bool enableRT) {
if (isCOO) {		if (isCOO) {
// Library uses SoA COO, direct IR uses AoS COO.		// Library uses SoA COO, direct IR uses AoS COO.
if (enableRT)		if (enableRT)
return genToCoordinates(builder, loc, a, 0, /cooStart=/0);		return genToCoordinates(builder, loc, a, 0, /cooStart=/0);
return genToCoordinatesBuffer(builder, loc, a);		return genToCoordinatesBuffer(builder, loc, a);
}		}
// CSR uses positions.		// CSR uses positions.
return genToPositions(builder, loc, a, 1);		return genToPositions(builder, loc, a, 1);
}		}

/// Generates the second coordinates of a sparse matrix.		/// Generates the second coordinates of a sparse matrix.
static Value genSecondCrds(OpBuilder &builder, Location loc, Value a,		static Value genSecondCrds(OpBuilder &builder, Location loc, Value a,
bool isCOO, bool enableRT) {		bool isCOO, bool enableRT) {
if (isCOO && !enableRT)		if (isCOO && !enableRT)
return Value(); // nothing needed		return Value(); // nothing needed
return genToCoordinates(builder, loc, a, 1, /cooStart=/0);		return genToCoordinates(builder, loc, a, 1, /cooStart=/0);
}		}

/// Generates the sparse matrix multiplication.		/// Generates the sparse matrix multiplication.
static Operation *genSpMat(OpBuilder &builder, Location loc, Type handleTp,		static Operation *genSpMat(OpBuilder &builder, Location loc, Type handleTp,
Type tokenTp, Value token, Value szY, Value szX,		Type tokenTp, Value token, Value szY, Value szX,
Value nnzA, Value rowA, Value colA, Value valA,		Value nseA, Value rowA, Value colA, Value valA,
bool isCOO, bool enableRT) {		bool isCOO, bool enableRT) {
if (isCOO) {		if (isCOO) {
// Library uses SoA COO, direct IR uses AoS COO.		// Library uses SoA COO, direct IR uses AoS COO.
if (enableRT)		if (enableRT)
return builder.create<gpu::CreateCooOp>(loc, handleTp, tokenTp, token,		return builder.create<gpu::CreateCooOp>(loc, handleTp, tokenTp, token,
szY, szX, nnzA, rowA, colA, valA);		szY, szX, nseA, rowA, colA, valA);
llvm_unreachable("gpu::CreateCooAoSOp is deprecated");		llvm_unreachable("gpu::CreateCooAoSOp is deprecated");
}		}
return builder.create<gpu::CreateCsrOp>(loc, handleTp, tokenTp, token, szY,		return builder.create<gpu::CreateCsrOp>(loc, handleTp, tokenTp, token, szY,
szX, nnzA, rowA, colA, valA);		szX, nseA, rowA, colA, valA);
}		}

/// Match and rewrite SpMV kernel.		/// Match and rewrite SpMV kernel.
static LogicalResult rewriteSpMV(PatternRewriter &rewriter,		static LogicalResult rewriteSpMV(PatternRewriter &rewriter,
linalg::GenericOp op, bool enableRT) {		linalg::GenericOp op, bool enableRT) {
Location loc = op.getLoc();		Location loc = op.getLoc();
Value a = op.getOperand(0);		Value a = op.getOperand(0);
Value x = op.getOperand(1);		Value x = op.getOperand(1);
Value y = op.getOperand(2); // we have y = Ax		Value y = op.getOperand(2); // we have y = Ax
SmallVector<Value> tokens;		SmallVector<Value> tokens;

// Only admissible sparse matrix format and dense vectors for now.		// Only admissible sparse matrix format and dense vectors.
bool isCOO = false;		bool isCOO = false;
SparseTensorType aTp = getSparseTensorType(a);		SparseTensorType aTp = getSparseTensorType(a);
SparseTensorType xTp = getSparseTensorType(x);		SparseTensorType xTp = getSparseTensorType(x);
SparseTensorType yTp = getSparseTensorType(y);		SparseTensorType yTp = getSparseTensorType(y);
if (xTp.hasEncoding() \|\| yTp.hasEncoding())		if (!areAdmissibleTypes(aTp, xTp, yTp, enableRT, isCOO))
return failure();
if (isAdmissibleCOO(aTp)) {
isCOO = true;
// TODO: CreateCooAoSOp was deprecated, find another way
if (!enableRT)
return failure();		return failure();
} else if (isAdmissibleCSR(aTp)) {
isCOO = false;
} else {
return failure();
}

// Start sparse kernel and copy data from host to device.		// Start sparse kernel and copy data from host to device.
// a : memR/memC/memV -> rowA,colA,valA		// a : memR/memC/memV -> rowA,colA,valA
// x : memX -> vecX		// x : memX -> vecX
// y : memY -> vecY		// y : memY -> vecY
Value nnzA = rewriter.create<NumberOfEntriesOp>(loc, a);		Value nseA = rewriter.create<NumberOfEntriesOp>(loc, a);
Value szY = linalg::createOrFoldDimOp(rewriter, loc, a, 0);		Value szY = linalg::createOrFoldDimOp(rewriter, loc, a, 0);
Value szX = linalg::createOrFoldDimOp(rewriter, loc, a, 1);		Value szX = linalg::createOrFoldDimOp(rewriter, loc, a, 1);
Value memR = genFirstPosOrCrds(rewriter, loc, a, isCOO, enableRT);		Value memR = genFirstPosOrCrds(rewriter, loc, a, isCOO, enableRT);
Value memC = genSecondCrds(rewriter, loc, a, isCOO, enableRT);		Value memC = genSecondCrds(rewriter, loc, a, isCOO, enableRT);
Value memV = genToValues(rewriter, loc, a);		Value memV = genToValues(rewriter, loc, a);
Value rowA = genAllocCopy(rewriter, loc, memR, tokens);		Value rowA = genAllocCopy(rewriter, loc, memR, tokens);
Value colA = memC ? genAllocCopy(rewriter, loc, memC, tokens) : Value();		Value colA = memC ? genAllocCopy(rewriter, loc, memC, tokens) : Value();
Value valA = genAllocCopy(rewriter, loc, memV, tokens);		Value valA = genAllocCopy(rewriter, loc, memV, tokens);
Show All 9 Lines	static LogicalResult rewriteSpMV(PatternRewriter &rewriter,
Type handleTp = rewriter.getType<gpu::SparseHandleType>();		Type handleTp = rewriter.getType<gpu::SparseHandleType>();
Type tokenTp = rewriter.getType<gpu::AsyncTokenType>();		Type tokenTp = rewriter.getType<gpu::AsyncTokenType>();
Value token = genFirstWait(rewriter, loc);		Value token = genFirstWait(rewriter, loc);
auto env =		auto env =
rewriter.create<gpu::CreateSparseEnvOp>(loc, handleTp, tokenTp, token);		rewriter.create<gpu::CreateSparseEnvOp>(loc, handleTp, tokenTp, token);
Value handle = env.getResult(0);		Value handle = env.getResult(0);
token = env.getAsyncToken();		token = env.getAsyncToken();
Operation *spGenA = genSpMat(rewriter, loc, handleTp, tokenTp, token, szY,		Operation *spGenA = genSpMat(rewriter, loc, handleTp, tokenTp, token, szY,
szX, nnzA, rowA, colA, valA, isCOO, enableRT);		szX, nseA, rowA, colA, valA, isCOO, enableRT);
Value spMatA = spGenA->getResult(0);		Value spMatA = spGenA->getResult(0);
token = spGenA->getResult(1);		token = spGenA->getResult(1);
auto dvecX = rewriter.create<gpu::CreateDnVecOp>(loc, handleTp, tokenTp,		auto dvecX = rewriter.create<gpu::CreateDnVecOp>(loc, handleTp, tokenTp,
token, vecX, szX);		token, vecX, szX);
Value dnX = dvecX.getResult(0);		Value dnX = dvecX.getResult(0);
token = dvecX.getAsyncToken();		token = dvecX.getAsyncToken();
auto dvecY = rewriter.create<gpu::CreateDnVecOp>(loc, handleTp, tokenTp,		auto dvecY = rewriter.create<gpu::CreateDnVecOp>(loc, handleTp, tokenTp,
token, vecY, szY);		token, vecY, szY);
▲ Show 20 Lines • Show All 42 Lines • ▼ Show 20 Lines	static LogicalResult rewriteSpMV(PatternRewriter &rewriter,
// Done.		// Done.
rewriter.replaceOp(op, op.getDpsInitOperand(0)->get());		rewriter.replaceOp(op, op.getDpsInitOperand(0)->get());
return success();		return success();
}		}

/// Match and rewrite SpMM kernel.		/// Match and rewrite SpMM kernel.
static LogicalResult rewriteSpMM(PatternRewriter &rewriter,		static LogicalResult rewriteSpMM(PatternRewriter &rewriter,
linalg::GenericOp op, bool enableRT) {		linalg::GenericOp op, bool enableRT) {
return failure(); // TODO: implement		Location loc = op.getLoc();
		Value a = op.getOperand(0);
		Value b = op.getOperand(1);
		Value c = op.getOperand(2); // we have C = AB
		SmallVector<Value> tokens;

		// Only admissible sparse matrix format and dense matrices.
		bool isCOO = false;
		SparseTensorType aTp = getSparseTensorType(a);
		SparseTensorType bTp = getSparseTensorType(b);
		SparseTensorType cTp = getSparseTensorType(c);
		if (!areAdmissibleTypes(aTp, bTp, cTp, enableRT, isCOO))
		return failure();

		// Start sparse kernel and copy data from host to device.
		// a : memR/memC/memV -> rowA,colA,valA
		// b : bufB -> matA
		// c : bufC -> matC
		Value nseA = rewriter.create<NumberOfEntriesOp>(loc, a);
		wrengrUnsubmitted Done Reply Inline Actions Nit: why not "nseA"? wrengr: Nit: why not "nseA"?
		aartbikAuthorUnsubmitted Done Reply Inline Actions Yeah, as you know I prefer the nse too over nnz, but cuSparse follows the nnz convention. But since that is not visible here, I agree using nse is better aartbik: Yeah, as you know I prefer the nse too over nnz, but cuSparse follows the nnz convention. But…
		Value szm = linalg::createOrFoldDimOp(rewriter, loc, a, 0);
		Value szk = linalg::createOrFoldDimOp(rewriter, loc, a, 1);
		Value szn = linalg::createOrFoldDimOp(rewriter, loc, b, 1);
		wrengrUnsubmitted Done Reply Inline Actions Nit: personally I think it'd be cleaner to have these in the order `szm;szk;szn` so that the dimops on `a` are together wrengr: Nit: personally I think it'd be cleaner to have these in the order `szm;szk;szn` so that the…
		aartbikAuthorUnsubmitted Done Reply Inline Actions made it so aartbik: made it so
		Value memR = genFirstPosOrCrds(rewriter, loc, a, isCOO, enableRT);
		Value memC = genSecondCrds(rewriter, loc, a, isCOO, enableRT);
		Value memV = genToValues(rewriter, loc, a);
		Value rowA = genAllocCopy(rewriter, loc, memR, tokens);
		Value colA = memC ? genAllocCopy(rewriter, loc, memC, tokens) : Value();
		Value valA = genAllocCopy(rewriter, loc, memV, tokens);
		Value bufB = genTensorToMemref(rewriter, loc, b);
		Value matB = genAllocCopy(rewriter, loc, bufB, tokens);
		Value bufC = genTensorToMemref(rewriter, loc, c);
		Value matC = genAllocCopy(rewriter, loc, bufC, tokens);
		genBlockingWait(rewriter, loc, tokens);
		tokens.clear();

		// Create sparse environment and sparse matrix/dense matrix handles.
		Type indexTp = rewriter.getIndexType();
		Type handleTp = rewriter.getType<gpu::SparseHandleType>();
		Type tokenTp = rewriter.getType<gpu::AsyncTokenType>();
		Value token = genFirstWait(rewriter, loc);
		auto env =
		rewriter.create<gpu::CreateSparseEnvOp>(loc, handleTp, tokenTp, token);
		Value handle = env.getResult(0);
		token = env.getAsyncToken();
		Operation *spGenA = genSpMat(rewriter, loc, handleTp, tokenTp, token, szm,
		szk, nseA, rowA, colA, valA, isCOO, enableRT);
		Value spMatA = spGenA->getResult(0);
		token = spGenA->getResult(1);
		wrengrUnsubmitted Done Reply Inline Actions This is more of a long-term nit, rather than something that needs to be fixed in the current CL, but: I mislike the asymmetry of not using `getAsyncToken` here. Of course, the only way to fix that would be to adjust `genSpMat` to return `gpu::AsyncOpInterface` in lieu of `Operation`. Which is doable since both `gpu::Create{Coo,Csr}Op` support that interface; however, I'm not sure if that change would cause other problems for the various `genSpMat` callsites. wrengr:* This is more of a long-term nit, rather than something that needs to be fixed in the current CL…
		aartbikAuthorUnsubmitted Done Reply Inline Actions Yeah, agreed there is some redesign lingering there aartbik: Yeah, agreed there is some redesign lingering there
		auto dmatB = rewriter.create<gpu::CreateDnMatOp>(loc, handleTp, tokenTp,
		token, szk, szn, matB);
		Value dnB = dmatB.getResult(0);
		token = dmatB.getAsyncToken();
		auto dmatC = rewriter.create<gpu::CreateDnMatOp>(loc, handleTp, tokenTp,
		token, szm, szn, matC);
		Value dnC = dmatC.getResult(0);
		token = dmatC.getAsyncToken();

		// Precompute buffersize for SpMM.
		auto bufferComp = rewriter.create<gpu::SpMMBufferSizeOp>(
		loc, indexTp, tokenTp, token, handle, spMatA, dnB, dnC);
		Value bufferSz = bufferComp.getResult(0);
		token = bufferComp.getAsyncToken();
		auto buf = genAllocBuffer(rewriter, loc, bufferSz, token);
		Value buffer = buf.getResult(0);
		token = buf.getAsyncToken();

		// Perform the SpMM.
		auto spmmComp = rewriter.create<gpu::SpMMOp>(loc, tokenTp, token, handle,
		spMatA, dnB, dnC, buffer);
		token = spmmComp.getAsyncToken();

		// Copy data back to host and free all the resoures.
		token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatA)
		.getAsyncToken();
		token = rewriter.create<gpu::DestroyDnMatOp>(loc, tokenTp, token, dnB)
		.getAsyncToken();
		token = rewriter.create<gpu::DestroyDnMatOp>(loc, tokenTp, token, dnC)
		.getAsyncToken();
		token = rewriter.create<gpu::DestroySparseEnvOp>(loc, tokenTp, token, handle)
		.getAsyncToken();
		tokens.push_back(token);
		genBlockingWait(rewriter, loc, tokens);
		tokens.clear();
		token = genFirstWait(rewriter, loc);
		token = genCopyMemRef(rewriter, loc, bufC, matC, token);
		token = genDeallocMemRef(rewriter, loc, rowA, token);
		if (colA)
		token = genDeallocMemRef(rewriter, loc, colA, token);
		token = genDeallocMemRef(rewriter, loc, valA, token);
		token = genDeallocMemRef(rewriter, loc, buffer, token);
		token = genDeallocMemRef(rewriter, loc, matB, token);
		token = genDeallocMemRef(rewriter, loc, matC, token);
		tokens.push_back(token);
		genBlockingWait(rewriter, loc, tokens);
		tokens.clear();

		// Done.
		rewriter.replaceOp(op, op.getDpsInitOperand(0)->get());
		return success();
}		}

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// Rewriting rules for direct code generation.		// Rewriting rules for direct code generation.
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

/// Proof-of-concept rewriter. This rule generates a GPU implementation		/// Proof-of-concept rewriter. This rule generates a GPU implementation
/// for each outermost forall loop generated by the sparse compiler.		/// for each outermost forall loop generated by the sparse compiler.
▲ Show 20 Lines • Show All 93 Lines • ▼ Show 20 Lines	private:
unsigned numThreads;		unsigned numThreads;
};		};

//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
// Rewriting rules for library recognition and code generation.		// Rewriting rules for library recognition and code generation.
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

/// Proof-of-concept rewriter. This rule recognizes certain math kernels		/// Proof-of-concept rewriter. This rule recognizes certain math kernels
/// and replaces these with corresponding calls into the sparse library.		/// and replaces these with corresponding calls into a sparse library.
struct LinalgOpRewriter : public OpRewritePattern<linalg::GenericOp> {		struct LinalgOpRewriter : public OpRewritePattern<linalg::GenericOp> {
using OpRewritePattern<linalg::GenericOp>::OpRewritePattern;		using OpRewritePattern<linalg::GenericOp>::OpRewritePattern;

LinalgOpRewriter(MLIRContext *context, bool rt)		LinalgOpRewriter(MLIRContext *context, bool rt)
: OpRewritePattern(context), enableRT(rt) {}		: OpRewritePattern(context), enableRT(rt) {}

LogicalResult matchAndRewrite(linalg::GenericOp op,		LogicalResult matchAndRewrite(linalg::GenericOp op,
PatternRewriter &rewriter) const override {		PatternRewriter &rewriter) const override {
▲ Show 20 Lines • Show All 60 Lines • Show Last 20 Lines

mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir

This file was added.

				// RUN: mlir-opt %s --linalg-generalize-named-ops \
				// RUN: --sparsification="enable-gpu-libgen" \| FileCheck %s

				#CSR = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>

				aartbikAuthorUnsubmitted Done Reply Inline Actions note to self: rebase with the renaming in main when ready aartbik: note to self: rebase with the renaming in main when ready
				//
				// Compute matrix matrix C = AB
				//
				// CHECK-LABEL: func.func @matmul(
				// CHECK-SAME: %[[VAL_0:.]]: tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>>,
				// CHECK-SAME: %[[VAL_1:.*]]: tensor<?x?xf64>,
				// CHECK-SAME: %[[VAL_2:.*]]: tensor<?x?xf64>) -> tensor<?x?xf64> {
				// CHECK-DAG: %[[VAL_3:.*]] = arith.constant 0 : index
				// CHECK-DAG: %[[VAL_4:.*]] = arith.constant 1 : index
				// CHECK-DAG: %[[VAL_5:.]] = sparse_tensor.number_of_entries %[[VAL_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>>
				// CHECK-DAG: %[[VAL_6:.]] = tensor.dim %[[VAL_0]], %[[VAL_3]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>>
				// CHECK-DAG: %[[VAL_7:.*]] = tensor.dim %[[VAL_1]], %[[VAL_4]] : tensor<?x?xf64>
				// CHECK-DAG: %[[VAL_8:.]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>>
				// CHECK-DAG: %[[VAL_9:.]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>> to memref<?xindex>
				// CHECK-DAG: %[[VAL_10:.]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>> to memref<?xindex, strided<[?], offset: ?>>
				// CHECK: %[[VAL_11:.]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.}}}>> to memref<?xf64>
				// CHECK: %[[VAL_12:.*]] = gpu.wait async
				// CHECK: %[[VAL_13:.*]] = memref.dim %[[VAL_9]], %[[VAL_3]] : memref<?xindex>
				// CHECK: %[[VAL_14:.]], %[[VAL_15:.]] = gpu.alloc async {{\[}}%[[VAL_12]]] (%[[VAL_13]]) : memref<?xindex>
				// CHECK: %[[VAL_16:.*]] = gpu.memcpy async {{\[}}%[[VAL_15]]] %[[VAL_14]], %[[VAL_9]] : memref<?xindex>, memref<?xindex>
				// CHECK: %[[VAL_17:.*]] = gpu.wait async
				// CHECK: %[[VAL_18:.*]] = memref.dim %[[VAL_10]], %[[VAL_3]] : memref<?xindex, strided<[?], offset: ?>>
				// CHECK: %[[VAL_19:.]], %[[VAL_20:.]] = gpu.alloc async {{\[}}%[[VAL_17]]] (%[[VAL_18]]) : memref<?xindex>
				// CHECK: %[[VAL_21:.*]] = gpu.memcpy async {{\[}}%[[VAL_20]]] %[[VAL_19]], %[[VAL_10]] : memref<?xindex>, memref<?xindex, strided<[?], offset: ?>>
				// CHECK: %[[VAL_22:.*]] = gpu.wait async
				// CHECK: %[[VAL_23:.*]] = memref.dim %[[VAL_11]], %[[VAL_3]] : memref<?xf64>
				// CHECK: %[[VAL_24:.]], %[[VAL_25:.]] = gpu.alloc async {{\[}}%[[VAL_22]]] (%[[VAL_23]]) : memref<?xf64>
				// CHECK: %[[VAL_26:.*]] = gpu.memcpy async {{\[}}%[[VAL_25]]] %[[VAL_24]], %[[VAL_11]] : memref<?xf64>, memref<?xf64>
				// CHECK: %[[VAL_27:.*]] = bufferization.to_memref %[[VAL_1]] : memref<?x?xf64>
				// CHECK: %[[VAL_28:.*]] = gpu.wait async
				// CHECK: %[[VAL_29:.*]] = memref.dim %[[VAL_27]], %[[VAL_3]] : memref<?x?xf64>
				// CHECK: %[[VAL_30:.*]] = memref.dim %[[VAL_27]], %[[VAL_4]] : memref<?x?xf64>
				// CHECK: %[[VAL_31:.]], %[[VAL_32:.]] = gpu.alloc async {{\[}}%[[VAL_28]]] (%[[VAL_29]], %[[VAL_30]]) : memref<?x?xf64>
				// CHECK: %[[VAL_33:.*]] = gpu.memcpy async {{\[}}%[[VAL_32]]] %[[VAL_31]], %[[VAL_27]] : memref<?x?xf64>, memref<?x?xf64>
				// CHECK: %[[VAL_34:.*]] = bufferization.to_memref %[[VAL_2]] : memref<?x?xf64>
				// CHECK: %[[VAL_35:.*]] = gpu.wait async
				// CHECK: %[[VAL_36:.*]] = memref.dim %[[VAL_34]], %[[VAL_3]] : memref<?x?xf64>
				// CHECK: %[[VAL_37:.*]] = memref.dim %[[VAL_34]], %[[VAL_4]] : memref<?x?xf64>
				// CHECK: %[[VAL_38:.]], %[[VAL_39:.]] = gpu.alloc async {{\[}}%[[VAL_35]]] (%[[VAL_36]], %[[VAL_37]]) : memref<?x?xf64>
				// CHECK: %[[VAL_40:.*]] = gpu.memcpy async {{\[}}%[[VAL_39]]] %[[VAL_38]], %[[VAL_34]] : memref<?x?xf64>, memref<?x?xf64>
				// CHECK: gpu.wait {{\[}}%[[VAL_16]], %[[VAL_21]], %[[VAL_26]], %[[VAL_33]], %[[VAL_40]]]
				// CHECK: %[[VAL_41:.*]] = gpu.wait async
				// CHECK: %[[VAL_42:.]], %[[VAL_43:.]] = gpu.create_sparse_env async {{\[}}%[[VAL_41]]]
				// CHECK: %[[VAL_44:.]], %[[VAL_45:.]] = gpu.create_csr async {{\[}}%[[VAL_43]]] %[[VAL_6]], %[[VAL_8]], %[[VAL_5]], %[[VAL_14]], %[[VAL_19]], %[[VAL_24]] : memref<?xindex>, memref<?xindex>, memref<?xf64>
				// CHECK: %[[VAL_46:.]], %[[VAL_47:.]] = gpu.create_dn_mat async {{\[}}%[[VAL_45]]] %[[VAL_8]], %[[VAL_7]], %[[VAL_31]] : memref<?x?xf64>
				// CHECK: %[[VAL_48:.]], %[[VAL_49:.]] = gpu.create_dn_mat async {{\[}}%[[VAL_47]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_38]] : memref<?x?xf64>
				// CHECK: %[[VAL_50:.]], %[[VAL_51:.]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_49]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]]
				// CHECK: %[[VAL_52:.]], %[[VAL_53:.]] = gpu.alloc async {{\[}}%[[VAL_51]]] (%[[VAL_50]]) : memref<?xi8>
				// CHECK: %[[VAL_54:.*]] = gpu.spmm async {{\[}}%[[VAL_53]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]], %[[VAL_52]] : memref<?xi8>
				// CHECK: %[[VAL_55:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_54]]] %[[VAL_44]]
				// CHECK: %[[VAL_56:.*]] = gpu.destroy_dn_mat async {{\[}}%[[VAL_55]]] %[[VAL_46]]
				// CHECK: %[[VAL_57:.*]] = gpu.destroy_dn_mat async {{\[}}%[[VAL_56]]] %[[VAL_48]]
				// CHECK: %[[VAL_58:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_57]]] %[[VAL_42]]
				// CHECK: gpu.wait {{\[}}%[[VAL_58]]]
				// CHECK: %[[VAL_59:.*]] = gpu.wait async
				// CHECK: %[[VAL_60:.*]] = gpu.memcpy async {{\[}}%[[VAL_59]]] %[[VAL_34]], %[[VAL_38]] : memref<?x?xf64>, memref<?x?xf64>
				// CHECK: %[[VAL_61:.*]] = gpu.dealloc async {{\[}}%[[VAL_60]]] %[[VAL_14]] : memref<?xindex>
				// CHECK: %[[VAL_62:.*]] = gpu.dealloc async {{\[}}%[[VAL_61]]] %[[VAL_19]] : memref<?xindex>
				// CHECK: %[[VAL_63:.*]] = gpu.dealloc async {{\[}}%[[VAL_62]]] %[[VAL_24]] : memref<?xf64>
				// CHECK: %[[VAL_64:.*]] = gpu.dealloc async {{\[}}%[[VAL_63]]] %[[VAL_52]] : memref<?xi8>
				// CHECK: %[[VAL_65:.*]] = gpu.dealloc async {{\[}}%[[VAL_64]]] %[[VAL_31]] : memref<?x?xf64>
				// CHECK: %[[VAL_66:.*]] = gpu.dealloc async {{\[}}%[[VAL_65]]] %[[VAL_38]] : memref<?x?xf64>
				// CHECK: gpu.wait {{\[}}%[[VAL_66]]]
				// CHECK: return %[[VAL_2]] : tensor<?x?xf64>
				// CHECK: }
				func.func @matmul(%A: tensor<?x?xf64, #CSR>, %B: tensor<?x?xf64>, %C_in: tensor<?x?xf64>) -> tensor<?x?xf64> {
				%C_out = linalg.matmul
				ins(%A, %B: tensor<?x?xf64, #CSR>, tensor<?x?xf64>)
				outs(%C_in: tensor<?x?xf64>) -> tensor<?x?xf64>
				return %C_out : tensor<?x?xf64>
				}

This is an archive of the discontinued LLVM Phabricator instance.

[mlir][sparse][gpu] recognize SpMM cuSparse during sparsificationClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 523974

mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp

mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir

[mlir][sparse][gpu] recognize SpMM cuSparse during sparsification
ClosedPublic