diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp @@ -605,6 +605,118 @@ return success(); } + +/// Match and rewrite SDDMM kernel. +static LogicalResult rewriteSDDMM(PatternRewriter &rewriter, + linalg::GenericOp op, bool enableRT) { + Location loc = op.getLoc(); + Value a = op.getOperand(0); + Value b = op.getOperand(1); + Value c = op.getOperand(2); // we have C = AB + SmallVector tokens; + + // Only admissible sparse matrix format and dense matrices. + bool isCOO = false; + SparseTensorType aTp = getSparseTensorType(a); + SparseTensorType bTp = getSparseTensorType(b); + SparseTensorType cTp = getSparseTensorType(c); + if (!areAdmissibleTypes(aTp, bTp, cTp, enableRT, isCOO)) + return failure(); + + // Start sparse kernel and copy data from host to device. + // a : bufA -> matA + // b : bufB -> matA + // c : memR/memC/memV -> rowC,colC,valC + Value nseC = rewriter.create(loc, a); + Value szm = linalg::createOrFoldDimOp(rewriter, loc, a, 0); + Value szk = linalg::createOrFoldDimOp(rewriter, loc, a, 1); + Value szn = linalg::createOrFoldDimOp(rewriter, loc, b, 1); + Value bufA = genTensorToMemref(rewriter, loc, a); + Value matA = genAllocCopy(rewriter, loc, bufA, tokens); + Value bufB = genTensorToMemref(rewriter, loc, b); + Value matB = genAllocCopy(rewriter, loc, bufB, tokens); + Value memR = genFirstPosOrCrds(rewriter, loc, c, isCOO, enableRT); + Value memC = genSecondCrds(rewriter, loc, c, isCOO, enableRT); + Value memV = genToValues(rewriter, loc, c); + Value rowC = genAllocCopy(rewriter, loc, memR, tokens); + Value colC = memC ? genAllocCopy(rewriter, loc, memC, tokens) : Value(); + Value valC = genAllocCopy(rewriter, loc, memV, tokens); + genBlockingWait(rewriter, loc, tokens); + tokens.clear(); + + // Create sparse environment and sparse matrix/dense matrix handles. + Type indexTp = rewriter.getIndexType(); + Type envHandleTp = rewriter.getType(); + Type dnMatHandleTp = rewriter.getType(); + Type spMatHandleTp = rewriter.getType(); + Type tokenTp = rewriter.getType(); + Value token = genFirstWait(rewriter, loc); + auto env = + rewriter.create(loc, envHandleTp, tokenTp, token); + Value handle = env.getResult(0); + token = env.getAsyncToken(); + + auto dmatA = rewriter.create(loc, dnMatHandleTp, tokenTp, + token, szm, szk, matA); + Value dnA = dmatA.getResult(0); + token = dmatA.getAsyncToken(); + auto dmatB = rewriter.create(loc, dnMatHandleTp, tokenTp, + token, szk, szn, matB); + Value dnB = dmatB.getResult(0); + token = dmatB.getAsyncToken(); + + Operation *spGenC = + genSpMat(rewriter, loc, spMatHandleTp, tokenTp, token, szm, szn, nseC, + rowC, colC, valC, isCOO, enableRT); + Value spMatC = spGenC->getResult(0); + token = spGenC->getResult(1); + + // Precompute buffersize for SDDMM. + auto bufferComp = rewriter.create( + loc, indexTp, tokenTp, token, handle, dnA, dnB, spMatC); + Value bufferSz = bufferComp.getResult(0); + token = bufferComp.getAsyncToken(); + auto buf = genAllocBuffer(rewriter, loc, bufferSz, token); + Value buffer = buf.getResult(0); + token = buf.getAsyncToken(); + + // Perform the SDDMM. + auto sddmmComp = rewriter.create(loc, tokenTp, token, handle, + dnA, dnB, spMatC, buffer); + token = sddmmComp.getAsyncToken(); + + // Copy data back to host and free all the resoures. + token = rewriter.create(loc, tokenTp, token, dnA) + .getAsyncToken(); + token = rewriter.create(loc, tokenTp, token, dnB) + .getAsyncToken(); + token = rewriter.create(loc, tokenTp, token, spMatC) + .getAsyncToken(); + token = rewriter.create(loc, tokenTp, token, handle) + .getAsyncToken(); + tokens.push_back(token); + genBlockingWait(rewriter, loc, tokens); + tokens.clear(); + token = genFirstWait(rewriter, loc); + token = genCopyMemRef(rewriter, loc, memR, rowC, token); + token = genCopyMemRef(rewriter, loc, memC, colC, token); + token = genCopyMemRef(rewriter, loc, memV, valC, token); + token = genDeallocMemRef(rewriter, loc, buffer, token); + token = genDeallocMemRef(rewriter, loc, matA, token); + token = genDeallocMemRef(rewriter, loc, matB, token); + token = genDeallocMemRef(rewriter, loc, rowC, token); + if (colC) + token = genDeallocMemRef(rewriter, loc, colC, token); + token = genDeallocMemRef(rewriter, loc, valC, token); + tokens.push_back(token); + genBlockingWait(rewriter, loc, tokens); + tokens.clear(); + + // Done. + rewriter.replaceOp(op, op.getDpsInitOperand(0)->get()); + return success(); +} + //===----------------------------------------------------------------------===// // Rewriting rules for direct code generation. //===----------------------------------------------------------------------===//