diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp @@ -625,6 +625,121 @@ return success(); } +/// Match and rewrite SDDMM kernel. +static LogicalResult rewriteSDDMM(PatternRewriter &rewriter, + linalg::GenericOp op, bool enableRT) { + Location loc = op.getLoc(); + Value a = op.getOperand(1); + Value b = op.getOperand(2); + Value c = op.getOperand(0); // we have C = AB + SmallVector tokens; + + // Only admissible sparse matrix format and dense matrices. + bool isCOO = false; + SparseTensorType aTp = getSparseTensorType(a); + SparseTensorType bTp = getSparseTensorType(b); + SparseTensorType cTp = getSparseTensorType(c); + if (!areAdmissibleTypes(cTp, bTp, aTp, enableRT, isCOO)) + return failure(); + + // The SDDMM does the in-place operation. If the sparse matrix C is + // reused, e.g., later on when we remove redundant memcpy, we may need to + // duplicate it before the operation so that users could use the new copy + // instead. Start sparse kernel and copy data from host to device. + // a : bufA -> matA + // b : bufB -> matA + // c : memR/memC/memV -> rowC,colC,valC + Value nseC = rewriter.create(loc, a); + Value szm = linalg::createOrFoldDimOp(rewriter, loc, a, 0); + Value szk = linalg::createOrFoldDimOp(rewriter, loc, a, 1); + Value szn = linalg::createOrFoldDimOp(rewriter, loc, b, 1); + Value bufA = genTensorToMemref(rewriter, loc, a); + Value matA = genAllocCopy(rewriter, loc, bufA, tokens); + Value bufB = genTensorToMemref(rewriter, loc, b); + Value matB = genAllocCopy(rewriter, loc, bufB, tokens); + Value memR = genFirstPosOrCrds(rewriter, loc, c, isCOO, enableRT); + Value memC = genSecondCrds(rewriter, loc, c, isCOO, enableRT); + Value memV = genToValues(rewriter, loc, c); + Value rowC = genAllocCopy(rewriter, loc, memR, tokens); + Value colC = memC ? genAllocCopy(rewriter, loc, memC, tokens) : Value(); + Value valC = genAllocCopy(rewriter, loc, memV, tokens); + genBlockingWait(rewriter, loc, tokens); + tokens.clear(); + + // Create sparse environment and sparse matrix/dense matrix handles. + Type indexTp = rewriter.getIndexType(); + Type envHandleTp = rewriter.getType(); + Type dnMatHandleTp = rewriter.getType(); + Type spMatHandleTp = rewriter.getType(); + Type tokenTp = rewriter.getType(); + Value token = genFirstWait(rewriter, loc); + auto env = + rewriter.create(loc, envHandleTp, tokenTp, token); + Value handle = env.getResult(0); + token = env.getAsyncToken(); + + auto dmatA = rewriter.create( + loc, dnMatHandleTp, tokenTp, handle, token, szm, szk, matA); + Value dnA = dmatA.getResult(0); + token = dmatA.getAsyncToken(); + auto dmatB = rewriter.create( + loc, dnMatHandleTp, tokenTp, handle, token, szk, szn, matB); + Value dnB = dmatB.getResult(0); + token = dmatB.getAsyncToken(); + + Operation *spGenC = + genSpMat(rewriter, loc, spMatHandleTp, tokenTp, token, szm, szn, nseC, + rowC, colC, valC, isCOO, enableRT); + Value spMatC = spGenC->getResult(0); + token = spGenC->getResult(1); + + auto dnCType = llvm::cast(c.getType()).getElementType(); + // Precompute buffersize for SDDMM. + auto bufferComp = rewriter.create( + loc, indexTp, tokenTp, token, handle, dnA, dnB, spMatC, dnCType); + Value bufferSz = bufferComp.getResult(0); + token = bufferComp.getAsyncToken(); + auto buf = genAllocBuffer(rewriter, loc, bufferSz, token); + Value buffer = buf.getResult(0); + token = buf.getAsyncToken(); + + // Perform the SDDMM. + auto sddmmComp = rewriter.create( + loc, tokenTp, token, handle, dnA, dnB, spMatC, dnCType, buffer); + token = sddmmComp.getAsyncToken(); + + // Copy data back to host and free all the resoures. + token = rewriter.create(loc, tokenTp, token, dnA) + .getAsyncToken(); + token = rewriter.create(loc, tokenTp, token, dnB) + .getAsyncToken(); + token = rewriter.create(loc, tokenTp, token, spMatC) + .getAsyncToken(); + token = rewriter.create(loc, tokenTp, token, handle) + .getAsyncToken(); + tokens.push_back(token); + genBlockingWait(rewriter, loc, tokens); + tokens.clear(); + token = genFirstWait(rewriter, loc); + token = genCopyMemRef(rewriter, loc, memR, rowC, token); + token = genCopyMemRef(rewriter, loc, memC, colC, token); + token = genCopyMemRef(rewriter, loc, memV, valC, token); + token = genDeallocMemRef(rewriter, loc, buffer, token); + token = genDeallocMemRef(rewriter, loc, matA, token); + token = genDeallocMemRef(rewriter, loc, matB, token); + token = genDeallocMemRef(rewriter, loc, rowC, token); + if (colC) + token = genDeallocMemRef(rewriter, loc, colC, token); + token = genDeallocMemRef(rewriter, loc, valC, token); + tokens.push_back(token); + genBlockingWait(rewriter, loc, tokens); + tokens.clear(); + + // Done. + rewriter.replaceOp(op, op.getDpsInitOperand(0)->get()); + return success(); +} + //===----------------------------------------------------------------------===// // Rewriting rules for direct code generation. //===----------------------------------------------------------------------===// @@ -776,6 +891,17 @@ return rewriteSpMM(rewriter, op, enableRT); } + // Recognize a SDDMM kernel. + if (//numLoops == 3 && numTensors == 4 && + linalg::isParallelIterator(iteratorTypes[0]) && + linalg::isParallelIterator(iteratorTypes[1]) && + linalg::isReductionIterator(iteratorTypes[2])){// && + // TODO: add transposed {i, k}, {k, j} + // TODO: maybe add transposed {i, j} in future + //maps == infer({{i, j}, {i, k}, {k, j}, {i, j}})) { + return rewriteSDDMM(rewriter, op, enableRT); + } + return failure(); } diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir @@ -0,0 +1,202 @@ +// RUN: mlir-opt %s --linalg-generalize-named-ops \ +// RUN: --sparsification="enable-gpu-libgen" | FileCheck %s + +#trait_sampled_dense_dense = { + indexing_maps = [ + affine_map<(i,j,k) -> (i,j)>, // S + affine_map<(i,j,k) -> (i,k)>, // A + affine_map<(i,j,k) -> (k,j)>, // B + affine_map<(i,j,k) -> (i,j)> // X (out) + ], + iterator_types = ["parallel", "parallel", "reduction"], + doc = "X(i,j) += S(i,j) SUM_k A(i,k) B(k,j)" +} + +#trait_vec_op = { + indexing_maps = [ + affine_map<(i,j) -> (i,j)>, // a (in) + affine_map<(i,j) -> (i,j)>, // b (in) + affine_map<(i,j) -> (i,j)> // x (out) + ], + iterator_types = ["parallel", "parallel"] +} + +#CSR = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }> + +#SortedCOO = #sparse_tensor.encoding<{ + lvlTypes = [ "compressed-nu", "singleton" ] +}> + +module { + +// CHECK-LABEL: func.func @sparse_sampled_dd( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>, +// CHECK-SAME: %[[VAL_1:.*]]: tensor<8x8xf64>, +// CHECK-SAME: %[[VAL_2:.*]]: tensor<8x8xf64>) -> tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> { +// CHECK: %[[VAL_3:.*]] = arith.constant 8 : index +// CHECK: %[[VAL_4:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_5:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_6:.*]] = arith.constant false +// CHECK: %[[VAL_7:.*]] = arith.constant true +// CHECK: %[[VAL_8:.*]] = bufferization.alloc_tensor() : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> +// CHECK: %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref +// CHECK: %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref +// CHECK: %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref +// CHECK: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : memref<8x8xf64> +// CHECK: %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : memref<8x8xf64> +// CHECK: %[[VAL_14:.*]] = scf.for %[[VAL_15:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] iter_args(%[[VAL_16:.*]] = %[[VAL_8]]) -> (tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>) { +// CHECK: %[[VAL_17:.*]], %[[VAL_18:.*]], %[[VAL_19:.*]], %[[VAL_20:.*]] = sparse_tensor.expand %[[VAL_8]] : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref, memref, memref +// CHECK: %[[VAL_21:.*]] = scf.for %[[VAL_22:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] iter_args(%[[VAL_23:.*]] = %[[VAL_20]]) -> (index) { +// CHECK: %[[VAL_24:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_15]], %[[VAL_22]]] : memref<8x8xf64> +// CHECK: %[[VAL_25:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_15]]] : memref +// CHECK: %[[VAL_26:.*]] = arith.addi %[[VAL_15]], %[[VAL_5]] : index +// CHECK: %[[VAL_27:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_26]]] : memref +// CHECK: %[[VAL_28:.*]] = scf.for %[[VAL_29:.*]] = %[[VAL_25]] to %[[VAL_27]] step %[[VAL_5]] iter_args(%[[VAL_30:.*]] = %[[VAL_23]]) -> (index) { +// CHECK: %[[VAL_31:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_29]]] : memref +// CHECK: %[[VAL_32:.*]] = memref.load %[[VAL_17]]{{\[}}%[[VAL_31]]] : memref +// CHECK: %[[VAL_33:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_29]]] : memref +// CHECK: %[[VAL_34:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_22]], %[[VAL_31]]] : memref<8x8xf64> +// CHECK: %[[VAL_35:.*]] = arith.mulf %[[VAL_24]], %[[VAL_34]] : f64 +// CHECK: %[[VAL_36:.*]] = arith.mulf %[[VAL_33]], %[[VAL_35]] : f64 +// CHECK: %[[VAL_37:.*]] = arith.addf %[[VAL_32]], %[[VAL_36]] : f64 +// CHECK: %[[VAL_38:.*]] = memref.load %[[VAL_18]]{{\[}}%[[VAL_31]]] : memref +// CHECK: %[[VAL_39:.*]] = arith.cmpi eq, %[[VAL_38]], %[[VAL_6]] : i1 +// CHECK: %[[VAL_40:.*]] = scf.if %[[VAL_39]] -> (index) { +// CHECK: memref.store %[[VAL_7]], %[[VAL_18]]{{\[}}%[[VAL_31]]] : memref +// CHECK: memref.store %[[VAL_31]], %[[VAL_19]]{{\[}}%[[VAL_30]]] : memref +// CHECK: %[[VAL_41:.*]] = arith.addi %[[VAL_30]], %[[VAL_5]] : index +// CHECK: scf.yield %[[VAL_41]] : index +// CHECK: } else { +// CHECK: scf.yield %[[VAL_30]] : index +// CHECK: } +// CHECK: memref.store %[[VAL_37]], %[[VAL_17]]{{\[}}%[[VAL_31]]] : memref +// CHECK: scf.yield %[[VAL_42:.*]] : index +// CHECK: } {"Emitted from" = "linalg.generic"} +// CHECK: scf.yield %[[VAL_43:.*]] : index +// CHECK: } {"Emitted from" = "linalg.generic"} +// CHECK: %[[VAL_44:.*]] = sparse_tensor.compress %[[VAL_17]], %[[VAL_18]], %[[VAL_19]], %[[VAL_45:.*]] into %[[VAL_16]]{{\[}}%[[VAL_15]]] : memref, memref, memref, tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> +// CHECK: scf.yield %[[VAL_44]] : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> +// CHECK: } {"Emitted from" = "linalg.generic"} +// CHECK: %[[VAL_46:.*]] = sparse_tensor.load %[[VAL_47:.*]] hasInserts : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> +// CHECK: return %[[VAL_46]] : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> +// CHECK: } +// +// A kernel that computes a direct sampled matrix matrix multiplication +// (with sparse result). +// Compute SDDMM C = C\spy AB +// VAL_0 is C +func.func @sparse_sampled_dd(%arga: tensor<8x8xf64>, %argb: tensor<8x8xf64>, %args: tensor<8x8xf64, #SortedCOO>) -> tensor<8x8xf64, #SortedCOO> { + %1 = bufferization.alloc_tensor() : tensor<8x8xf64, #SortedCOO> + %2 = linalg.generic #trait_sampled_dense_dense + ins(%args, %arga, %argb: tensor<8x8xf64, #SortedCOO>, + tensor<8x8xf64>, tensor<8x8xf64>) + outs(%1: tensor<8x8xf64, #SortedCOO>) { + ^bb(%s: f64, %a: f64, %b: f64, %x: f64): + %p = arith.mulf %a, %b : f64 + %q = arith.mulf %s, %p : f64 + %r = arith.addf %x, %q : f64 + linalg.yield %r : f64 + } -> tensor<8x8xf64, #SortedCOO> + return %2 : tensor<8x8xf64, #SortedCOO> +} + +// CHECK-LABEL: func.func @sparse_sampled_dd_with_reuse( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>, +// CHECK-SAME: %[[VAL_1:.*]]: tensor<8x8xf64>, +// CHECK-SAME: %[[VAL_2:.*]]: tensor<8x8xf64>) -> tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> { +// CHECK: %[[VAL_3:.*]] = arith.constant 8 : index +// CHECK: %[[VAL_4:.*]] = arith.constant 0 : index +// CHECK: %[[VAL_5:.*]] = arith.constant 1 : index +// CHECK: %[[VAL_6:.*]] = arith.constant false +// CHECK: %[[VAL_7:.*]] = arith.constant true +// CHECK: %[[VAL_8:.*]] = bufferization.alloc_tensor() : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> +// CHECK: %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref +// CHECK: %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref +// CHECK: %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref +// CHECK: %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : memref<8x8xf64> +// CHECK: %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : memref<8x8xf64> +// CHECK: %[[VAL_14:.*]] = scf.for %[[VAL_15:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] iter_args(%[[VAL_16:.*]] = %[[VAL_8]]) -> (tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>) { +// CHECK: %[[VAL_17:.*]], %[[VAL_18:.*]], %[[VAL_19:.*]], %[[VAL_20:.*]] = sparse_tensor.expand %[[VAL_8]] : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref, memref, memref +// CHECK: %[[VAL_21:.*]] = scf.for %[[VAL_22:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] iter_args(%[[VAL_23:.*]] = %[[VAL_20]]) -> (index) { +// CHECK: %[[VAL_24:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_15]], %[[VAL_22]]] : memref<8x8xf64> +// CHECK: %[[VAL_25:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_15]]] : memref +// CHECK: %[[VAL_26:.*]] = arith.addi %[[VAL_15]], %[[VAL_5]] : index +// CHECK: %[[VAL_27:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_26]]] : memref +// CHECK: %[[VAL_28:.*]] = scf.for %[[VAL_29:.*]] = %[[VAL_25]] to %[[VAL_27]] step %[[VAL_5]] iter_args(%[[VAL_30:.*]] = %[[VAL_23]]) -> (index) { +// CHECK: %[[VAL_31:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_29]]] : memref +// CHECK: %[[VAL_32:.*]] = memref.load %[[VAL_17]]{{\[}}%[[VAL_31]]] : memref +// CHECK: %[[VAL_33:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_29]]] : memref +// CHECK: %[[VAL_34:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_22]], %[[VAL_31]]] : memref<8x8xf64> +// CHECK: %[[VAL_35:.*]] = arith.mulf %[[VAL_24]], %[[VAL_34]] : f64 +// CHECK: %[[VAL_36:.*]] = arith.mulf %[[VAL_33]], %[[VAL_35]] : f64 +// CHECK: %[[VAL_37:.*]] = arith.addf %[[VAL_32]], %[[VAL_36]] : f64 +// CHECK: %[[VAL_38:.*]] = memref.load %[[VAL_18]]{{\[}}%[[VAL_31]]] : memref +// CHECK: %[[VAL_39:.*]] = arith.cmpi eq, %[[VAL_38]], %[[VAL_6]] : i1 +// CHECK: %[[VAL_40:.*]] = scf.if %[[VAL_39]] -> (index) { +// CHECK: memref.store %[[VAL_7]], %[[VAL_18]]{{\[}}%[[VAL_31]]] : memref +// CHECK: memref.store %[[VAL_31]], %[[VAL_19]]{{\[}}%[[VAL_30]]] : memref +// CHECK: %[[VAL_41:.*]] = arith.addi %[[VAL_30]], %[[VAL_5]] : index +// CHECK: scf.yield %[[VAL_41]] : index +// CHECK: } else { +// CHECK: scf.yield %[[VAL_30]] : index +// CHECK: } +// CHECK: memref.store %[[VAL_37]], %[[VAL_17]]{{\[}}%[[VAL_31]]] : memref +// CHECK: scf.yield %[[VAL_42:.*]] : index +// CHECK: } {"Emitted from" = "linalg.generic"} +// CHECK: scf.yield %[[VAL_43:.*]] : index +// CHECK: } {"Emitted from" = "linalg.generic"} +// CHECK: %[[VAL_44:.*]] = sparse_tensor.compress %[[VAL_17]], %[[VAL_18]], %[[VAL_19]], %[[VAL_45:.*]] into %[[VAL_16]]{{\[}}%[[VAL_15]]] : memref, memref, memref, tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> +// CHECK: scf.yield %[[VAL_44]] : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> +// CHECK: } {"Emitted from" = "linalg.generic"} +// CHECK: %[[VAL_46:.*]] = sparse_tensor.load %[[VAL_47:.*]] hasInserts : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> +// CHECK: %[[VAL_48:.*]] = bufferization.alloc_tensor() : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> +// CHECK: %[[VAL_49:.*]] = bufferization.alloc_tensor() : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> +// CHECK: return %[[VAL_46]] : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> +// CHECK: } +func.func @sparse_sampled_dd_with_reuse(%args: tensor<8x8xf64, #SortedCOO>, %arga: tensor<8x8xf64>, %argb: tensor<8x8xf64>) -> tensor<8x8xf64, #SortedCOO> { + %1 = bufferization.alloc_tensor() : tensor<8x8xf64, #SortedCOO> + %2 = linalg.generic #trait_sampled_dense_dense + ins(%args, %arga, %argb: tensor<8x8xf64, #SortedCOO>, + tensor<8x8xf64>, tensor<8x8xf64>) + outs(%1: tensor<8x8xf64, #SortedCOO>) { + ^bb(%s: f64, %a: f64, %b: f64, %x: f64): + %p = arith.mulf %a, %b : f64 + %q = arith.mulf %s, %p : f64 + %r = arith.addf %x, %q : f64 + linalg.yield %r : f64 + } -> tensor<8x8xf64, #SortedCOO> + + // reuse the input by doing the computation again + %3 = bufferization.alloc_tensor() : tensor<8x8xf64, #SortedCOO> + %4 = linalg.generic #trait_sampled_dense_dense + ins(%args, %arga, %argb: tensor<8x8xf64, #SortedCOO>, + tensor<8x8xf64>, tensor<8x8xf64>) + outs(%3: tensor<8x8xf64, #SortedCOO>) { + ^bb(%s: f64, %a: f64, %b: f64, %x: f64): + %p = arith.mulf %a, %b : f64 + %q = arith.mulf %s, %p : f64 + %r = arith.addf %x, %q : f64 + linalg.yield %r : f64 + } -> tensor<8x8xf64, #SortedCOO> + + // elementwise-min operation to produce the return tensor + %5 = bufferization.alloc_tensor() : tensor<8x8xf64, #SortedCOO> + %6 = linalg.generic #trait_vec_op + ins(%2, %4: tensor<8x8xf64, #SortedCOO>, tensor<8x8xf64, #SortedCOO>) + outs(%5: tensor<8x8xf64, #SortedCOO>) { + ^bb(%a: f64, %b: f64, %x: f64): + %d0 = sparse_tensor.binary %a, %b : f64, f64 to f64 + overlap={ + ^bb0(%a0: f64, %b0: f64): + %c0 = arith.minf %a0, %b0: f64 + sparse_tensor.yield %c0 : f64 + } + left=identity + right=identity + linalg.yield %d0 : f64 + } -> tensor<8x8xf64, #SortedCOO> + + return %2 : tensor<8x8xf64, #SortedCOO> +} + +} diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir @@ -0,0 +1,119 @@ +// +// NOTE: this test requires gpu-sm80 +// +// +// RUN: mlir-opt %s \ +// RUN: --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71" \ +// RUN: | mlir-cpu-runner \ +// RUN: --shared-libs=%mlir_cuda_runtime \ +// RUN: --shared-libs=%mlir_c_runner_utils \ +// RUN: --e main --entry-point-result=void \ +// RUN: | FileCheck %s +// + +!Filename = !llvm.ptr + +#SparseMatrix = #sparse_tensor.encoding<{ + lvlTypes = [ "compressed", "compressed" ], + posWidth = 32, + crdWidth = 32 +}> + +#trait_sampled_dense_dense = { + indexing_maps = [ + affine_map<(i,j,k) -> (i,j)>, // S + affine_map<(i,j,k) -> (i,k)>, // A + affine_map<(i,j,k) -> (k,j)>, // B + affine_map<(i,j,k) -> (i,j)> // X (out) + ], + iterator_types = ["parallel", "parallel", "reduction"], + doc = "X(i,j) += S(i,j) SUM_k A(i,k) B(k,j)" +} + +// +// Integration test that lowers a kernel annotated as sparse to +// actual sparse code, initializes a matching sparse storage scheme +// from file, and runs the resulting code with the JIT compiler. +// +module { + // + // A kernel that computes a sampled matrix matrix multiplication. + // + func.func @sampled_dense_dense(%args: tensor, + %arga: tensor, + %argb: tensor, + %argx: tensor) -> tensor { + %0 = linalg.generic #trait_sampled_dense_dense + ins(%args, %arga, %argb: tensor, tensor, tensor) + outs(%argx: tensor) { + ^bb(%s: f32, %a: f32, %b: f32, %x: f32): + %0 = arith.mulf %a, %b : f32 + %1 = arith.mulf %s, %0 : f32 + %2 = arith.addf %x, %1 : f32 + linalg.yield %2 : f32 + } -> tensor + return %0 : tensor + } + + func.func private @getTensorFilename(index) -> (!Filename) + + // + // Main driver that reads matrix from file and calls the sparse kernel. + // + func.func @entry() { + %d0 = arith.constant 0.0 : f32 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c5 = arith.constant 5 : index + %c10 = arith.constant 10 : index + + // Initialize dense matrices. + %x = tensor.generate %c5, %c5 { + ^bb0(%i : index, %j : index): + tensor.yield %d0 : f32 + } : tensor + + %a = tensor.generate %c5, %c10 { + ^bb0(%i: index, %j: index): + %p = arith.addi %i, %c1 : index + %q = arith.index_cast %p : index to i32 + %d = arith.sitofp %q : i32 to f32 + tensor.yield %d : f32 + } : tensor + + %b = tensor.generate %c10, %c5 { + ^bb0(%i: index, %j: index): + %p = arith.addi %j, %c1 : index + %q = arith.index_cast %p : index to i32 + %d = arith.sitofp %q : i32 to f32 + tensor.yield %d : f32 + } : tensor + + // Read the sparse matrix from file, construct sparse storage. + %fileName = call @getTensorFilename(%c0) : (index) -> (!Filename) + %s = sparse_tensor.new %fileName : !Filename to tensor + + // Call the kernel. + %0 = call @sampled_dense_dense(%s, %a, %b, %x) + : (tensor, + tensor, tensor, tensor) -> tensor + + // Print the result for verification. + // + // CHECK: ( 10, 0, 0, 56, 0 ) + // CHECK: ( 0, 80, 0, 0, 250 ) + // CHECK: ( 0, 0, 270, 0, 0 ) + // CHECK: ( 164, 0, 0, 640, 0 ) + // CHECK: ( 0, 520, 0, 0, 1250 ) + // + scf.for %i = %c0 to %c5 step %c1 { + %v = vector.transfer_read %0[%i, %c0], %d0: tensor, vector<5xf32> + vector.print %v : vector<5xf32> + } + + // Release the resources. + bufferization.dealloc_tensor %s : tensor + + return + } +}