diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp @@ -41,7 +41,12 @@ namespace { // Iteration graph sorting. -enum SortMask { kSparseOnly = 0x0, kIncludeDense = 0x1, kIncludeUndef = 0x2 }; +enum SortMask { + kSparseOnly = 0x0, + kIncludeDense = 0x1, + kIncludeUndef = 0x2, + kIncludeAll = 0x3 +}; // Reduction kinds. enum Reduction { kNoReduc, kSum, kProduct, kAnd, kOr, kXor }; @@ -105,6 +110,17 @@ // Sparse compiler analysis methods. //===----------------------------------------------------------------------===// +/// Helper method to construct a permuted dimension ordering +/// that adheres to the given topological sort. +static AffineMap permute(MLIRContext *context, AffineMap m, + std::vector &topSort) { + unsigned sz = topSort.size(); + SmallVector perm(sz); + for (unsigned i = 0; i < sz; i++) + perm[i] = m.getPermutedPosition(topSort[i]); + return AffineMap::getPermutationMap(perm, context); +} + /// Helper method to apply dimension ordering permutation. static unsigned perm(const SparseTensorEncodingAttr &enc, unsigned d) { if (enc) { @@ -231,8 +247,8 @@ /// dimensions. Even for dense storage formats, however, the natural index /// order yields innermost unit-stride access with better spatial locality. static bool computeIterationGraph(Merger &merger, linalg::GenericOp op, - std::vector &topSort, - unsigned mask) { + std::vector &topSort, unsigned mask, + OpOperand *skip = nullptr) { // Set up an n x n from/to adjacency matrix of the iteration graph // for the implicit loop indices i_0 .. i_n-1. unsigned n = op.getNumLoops(); @@ -240,6 +256,10 @@ // Iterate over the indexing maps of every tensor in the tensor expression. for (OpOperand *t : op.getInputAndOutputOperands()) { + // Skip tensor during cycle resolution. + if (t == skip) + continue; + // Get map and encoding. auto map = op.getTiedIndexingMap(t); auto enc = getSparseTensorEncoding(t->get().getType()); assert(map.getNumDims() == n); @@ -328,7 +348,7 @@ // A tensor expression with a sparse output tensor that changes its values // but not its nonzero structure, an operation called "simply dynamic" in // [Bik96,Ch9], is also admissable without special codegen, provided - // the tensor's underlying sparse storage scheme can be modified in place. + // the tensor's underlying sparse storage scheme can be modified in-place. if (merger.isSingleCondition(tensor, exp) && isInPlace(lhs->get())) return true; // Accept "truly dynamic" if the output tensor materializes uninitialized @@ -1725,18 +1745,18 @@ if (!findSparseAnnotations(merger, op)) return failure(); - // Computes a topologically sorted iteration graph to ensure - // tensors are visited in natural index order. Fails on cycles. - // This assumes that higher-level passes have already put the - // tensors in each tensor expression in a feasible order. + // Computes a topologically sorted iteration graph to ensure tensors + // are visited in natural index order. Gradually relaxes the considered + // constraints until an acyclic iteration graph results, such that sparse + // code generation can proceed. As a last resort, an attempt is made + // to resolve cycles by inserting a conversion. std::vector topSort; - if (!computeIterationGraph(merger, op, topSort, - SortMask::kIncludeUndef | - SortMask::kIncludeDense) && + if (!computeIterationGraph(merger, op, topSort, SortMask::kIncludeAll) && !computeIterationGraph(merger, op, topSort, SortMask::kIncludeUndef) && !computeIterationGraph(merger, op, topSort, SortMask::kIncludeDense) && - !computeIterationGraph(merger, op, topSort, SortMask::kSparseOnly)) - return failure(); + !computeIterationGraph(merger, op, topSort, SortMask::kSparseOnly)) { + return resolveCycle(merger, rewriter, op); + } // Builds the tensor expression for the Linalg operation in SSA form. Optional optExp = merger.buildTensorExpFromLinalg(op); @@ -1761,6 +1781,45 @@ } private: + // Last resort cycle resolution. + LogicalResult resolveCycle(Merger &merger, PatternRewriter &rewriter, + linalg::GenericOp op) const { + // Compute topological sort while leaving out every + // sparse input tensor in succession until an acylic + // iteration graph results. + std::vector topSort; + for (OpOperand *t : op.getInputOperands()) { + unsigned tensor = t->getOperandNumber(); + Value tval = t->get(); + auto srcEnc = getSparseTensorEncoding(tval.getType()); + if (!srcEnc || + !computeIterationGraph(merger, op, topSort, SortMask::kSparseOnly, t)) + continue; + // Found an input tensor that resolves the cycle by inserting a + // conversion into a sparse tensor that adheres to the iteration + // graph order. Also releases the temporary sparse tensor. + // + // TODO: investigate fusing the conversion with computation, + // especially if it is a direct yield! + // + auto srcTp = tval.getType().cast(); + auto dstEnc = SparseTensorEncodingAttr::get( + op->getContext(), srcEnc.getDimLevelType(), + permute(getContext(), op.getTiedIndexingMap(t), topSort), // new order + srcEnc.getPointerBitWidth(), srcEnc.getIndexBitWidth()); + auto dstTp = RankedTensorType::get(srcTp.getShape(), + srcTp.getElementType(), dstEnc); + auto convert = rewriter.create(tval.getLoc(), dstTp, tval); + op->setOperand(tensor, convert); + rewriter.setInsertionPointAfter(op); + rewriter.create(tval.getLoc(), convert); + return success(); + } + // Cannot be resolved with a single conversion. + // TODO: convert more than one? + return failure(); + } + /// Options to control sparse code generation. SparsificationOptions options; }; diff --git a/mlir/test/Dialect/SparseTensor/sparse_transpose.mlir b/mlir/test/Dialect/SparseTensor/sparse_transpose.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/SparseTensor/sparse_transpose.mlir @@ -0,0 +1,62 @@ +// RUN: mlir-opt %s -sparsification | FileCheck %s + +#DCSR = #sparse_tensor.encoding<{ + dimLevelType = [ "compressed", "compressed" ] +}> + +#transpose_trait = { + indexing_maps = [ + affine_map<(i,j) -> (j,i)>, // A + affine_map<(i,j) -> (i,j)> // X + ], + iterator_types = ["parallel", "parallel"], + doc = "X(i,j) = A(j,i)" +} + +// TODO: improve auto-conversion followed by yield + +// CHECK-LABEL: func.func @sparse_transpose_auto( +// CHECK-SAME: %[[VAL_0:.*]]: tensor<3x4xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>) -> tensor<4x3xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> { +// CHECK-DAG: %[[VAL_1:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[VAL_2:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[VAL_3:.*]] = arith.constant 2 : index +// CHECK: %[[VAL_4:.*]] = bufferization.alloc_tensor() : tensor<4x3xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> +// CHECK: %[[VAL_5:.*]] = sparse_tensor.convert %[[VAL_0]] : tensor<3x4xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> to tensor<3x4xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], dimOrdering = affine_map<(d0, d1) -> (d1, d0)>, pointerBitWidth = 0, indexBitWidth = 0 }>> +// CHECK: %[[VAL_6:.*]] = sparse_tensor.pointers %[[VAL_5]], %[[VAL_1]] : tensor<3x4xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], dimOrdering = affine_map<(d0, d1) -> (d1, d0)>, pointerBitWidth = 0, indexBitWidth = 0 }>> to memref +// CHECK: %[[VAL_7:.*]] = sparse_tensor.indices %[[VAL_5]], %[[VAL_1]] : tensor<3x4xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], dimOrdering = affine_map<(d0, d1) -> (d1, d0)>, pointerBitWidth = 0, indexBitWidth = 0 }>> to memref +// CHECK: %[[VAL_8:.*]] = sparse_tensor.pointers %[[VAL_5]], %[[VAL_2]] : tensor<3x4xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], dimOrdering = affine_map<(d0, d1) -> (d1, d0)>, pointerBitWidth = 0, indexBitWidth = 0 }>> to memref +// CHECK: %[[VAL_9:.*]] = sparse_tensor.indices %[[VAL_5]], %[[VAL_2]] : tensor<3x4xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], dimOrdering = affine_map<(d0, d1) -> (d1, d0)>, pointerBitWidth = 0, indexBitWidth = 0 }>> to memref +// CHECK: %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_5]] : tensor<3x4xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], dimOrdering = affine_map<(d0, d1) -> (d1, d0)>, pointerBitWidth = 0, indexBitWidth = 0 }>> to memref +// CHECK: %[[VAL_11:.*]] = memref.alloca(%[[VAL_3]]) : memref +// CHECK: %[[VAL_12:.*]] = memref.alloca() : memref +// CHECK: %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_1]]] : memref +// CHECK: %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_2]]] : memref +// CHECK: scf.for %[[VAL_15:.*]] = %[[VAL_13]] to %[[VAL_14]] step %[[VAL_2]] { +// CHECK: %[[VAL_16:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_15]]] : memref +// CHECK: memref.store %[[VAL_16]], %[[VAL_11]]{{\[}}%[[VAL_1]]] : memref +// CHECK: %[[VAL_17:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_15]]] : memref +// CHECK: %[[VAL_18:.*]] = arith.addi %[[VAL_15]], %[[VAL_2]] : index +// CHECK: %[[VAL_19:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_18]]] : memref +// CHECK: scf.for %[[VAL_20:.*]] = %[[VAL_17]] to %[[VAL_19]] step %[[VAL_2]] { +// CHECK: %[[VAL_21:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_20]]] : memref +// CHECK: memref.store %[[VAL_21]], %[[VAL_11]]{{\[}}%[[VAL_2]]] : memref +// CHECK: %[[VAL_22:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_20]]] : memref +// CHECK: memref.store %[[VAL_22]], %[[VAL_12]][] : memref +// CHECK: sparse_tensor.lex_insert %[[VAL_4]], %[[VAL_11]], %[[VAL_12]] : tensor<4x3xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>>, memref, memref +// CHECK: } +// CHECK: } +// CHECK: %[[VAL_23:.*]] = sparse_tensor.load %[[VAL_4]] hasInserts : tensor<4x3xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> +// CHECK: sparse_tensor.release %[[VAL_5]] : tensor<3x4xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], dimOrdering = affine_map<(d0, d1) -> (d1, d0)>, pointerBitWidth = 0, indexBitWidth = 0 }>> +// CHECK: return %[[VAL_23]] : tensor<4x3xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ], pointerBitWidth = 0, indexBitWidth = 0 }>> +// CHECK: } +func.func @sparse_transpose_auto(%arga: tensor<3x4xf64, #DCSR>) + -> tensor<4x3xf64, #DCSR> { + %i = bufferization.alloc_tensor() : tensor<4x3xf64, #DCSR> + %0 = linalg.generic #transpose_trait + ins(%arga: tensor<3x4xf64, #DCSR>) + outs(%i: tensor<4x3xf64, #DCSR>) { + ^bb(%a: f64, %x: f64): + linalg.yield %a : f64 + } -> tensor<4x3xf64, #DCSR> + return %0 : tensor<4x3xf64, #DCSR> +} diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_transpose.mlir @@ -25,25 +25,42 @@ // // Transposing a sparse row-wise matrix into another sparse row-wise - // matrix would fail direct codegen, since it introduces a cycle in - // the iteration graph. This can be avoided by converting the incoming + // matrix introduces a cycle in the iteration graph. This complication + // can be avoided by manually inserting a conversion of the incoming // matrix into a sparse column-wise matrix first. // - func.func @sparse_transpose(%arga: tensor<3x4xf64, #DCSR>) -> tensor<4x3xf64, #DCSR> { - %t = sparse_tensor.convert %arga : tensor<3x4xf64, #DCSR> to tensor<3x4xf64, #DCSC> + func.func @sparse_transpose(%arga: tensor<3x4xf64, #DCSR>) + -> tensor<4x3xf64, #DCSR> { + %t = sparse_tensor.convert %arga + : tensor<3x4xf64, #DCSR> to tensor<3x4xf64, #DCSC> %i = bufferization.alloc_tensor() : tensor<4x3xf64, #DCSR> - %0 = linalg.generic #transpose_trait ins(%t: tensor<3x4xf64, #DCSC>) outs(%i: tensor<4x3xf64, #DCSR>) { ^bb(%a: f64, %x: f64): linalg.yield %a : f64 - } -> tensor<4x3xf64, #DCSR> + } -> tensor<4x3xf64, #DCSR> + + sparse_tensor.release %t : tensor<3x4xf64, #DCSC> - sparse_tensor.release %t : tensor<3x4xf64, #DCSC> + return %0 : tensor<4x3xf64, #DCSR> + } - return %0 : tensor<4x3xf64, #DCSR> + // + // However, even better, the sparse compiler is able to insert such a + // conversion automatically to resolve a cycle in the iteration graph! + // + func.func @sparse_transpose_auto(%arga: tensor<3x4xf64, #DCSR>) + -> tensor<4x3xf64, #DCSR> { + %i = bufferization.alloc_tensor() : tensor<4x3xf64, #DCSR> + %0 = linalg.generic #transpose_trait + ins(%arga: tensor<3x4xf64, #DCSR>) + outs(%i: tensor<4x3xf64, #DCSR>) { + ^bb(%a: f64, %x: f64): + linalg.yield %a : f64 + } -> tensor<4x3xf64, #DCSR> + return %0 : tensor<4x3xf64, #DCSR> } // @@ -63,8 +80,11 @@ ]> : tensor<3x4xf64> %a = sparse_tensor.convert %d : tensor<3x4xf64> to tensor<3x4xf64, #DCSR> - // Call the kernel. - %0 = call @sparse_transpose(%a) : (tensor<3x4xf64, #DCSR>) -> tensor<4x3xf64, #DCSR> + // Call the kernels. + %0 = call @sparse_transpose(%a) + : (tensor<3x4xf64, #DCSR>) -> tensor<4x3xf64, #DCSR> + %1 = call @sparse_transpose_auto(%a) + : (tensor<3x4xf64, #DCSR>) -> tensor<4x3xf64, #DCSR> // // Verify result. @@ -74,17 +94,30 @@ // CHECK-NEXT: ( 0, 0, 3.3 ) // CHECK-NEXT: ( 1.4, 0, 3.4 ) // + // CHECK-NEXT: ( 1.1, 0, 3.1 ) + // CHECK-NEXT: ( 1.2, 0, 0 ) + // CHECK-NEXT: ( 0, 0, 3.3 ) + // CHECK-NEXT: ( 1.4, 0, 3.4 ) + // %x = sparse_tensor.convert %0 : tensor<4x3xf64, #DCSR> to tensor<4x3xf64> %m = bufferization.to_memref %x : memref<4x3xf64> scf.for %i = %c0 to %c4 step %c1 { - %v = vector.transfer_read %m[%i, %c0], %du: memref<4x3xf64>, vector<3xf64> - vector.print %v : vector<3xf64> + %v1 = vector.transfer_read %m[%i, %c0], %du: memref<4x3xf64>, vector<3xf64> + vector.print %v1 : vector<3xf64> + } + %y = sparse_tensor.convert %1 : tensor<4x3xf64, #DCSR> to tensor<4x3xf64> + %n = bufferization.to_memref %y : memref<4x3xf64> + scf.for %i = %c0 to %c4 step %c1 { + %v2 = vector.transfer_read %n[%i, %c0], %du: memref<4x3xf64>, vector<3xf64> + vector.print %v2 : vector<3xf64> } // Release resources. sparse_tensor.release %a : tensor<3x4xf64, #DCSR> sparse_tensor.release %0 : tensor<4x3xf64, #DCSR> + sparse_tensor.release %1 : tensor<4x3xf64, #DCSR> memref.dealloc %m : memref<4x3xf64> + memref.dealloc %n : memref<4x3xf64> return }