diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp @@ -259,7 +259,7 @@ // for the implicit loop indices i_0 .. i_n-1. unsigned n = op.getNumLoops(); std::vector> adjM(n, std::vector(n, false)); - + auto iteratorTypes = op.iterator_types().getValue(); // Iterate over the indexing maps of every tensor in the tensor expression. for (OpOperand *t : op.getInputAndOutputOperands()) { // Skip tensor during cycle resolution. @@ -298,8 +298,20 @@ topSort.clear(); topSort.reserve(n); std::vector visit(n, 0); + // Start the topological sort with Reduction iterator first (then parallel + // iterator). This is a heuristic such that Reduction iterators appears later + // in the topologically sorted result, which increase the chance to make the + // GenericOp admissible later in isAdmissableTensorExp(). + // NOTE: This is still suboptimal, e.g., + // R1->P1->P2, R2->P3, the optimal sequence should be: P2, P1, *R1*, P3, R2 + // while the heuristic can return: P3, *R2*, P2, P1, R1 + for (unsigned i = 0; i < n; i++) + if (visit[i] == 0 && linalg::isReductionIterator(iteratorTypes[i])) + if (!topSortDFS(i, visit, topSort, adjM)) + return false; // cycle! + // Then try parallel iterator. for (unsigned i = 0; i < n; i++) - if (visit[i] == 0) + if (visit[i] == 0 && !linalg::isReductionIterator(iteratorTypes[i])) if (!topSortDFS(i, visit, topSort, adjM)) return false; // cycle! std::reverse(std::begin(topSort), std::end(topSort)); @@ -1774,7 +1786,6 @@ !computeIterationGraph(merger, op, topSort, SortMask::kSparseOnly)) { return resolveCycle(merger, rewriter, op); } - // Builds the tensor expression for the Linalg operation in SSA form. Optional optExp = merger.buildTensorExpFromLinalg(op); if (!optExp.has_value()) diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_filter_conv2d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_filter_conv2d.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_filter_conv2d.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_filter_conv2d.mlir @@ -24,6 +24,15 @@ return %0 : tensor<6x6xi32> } + func.func @conv2d_sparse_out(%input: tensor<8x8xi32>, + %filter: tensor<3x3xi32, #DCSR>) -> tensor<6x6xi32, #DCSR> { + %s = bufferization.alloc_tensor() : tensor<6x6xi32, #DCSR> + %0 = linalg.conv_2d + ins (%input, %filter: tensor<8x8xi32>, tensor<3x3xi32, #DCSR>) + outs (%s: tensor<6x6xi32, #DCSR>) -> tensor<6x6xi32, #DCSR> + return %0 : tensor<6x6xi32, #DCSR> + } + func.func @entry() { %c0 = arith.constant 0 : index %i0 = arith.constant 0 : i32 @@ -53,7 +62,10 @@ %0 = call @conv2d(%input, %sparse_filter, %output) : (tensor<8x8xi32>, tensor<3x3xi32, #DCSR>, tensor<6x6xi32>) -> tensor<6x6xi32> - + %1 = call @conv2d_sparse_out(%input, %sparse_filter) + : (tensor<8x8xi32>, + tensor<3x3xi32, #DCSR>) -> tensor<6x6xi32, #DCSR> + // Verify the output. // // CHECK: ( ( 0, 0, -1, -6, -1, 6 ), @@ -67,9 +79,24 @@ : tensor<6x6xi32>, vector<6x6xi32> vector.print %v : vector<6x6xi32> + // + // Should be the same as dense output + // CHECK: ( ( 0, 0, -1, -6, -1, 6 ), + // CHECK-SAME: ( -1, 0, 1, 0, 1, 0 ), + // CHECK-SAME: ( 0, -1, 1, 0, 0, 0 ), + // CHECK-SAME: ( -1, 0, 0, 0, 0, 0 ), + // CHECK-SAME: ( 0, 0, 3, 6, -3, -6 ), + // CHECK-SAME: ( 2, -1, 3, 0, -3, 0 ) ) + // + %sparse_ret = sparse_tensor.convert %1 + : tensor<6x6xi32, #DCSR> to tensor<6x6xi32> + %v1 = vector.transfer_read %sparse_ret[%c0, %c0], %i0 + : tensor<6x6xi32>, vector<6x6xi32> + vector.print %v1 : vector<6x6xi32> + // Release the resources. bufferization.dealloc_tensor %sparse_filter : tensor<3x3xi32, #DCSR> - + bufferization.dealloc_tensor %1 : tensor<6x6xi32, #DCSR> return } }