diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp @@ -1162,15 +1162,6 @@ void LoopEmitter::exitForLoop(RewriterBase &rewriter, Location loc, MutableArrayRef reduc) { const LoopInfo &loopInfo = loopStack.back(); - rewriter.setInsertionPointToEnd(loopInfo.userCodeBlock); - if (!loopInfo.userCodeBlock->empty() && - llvm::isa(loopInfo.loop) && - llvm::isa(&loopInfo.userCodeBlock->back())) { - // scf::For inserts an implicit yield op when there is no loop iter args. In - // this case, we need to insert the code before the yield. - assert(reduc.empty()); - rewriter.setInsertionPoint(&loopInfo.userCodeBlock->back()); - } for (auto [tid, lvl, reduced] : loopInfo.sliceDrivenInfo) { SliceInfo &info = sliceStack[tid].back(); assert(isDenseDLT(lvlTypes[tid][lvl])); @@ -1262,7 +1253,6 @@ MutableArrayRef reduc) { const LoopInfo &loopInfo = loopStack.back(); auto whileOp = llvm::cast(loopInfo.loop); - builder.setInsertionPointToEnd(loopInfo.userCodeBlock); Value iv = loopInfo.iv; // Finalize the induction. Note that the induction could be performed @@ -1361,7 +1351,9 @@ } assert(o == operands.size() + delta); - YIELD(operands); + if (!operands.empty()) + YIELD(operands); + builder.setInsertionPointAfter(whileOp); } @@ -1370,7 +1362,17 @@ // Clean up the values, it would help use to discover potential bug at a // earlier stage (instead of silently using a wrong value). const LoopInfo &loopInfo = loopStack.back(); - SmallVector red; + + // Sets the insertion point to the right position. + rewriter.setInsertionPointToEnd(loopInfo.userCodeBlock); + if (!loopInfo.userCodeBlock->empty() && + llvm::isa(&loopInfo.userCodeBlock->back())) { + // scf::While/For inserts an implicit yield op when there is no loop + // iter args. In this case, we need to insert the code before the yield. + assert(loopInfo.userCodeBlock->back().getNumResults() == 0); + rewriter.setInsertionPoint(&loopInfo.userCodeBlock->back()); + } + if (llvm::isa(loopInfo.loop)) { exitWhileLoop(rewriter, loc, reduc); } else { @@ -1449,7 +1451,8 @@ Value cont = CMPI(ult, coord, sliceHi); TypeRange types = args.drop_front(2).getTypes(); - auto ifOp = builder.create(loc, types, cont, true); + auto ifOp = builder.create(loc, types, cont, + /*withElseBlock=*/!types.empty()); { // 2 reduction variable maintained by us. SmallVector ifRet = args.drop_front(2); @@ -1457,8 +1460,10 @@ OpBuilder::InsertionGuard guard(builder); // If coord >= sliceHi. - builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); - YIELD(ifRet); + if (!ifRet.empty()) { + builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); + YIELD(ifRet); + } // If coord < sliceHi. builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp @@ -699,14 +699,18 @@ const AffineExpr fa = map.getResult(toOrigDim(enc, lvl - 1)); const AffineExpr ta = map.getResult(toOrigDim(enc, lvl)); - if (auto fdim = fa.dyn_cast()) { + if (fa.isa() || ta.isa()) { + AffineDimCollector fCollector; + fCollector.walkPostOrder(fa); + AffineDimCollector tCollector; tCollector.walkPostOrder(ta); - - const LoopId f = env.makeLoopId(fdim.getPosition()); - for (auto td : tCollector.dims) { - const LoopId t = env.makeLoopId(td.getPosition()); - addIterOrdering(f, t, adjM, inDegree); + for (auto fd : fCollector.dims) { + for (auto td : tCollector.dims) { + const LoopId f = env.makeLoopId(fd.getPosition()); + const LoopId t = env.makeLoopId(td.getPosition()); + addIterOrdering(f, t, adjM, inDegree); + } } continue; } diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pooling_nhwc.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pooling_nhwc.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_pooling_nhwc.mlir @@ -0,0 +1,93 @@ +// DEFINE: %{option} = "enable-runtime-library=false enable-index-reduction=true" +// DEFINE: %{compile} = mlir-opt %s --sparse-compiler=%{option} +// DEFINE: %{run} = mlir-cpu-runner \ +// DEFINE: -e entry -entry-point-result=void \ +// DEFINE: -shared-libs=%mlir_c_runner_utils | \ +// DEFINE: FileCheck %s +// +// RUN: %{compile} | %{run} +// +// Do the same run, but now with direct IR generation. +// REDEFINE: %{option} = "enable-runtime-library=true enable-buffer-initialization=true enable-index-reduction=true" +// RUN: %{compile} | %{run} +// +// Do the same run, but now with direct IR generation and vectorization. +// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true enable-index-reduction=true" +// RUN: %{compile} | %{run} + +// Do the same run, but now with direct IR generation and, if available, VLA +// vectorization. +// REDEFINE: %{option} = "enable-runtime-library=false vl=4 enable-arm-sve=%ENABLE_VLA enable-index-reduction=true" +// REDEFINE: %{run} = %lli_host_or_aarch64_cmd \ +// REDEFINE: --entry-function=entry_lli \ +// REDEFINE: --extra-module=%S/Inputs/main_for_lli.ll \ +// REDEFINE: %VLA_ARCH_ATTR_OPTIONS \ +// REDEFINE: --dlopen=%mlir_native_utils_lib_dir/libmlir_c_runner_utils%shlibext | \ +// REDEFINE: FileCheck %s +// RUN: %{compile} | mlir-translate -mlir-to-llvmir | %{run} + +#CCCC = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed", "compressed" ], posWidth = 32, crdWidth = 32 }> + +func.func @pooling_nhwc_sum_CCCC(%input: tensor<1x4x4x1xf32, #CCCC>, %filter: tensor<2x2xf32>) -> tensor<1x3x3x1xf32, #CCCC> { + %init = bufferization.alloc_tensor() : tensor<1x3x3x1xf32, #CCCC> + %0 = linalg.pooling_nhwc_sum {dilations = dense<1> : tensor<2xi64>, + strides = dense<1> : tensor<2xi64>} + ins (%input, %filter: tensor<1x4x4x1xf32, #CCCC>, tensor<2x2xf32>) + outs (%init: tensor<1x3x3x1xf32, #CCCC>) -> tensor<1x3x3x1xf32, #CCCC> + return %0 : tensor<1x3x3x1xf32, #CCCC> +} + +func.func @pooling_nhwc_sum(%input: tensor<1x4x4x1xf32>, %filter: tensor<2x2xf32>) -> tensor<1x3x3x1xf32> { + %init = arith.constant dense<[[ [[0.0], [0.0], [0.0]], + [[0.0], [0.0], [0.0]], + [[0.0], [0.0], [0.0]] ]]> : tensor<1x3x3x1xf32> + %0 = linalg.pooling_nhwc_sum {dilations = dense<1> : tensor<2xi64>, + strides = dense<1> : tensor<2xi64>} + ins (%input, %filter: tensor<1x4x4x1xf32>, tensor<2x2xf32>) + outs (%init: tensor<1x3x3x1xf32>) -> tensor<1x3x3x1xf32> + return %0 : tensor<1x3x3x1xf32> +} + + +func.func @entry() { + %c0 = arith.constant 0 : index + %zero = arith.constant 0.00000e+00 : f32 + + %filter = arith.constant dense< + [[ 1.0, 0.0], + [ 0.0, 1.0]] + > : tensor<2x2xf32> + + %in_dense = arith.constant dense< + [[[[1.0], [2.0], [1.0], [2.0]], + [[1.0], [2.0], [1.0], [2.0]], + [[1.0], [2.0], [1.0], [2.0]], + [[1.0], [2.0], [1.0], [2.0]]]] + > : tensor<1x4x4x1xf32> + + %in_CCCC = sparse_tensor.convert %in_dense : tensor<1x4x4x1xf32> to tensor<1x4x4x1xf32, #CCCC> + + %dense_ret = call @pooling_nhwc_sum(%in_dense, %filter) : (tensor<1x4x4x1xf32>, tensor<2x2xf32>) -> tensor<1x3x3x1xf32> + %CCCC_ret = call @pooling_nhwc_sum_CCCC(%in_CCCC, %filter) : (tensor<1x4x4x1xf32, #CCCC>, tensor<2x2xf32>) -> tensor<1x3x3x1xf32, #CCCC> + + // CHECK: ( ( ( ( 6 ), ( 6 ), ( 6 ) ), ( ( 6 ), ( 6 ), ( 6 ) ), ( ( 6 ), ( 6 ), ( 6 ) ) ) ) + %dense_v = vector.transfer_read %dense_ret[%c0, %c0, %c0, %c0], %zero + : tensor<1x3x3x1xf32>, vector<1x3x3x1xf32> + vector.print %dense_v : vector<1x3x3x1xf32> + + // + // Sparse pooling should have the same output. + // + + // CHECK-NEXT: ( ( ( ( 6 ), ( 6 ), ( 6 ) ), ( ( 6 ), ( 6 ), ( 6 ) ), ( ( 6 ), ( 6 ), ( 6 ) ) ) ) + %s1 = sparse_tensor.convert %CCCC_ret : tensor<1x3x3x1xf32, #CCCC> to tensor<1x3x3x1xf32> + %v1 = vector.transfer_read %s1[%c0, %c0, %c0, %c0], %zero + : tensor<1x3x3x1xf32>, vector<1x3x3x1xf32> + vector.print %v1 : vector<1x3x3x1xf32> + + // Releases resources. + bufferization.dealloc_tensor %in_CCCC : tensor<1x4x4x1xf32, #CCCC> + bufferization.dealloc_tensor %CCCC_ret : tensor<1x3x3x1xf32, #CCCC> + bufferization.dealloc_tensor %dense_ret : tensor<1x3x3x1xf32> + return +}