diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h --- a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h +++ b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h @@ -452,11 +452,11 @@ /// Generates a nested loop that iterates over tid on all the coordinates on /// lvl. - ValueRange - genUnResolvedSliceTreeTraverse(OpBuilder &builder, Location loc, TensorId tid, - ArrayRef unResLvls, - ValueRange userReduc, - LoopBodyBuilder bodyBuilder); + ValueRange genUnResolvedSliceTreeTraverse( + OpBuilder &builder, Location loc, TensorId tid, + ArrayRef unResLvls, + std::optional> firstResLvl, + ValueRange userReduc, LoopBodyBuilder bodyBuilder); /// Generates code to get the first non-empty slice of tid on lvl, when all /// the previous level before `lvl` are resolved (or lvl is the first level). diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp @@ -1521,65 +1521,69 @@ // } ValueRange LoopEmitter::genUnResolvedSliceTreeTraverse( OpBuilder &builder, Location loc, TensorId tid, - ArrayRef unResLvls, ValueRange userReduc, + ArrayRef unResLvls, + std::optional> firstResLvl, ValueRange userReduc, LoopBodyBuilder bodyBuilder) { - // assert(unResLvls.size() == 1 && "TODO"); - Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2); - - const SliceInfo &frontSlice = *unResLvls.back(); - Level firstLvl = *frontSlice.slicedOnLvl; - assert(!lvlFullyResolved(tid, firstLvl) && "TODO"); - // FIXME: it is not zero when the first level is fully resolved. + Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2); Value pos = c0; OpBuilder::InsertPoint ip; SmallVector innerArgs(userReduc.begin(), userReduc.end()); - scf::ForOp outerMost = nullptr; - if (!lvlFullyResolved(tid, firstLvl)) { - if (isCompressedDLT(lvlTypes[tid][firstLvl])) { - unsigned depth = frontSlice.depth - 1; - Value offset = frontSlice.offset; - Value sPtrBuf = slicePosBuffer[tid][firstLvl][depth]; - Value mSz = genIndexLoad(builder, loc, sPtrBuf, c0); // memSize - outerMost = builder.create( - loc, c2, mSz, c2, innerArgs, - [this, c1, tid, firstLvl, offset, sPtrBuf, &ip, &pos, &innerArgs]( - OpBuilder &builder, Location loc, Value iv, ValueRange iterArgs) { - // generate traversal for each level. - Value loopLo = genIndexLoad(builder, loc, sPtrBuf, iv); - Value loopHi = genIndexLoad(builder, loc, sPtrBuf, ADDI(iv, c1)); - ValueRange itArgs = - genSliceLvlTraverseLoop( - builder, loc, loopLo, loopHi, offset, - sliceSizes[tid][firstLvl].back(), tid, firstLvl, iterArgs, - false, - [&](OpBuilder &builder, Location, Value iv, - MutableArrayRef reduc) { - ip = builder.saveInsertionPoint(); - pos = iv; - innerArgs.assign(reduc.begin(), reduc.end()); - }) - .second; - YIELD(itArgs); - }); - } else if (isDenseDLT(lvlTypes[tid][firstLvl])) { - assert(firstLvl == 0); // This must be the first level. - Value lb = frontSlice.offset; - Value sliceSz = - sliceSizes[tid][*frontSlice.slicedOnLvl][frontSlice.depth - 1]; - Value ub = ADDI(lb, sliceSz); - outerMost = builder.create( - loc, lb, ub, c1, innerArgs, - [&](OpBuilder &builder, Location loc, Value iv, ValueRange iterArgs) { - ip = builder.saveInsertionPoint(); - pos = iv; - innerArgs.assign(iterArgs.begin(), iterArgs.end()); - }); + scf::ForOp outerMost = nullptr; // the outtermost loop. + if (firstResLvl.has_value()) { + // Overwrite position when the first level is fully resolved. + pos = posits[firstResLvl->first][firstResLvl->second]; + ip = builder.saveInsertionPoint(); + } else { + const SliceInfo &frontSlice = *unResLvls.back(); + Level firstLvl = *frontSlice.slicedOnLvl; + if (!lvlFullyResolved(tid, firstLvl)) { + if (isCompressedDLT(lvlTypes[tid][firstLvl])) { + unsigned depth = frontSlice.depth - 1; + Value offset = frontSlice.offset; + Value sPtrBuf = slicePosBuffer[tid][firstLvl][depth]; + Value mSz = genIndexLoad(builder, loc, sPtrBuf, c0); // memSize + outerMost = builder.create( + loc, c2, mSz, c2, innerArgs, + [this, c1, tid, firstLvl, offset, sPtrBuf, &ip, &pos, + &innerArgs](OpBuilder &builder, Location loc, Value iv, + ValueRange iterArgs) { + // generate traversal for each level. + Value loopLo = genIndexLoad(builder, loc, sPtrBuf, iv); + Value loopHi = genIndexLoad(builder, loc, sPtrBuf, ADDI(iv, c1)); + ValueRange itArgs = + genSliceLvlTraverseLoop( + builder, loc, loopLo, loopHi, offset, + sliceSizes[tid][firstLvl].back(), tid, firstLvl, iterArgs, + false, + [&](OpBuilder &builder, Location, Value iv, + MutableArrayRef reduc) { + ip = builder.saveInsertionPoint(); + pos = iv; + innerArgs.assign(reduc.begin(), reduc.end()); + }) + .second; + YIELD(itArgs); + }); + } else if (isDenseDLT(lvlTypes[tid][firstLvl])) { + assert(firstLvl == 0); // This must be the first level. + Value lb = frontSlice.offset; + Value sliceSz = + sliceSizes[tid][*frontSlice.slicedOnLvl][frontSlice.depth - 1]; + Value ub = ADDI(lb, sliceSz); + outerMost = builder.create( + loc, lb, ub, c1, innerArgs, + [&](OpBuilder &builder, Location loc, Value iv, + ValueRange iterArgs) { + ip = builder.saveInsertionPoint(); + pos = iv; + innerArgs.assign(iterArgs.begin(), iterArgs.end()); + }); + } + // We generated the loop for the first slice above, now remove it. + unResLvls = unResLvls.drop_back(); } - // We generated the loop for the first slice above, now remove it. - unResLvls = unResLvls.drop_back(); } - // Reset the insertion point into the loop body. builder.restoreInsertionPoint(ip); if (!unResLvls.empty()) { @@ -1611,12 +1615,21 @@ bodyBuilder(builder, loc, pos, innerArgs); return innerArgs; }); - YIELD(denseNest.results); + + if (!outerMost) { + // If the outermost loop has not been set, this is the outermost loop. + outerMost = denseNest.loops.front(); + } else { + // Otherwise we need to generate yield operations to link the SSA chain. + YIELD(denseNest.results); + } } else { + assert(outerMost); // Generates user request loop body. bodyBuilder(builder, loc, pos, innerArgs); YIELD(innerArgs); } + assert(outerMost); // Insert after current while operation. builder.setInsertionPointAfter(outerMost); return outerMost.getResults(); @@ -1624,7 +1637,6 @@ void LoopEmitter::genResolvedSliceBegin(OpBuilder &builder, Location loc, TensorId tid, Level lvl) { - assert(lvl == 0 && "TODO: handle non-first level"); Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2), c3 = C_IDX(3), c4 = C_IDX(4); if (isDenseDLT(lvlTypes[tid][lvl])) { @@ -1634,14 +1646,23 @@ lvl, /*depth=*/1); return; } - Value size = sliceSizes[tid][0][0]; - Value sPtrBuf = slicePosBuffer[tid][0][0]; - Value pHi = genIndexLoad(builder, loc, positionsBuffers[tid][0], c1); + Value size = sliceSizes[tid][lvl][0]; + Value sPtrBuf = slicePosBuffer[tid][lvl][0]; + Value pHi, pLo; + if (lvl == 0) { + pLo = c0; + pHi = genIndexLoad(builder, loc, positionsBuffers[tid][0], c1); + } else { + pLo = genIndexLoad(builder, loc, positionsBuffers[tid][lvl], + posits[tid][lvl - 1]); + pHi = genIndexLoad(builder, loc, positionsBuffers[tid][lvl], + ADDI(posits[tid][lvl - 1], c1)); + } // Fills out pIdxBuffer[tid][lvl][0] with [/*memSize =*/4, 0, 0, pHi] builder.create(loc, c4, sPtrBuf, c0); // memSize = 4 builder.create(loc, c0, sPtrBuf, c1); // index = 0 - builder.create(loc, c0, sPtrBuf, c2); // pLo = 0; - builder.create(loc, pHi, sPtrBuf, c3); // loaded pHi. + builder.create(loc, pLo, sPtrBuf, c2); // pLo + builder.create(loc, pHi, sPtrBuf, c3); // pHi // This is an non empty tensor if 0 < pHi. Value isNonEmpty = CMPI(ult, c0, pHi); @@ -1703,10 +1724,15 @@ assert(slicePosBuffer[tid][lvl - 1].size() == sliceStack[tid].back().depth); SmallVector unResSlices; + std::optional> firstResLvl; for (Level curLvl = lvl; curLvl >= 1; curLvl--) { Level prevLvl = curLvl - 1; + if (lvlFullyResolved(tid, prevLvl)) { + firstResLvl = std::make_pair(tid, prevLvl); + break; + } unResSlices.push_back(&getMostRecentSliceOnLvl(tid, prevLvl)); - if (!isDenseDLT(lvlTypes[tid][prevLvl]) || lvlFullyResolved(tid, prevLvl)) { + if (!isDenseDLT(lvlTypes[tid][prevLvl])) { break; } } @@ -1722,7 +1748,7 @@ }; ValueRange result = genUnResolvedSliceTreeTraverse( - builder, loc, tid, unResSlices, reduc, + builder, loc, tid, unResSlices, firstResLvl, reduc, [this, c1, c2, tid, lvl, sPtrBuf](OpBuilder &builder, Location loc, Value iv, MutableArrayRef reduc) { @@ -1869,7 +1895,7 @@ void LoopEmitter::invalidateSliceIterIdx(OpBuilder &builder, Location loc, TensorId tid, Level lvl) { for (unsigned i = 0; i <= lvl; i++) { - if (!isDenseDLT(lvlTypes[tid][i])) { + if (!isDenseDLT(lvlTypes[tid][i]) && !dependentLvlMap[tid][i].empty()) { builder.create(loc, C_IDX(0), slicePosBuffer[tid][i].back(), C_IDX(1)); } diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_1d_nwc_wcf.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_1d_nwc_wcf.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_1d_nwc_wcf.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_1d_nwc_wcf.mlir @@ -1,9 +1,4 @@ -// UNSUPPORTED: target={{.*}} -// FIXME: The test case is disabled (for now) because affine index on sparse tensor -// are not handled efficiently by sparse compiler, the test case will be re-enabled -// after new algorithm is implemented. - -// DEFINE: %{option} = enable-runtime-library=true +// DEFINE: %{option} = "enable-runtime-library=true enable-index-reduction=true" // DEFINE: %{compile} = mlir-opt %s --sparse-compiler=%{option} // DEFINE: %{run} = mlir-cpu-runner \ // DEFINE: -e entry -entry-point-result=void \ @@ -13,16 +8,16 @@ // RUN: %{compile} | %{run} // // Do the same run, but now with direct IR generation. -// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true" +// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true enable-index-reduction=true" // RUN: %{compile} | %{run} // // Do the same run, but now with direct IR generation and vectorization. -// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true" +// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true enable-index-reduction=true" // RUN: %{compile} | %{run} // Do the same run, but now with direct IR generation and, if available, VLA // vectorization. -// REDEFINE: %{option} = "enable-runtime-library=false vl=4 enable-arm-sve=%ENABLE_VLA" +// REDEFINE: %{option} = "enable-runtime-library=false vl=4 enable-arm-sve=%ENABLE_VLA enable-index-reduction=true" // REDEFINE: %{run} = %lli \ // REDEFINE: --entry-function=entry_lli \ // REDEFINE: --extra-module=%S/Inputs/main_for_lli.ll \ @@ -55,26 +50,26 @@ return %ret : tensor } -func.func @conv_1d_nwc_wcf_CCC(%arg0: tensor, %arg1: tensor) -> tensor { +func.func @conv_1d_nwc_wcf_CCC(%arg0: tensor, %arg1: tensor) -> tensor { %c1 = arith.constant 1 : index %c3 = arith.constant 3 : index %c6 = arith.constant 6 : index %s = bufferization.alloc_tensor(%c3, %c6, %c1) : tensor %ret = linalg.conv_1d_nwc_wcf {dilations = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} - ins (%arg0, %arg1: tensor, tensor) + ins (%arg0, %arg1: tensor, tensor) outs (%s: tensor) -> tensor return %ret : tensor } -func.func @conv_1d_nwc_wcf_CDC(%arg0: tensor, %arg1: tensor) -> tensor { +func.func @conv_1d_nwc_wcf_CDC(%arg0: tensor, %arg1: tensor) -> tensor { %c1 = arith.constant 1 : index %c3 = arith.constant 3 : index %c6 = arith.constant 6 : index %s = bufferization.alloc_tensor(%c3, %c6, %c1) : tensor %ret = linalg.conv_1d_nwc_wcf {dilations = dense<1> : tensor<1xi64>, strides = dense<1> : tensor<1xi64>} - ins (%arg0, %arg1: tensor, tensor) + ins (%arg0, %arg1: tensor, tensor) outs (%s: tensor) -> tensor return %ret : tensor } @@ -91,22 +86,18 @@ %in1D_tmp = call @alloc_3d_filled_f32(%c3, %c8, %c1, %val) : (index, index, index, f32) -> (tensor) %in1D_nwc = tensor.insert %f10 into %in1D_tmp[%c0, %c3, %c0] : tensor + %filter1D_nwc = call @alloc_3d_filled_f32(%c3, %c1, %c1, %val) : (index, index, index, f32) -> (tensor) %out1D_nwc = call @alloc_3d_filled_f32(%c3, %c6, %c1, %zero) : (index, index, index, f32) -> (tensor) %in1D_nwc_CCC = sparse_tensor.convert %in1D_nwc : tensor to tensor - %filter1D_nwc_CCC = sparse_tensor.convert %filter1D_nwc - : tensor to tensor - %in1D_nwc_CDC = sparse_tensor.convert %in1D_nwc : tensor to tensor - %filter1D_nwc_CDC = sparse_tensor.convert %filter1D_nwc - : tensor to tensor %dense_ret = call @conv_1d_nwc_wcf(%in1D_nwc, %filter1D_nwc, %out1D_nwc) : (tensor, tensor, tensor) -> (tensor) - %CCC_ret = call @conv_1d_nwc_wcf_CCC(%in1D_nwc_CCC, %filter1D_nwc_CCC) : (tensor, tensor) -> (tensor) - %CDC_ret = call @conv_1d_nwc_wcf_CDC(%in1D_nwc_CDC, %filter1D_nwc_CDC) : (tensor, tensor) -> (tensor) + %CCC_ret = call @conv_1d_nwc_wcf_CCC(%in1D_nwc_CCC, %filter1D_nwc) : (tensor, tensor) -> (tensor) + %CDC_ret = call @conv_1d_nwc_wcf_CDC(%in1D_nwc_CDC, %filter1D_nwc) : (tensor, tensor) -> (tensor) // CHECK: ( ( ( 12 ), ( 28 ), ( 28 ), ( 28 ), ( 12 ), ( 12 ) ), // CHECK-SAME: ( ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ), ( 12 ) ), @@ -139,9 +130,7 @@ bufferization.dealloc_tensor %out1D_nwc : tensor bufferization.dealloc_tensor %in1D_nwc_CDC : tensor - bufferization.dealloc_tensor %filter1D_nwc_CDC : tensor bufferization.dealloc_tensor %in1D_nwc_CCC : tensor - bufferization.dealloc_tensor %filter1D_nwc_CCC : tensor bufferization.dealloc_tensor %CCC_ret : tensor bufferization.dealloc_tensor %CDC_ret : tensor diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nhwc_hwcf.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nhwc_hwcf.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nhwc_hwcf.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nhwc_hwcf.mlir @@ -1,9 +1,4 @@ -// UNSUPPORTED: target={{.*}} -// FIXME: The test case is disabled (for now) because affine index on sparse tensor -// are not handled efficiently by sparse compiler, the test case will be re-enabled -// after new algorithm is implemented. - -// DEFINE: %{option} = enable-runtime-library=true +// DEFINE: %{option} = "enable-runtime-library=true enable-index-reduction=true" // DEFINE: %{compile} = mlir-opt %s --sparse-compiler=%{option} // DEFINE: %{run} = mlir-cpu-runner \ // DEFINE: -e entry -entry-point-result=void \ @@ -13,16 +8,16 @@ // RUN: %{compile} | %{run} // // Do the same run, but now with direct IR generation. -// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true" +// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true enable-index-reduction=true" // RUN: %{compile} | %{run} // // Do the same run, but now with direct IR generation and vectorization. -// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true" +// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true enable-index-reduction=true" // RUN: %{compile} | %{run} // Do the same run, but now with direct IR generation and, if available, VLA // vectorization. -// REDEFINE: %{option} = "enable-runtime-library=false vl=4 enable-arm-sve=%ENABLE_VLA" +// REDEFINE: %{option} = "enable-runtime-library=false vl=4 enable-arm-sve=%ENABLE_VLA enable-index-reduction=true" // REDEFINE: %{run} = %lli \ // REDEFINE: --entry-function=entry_lli \ // REDEFINE: --extra-module=%S/Inputs/main_for_lli.ll \ @@ -54,26 +49,26 @@ return %ret : tensor } -func.func @conv_2d_nhwc_hwcf_CCCC(%arg0: tensor, %arg1: tensor) -> tensor { +func.func @conv_2d_nhwc_hwcf_CCCC(%arg0: tensor, %arg1: tensor) -> tensor { %c1 = arith.constant 1 : index %c3 = arith.constant 3 : index %c6 = arith.constant 6 : index %s = bufferization.alloc_tensor(%c3, %c6, %c6, %c1) : tensor %ret = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} - ins (%arg0, %arg1: tensor, tensor) + ins (%arg0, %arg1: tensor, tensor) outs (%s: tensor) -> tensor return %ret : tensor } -func.func @conv_2d_nhwc_hwcf_CDCD(%arg0: tensor, %arg1: tensor) -> tensor { +func.func @conv_2d_nhwc_hwcf_CDCD(%arg0: tensor, %arg1: tensor) -> tensor { %c1 = arith.constant 1 : index %c3 = arith.constant 3 : index %c6 = arith.constant 6 : index %s = bufferization.alloc_tensor(%c3, %c6, %c6, %c1) : tensor %ret = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} - ins (%arg0, %arg1: tensor, tensor) + ins (%arg0, %arg1: tensor, tensor) outs (%s: tensor) -> tensor return %ret : tensor } @@ -95,17 +90,12 @@ %in2D_nhwc_CCCC = sparse_tensor.convert %in2D_nhwc : tensor to tensor - %filter2D_nhwc_CCCC = sparse_tensor.convert %filter2D_nhwc - : tensor to tensor - %in2D_nhwc_CDCD = sparse_tensor.convert %in2D_nhwc : tensor to tensor - %filter2D_nhwc_CDCD = sparse_tensor.convert %filter2D_nhwc - : tensor to tensor %dense_ret = call @conv_2d_nhwc_hwcf(%in2D_nhwc, %filter2D_nhwc, %out2D_nhwc) : (tensor, tensor, tensor) -> (tensor) - %CCCC_ret = call @conv_2d_nhwc_hwcf_CCCC(%in2D_nhwc_CCCC, %filter2D_nhwc_CCCC) : (tensor, tensor) -> (tensor) - %CDCD_ret = call @conv_2d_nhwc_hwcf_CDCD(%in2D_nhwc_CDCD, %filter2D_nhwc_CDCD) : (tensor, tensor) -> (tensor) + %CCCC_ret = call @conv_2d_nhwc_hwcf_CCCC(%in2D_nhwc_CCCC, %filter2D_nhwc) : (tensor, tensor) -> (tensor) + %CDCD_ret = call @conv_2d_nhwc_hwcf_CDCD(%in2D_nhwc_CDCD, %filter2D_nhwc) : (tensor, tensor) -> (tensor) // CHECK: ( ( ( ( 108 ), ( 124 ), ( 124 ), ( 124 ), ( 108 ), ( 108 ) ), // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), @@ -183,9 +173,7 @@ bufferization.dealloc_tensor %out2D_nhwc : tensor bufferization.dealloc_tensor %in2D_nhwc_CDCD : tensor - bufferization.dealloc_tensor %filter2D_nhwc_CDCD : tensor bufferization.dealloc_tensor %in2D_nhwc_CCCC : tensor - bufferization.dealloc_tensor %filter2D_nhwc_CCCC : tensor bufferization.dealloc_tensor %CCCC_ret : tensor bufferization.dealloc_tensor %CDCD_ret : tensor diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_ndhwc_dhwcf.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_ndhwc_dhwcf.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_ndhwc_dhwcf.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_3d_ndhwc_dhwcf.mlir @@ -1,9 +1,4 @@ -// UNSUPPORTED: target={{.*}} -// FIXME: The test case is disabled (for now) because affine index on sparse tensor -// are not handled efficiently by sparse compiler, the test case will be re-enabled -// after new algorithm is implemented. - -// DEFINE: %{option} = enable-runtime-library=true +// DEFINE: %{option} = "enable-runtime-library=true enable-index-reduction=true" // DEFINE: %{compile} = mlir-opt %s --sparse-compiler=%{option} // DEFINE: %{run} = mlir-cpu-runner \ // DEFINE: -e entry -entry-point-result=void \ @@ -13,16 +8,16 @@ // RUN: %{compile} | %{run} // // Do the same run, but now with direct IR generation. -// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true" +// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true enable-index-reduction=true" // RUN: %{compile} | %{run} // // Do the same run, but now with direct IR generation and vectorization. -// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true" +// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true enable-index-reduction=true" // RUN: %{compile} | %{run} // Do the same run, but now with direct IR generation and, if available, VLA // vectorization. -// REDEFINE: %{option} = "enable-runtime-library=false vl=4 enable-arm-sve=%ENABLE_VLA" +// REDEFINE: %{option} = "enable-runtime-library=false vl=4 enable-arm-sve=%ENABLE_VLA enable-index-reduction=true" // REDEFINE: %{run} = %lli \ // REDEFINE: --entry-function=entry_lli \ // REDEFINE: --extra-module=%S/Inputs/main_for_lli.ll \ @@ -57,7 +52,7 @@ } func.func @conv_3d_ndhwc_dhwcf_CCCCC(%arg0: tensor, - %arg1: tensor) + %arg1: tensor) -> tensor { %c1 = arith.constant 1 : index %c6 = arith.constant 6 : index @@ -65,13 +60,13 @@ : tensor %ret = linalg.conv_3d_ndhwc_dhwcf {dilations = dense<1> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} - ins (%arg0, %arg1: tensor, tensor) + ins (%arg0, %arg1: tensor, tensor) outs (%s: tensor) -> tensor return %ret : tensor } func.func @conv_3d_ndhwc_dhwcf_CDCDC(%arg0: tensor, - %arg1: tensor) + %arg1: tensor) -> tensor { %c1 = arith.constant 1 : index %c6 = arith.constant 6 : index @@ -79,7 +74,7 @@ : tensor %ret = linalg.conv_3d_ndhwc_dhwcf {dilations = dense<1> : tensor<3xi64>, strides = dense<1> : tensor<3xi64>} - ins (%arg0, %arg1: tensor, tensor) + ins (%arg0, %arg1: tensor, tensor) outs (%s: tensor) -> tensor return %ret : tensor } @@ -102,13 +97,8 @@ %in3D_ndhwc_CCCCC = sparse_tensor.convert %in3D_ndhwc : tensor to tensor - %filter3D_ndhwc_CCCCC = sparse_tensor.convert %filter3D_ndhwc - : tensor to tensor - %in3D_ndhwc_CDCDC = sparse_tensor.convert %in3D_ndhwc : tensor to tensor - %filter3D_ndhwc_CDCDC = sparse_tensor.convert %filter3D_ndhwc - : tensor to tensor // CHECK:( ( ( ( ( 108 ), ( 124 ), ( 124 ), ( 124 ), ( 108 ), ( 108 ) ), // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), @@ -152,9 +142,9 @@ : tensor, vector<1x6x6x6x1xf32> vector.print %dense_v : vector<1x6x6x6x1xf32> - %CCCCC_ret = call @conv_3d_ndhwc_dhwcf_CCCCC(%in3D_ndhwc_CCCCC, %filter3D_ndhwc_CCCCC) + %CCCCC_ret = call @conv_3d_ndhwc_dhwcf_CCCCC(%in3D_ndhwc_CCCCC, %filter3D_ndhwc) : (tensor, - tensor) -> (tensor) + tensor) -> (tensor) // CHECK-NEXT:( ( ( ( ( 108 ), ( 124 ), ( 124 ), ( 124 ), ( 108 ), ( 108 ) ), // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), @@ -198,9 +188,9 @@ : tensor, vector<1x6x6x6x1xf32> vector.print %v1 : vector<1x6x6x6x1xf32> - %CDCDC_ret = call @conv_3d_ndhwc_dhwcf_CDCDC(%in3D_ndhwc_CDCDC, %filter3D_ndhwc_CDCDC) + %CDCDC_ret = call @conv_3d_ndhwc_dhwcf_CDCDC(%in3D_ndhwc_CDCDC, %filter3D_ndhwc) : (tensor, - tensor) -> (tensor) + tensor) -> (tensor) // CHECK-NEXT:( ( ( ( ( 108 ), ( 124 ), ( 124 ), ( 124 ), ( 108 ), ( 108 ) ), // CHECK-SAME: ( ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ), ( 108 ) ), @@ -250,9 +240,7 @@ bufferization.dealloc_tensor %out3D_ndhwc : tensor bufferization.dealloc_tensor %in3D_ndhwc_CDCDC : tensor - bufferization.dealloc_tensor %filter3D_ndhwc_CDCDC : tensor bufferization.dealloc_tensor %in3D_ndhwc_CCCCC : tensor - bufferization.dealloc_tensor %filter3D_ndhwc_CCCCC : tensor bufferization.dealloc_tensor %CCCCC_ret : tensor bufferization.dealloc_tensor %CDCDC_ret : tensor