diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp @@ -1163,6 +1163,14 @@ MutableArrayRef reduc) { const LoopInfo &loopInfo = loopStack.back(); rewriter.setInsertionPointToEnd(loopInfo.userCodeBlock); + if (!loopInfo.userCodeBlock->empty() && + llvm::isa(loopInfo.loop) && + llvm::isa(&loopInfo.userCodeBlock->back())) { + // Annoyingly, scf::For inserts an implicit yield op when there is no loop + // iter args. In this case, we need to insert the code before the yield. + assert(reduc.empty()); + rewriter.setInsertionPoint(&loopInfo.userCodeBlock->back()); + } for (auto [tid, lvl, reduced] : loopInfo.sliceDrivenInfo) { SliceInfo &info = sliceStack[tid].back(); assert(isDenseDLT(lvlTypes[tid][lvl])); diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nchw_fchw.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nchw_fchw.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nchw_fchw.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nchw_fchw.mlir @@ -30,8 +30,8 @@ // TODO: we can only support dense output for nchw input because 'c' is a reduction loop -#CCCD = #sparse_tensor.encoding<{ - lvlTypes = [ "dense", "dense", "dense", "compressed" ] +#CDCD = #sparse_tensor.encoding<{ + lvlTypes = [ "compressed", "dense", "compressed", "dense" ] }> @@ -39,8 +39,6 @@ lvlTypes = [ "compressed", "compressed", "compressed", "compressed" ] }> -// FIXME: CDCD encoding crashes! - // Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f func.func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> tensor { %buf = bufferization.alloc_tensor(%s1, %s2, %s3, %s4) : tensor @@ -56,10 +54,10 @@ return %ret : tensor } -func.func @conv_2d_nchw_fchw_CCCD(%arg0: tensor, %arg1: tensor, %arg2: tensor) -> tensor { +func.func @conv_2d_nchw_fchw_CDCD(%arg0: tensor, %arg1: tensor, %arg2: tensor) -> tensor { %ret = linalg.conv_2d_nchw_fchw {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} - ins (%arg0, %arg1: tensor, tensor) + ins (%arg0, %arg1: tensor, tensor) outs (%arg2: tensor) -> tensor return %ret : tensor } @@ -90,12 +88,12 @@ %out2D_nhwc_CCCC = call @alloc_4d_filled_f32(%c3, %c1, %c6, %c6, %zero) : (index, index, index, index, f32) -> (tensor) %in2D_nhwc_CCCD = sparse_tensor.convert %in2D_nhwc - : tensor to tensor + : tensor to tensor %in2D_nhwc_CCCC = sparse_tensor.convert %in2D_nhwc : tensor to tensor %dense_ret = call @conv_2d_nchw_fchw(%in2D_nhwc, %filter2D_nhwc, %out2D_nhwc) : (tensor, tensor, tensor) -> (tensor) - %CCCC_ret = call @conv_2d_nchw_fchw_CCCD(%in2D_nhwc_CCCD, %filter2D_nhwc, %out2D_nhwc_CCCD) : (tensor, tensor, tensor) -> (tensor) + %CCCC_ret = call @conv_2d_nchw_fchw_CDCD(%in2D_nhwc_CCCD, %filter2D_nhwc, %out2D_nhwc_CCCD) : (tensor, tensor, tensor) -> (tensor) %CDCD_ret = call @conv_2d_nchw_fchw_CCCC(%in2D_nhwc_CCCC, %filter2D_nhwc, %out2D_nhwc_CCCC) : (tensor, tensor, tensor) -> (tensor) @@ -173,6 +171,6 @@ bufferization.dealloc_tensor %out2D_nhwc_CCCC : tensor bufferization.dealloc_tensor %in2D_nhwc_CCCC : tensor - bufferization.dealloc_tensor %in2D_nhwc_CCCD : tensor + bufferization.dealloc_tensor %in2D_nhwc_CCCD : tensor return }