diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp @@ -515,6 +515,15 @@ return env.topSortSize() == numLoops; } +static void addIterOrdering(LoopId f, LoopId t, + std::vector> &adjM, + std::vector &inDegree) { + if (!adjM[f][t] && f != t) { + adjM[f][t] = true; + inDegree[t]++; + } +} + /// Helper method to add all constraints from the indices in one affine /// expression before all indices in the other affine expression. For /// example i0+i1 < i2+i3+1 yields i0()) { + AffineDimCollector tCollector; + tCollector.walkPostOrder(ta); + + const LoopId f = env.makeLoopId(fdim.getPosition()); + for (auto td : tCollector.dims) { + const LoopId t = env.makeLoopId(td.getPosition()); + addIterOrdering(f, t, adjM, inDegree); + } + continue; + } + // This is a heuristic, we pick an abitrary reduction loop from lhs and // rhs and use them as d_x and d_y. finder.walkPostOrder(fa); @@ -704,10 +722,7 @@ const LoopId tldx = env.makeLoopId(texp.getPosition()); // d_x > d_y - if (!adjM[fldx][tldx]) { - adjM[fldx][tldx] = true; - inDegree[tldx]++; - } + addIterOrdering(fldx, tldx, adjM, inDegree); AffineDimCollector fCollector; fCollector.walkPostOrder(fa); @@ -717,21 +732,11 @@ // make sure dx and dy is the last; for (auto fd : fCollector.dims) { const LoopId f = env.makeLoopId(fd.getPosition()); - if (f == fldx) - continue; - if (!adjM[f][fldx]) { - adjM[f][fldx] = true; - inDegree[fldx]++; - } + addIterOrdering(f, fldx, adjM, inDegree); } for (auto td : tCollector.dims) { const LoopId t = env.makeLoopId(td.getPosition()); - if (t == tldx) - continue; - if (!adjM[t][tldx]) { - adjM[t][tldx] = true; - inDegree[tldx]++; - } + addIterOrdering(t, tldx, adjM, inDegree); } // Since we only support affine addition, the order between two dim // expression does not really matters. @@ -746,15 +751,11 @@ const LoopId f = env.makeLoopId(fd.getPosition()); if (f == fldx) // skip d_x continue; - for (auto td : tCollector.dims) { const LoopId t = env.makeLoopId(td.getPosition()); if (t == tldx) // skip d_y continue; - if (!adjM[f][t]) { - adjM[f][t] = true; - inDegree[t]++; - } + addIterOrdering(f, t, adjM, inDegree); } } } @@ -796,7 +797,7 @@ if (isCompressedDLT(dltI) || isCompressedWithHiDLT(dltI) || isSingletonDLT(dltI)) { for (LoopId j = 0; j < numLoops; j++) - if (isUndefDLT(env.dlt(tid, j))) { + if (isUndefDLT(env.dlt(tid, j)) && !adjM[i][j]) { adjM[i][j] = true; inDegree[j]++; } diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nchw_fchw.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nchw_fchw.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conv_2d_nchw_fchw.mlir @@ -0,0 +1,178 @@ +// DEFINE: %{option} = "enable-runtime-library=true enable-index-reduction=true" +// DEFINE: %{compile} = mlir-opt %s --sparse-compiler=%{option} +// DEFINE: %{run} = mlir-cpu-runner \ +// DEFINE: -e entry -entry-point-result=void \ +// DEFINE: -shared-libs=%mlir_c_runner_utils | \ +// DEFINE: FileCheck %s +// +// RUN: %{compile} | %{run} +// +// Do the same run, but now with direct IR generation. +// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true enable-index-reduction=true" +// RUN: %{compile} | %{run} +// +// Do the same run, but now with direct IR generation and vectorization. +// REDEFINE: %{option} = "enable-runtime-library=false enable-buffer-initialization=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true enable-index-reduction=true" +// RUN: %{compile} | %{run} + +// Do the same run, but now with direct IR generation and, if available, VLA +// vectorization. +// REDEFINE: %{option} = "enable-runtime-library=false vl=4 enable-arm-sve=%ENABLE_VLA enable-index-reduction=true" +// REDEFINE: %{run} = %lli_host_or_aarch64_cmd \ +// REDEFINE: --entry-function=entry_lli \ +// REDEFINE: --extra-module=%S/Inputs/main_for_lli.ll \ +// REDEFINE: %VLA_ARCH_ATTR_OPTIONS \ +// REDEFINE: --dlopen=%mlir_native_utils_lib_dir/libmlir_c_runner_utils%shlibext | \ +// REDEFINE: FileCheck %s +// RUN: %{compile} | mlir-translate -mlir-to-llvmir | %{run} + + +// TODO: we can only support dense output for nchw input because 'c' is a reduction loop + + +#CCCD = #sparse_tensor.encoding<{ + lvlTypes = [ "dense", "dense", "dense", "compressed" ] +}> + + +#CCCC = #sparse_tensor.encoding<{ + lvlTypes = [ "compressed", "compressed", "compressed", "compressed" ] +}> + +// FIXME: CDCD encoding crashes! + +// Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f +func.func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> tensor { + %buf = bufferization.alloc_tensor(%s1, %s2, %s3, %s4) : tensor + %ret = linalg.fill ins(%f : f32) outs(%buf : tensor) -> tensor + return %ret : tensor +} + +func.func @conv_2d_nchw_fchw(%arg0: tensor, %arg1: tensor, %arg2: tensor) -> tensor { + %ret = linalg.conv_2d_nchw_fchw {dilations = dense<1> : tensor<2xi64>, + strides = dense<1> : tensor<2xi64>} + ins (%arg0, %arg1: tensor, tensor) + outs (%arg2: tensor) -> tensor + return %ret : tensor +} + +func.func @conv_2d_nchw_fchw_CCCD(%arg0: tensor, %arg1: tensor, %arg2: tensor) -> tensor { + %ret = linalg.conv_2d_nchw_fchw {dilations = dense<1> : tensor<2xi64>, + strides = dense<1> : tensor<2xi64>} + ins (%arg0, %arg1: tensor, tensor) + outs (%arg2: tensor) -> tensor + return %ret : tensor +} + +func.func @conv_2d_nchw_fchw_CCCC(%arg0: tensor, %arg1: tensor, %arg2: tensor) -> tensor { + %ret = linalg.conv_2d_nchw_fchw {dilations = dense<1> : tensor<2xi64>, + strides = dense<1> : tensor<2xi64>} + ins (%arg0, %arg1: tensor, tensor) + outs (%arg2: tensor) -> tensor + return %ret : tensor +} + +func.func @entry() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + %c6 = arith.constant 6 : index + %c8 = arith.constant 8 : index + %f10 = arith.constant 10.00000e+00 : f32 + %val = arith.constant 2.00000e+00 : f32 + %zero = arith.constant 0.00000e+00 : f32 + + %filter2D_nhwc = call @alloc_4d_filled_f32(%c1, %c3, %c3, %c3, %val) :(index, index, index, index, f32) -> (tensor) + %in2D_tmp = call @alloc_4d_filled_f32(%c3, %c3, %c8, %c8, %val) : (index, index, index, index, f32) -> (tensor) + %in2D_nhwc = tensor.insert %f10 into %in2D_tmp[%c0, %c0, %c0, %c3] : tensor + %out2D_nhwc = call @alloc_4d_filled_f32(%c3, %c1, %c6, %c6, %zero) : (index, index, index, index, f32) -> (tensor) + %out2D_nhwc_CCCD = call @alloc_4d_filled_f32(%c3, %c1, %c6, %c6, %zero) : (index, index, index, index, f32) -> (tensor) + %out2D_nhwc_CCCC = call @alloc_4d_filled_f32(%c3, %c1, %c6, %c6, %zero) : (index, index, index, index, f32) -> (tensor) + + %in2D_nhwc_CCCD = sparse_tensor.convert %in2D_nhwc + : tensor to tensor + %in2D_nhwc_CCCC = sparse_tensor.convert %in2D_nhwc + : tensor to tensor + + %dense_ret = call @conv_2d_nchw_fchw(%in2D_nhwc, %filter2D_nhwc, %out2D_nhwc) : (tensor, tensor, tensor) -> (tensor) + %CCCC_ret = call @conv_2d_nchw_fchw_CCCD(%in2D_nhwc_CCCD, %filter2D_nhwc, %out2D_nhwc_CCCD) : (tensor, tensor, tensor) -> (tensor) + %CDCD_ret = call @conv_2d_nchw_fchw_CCCC(%in2D_nhwc_CCCC, %filter2D_nhwc, %out2D_nhwc_CCCC) : (tensor, tensor, tensor) -> (tensor) + + + // CHECK: ( ( ( ( 108, 124, 124, 124, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ), + // CHECK-SAME: ( ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ), + // CHECK-SAME: ( ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ) ) + %dense_v = vector.transfer_read %dense_ret[%c0, %c0, %c0, %c0], %zero + : tensor, vector<3x1x6x6xf32> + vector.print %dense_v : vector<3x1x6x6xf32> + + // CHECK: ( ( ( ( 108, 124, 124, 124, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ), + // CHECK-SAME: ( ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ), + // CHECK-SAME: ( ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ) ) + %v1 = vector.transfer_read %CCCC_ret[%c0, %c0, %c0, %c0], %zero + : tensor, vector<3x1x6x6xf32> + vector.print %v1 : vector<3x1x6x6xf32> + + // CHECK: ( ( ( ( 108, 124, 124, 124, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ), + // CHECK-SAME: ( ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ), + // CHECK-SAME: ( ( ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ), + // CHECK-SAME: ( 108, 108, 108, 108, 108, 108 ) ) ) ) + %v2 = vector.transfer_read %CDCD_ret[%c0, %c0, %c0, %c0], %zero + : tensor, vector<3x1x6x6xf32> + vector.print %v2 : vector<3x1x6x6xf32> + + // Free the resources + bufferization.dealloc_tensor %in2D_nhwc : tensor + bufferization.dealloc_tensor %filter2D_nhwc : tensor + bufferization.dealloc_tensor %out2D_nhwc : tensor + bufferization.dealloc_tensor %out2D_nhwc_CCCD : tensor + bufferization.dealloc_tensor %out2D_nhwc_CCCC : tensor + + bufferization.dealloc_tensor %in2D_nhwc_CCCC : tensor + bufferization.dealloc_tensor %in2D_nhwc_CCCD : tensor + return +}