diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h --- a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h +++ b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.h @@ -393,7 +393,7 @@ } static bool isTrivalIdxCond(LoopCondKind k) { return !isAffineIdxCond(k); } - /// Whether the affine index expression is not fully reduced. + /// Whether the affine index expression is fully reduced. static bool isAffineIdxUnRedCond(LoopCondKind k) { return isAffineIdxCond(k) && static_cast(k) & kAffineIdxCondUnRed; } @@ -405,7 +405,7 @@ // E.g., to iterate over sparse tensor slice, we need to check whether the // current cooridnate is on the slice (e.g., due to stride) or not. static bool isCondWithExtraCheck(LoopCondKind k) { - return isSparseCond(k) && isSliceCond(k); + return isSparseCond(k) && (isSliceCond(k) || isAffineIdxUnRedCond(k)); } static LoopCondKind makeLoopCondKind(bool isSparse, bool isSlice, diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp @@ -17,6 +17,7 @@ #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/Dialect/SparseTensor/IR/SparseTensorType.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" using namespace mlir; using namespace mlir::sparse_tensor; @@ -35,6 +36,8 @@ #define ANDI(lhs, rhs) (builder.create(loc, (lhs), (rhs))) #define SUBI(lhs, rhs) (builder.create(loc, (lhs), (rhs))) #define MULI(lhs, rhs) (builder.create(loc, (lhs), (rhs))) +#define REMUI(lhs, rhs) (builder.create(loc, (lhs), (rhs))) +#define DIVUI(lhs, rhs) (builder.create(loc, (lhs), (rhs))) #define SELECT(c, l, r) (builder.create(loc, (c), (l), (r))) //===----------------------------------------------------------------------===// @@ -117,8 +120,8 @@ Level lvl) { // sliceCrd = (tensorCrd - offset) / stride crd = SUBI(crd, offset); - Value rem = builder.create(loc, crd, stride); - crd = builder.create(loc, crd, stride); + Value rem = REMUI(crd, stride); + crd = DIVUI(crd, stride); return std::make_pair(crd, rem); } @@ -725,6 +728,7 @@ } case LoopCondKind::SparseAffineCond: { assert(ivs.size() == 1); + Value crdHi; // loop upper bound { OpBuilder::InsertionGuard guard(builder); @@ -732,9 +736,9 @@ // crdHi is a loop invariant, hosit the computation outside the loop. if (llvm::isa_and_nonnull(loop)) builder.setInsertionPoint(loop); - auto [size, stride] = sliceMeta[tid][lvl].back(); + auto [remSz, stride] = sliceMeta[tid][lvl].back(); assert(stride == 1 && "Not yet implemented"); - crdHi = ADDI(getMostRecentSliceOnLvl(tid, lvl).offset, size); + crdHi = ADDI(getMostRecentSliceOnLvl(tid, lvl).offset, remSz); } assert(crdHi); return genSparseReducedAffineCond(builder, loc, @@ -792,18 +796,33 @@ return std::nullopt; } case LoopCondKind::SparseAffineUnRedCond: { + unsigned depth = sliceStack[tid].back().depth; + unsigned curStride = sliceMeta[tid][lvl][depth - 1].second; assert(ivs.size() == 3); - // Coord is the relative offset related to its parents. - // Update c = absOffset[lvl][depth] - absOffset[lvl][depth - 1] - assert(sliceStack[tid].back().depth == 1 && "TODO: not yet implement"); + // Updates the current slice info SliceInfo &sliceInfo = sliceStack[tid].back(); sliceInfo.isNonEmpty = ivs[0]; sliceInfo.minCrd = ivs[1]; sliceInfo.offset = ivs[2]; - coords[tid][lvl] = sliceInfo.offset; + + // Crd (the value we used to coiterate) is the relative offset related to + // its parents, we can use the abolute offset here because when depth = 1, + // absOffset[lvl][depth - 1] always equals to zero. + // TODO: Update crd =absOffset[lvl][depth] - absOffset[lvl][depth - 1] + assert(depth == 1 && "TODO: not yet implement"); + Value crd = sliceInfo.offset; + + Value onStride = constantI1(builder, loc, true); + if (curStride != 1) { + Value strideVal = C_IDX(curStride); + Value rem = REMUI(crd, strideVal); + crd = DIVUI(crd, strideVal); + onStride = CMPI(eq, rem, C_IDX(0)); + } + coords[tid][lvl] = crd; // No extra check is needed before accessing the tensor level. - return std::nullopt; + return onStride; } default: llvm_unreachable("Unhandled LoopCondKind"); @@ -814,11 +833,44 @@ ValueRange LoopEmitter::genCheckedValue(OpBuilder &builder, Location loc, Value pred, ValueRange curArgs, TensorLvlCond cond) { - // Currently only sparse slice condition need extra check. - assert(isSliceCond(cond.second) && isSparseCond(cond.second)); - assert(curArgs.size() == 1); - Value nextPos = ADDI(curArgs.front(), C_IDX(1)); - return SELECT(pred, curArgs.front(), nextPos)->getResults(); + assert(isSparseCond(cond.second)); + auto [tid, lvl] = unpackTensorLevel(cond.first); + if (isAffineIdxUnRedCond(cond.second)) { + unsigned depth = sliceStack[tid].back().depth; + unsigned curStride = sliceMeta[tid][lvl][depth - 1].second; + if (curStride == 1) + return curArgs; + // Build + // if (onStride) { + // yield curSlice + // } else { + // yield nxSlice. + //} + assert(curArgs.size() == 3); + auto ifOp = builder.create(loc, curArgs.getTypes(), pred, true); + { + OpBuilder::InsertionGuard guard(builder); + // If not all slices are legit, yield the updated value. + builder.setInsertionPointToStart(&ifOp.getThenRegion().front()); + + YIELD(curArgs); + // If not all slices are legit, yield the updated value. + builder.setInsertionPointToStart(&ifOp.getElseRegion().front()); + auto [nonEmpty, minCrd, offset] = + genSliceNextInduction(builder, loc, tid, lvl); + ValueRange nxSlice{nonEmpty, minCrd, offset}; + YIELD(nxSlice); + } + // If all slices are legit, start the user generated code. + return ifOp.getResults(); + } else { + // Currently only sparse slice condition need extra check. + assert(isSliceCond(cond.second) && isSparseCond(cond.second)); + assert(curArgs.size() == 1); + Value nextPos = ADDI(curArgs.front(), C_IDX(1)); + return SELECT(pred, curArgs.front(), nextPos)->getResults(); + } + llvm_unreachable("unhandled case"); } std::pair LoopEmitter::emitWhileLoopOverTensorsAtLvls( @@ -1878,7 +1930,7 @@ Value c0 = C_IDX(0), c1 = C_IDX(1), c2 = C_IDX(2); unsigned depth = levelReducedDep[tid][lvl]; // TODO: handle case when the current slice stride is not one. - assert(sliceMeta[tid][lvl][depth].second == 1 && "Not yet implemented"); + // assert(sliceMeta[tid][lvl][depth].second == 1 && "Not yet implemented"); // The remaining slice size after reduction. Value remSz = sliceMeta[tid][lvl][depth + 1].first; @@ -2251,8 +2303,8 @@ // FIXME: compute relative offset. assert(info.depth - 1 == 0); - Value nextRelOffset = nextAbsOffset; - nextRelOffset = SELECT(nextNonEmpty, nextRelOffset, c0); + // Value nextRelOffset = nextAbsOffset; + // nextRelOffset = SELECT(nextNonEmpty, nextRelOffset, c0); return std::make_tuple(nextNonEmpty, nextMinCrd, nextAbsOffset); } diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_strided_conv_2d_nhwc_hwcf.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_strided_conv_2d_nhwc_hwcf.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_strided_conv_2d_nhwc_hwcf.mlir @@ -0,0 +1,102 @@ +//-------------------------------------------------------------------------------------------------- +// WHEN CREATING A NEW TEST, PLEASE JUST COPY & PASTE WITHOUT EDITS. +// +// Set-up that's shared across all tests in this directory. In principle, this +// config could be moved to lit.local.cfg. However, there are downstream users that +// do not use these LIT config files. Hence why this is kept inline. +// +// DEFINE: %{sparse_compiler_opts} = enable-runtime-library=true +// DEFINE: %{sparse_compiler_opts_sve} = enable-arm-sve=true %{sparse_compiler_opts} +// DEFINE: %{compile} = mlir-opt %s --sparse-compiler="%{sparse_compiler_opts}" +// DEFINE: %{compile_sve} = mlir-opt %s --sparse-compiler="%{sparse_compiler_opts_sve}" +// DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils +// DEFINE: %{run_opts} = -e entry -entry-point-result=void +// DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs} +// DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs} +// +// DEFINE: %{env} = +//-------------------------------------------------------------------------------------------------- + +// RUN: %{compile} | %{run} | FileCheck %s +// +// Do the same run, but now with direct IR generation. +// REDEFINE: %{sparse_compiler_opts} = enable-runtime-library=false enable-buffer-initialization=true enable-index-reduction=true +// RUN: %{compile} | %{run} | FileCheck %s +// +// Do the same run, but now with direct IR generation and vectorization. +// REDEFINE: %{sparse_compiler_opts} = enable-runtime-library=false enable-buffer-initialization=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true enable-index-reduction=true +// RUN: %{compile} | %{run} | FileCheck %s +// +// Do the same run, but now with direct IR generation and VLA vectorization. +// RUN: %if mlir_arm_sve_tests %{ %{compile_sve} | %{run_sve} | FileCheck %s %} + +#CCCC = #sparse_tensor.encoding<{ + lvlTypes = [ "compressed", "compressed", "compressed", "compressed" ] +}> + +// Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f +func.func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> tensor { + %buf = bufferization.alloc_tensor(%s1, %s2, %s3, %s4) : tensor + %ret = linalg.fill ins(%f : f32) outs(%buf : tensor) -> tensor + return %ret : tensor +} + +func.func @conv_2d_nhwc_hwcf(%arg0: tensor, %arg1: tensor, %arg2: tensor) -> tensor { + %ret = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, + strides = dense<2> : tensor<2xi64>} + ins (%arg0, %arg1: tensor, tensor) + outs (%arg2: tensor) -> tensor + return %ret : tensor +} + +func.func @conv_2d_nhwc_hwcf_CCCC(%arg0: tensor, %arg1: tensor, %arg2: tensor) -> tensor { + %ret = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, + strides = dense<2> : tensor<2xi64>} + ins (%arg0, %arg1: tensor, tensor) + outs (%arg2: tensor) -> tensor + return %ret : tensor +} + +func.func @entry() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c3 = arith.constant 3 : index + %c6 = arith.constant 6 : index + %c9 = arith.constant 9 : index + %f10 = arith.constant 10.00000e+00 : f32 + %val = arith.constant 2.00000e+00 : f32 + %zero = arith.constant 0.00000e+00 : f32 + + %filter2D_nhwc = call @alloc_4d_filled_f32(%c3, %c3, %c3, %c1, %val) :(index, index, index, index, f32) -> (tensor) + %in2D_tmp = call @alloc_4d_filled_f32(%c3, %c9, %c9, %c3, %zero) : (index, index, index, index, f32) -> (tensor) + %in2D_nhwc = tensor.insert %f10 into %in2D_tmp[%c0, %c3, %c3, %c0] : tensor + %out2D_nhwc = call @alloc_4d_filled_f32(%c3, %c3, %c3, %c1, %zero) : (index, index, index, index, f32) -> (tensor) + + %in2D_nhwc_CCCC = sparse_tensor.convert %in2D_nhwc + : tensor to tensor + + %dense_ret = call @conv_2d_nhwc_hwcf(%in2D_nhwc, %filter2D_nhwc, %out2D_nhwc) : (tensor, tensor, tensor) -> (tensor) + %CCCC_ret = call @conv_2d_nhwc_hwcf_CCCC(%in2D_nhwc_CCCC, %filter2D_nhwc, %out2D_nhwc) : (tensor, tensor, tensor) -> (tensor) + + // CHECK: ( ( ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 20 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ) ), + // CHECK-SAME: ( ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ) ), + // CHECK-SAME: ( ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ) ) ) + %dense_v = vector.transfer_read %dense_ret[%c0, %c0, %c0, %c0], %zero + : tensor, vector<3x3x3x1xf32> + vector.print %dense_v : vector<3x3x3x1xf32> + + // CHECK: ( ( ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 20 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ) ), + // CHECK-SAME: ( ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ) ), + // CHECK-SAME: ( ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ), ( ( 0 ), ( 0 ), ( 0 ) ) ) ) + %v1 = vector.transfer_read %CCCC_ret[%c0, %c0, %c0, %c0], %zero + : tensor, vector<3x3x3x1xf32> + vector.print %v1 : vector<3x3x3x1xf32> + + // Free the resources + bufferization.dealloc_tensor %in2D_nhwc : tensor + bufferization.dealloc_tensor %filter2D_nhwc : tensor + bufferization.dealloc_tensor %out2D_nhwc : tensor + + bufferization.dealloc_tensor %in2D_nhwc_CCCC : tensor + return +}