diff --git a/mlir/lib/Dialect/Vector/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/VectorTransforms.cpp --- a/mlir/lib/Dialect/Vector/VectorTransforms.cpp +++ b/mlir/lib/Dialect/Vector/VectorTransforms.cpp @@ -2379,6 +2379,7 @@ xferOp.indices().take_front(xferOp.getLeadingShapedRank()); SmallVector sizes; sizes.append(leadingIndices.begin(), leadingIndices.end()); + auto isaWrite = isa(xferOp); xferOp.zipResultAndIndexing([&](int64_t resultIdx, int64_t indicesIdx) { using MapList = ArrayRef>; Value dimMemRef = memref_dim(xferOp.source(), indicesIdx); @@ -2397,7 +2398,7 @@ SmallVector indices = llvm::to_vector<4>(llvm::map_range( xferOp.indices(), [](Value idx) -> OpFoldResult { return idx; })); return memref_sub_view( - xferOp.source(), indices, sizes, + isaWrite ? alloc : xferOp.source(), indices, sizes, SmallVector(memrefRank, OpBuilder(xferOp).getIndexAttr(1))); } @@ -2509,14 +2510,119 @@ return fullPartialIfOp; } +/// Given an `xferOp` for which: +/// 1. `inBoundsCond` and a `compatibleMemRefType` have been computed. +/// 2. a memref of single vector `alloc` has been allocated. +/// Produce IR resembling: +/// ``` +/// %1:3 = scf.if (%inBounds) { +/// memref.cast %A: memref to compatibleMemRefType +/// scf.yield %view, ... : compatibleMemRefType, index, index +/// } else { +/// %3 = vector.type_cast %extra_alloc : +/// memref<...> to memref> +/// %4 = memref.cast %alloc: memref to compatibleMemRefType +/// scf.yield %4, ... : compatibleMemRefType, index, index +/// } +/// ``` +static ValueRange getLocationToWriteFullVec(vector::TransferWriteOp xferOp, + TypeRange returnTypes, + Value inBoundsCond, + MemRefType compatibleMemRefType, + Value alloc) { + using namespace edsc; + using namespace edsc::intrinsics; + Value zero = std_constant_index(0); + Value memref = xferOp.source(); + return conditionBuilder( + returnTypes, inBoundsCond, + [&]() -> scf::ValueVector { + Value res = memref; + if (compatibleMemRefType != xferOp.getShapedType()) + res = memref_cast(memref, compatibleMemRefType); + scf::ValueVector viewAndIndices{res}; + viewAndIndices.insert(viewAndIndices.end(), xferOp.indices().begin(), + xferOp.indices().end()); + return viewAndIndices; + }, + [&]() -> scf::ValueVector { + Value casted = memref_cast(alloc, compatibleMemRefType); + scf::ValueVector viewAndIndices{casted}; + viewAndIndices.insert(viewAndIndices.end(), xferOp.getTransferRank(), + zero); + return viewAndIndices; + }); +} + +/// Given an `xferOp` for which: +/// 1. `inBoundsCond` has been computed. +/// 2. a memref of single vector `alloc` has been allocated. +/// 3. it originally wrote to %view +/// Produce IR resembling: +/// ``` +/// %notInBounds = xor %inBounds, %true +/// scf.if (%notInBounds) { +/// %3 = subview %alloc [...][...][...] +/// linalg.copy(%3, %view) +/// } +/// ``` +static void createScopedFullPartialLinalgCopy(vector::TransferWriteOp xferOp, + Value inBoundsCond, Value alloc) { + using namespace edsc; + using namespace edsc::intrinsics; + auto &b = ScopedContext::getBuilderRef(); + auto notInBounds = b.create( + xferOp->getLoc(), inBoundsCond, + b.create<::mlir::ConstantIntOp>(xferOp.getLoc(), true, 1)); + + conditionBuilder(notInBounds, [&]() { + Value memRefSubView = createScopedSubViewIntersection( + cast(xferOp.getOperation()), alloc); + linalg_copy(memRefSubView, xferOp.source()); + }); +} + +/// Given an `xferOp` for which: +/// 1. `inBoundsCond` has been computed. +/// 2. a memref of single vector `alloc` has been allocated. +/// 3. it originally wrote to %view +/// Produce IR resembling: +/// ``` +/// %notInBounds = xor %inBounds, %true +/// scf.if (%notInBounds) { +/// %2 = load %alloc : memref> +/// vector.transfer_write %2, %view[...] : memref, vector<...> +/// } +/// ``` +static void +createScopedFullPartialVectorTransferWrite(vector::TransferWriteOp xferOp, + Value inBoundsCond, Value alloc) { + using namespace edsc; + using namespace edsc::intrinsics; + auto &b = ScopedContext::getBuilderRef(); + auto notInBounds = b.create( + xferOp->getLoc(), inBoundsCond, + b.create<::mlir::ConstantIntOp>(xferOp.getLoc(), true, 1)); + conditionBuilder(notInBounds, [&]() { + BlockAndValueMapping mapping; + + Value load = memref_load(vector_type_cast( + MemRefType::get({}, xferOp.vector().getType()), alloc)); + + mapping.map(xferOp.vector(), load); + b.clone(*xferOp.getOperation(), mapping); + }); +} + /// Split a vector.transfer operation into an in-bounds (i.e., no out-of-bounds /// masking) fastpath and a slowpath. +/// +/// For vector.transfer_read: /// If `ifOp` is not null and the result is `success, the `ifOp` points to the /// newly created conditional upon function return. /// To accomodate for the fact that the original vector.transfer indexing may be /// arbitrary and the slow path indexes @[0...0] in the temporary buffer, the /// scf.if op returns a view and values of type index. -/// At this time, only vector.transfer_read case is implemented. /// /// Example (a 2-D vector.transfer_read): /// ``` @@ -2537,6 +2643,32 @@ /// ``` /// where `alloc` is a top of the function alloca'ed buffer of one vector. /// +/// For vector.transfer_write: +/// There are 2 conditional blocks. First a block to decide which memref and +/// indices to use for an unmasked, inbounds write. Then a conditional block to +/// further copy a partial buffer into the final result in the slow path case. +/// +/// Example (a 2-D vector.transfer_write): +/// ``` +/// vector.transfer_write %arg, %0[...], %pad : memref, vector<...> +/// ``` +/// is transformed into: +/// ``` +/// %1:3 = scf.if (%inBounds) { +/// memref.cast %A: memref to compatibleMemRefType +/// scf.yield %view : compatibleMemRefType, index, index +/// } else { +/// memref.cast %alloc: memref to compatibleMemRefType +/// scf.yield %4 : compatibleMemRefType, index, index +/// } +/// %0 = vector.transfer_write %arg, %1#0[%1#1, %1#2] {in_bounds = [true ... +/// true]} +/// scf.if (%notInBounds) { +/// // slowpath: not in-bounds vector.transfer or linalg.copy. +/// } +/// ``` +/// where `alloc` is a top of the function alloca'ed buffer of one vector. +/// /// Preconditions: /// 1. `xferOp.permutation_map()` must be a minor identity map /// 2. the rank of the `xferOp.source()` and the rank of the `xferOp.vector()` @@ -2554,27 +2686,29 @@ SmallVector bools(xferOp.getTransferRank(), true); auto inBoundsAttr = b.getBoolArrayAttr(bools); if (options.vectorTransferSplit == VectorTransferSplit::ForceInBounds) { - xferOp->setAttr(vector::TransferReadOp::getInBoundsAttrName(), - inBoundsAttr); + xferOp->setAttr(xferOp.getInBoundsAttrName(), inBoundsAttr); return success(); } - assert(succeeded(splitFullAndPartialTransferPrecondition(xferOp)) && - "Expected splitFullAndPartialTransferPrecondition to hold"); - auto xferReadOp = dyn_cast(xferOp.getOperation()); + // Assert preconditions. Additionally, keep the variables in an inner scope to + // ensure they aren't used in the wrong scopes further down. + { + assert(succeeded(splitFullAndPartialTransferPrecondition(xferOp)) && + "Expected splitFullAndPartialTransferPrecondition to hold"); - // TODO: add support for write case. - if (!xferReadOp) - return failure(); + auto xferReadOp = dyn_cast(xferOp.getOperation()); + auto xferWriteOp = dyn_cast(xferOp.getOperation()); - if (xferReadOp.mask()) - return failure(); + if (!(xferReadOp || xferWriteOp)) + return failure(); + if (xferWriteOp && xferWriteOp.mask()) + return failure(); + if (xferReadOp && xferReadOp.mask()) + return failure(); + } OpBuilder::InsertionGuard guard(b); - if (Operation *sourceOp = xferOp.source().getDefiningOp()) - b.setInsertionPointAfter(sourceOp); - else - b.setInsertionPoint(xferOp); + b.setInsertionPoint(xferOp); ScopedContext scope(b, xferOp.getLoc()); Value inBoundsCond = createScopedInBoundsCond( cast(xferOp.getOperation())); @@ -2596,26 +2730,57 @@ MemRefType compatibleMemRefType = getCastCompatibleMemRefType(xferOp.getShapedType().cast(), alloc.getType().cast()); - - // Read case: full fill + partial copy -> in-bounds vector.xfer_read. SmallVector returnTypes(1 + xferOp.getTransferRank(), b.getIndexType()); returnTypes[0] = compatibleMemRefType; - scf::IfOp fullPartialIfOp = - options.vectorTransferSplit == VectorTransferSplit::VectorTransfer - ? createScopedFullPartialVectorTransferRead( - xferReadOp, returnTypes, inBoundsCond, compatibleMemRefType, - alloc) - : createScopedFullPartialLinalgCopy(xferReadOp, returnTypes, - inBoundsCond, - compatibleMemRefType, alloc); - if (ifOp) - *ifOp = fullPartialIfOp; - - // Set existing read op to in-bounds, it always reads from a full buffer. - for (unsigned i = 0, e = returnTypes.size(); i != e; ++i) - xferReadOp.setOperand(i, fullPartialIfOp.getResult(i)); - xferOp->setAttr(vector::TransferReadOp::getInBoundsAttrName(), inBoundsAttr); + + if (auto xferReadOp = + dyn_cast(xferOp.getOperation())) { + // Read case: full fill + partial copy -> in-bounds vector.xfer_read. + scf::IfOp fullPartialIfOp = + options.vectorTransferSplit == VectorTransferSplit::VectorTransfer + ? createScopedFullPartialVectorTransferRead( + xferReadOp, returnTypes, inBoundsCond, compatibleMemRefType, + alloc) + : createScopedFullPartialLinalgCopy(xferReadOp, returnTypes, + inBoundsCond, + compatibleMemRefType, alloc); + if (ifOp) + *ifOp = fullPartialIfOp; + + // Set existing read op to in-bounds, it always reads from a full buffer. + for (unsigned i = 0, e = returnTypes.size(); i != e; ++i) + xferReadOp.setOperand(i, fullPartialIfOp.getResult(i)); + + xferOp->setAttr(xferOp.getInBoundsAttrName(), inBoundsAttr); + + return success(); + } + + auto xferWriteOp = cast(xferOp.getOperation()); + + // Decide which location to write the entire vector to. + auto memrefAndIndices = getLocationToWriteFullVec( + xferWriteOp, returnTypes, inBoundsCond, compatibleMemRefType, alloc); + + // Do an in bounds write to either the output or the extra allocated buffer. + // The operation is cloned to prevent deleting information needed for the + // later IR creation. + BlockAndValueMapping mapping; + mapping.map(xferWriteOp.source(), memrefAndIndices.front()); + mapping.map(xferWriteOp.indices(), memrefAndIndices.drop_front()); + auto *clone = b.clone(*xferWriteOp, mapping); + clone->setAttr(xferWriteOp.getInBoundsAttrName(), inBoundsAttr); + + // Create a potential copy from the allocated buffer to the final output in + // the slow path case. + if (options.vectorTransferSplit == VectorTransferSplit::VectorTransfer) + createScopedFullPartialVectorTransferWrite(xferWriteOp, inBoundsCond, + alloc); + else + createScopedFullPartialLinalgCopy(xferWriteOp, inBoundsCond, alloc); + + xferOp->erase(); return success(); } diff --git a/mlir/test/Dialect/Vector/vector-transfer-full-partial-split.mlir b/mlir/test/Dialect/Vector/vector-transfer-full-partial-split.mlir --- a/mlir/test/Dialect/Vector/vector-transfer-full-partial-split.mlir +++ b/mlir/test/Dialect/Vector/vector-transfer-full-partial-split.mlir @@ -1,5 +1,5 @@ -// RUN: mlir-opt %s -test-vector-transfer-full-partial-split | FileCheck %s -// RUN: mlir-opt %s -test-vector-transfer-full-partial-split=use-linalg-copy | FileCheck %s --check-prefix=LINALG +// RUN: mlir-opt %s -test-vector-transfer-full-partial-split -split-input-file | FileCheck %s +// RUN: mlir-opt %s -test-vector-transfer-full-partial-split=use-linalg-copy -split-input-file | FileCheck %s --check-prefix=LINALG // CHECK-DAG: #[[$map_p4:.*]] = affine_map<()[s0] -> (s0 + 4)> // CHECK-DAG: #[[$map_p8:.*]] = affine_map<()[s0] -> (s0 + 8)> @@ -186,3 +186,206 @@ // CHECK: return %[[res]] : vector<4x8xf32> return %1 : vector<4x8xf32> } + +// ----- + +func @split_vector_transfer_write_2d(%V: vector<4x8xf32>, %A: memref, %i: index, %j: index) { + vector.transfer_write %V, %A[%i, %j] : + vector<4x8xf32>, memref + return +} + +// CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 + 4)> +// CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 8)> +// CHECK: func @split_vector_transfer_write_2d( +// CHECK-SAME: %[[VEC:.*]]: vector<4x8xf32>, +// CHECK-SAME: %[[DEST:.*]]: memref, +// CHECK-SAME: %[[I:.*]]: index, +// CHECK-SAME: %[[J:.*]]: index) { +// CHECK-DAG: %[[C8:.*]] = constant 8 : index +// CHECK-DAG: %[[C0:.*]] = constant 0 : index +// CHECK-DAG: %[[CT:.*]] = constant true +// CHECK: %[[TEMP:.*]] = memref.alloca() {alignment = 32 : i64} : memref<4x8xf32> +// CHECK: %[[VAL_8:.*]] = affine.apply #[[MAP0]]()[%[[I]]] +// CHECK: %[[DIM0:.*]] = memref.dim %[[DEST]], %[[C0]] : memref +// CHECK: %[[DIM0_IN:.*]] = cmpi sle, %[[VAL_8]], %[[DIM0]] : index +// CHECK: %[[DIM1:.*]] = affine.apply #[[MAP1]]()[%[[J]]] +// CHECK: %[[DIM1_IN:.*]] = cmpi sle, %[[DIM1]], %[[C8]] : index +// CHECK: %[[IN_BOUNDS:.*]] = and %[[DIM0_IN]], %[[DIM1_IN]] : i1 +// CHECK: %[[IN_BOUND_DEST:.*]]:3 = scf.if %[[IN_BOUNDS]] -> +// CHECK-SAME: (memref, index, index) { +// CHECK: scf.yield %[[DEST]], %[[I]], %[[J]] : memref, index, index +// CHECK: } else { +// CHECK: %[[VAL_15:.*]] = memref.cast %[[TEMP]] +// CHECK-SAME: : memref<4x8xf32> to memref +// CHECK: scf.yield %[[VAL_15]], %[[C0]], %[[C0]] +// CHECK-SAME: : memref, index, index +// CHECK: } +// CHECK: vector.transfer_write %[[VEC]], +// CHECK-SAME: %[[IN_BOUND_DEST:.*]]#0[%[[IN_BOUND_DEST]]#1, %[[IN_BOUND_DEST]]#2] +// CHECK-SAME: {in_bounds = [true, true]} : vector<4x8xf32>, memref +// CHECK: %[[OUT_BOUNDS:.*]] = xor %[[IN_BOUNDS]], %[[CT]] : i1 +// CHECK: scf.if %[[OUT_BOUNDS]] { +// CHECK: %[[CASTED:.*]] = vector.type_cast %[[TEMP]] +// CHECK-SAME: : memref<4x8xf32> to memref> +// CHECK: %[[RESULT_COPY:.*]] = memref.load %[[CASTED]][] +// CHECK-SAME: : memref> +// CHECK: vector.transfer_write %[[RESULT_COPY]], +// CHECK-SAME: %[[DEST]][%[[I]], %[[J]]] +// CHECK-SAME: : vector<4x8xf32>, memref +// CHECK: } +// CHECK: return +// CHECK: } + +// LINALG-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 + 4)> +// LINALG-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 8)> +// LINALG-DAG: #[[MAP2:.*]] = affine_map<(d0, d1, d2) -> (d0 - d1, 4)> +// LINALG-DAG: #[[MAP3:.*]] = affine_map<(d0, d1, d2) -> (d0 - d1, 8)> +// LINALG-DAG: #[[MAP4:.*]] = affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)> +// LINALG: func @split_vector_transfer_write_2d( +// LINALG-SAME: %[[VEC:.*]]: vector<4x8xf32>, +// LINALG-SAME: %[[DEST:.*]]: memref, +// LINALG-SAME: %[[I:.*]]: index, +// LINALG-SAME: %[[J:.*]]: index) { +// LINALG-DAG: %[[CT:.*]] = constant true +// LINALG-DAG: %[[C0:.*]] = constant 0 : index +// LINALG-DAG: %[[C4:.*]] = constant 4 : index +// LINALG-DAG: %[[C8:.*]] = constant 8 : index +// LINALG: %[[TEMP:.*]] = memref.alloca() {alignment = 32 : i64} : memref<4x8xf32> +// LINALG: %[[IDX0:.*]] = affine.apply #[[MAP0]]()[%[[I]]] +// LINALG: %[[DIM0:.*]] = memref.dim %[[DEST]], %[[C0]] : memref +// LINALG: %[[DIM0_IN:.*]] = cmpi sle, %[[IDX0]], %[[DIM0]] : index +// LINALG: %[[DIM1:.*]] = affine.apply #[[MAP1]]()[%[[J]]] +// LINALG: %[[DIM1_IN:.*]] = cmpi sle, %[[DIM1]], %[[C8]] : index +// LINALG: %[[IN_BOUNDS:.*]] = and %[[DIM0_IN]], %[[DIM1_IN]] : i1 +// LINALG: %[[IN_BOUND_DEST:.*]]:3 = scf.if %[[IN_BOUNDS]] +// LINALG-SAME: -> (memref, index, index) { +// LINALG: scf.yield %[[DEST]], %[[I]], %[[J]] : memref, index, index +// LINALG: } else { +// LINALG: %[[VAL_16:.*]] = memref.cast %[[TEMP]] : memref<4x8xf32> to memref +// LINALG: scf.yield %[[VAL_16]], %[[C0]], %[[C0]] : memref, index, index +// LINALG: } +// LINALG: vector.transfer_write %[[VEC]], +// LINALG-SAME: %[[IN_BOUND_DEST:.*]]#0[%[[IN_BOUND_DEST]]#1, %[[IN_BOUND_DEST]]#2] +// LINALG-SAME: {in_bounds = [true, true]} : vector<4x8xf32>, memref +// LINALG: %[[OUT_BOUNDS:.*]] = xor %[[IN_BOUNDS]], %[[CT]] : i1 +// LINALG: scf.if %[[OUT_BOUNDS]] { +// LINALG: %[[VAL_19:.*]] = memref.dim %[[DEST]], %[[C0]] : memref +// LINALG-DAG: %[[VAL_20:.*]] = affine.min #[[MAP2]](%[[VAL_19]], %[[I]], %[[C4]]) +// LINALG-DAG: %[[VAL_21:.*]] = affine.min #[[MAP3]](%[[C8]], %[[J]], %[[C8]]) +// LINALG: %[[VAL_22:.*]] = memref.subview %[[TEMP]] +// LINALG-SAME: [%[[I]], %[[J]]] [%[[VAL_20]], %[[VAL_21]]] +// LINALG-SAME: [1, 1] : memref<4x8xf32> to memref +// LINALG: linalg.copy(%[[VAL_22]], %[[DEST]]) +// LINALG-SAME: : memref, memref +// LINALG: } +// LINALG: return +// LINALG: } + +// ----- + +func @split_vector_transfer_write_strided_2d( + %V: vector<4x8xf32>, %A: memref<7x8xf32, offset:?, strides:[?, 1]>, + %i: index, %j: index) { + vector.transfer_write %V, %A[%i, %j] : + vector<4x8xf32>, memref<7x8xf32, offset:?, strides:[?, 1]> + return +} + +// CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)> +// CHECK-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 4)> +// CHECK-DAG: #[[MAP2:.*]] = affine_map<()[s0] -> (s0 + 8)> +// CHECK: func @split_vector_transfer_write_strided_2d( +// CHECK-SAME: %[[VEC:.*]]: vector<4x8xf32>, +// CHECK-SAME: %[[DEST:.*]]: memref<7x8xf32, #[[MAP0]]>, +// CHECK-SAME: %[[I:.*]]: index, +// CHECK-SAME: %[[J:.*]]: index) { +// CHECK-DAG: %[[C7:.*]] = constant 7 : index +// CHECK-DAG: %[[C8:.*]] = constant 8 : index +// CHECK-DAG: %[[C0:.*]] = constant 0 : index +// CHECK-DAG: %[[CT:.*]] = constant true +// CHECK: %[[TEMP:.*]] = memref.alloca() {alignment = 32 : i64} : memref<4x8xf32> +// CHECK: %[[DIM0:.*]] = affine.apply #[[MAP1]]()[%[[I]]] +// CHECK: %[[DIM0_IN:.*]] = cmpi sle, %[[DIM0]], %[[C7]] : index +// CHECK: %[[DIM1:.*]] = affine.apply #[[MAP2]]()[%[[J]]] +// CHECK: %[[DIM1_IN:.*]] = cmpi sle, %[[DIM1]], %[[C8]] : index +// CHECK: %[[IN_BOUNDS:.*]] = and %[[DIM0_IN]], %[[DIM1_IN]] : i1 +// CHECK: %[[IN_BOUND_DEST:.*]]:3 = scf.if %[[IN_BOUNDS]] +// CHECK-SAME: -> (memref, index, index) { +// CHECK: %[[VAL_15:.*]] = memref.cast %[[DEST]] +// CHECK-SAME: : memref<7x8xf32, #[[MAP0]]> to memref +// CHECK: scf.yield %[[VAL_15]], %[[I]], %[[J]] +// CHECK-SAME: : memref, index, index +// CHECK: } else { +// CHECK: %[[VAL_16:.*]] = memref.cast %[[TEMP]] +// CHECK-SAME: : memref<4x8xf32> to memref +// CHECK: scf.yield %[[VAL_16]], %[[C0]], %[[C0]] +// CHECK-SAME: : memref, index, index +// CHECK: } +// CHECK: vector.transfer_write %[[VEC]], +// CHECK-SAME: %[[IN_BOUND_DEST:.*]]#0 +// CHECK-SAME: [%[[IN_BOUND_DEST]]#1, %[[IN_BOUND_DEST]]#2] +// CHECK-SAME: {in_bounds = [true, true]} : vector<4x8xf32>, memref +// CHECK: %[[OUT_BOUNDS:.*]] = xor %[[IN_BOUNDS]], %[[CT]] : i1 +// CHECK: scf.if %[[OUT_BOUNDS]] { +// CHECK: %[[VAL_19:.*]] = vector.type_cast %[[TEMP]] +// CHECK-SAME: : memref<4x8xf32> to memref> +// CHECK: %[[VAL_20:.*]] = memref.load %[[VAL_19]][] +// CHECK-SAME: : memref> +// CHECK: vector.transfer_write %[[VAL_20]], %[[DEST]][%[[I]], %[[J]]] +// CHECK-SAME: : vector<4x8xf32>, memref<7x8xf32, #[[MAP0]]> +// CHECK: } +// CHECK: return +// CHECK: } + +// LINALG-DAG: #[[MAP0:.*]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)> +// LINALG-DAG: #[[MAP1:.*]] = affine_map<()[s0] -> (s0 + 4)> +// LINALG-DAG: #[[MAP2:.*]] = affine_map<()[s0] -> (s0 + 8)> +// LINALG-DAG: #[[MAP3:.*]] = affine_map<(d0, d1, d2) -> (d0 - d1, 4)> +// LINALG-DAG: #[[MAP4:.*]] = affine_map<(d0, d1, d2) -> (d0 - d1, 8)> +// LINALG-DAG: #[[MAP5:.*]] = affine_map<(d0, d1)[s0] -> (d0 * 8 + s0 + d1)> +// LINALG: func @split_vector_transfer_write_strided_2d( +// LINALG-SAME: %[[VEC:.*]]: vector<4x8xf32>, +// LINALG-SAME: %[[DEST:.*]]: memref<7x8xf32, #[[MAP0]]>, +// LINALG-SAME: %[[I:.*]]: index, +// LINALG-SAME: %[[J:.*]]: index) { +// LINALG-DAG: %[[C0:.*]] = constant 0 : index +// LINALG-DAG: %[[CT:.*]] = constant true +// LINALG-DAG: %[[C7:.*]] = constant 7 : index +// LINALG-DAG: %[[C4:.*]] = constant 4 : index +// LINALG-DAG: %[[C8:.*]] = constant 8 : index +// LINALG: %[[TEMP:.*]] = memref.alloca() {alignment = 32 : i64} : memref<4x8xf32> +// LINALG: %[[DIM0:.*]] = affine.apply #[[MAP1]]()[%[[I]]] +// LINALG: %[[DIM0_IN:.*]] = cmpi sle, %[[DIM0]], %[[C7]] : index +// LINALG: %[[DIM1:.*]] = affine.apply #[[MAP2]]()[%[[J]]] +// LINALG: %[[DIM1_IN:.*]] = cmpi sle, %[[DIM1]], %[[C8]] : index +// LINALG: %[[IN_BOUNDS:.*]] = and %[[DIM0_IN]], %[[DIM1_IN]] : i1 +// LINALG: %[[IN_BOUND_DEST:.*]]:3 = scf.if %[[IN_BOUNDS]] +// LINALG-SAME: -> (memref, index, index) { +// LINALG: %[[VAL_16:.*]] = memref.cast %[[DEST]] +// LINALG-SAME: : memref<7x8xf32, #[[MAP0]]> to memref +// LINALG: scf.yield %[[VAL_16]], %[[I]], %[[J]] +// LINALG-SAME: : memref, index, index +// LINALG: } else { +// LINALG: %[[VAL_17:.*]] = memref.cast %[[TEMP]] +// LINALG-SAME: : memref<4x8xf32> to memref +// LINALG: scf.yield %[[VAL_17]], %[[C0]], %[[C0]] +// LINALG-SAME: : memref, index, index +// LINALG: } +// LINALG: vector.transfer_write %[[VEC]], +// LINALG-SAME: %[[IN_BOUND_DEST:.*]]#0 +// LINALG-SAME: [%[[IN_BOUND_DEST]]#1, %[[IN_BOUND_DEST]]#2] +// LINALG-SAME: {in_bounds = [true, true]} +// LINALG-SAME: : vector<4x8xf32>, memref +// LINALG: %[[OUT_BOUNDS:.*]] = xor %[[IN_BOUNDS]], %[[CT]] : i1 +// LINALG: scf.if %[[OUT_BOUNDS]] { +// LINALG-DAG: %[[VAL_20:.*]] = affine.min #[[MAP3]](%[[C7]], %[[I]], %[[C4]]) +// LINALG-DAG: %[[VAL_21:.*]] = affine.min #[[MAP4]](%[[C8]], %[[J]], %[[C8]]) +// LINALG: %[[VAL_22:.*]] = memref.subview %[[TEMP]] +// LINALG-SAME: [%[[I]], %[[J]]] [%[[VAL_20]], %[[VAL_21]]] +// LINALG-SAME: [1, 1] : memref<4x8xf32> to memref +// LINALG: linalg.copy(%[[VAL_22]], %[[DEST]]) +// LINALG-SAME: : memref, memref<7x8xf32, #[[MAP0]]> +// LINALG: } +// LINALG: return +// LINALG: }